# 日期正则匹配 DATETIME_PATTERN = [ r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9])", r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9])", r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[1-24]d时[0-60]d分)([1-24]d时)", r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9])", r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9])", r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[1-24]d时[0-60]d分)([1-24]d时)", r"(d{4}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{4}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{4}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9])", r"(d{4}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9])", r"(d{4}年d{1,2}月d{1,2}日s*?[1-24]d时[0-60]d分)([1-24]d时)", r"(d{2}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{2}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{2}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9])", r"(d{2}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9])", r"(d{2}年d{1,2}月d{1,2}日s*?[1-24]d时[0-60]d分)([1-24]d时)", r"(d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", r"(d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9])", r"(d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9])", r"(d{1,2}月d{1,2}日s*?[1-24]d时[0-60]d分)([1-24]d时)", r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2})", r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2})", r"(d{4}年d{1,2}月d{1,2}日)", r"(d{2}年d{1,2}月d{1,2}日)", r"(d{1,2}月d{1,2}日)", ] # 部分特别规范的新闻网站,可以直接从 HTML 的 meta 数据中获得发布时间 PUBLISH_TIME_META = [ '//meta[starts-with(@property, "rnews:datePublished")]/@content', '//meta[starts-with(@property, "article:published_time")]/@content', '//meta[starts-with(@property, "og:published_time")]/@content', '//meta[starts-with(@property, "og:release_date")]/@content', '//meta[starts-with(@itemprop, "datePublished")]/@content', '//meta[starts-with(@itemprop, "dateUpdate")]/@content', '//meta[starts-with(@name, "OriginalPublicationDate")]/@content', '//meta[starts-with(@name, "article_date_original")]/@content', '//meta[starts-with(@name, "og:time")]/@content', '//meta[starts-with(@name, "apub:time")]/@content', '//meta[starts-with(@name, "publication_date")]/@content', '//meta[starts-with(@name, "sailthru.date")]/@content', '//meta[starts-with(@name, "PublishDate")]/@content', '//meta[starts-with(@name, "publishdate")]/@content', '//meta[starts-with(@name, "PubDate")]/@content', '//meta[starts-with(@name, "pubtime")]/@content', '//meta[starts-with(@name, "_pubtime")]/@content', '//meta[starts-with(@name, "weibo: article:create_at")]/@content', '//meta[starts-with(@pubdate, "pubdate")]/@content', ]