zoukankan      html  css  js  c++  java
  • 发布时间提取正则汇总

    # 日期正则匹配
    DATETIME_PATTERN = [
        r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9])",
        r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9])",
        r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[1-24]d时[0-60]d分)([1-24]d时)",
        r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[0-1]?[0-9]:[0-5]?[0-9])",
        r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[2][0-3]:[0-5]?[0-9])",
        r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2}s*?[1-24]d时[0-60]d分)([1-24]d时)",
        r"(d{4}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{4}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{4}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9])",
        r"(d{4}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9])",
        r"(d{4}年d{1,2}月d{1,2}日s*?[1-24]d时[0-60]d分)([1-24]d时)",
        r"(d{2}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{2}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{2}年d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9])",
        r"(d{2}年d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9])",
        r"(d{2}年d{1,2}月d{1,2}日s*?[1-24]d时[0-60]d分)([1-24]d时)",
        r"(d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
        r"(d{1,2}月d{1,2}日s*?[0-1]?[0-9]:[0-5]?[0-9])",
        r"(d{1,2}月d{1,2}日s*?[2][0-3]:[0-5]?[0-9])",
        r"(d{1,2}月d{1,2}日s*?[1-24]d时[0-60]d分)([1-24]d时)",
        r"(d{4}[-|/|.]d{1,2}[-|/|.]d{1,2})",
        r"(d{2}[-|/|.]d{1,2}[-|/|.]d{1,2})",
        r"(d{4}年d{1,2}月d{1,2}日)",
        r"(d{2}年d{1,2}月d{1,2}日)",
        r"(d{1,2}月d{1,2}日)",
    ]
    
    # 部分特别规范的新闻网站,可以直接从 HTML 的 meta 数据中获得发布时间
    PUBLISH_TIME_META = [
        '//meta[starts-with(@property, "rnews:datePublished")]/@content',
        '//meta[starts-with(@property, "article:published_time")]/@content',
        '//meta[starts-with(@property, "og:published_time")]/@content',
        '//meta[starts-with(@property, "og:release_date")]/@content',
        '//meta[starts-with(@itemprop, "datePublished")]/@content',
        '//meta[starts-with(@itemprop, "dateUpdate")]/@content',
        '//meta[starts-with(@name, "OriginalPublicationDate")]/@content',
        '//meta[starts-with(@name, "article_date_original")]/@content',
        '//meta[starts-with(@name, "og:time")]/@content',
        '//meta[starts-with(@name, "apub:time")]/@content',
        '//meta[starts-with(@name, "publication_date")]/@content',
        '//meta[starts-with(@name, "sailthru.date")]/@content',
        '//meta[starts-with(@name, "PublishDate")]/@content',
        '//meta[starts-with(@name, "publishdate")]/@content',
        '//meta[starts-with(@name, "PubDate")]/@content',
        '//meta[starts-with(@name, "pubtime")]/@content',
        '//meta[starts-with(@name, "_pubtime")]/@content',
        '//meta[starts-with(@name, "weibo: article:create_at")]/@content',
        '//meta[starts-with(@pubdate, "pubdate")]/@content',
    ]
  • 相关阅读:
    【转】《基于MFC的OpenGL编程》Part 5 Transformations Rotations, Translations and Scaling
    【转】 《基于MFC的OpenGL编程》Part 10 Texture Mapping
    【转】 《基于MFC的OpenGL编程》Part 11 Blending, Antialiasing and Fog
    win form 托盘功能的实现(引用CSDN)
    C# win form退出窗体时对话框实用
    智能DNS 笔记
    iis无法启动, 找出占用80端口的罪魁祸首
    gvim for windows的剪贴板操作
    内容交换
    Content Networking 读书笔记
  • 原文地址:https://www.cnblogs.com/xuchunlin/p/14000003.html
Copyright © 2011-2022 走看看