zoukankan      html  css  js  c++  java
  • 正文提取中用到的正则表达式

     #region 相关正则表达式
     /// <summary>
     /// 去掉所有html标签
     /// </summary>
     private static readonly Regex FilterAll = new Regex(
     @"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase); //(?<Link><a[\s\S]*?</a>)|
     //(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)
     /// <summary>
     /// 找出title标签
     /// </summary>
     private static readonly Regex FindTitle = new Regex(
     @"<\s*/?title\s*>",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出title标签内容
     /// </summary>
     private static readonly Regex FindTitleContent = new Regex(
     @"<\s*/?title\s*>(?<Content>[\s\S]*?)<\s*/?title\s*>",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出h 和Strong标签
     /// </summary>
     private static readonly Regex FindHStrong = new Regex(
     @"<\s*/?h\s*>|<\s*/?strong\s*>",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出p 和br标签
     /// </summary>
     private static readonly Regex FindPB = new Regex(
     @"<\s*/?p\s*>|<\s*br\s*/?>|<\s*/?tr\s*>",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出nbsp标签
     /// </summary>
     private static readonly Regex FindNbsp = new Regex(
     @"&nbsp",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出结尾标签
     /// </summary>
     private static readonly Regex FindS = new Regex(
     @"(?<Content>[\s\S]*?)\$",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出是否为标准句
     /// </summary>
     private static readonly Regex IsSen = new Regex(
     @"[,.,。!!;;::……??《》“”""]",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出是否为垃圾句[strong][h]标签过多的
     /// </summary>
     private static readonly Regex IsWs = new Regex(
     @"\[\(h\)\]",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出是否为垃圾句冒号和·-过多的
     /// </summary>
     private static readonly Regex IsWsM = new Regex(
     @"\[·]|[-]|[::]",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出是否为BBS特征
     /// </summary>
     private static readonly Regex IsBbsInfo = new Regex(
     @"第[^楼]{1,50}楼|Powered\s*/?by[\s\S]*?Dvbbs|Powered\s*/?by[\s\S]*?Discuz",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     
     /// <summary>
     /// 取KEYWORD
     /// </summary>
     private static readonly Regex mKeyWord = new Regex(
     @"<meta\s*name\s*=\s*['""]?keywords['""]?\s*content\s*=\s*['""]?(?<KeyWords>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<KeyWords>[^'"">]*)['""]?\s*name\s*=\s*['""]?keywords['""]?\s*[^>]*>
    ",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
     /// <summary>
     /// 取DESCRIPTION
     /// </summary>
     private static readonly Regex mDescription = new Regex(
     @"<meta\s*name\s*=\s*['""]?description['""]?\s*content\s*=\s*['""]?(?<description>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<description>[^'"">]*)['""]?\s*name\s*=\s*['""]?description['""]?\s*[^>]*>
    ",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
     
     /// <summary>
     /// 取Tags
     /// </summary>
     private static readonly Regex mTag = new Regex(
     @"<meta\s*name\s*=\s*['""]?tagwords['""]?\s*content\s*=\s*['""]?(?<tagwords>[^'"">]*)['""]?[^>]*>|<meta\s*content\s*=\s*['""]?(?<tagwords>[^'"">]*)['""]?\s*name\s*=\s*['""]?tagwords['""]?\s*[^>]*>
    ", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出是否为垃圾句:后字符号过少,:号前无“说”字,:号后无"关于"
     /// </summary>
     private static readonly Regex IsWsMM = new Regex(
     @"^[^说\s]{0,8}?[::].{0,10}$",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出spider写入的url标记
     /// </summary>
     private static readonly Regex txtUrl = new Regex(
     @"当前URL为:http://(?<URL>.*)",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     /// <summary>
     /// 找出spider写入的锚点描述标记
     /// </summary>
     private static readonly Regex txtDescription = new Regex(
     @"当前链接描述为:(?<Describe>.*)",
     RegexOptions.ExplicitCapture
     | RegexOptions.Multiline
     | RegexOptions.IgnoreCase);
     ///// <summary>
     ///// 取需要a标签
     ///// </summary>
     //private static readonly Regex cleanFirst = new Regex(
     // @"([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])(?<Robbish1><a\s+[^>]*>)[^<]{1,6}(?<Robbish2></a>)([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
     #endregion
  • 相关阅读:
    题目834-组队-nyoj20140818
    题目806-HEIHEI的心情-nyoj20140818
    如何配置:断路器Dashboard监控仪表盘
    Hystrix降级策略和超时调整
    微服务调用时的超时异常,使用feign的时候负载均衡策略的调整
    SpringCloud服务间调用:负载均衡策略调整
    微服务调用方式ribbon
    FastJson:Json树的CRUD操作方法实现
    java 面向对象String类
    java 面向对象内部类
  • 原文地址:https://www.cnblogs.com/top5/p/2356057.html
Copyright © 2011-2022 走看看