zoukankan      html  css  js  c++  java
  • 用正则表达式去截取网页里文字的方法。参数为读取的网页源代码

     //抓取文字方法,参数为网页源代码
            public string ExtractText(string strHtml)
            {
                string result = strHtml;
                result = RemoveComment(result); //调用去掉注释等方法
                result = RemoveScript(result); //调用去除js 方法
                result = RemoveStyle(result);  //调用去除样式表方法
                result = RemoveTags(result);  //调用去掉符号方法
                return result.Trim();
            }
            #region 
            //去除符号方法。把网页源代码作为参数,根据正则表达式去除相应符号。代码需要背过
            private string RemoveComment(string input)
            {
                string result = input;
                result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
                return result;
            }
            private string RemoveStyle(string input)
            {
                string result = input;
                //remove all styles 
                result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
                return result;
            }
            //去掉js方法
            private string RemoveScript(string input)
            {
                string result = input;
                result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
                return result;
            }
            //去掉标点符号方法
            private string RemoveTags(string input)
            {
                string result = input;
                result = result.Replace(" ", " ");
                result = result.Replace("<", "<");
                result = result.Replace(">", ">");
                result = result.Replace("&", "&");
                result = result.Replace("<br>", "
    ");
                result = Regex.Replace(result, @"<[sS]*?>", string.Empty, RegexOptions.IgnoreCase);
                return result;
            }
            #endregion
            //批量抓取邮箱
            private void 转换工具ZToolStripMenuItem_Click(object sender, EventArgs e)
            {
                //MatchCollection 通过不停的替换将正则表达式模式应用于输入字符串所找到的成功匹配的集合
                textBox2.Clear();
                MatchCollection mc = Regex.Matches(respHtml, @"[a-zA-Z0-9_-.]+@w+(.w+)+");
                StringBuilder sb = new StringBuilder();  //可变字符串
                foreach (Match mm in mc)  //Macth这是一个匹配类
                {
                    sb.AppendLine(mm.Value);
                }
                textBox2.Text = sb.ToString();
            }
         
  • 相关阅读:
    Codeforces Round #171 (Div. 2)
    ACdream 1079 郭式树
    HDOJ 1517 博弈论
    ACdream 1080 面面数
    博弈论 Nim 博弈
    Codeforces Round #172 (Div. 2)
    ACdream 1084 同心树
    STL bitset
    博弈论 bash博弈
    POJ 3261 后缀数组
  • 原文地址:https://www.cnblogs.com/275147378abc/p/4581580.html
Copyright © 2011-2022 走看看