zoukankan      html  css  js  c++  java
  • 随手正则写的 CSDN【只看楼主】功能

    写这个的时候居然没有看到原来CSDN已经有这个功能了,写完代码了突然发现原来早就已经有了。

    现把代码贴出来吧,虽然有很多解析HTML的开源类库如:http://htmlagilitypack.codeplex.com/,但我一直习惯于正则匹配。

    截图:

    呵呵,起码还能看吧@——#

      1 private void button1_Click(object sender, EventArgs e)
      2         {
      3             if (!string.IsNullOrEmpty(txtCsdnUrl.Text.Trim()))
      4             {
      5                 string url = txtCsdnUrl.Text.Trim();
      6                 string htmlSource = string.Empty;
      7                 htmlSource = GetHtmlSource(url);
      8                 int pageCount = GetPageCount(htmlSource);
      9                 string context = string.Empty;
     10 
     11                 if (pageCount > 1)
     12                 {
     13                     for (int i = 1; i <= pageCount; i++)
     14                     {
     15                         htmlSource = GetHtmlSource(url + "?page=" + i);
     16 
     17                        context+= GetLZArticle(htmlSource);
     18                     }
     19                 }
     20                 else
     21                 {
     22                     context += GetLZArticle(htmlSource);
     23                 }
     24 
     25                 richTextBox1.Text = context;
     26 
     27             }
     28             else
     29             {
     30                 MessageBox.Show("请输入地址");
     31             }
     32         }
     33 
     34         /// <summary>
     35         /// 获取源代码
     36         /// </summary>
     37         /// <param name="Url"></param>
     38         /// <returns></returns>
     39         public string GetHtmlSource(string Url)
     40         {
     41             WebClient client = new WebClient();
     42             client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
     43             Stream data = client.OpenRead(Url);
     44             string result = string.Empty;
     45             using (StreamReader reader = new StreamReader(data, Encoding.UTF8))
     46             {
     47                 result = reader.ReadToEnd();
     48             }
     49 
     50             return result;
     51         }
     52 
     53         /// <summary>
     54         /// 获取贴子总页数 URL格式:http://bbs.csdn.net/topics/390730011?page=2
     55         /// </summary>
     56         /// <returns>返回最大页数</returns>
     57         public int GetPageCount(string HtmlSource)
     58         {
     59             int pageCount = 0;
     60 
     61             Regex reg = new Regex("<select class="jumpMenu" name="jumpMenu">(?<val>.*?)</select>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
     62             string htmlSource = HtmlSource;
     63             Regex reg1 = new Regex("<option.*?>(?<val>.*?)</option>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
     64             int count = reg1.Matches(reg.Match(htmlSource).Groups["val"].Value).Count;
     65 
     66             int.TryParse(reg1.Matches(reg.Match(htmlSource).Groups["val"].Value)[count - 1].Groups["val"].Value,
     67                          out pageCount);
     68 
     69             return pageCount;
     70         }
     71 
     72         /// <summary>
     73         /// 获取文章标题
     74         /// </summary>
     75         /// <param name="HtmlSource">网页内容</param>
     76         /// <returns></returns>
     77         public string GetArticleTitle(string HtmlSource)
     78         {
     79             string title = string.Empty;
     80 
     81             Regex reg = new Regex("<span class="title text_overflow">(?<title>.*?)</span>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
     82 
     83             title = reg.Match(HtmlSource).Groups["title"].Value;
     84 
     85             return title;
     86         }
     87 
     88 
     89         public string GetAuthorName(string HtmlSource)
     90         {
     91             string result = string.Empty;
     92 
     93             Regex regex = new Regex("<a class="p-author" href="#">(?<value>.*?)</a>");
     94 
     95             result = regex.Match(HtmlSource).Groups["value"].Value;
     96 
     97             return result;
     98         }
     99 
    100         public string GetLZArticle(string HtmlSource)
    101         {
    102 
    103             string result = string.Empty;
    104             string authorName = GetAuthorName(HtmlSource);
    105 
    106             Regex regex = new Regex("<td valign="top" class="post_info .*?" data-username="" + authorName + "".*?>.*?<div class="post_body">(?<value>.*?)</div>.*?</td>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
    107 
    108             for (int i = 0; i < regex.Matches(HtmlSource).Count; i++)
    109             {
    110                 result += regex.Matches(HtmlSource)[i].Groups["value"].Value;
    111                 result += "--------------------分隔线--------------------";
    112             }
    113             return result.Trim().Replace("<br />","
    ");
    114         }

    代码都在这里了。

  • 相关阅读:
    06 is和==的区别 encode()编码 decode()解码
    05 dic的增删改查 字典的嵌套 考试题dic.get()的相关使用
    03 编码 int ,bool,str的常用操作 主要讲str
    01 基本数据类型 变量 if语句
    04 列表的增删改查 常用方法 元祖 range
    02 while循环 格式化输出 运算符
    多校2 Harmonious Army hdu6598 网络流
    P3159 [CQOI2012]交换棋子 网络流
    P2172 [国家集训队]部落战争 最大流
    P2402 奶牛隐藏 网络流
  • 原文地址:https://www.cnblogs.com/pandait/p/CSDN_See_LouZhu.html
Copyright © 2011-2022 走看看