zoukankan      html  css  js  c++  java
  • 随手正则写的 CSDN【只看楼主】功能

    写这个的时候居然没有看到原来CSDN已经有这个功能了,写完代码了突然发现原来早就已经有了。

    现把代码贴出来吧,虽然有很多解析HTML的开源类库如:http://htmlagilitypack.codeplex.com/,但我一直习惯于正则匹配。

    截图:

    呵呵,起码还能看吧@——#

      1 private void button1_Click(object sender, EventArgs e)
      2         {
      3             if (!string.IsNullOrEmpty(txtCsdnUrl.Text.Trim()))
      4             {
      5                 string url = txtCsdnUrl.Text.Trim();
      6                 string htmlSource = string.Empty;
      7                 htmlSource = GetHtmlSource(url);
      8                 int pageCount = GetPageCount(htmlSource);
      9                 string context = string.Empty;
     10 
     11                 if (pageCount > 1)
     12                 {
     13                     for (int i = 1; i <= pageCount; i++)
     14                     {
     15                         htmlSource = GetHtmlSource(url + "?page=" + i);
     16 
     17                        context+= GetLZArticle(htmlSource);
     18                     }
     19                 }
     20                 else
     21                 {
     22                     context += GetLZArticle(htmlSource);
     23                 }
     24 
     25                 richTextBox1.Text = context;
     26 
     27             }
     28             else
     29             {
     30                 MessageBox.Show("请输入地址");
     31             }
     32         }
     33 
     34         /// <summary>
     35         /// 获取源代码
     36         /// </summary>
     37         /// <param name="Url"></param>
     38         /// <returns></returns>
     39         public string GetHtmlSource(string Url)
     40         {
     41             WebClient client = new WebClient();
     42             client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
     43             Stream data = client.OpenRead(Url);
     44             string result = string.Empty;
     45             using (StreamReader reader = new StreamReader(data, Encoding.UTF8))
     46             {
     47                 result = reader.ReadToEnd();
     48             }
     49 
     50             return result;
     51         }
     52 
     53         /// <summary>
     54         /// 获取贴子总页数 URL格式:http://bbs.csdn.net/topics/390730011?page=2
     55         /// </summary>
     56         /// <returns>返回最大页数</returns>
     57         public int GetPageCount(string HtmlSource)
     58         {
     59             int pageCount = 0;
     60 
     61             Regex reg = new Regex("<select class="jumpMenu" name="jumpMenu">(?<val>.*?)</select>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
     62             string htmlSource = HtmlSource;
     63             Regex reg1 = new Regex("<option.*?>(?<val>.*?)</option>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
     64             int count = reg1.Matches(reg.Match(htmlSource).Groups["val"].Value).Count;
     65 
     66             int.TryParse(reg1.Matches(reg.Match(htmlSource).Groups["val"].Value)[count - 1].Groups["val"].Value,
     67                          out pageCount);
     68 
     69             return pageCount;
     70         }
     71 
     72         /// <summary>
     73         /// 获取文章标题
     74         /// </summary>
     75         /// <param name="HtmlSource">网页内容</param>
     76         /// <returns></returns>
     77         public string GetArticleTitle(string HtmlSource)
     78         {
     79             string title = string.Empty;
     80 
     81             Regex reg = new Regex("<span class="title text_overflow">(?<title>.*?)</span>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
     82 
     83             title = reg.Match(HtmlSource).Groups["title"].Value;
     84 
     85             return title;
     86         }
     87 
     88 
     89         public string GetAuthorName(string HtmlSource)
     90         {
     91             string result = string.Empty;
     92 
     93             Regex regex = new Regex("<a class="p-author" href="#">(?<value>.*?)</a>");
     94 
     95             result = regex.Match(HtmlSource).Groups["value"].Value;
     96 
     97             return result;
     98         }
     99 
    100         public string GetLZArticle(string HtmlSource)
    101         {
    102 
    103             string result = string.Empty;
    104             string authorName = GetAuthorName(HtmlSource);
    105 
    106             Regex regex = new Regex("<td valign="top" class="post_info .*?" data-username="" + authorName + "".*?>.*?<div class="post_body">(?<value>.*?)</div>.*?</td>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
    107 
    108             for (int i = 0; i < regex.Matches(HtmlSource).Count; i++)
    109             {
    110                 result += regex.Matches(HtmlSource)[i].Groups["value"].Value;
    111                 result += "--------------------分隔线--------------------";
    112             }
    113             return result.Trim().Replace("<br />","
    ");
    114         }

    代码都在这里了。

  • 相关阅读:
    Unit of Work
    Layered Supertype
    Domain Model
    ASP.Net设计模式读书笔记
    VS2010无法使用nuget安装第三方包的问题
    数据库对象命名
    sql50题
    RESTFul API
    EasyUI日历控件
    ASP.NET MVC 防止前端点击劫持
  • 原文地址:https://www.cnblogs.com/pandait/p/CSDN_See_LouZhu.html
Copyright © 2011-2022 走看看