写这个的时候居然没有看到原来CSDN已经有这个功能了,写完代码了突然发现原来早就已经有了。
现把代码贴出来吧,虽然有很多解析HTML的开源类库如:http://htmlagilitypack.codeplex.com/,但我一直习惯于正则匹配。
截图:
呵呵,起码还能看吧@——#
1 private void button1_Click(object sender, EventArgs e) 2 { 3 if (!string.IsNullOrEmpty(txtCsdnUrl.Text.Trim())) 4 { 5 string url = txtCsdnUrl.Text.Trim(); 6 string htmlSource = string.Empty; 7 htmlSource = GetHtmlSource(url); 8 int pageCount = GetPageCount(htmlSource); 9 string context = string.Empty; 10 11 if (pageCount > 1) 12 { 13 for (int i = 1; i <= pageCount; i++) 14 { 15 htmlSource = GetHtmlSource(url + "?page=" + i); 16 17 context+= GetLZArticle(htmlSource); 18 } 19 } 20 else 21 { 22 context += GetLZArticle(htmlSource); 23 } 24 25 richTextBox1.Text = context; 26 27 } 28 else 29 { 30 MessageBox.Show("请输入地址"); 31 } 32 } 33 34 /// <summary> 35 /// 获取源代码 36 /// </summary> 37 /// <param name="Url"></param> 38 /// <returns></returns> 39 public string GetHtmlSource(string Url) 40 { 41 WebClient client = new WebClient(); 42 client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); 43 Stream data = client.OpenRead(Url); 44 string result = string.Empty; 45 using (StreamReader reader = new StreamReader(data, Encoding.UTF8)) 46 { 47 result = reader.ReadToEnd(); 48 } 49 50 return result; 51 } 52 53 /// <summary> 54 /// 获取贴子总页数 URL格式:http://bbs.csdn.net/topics/390730011?page=2 55 /// </summary> 56 /// <returns>返回最大页数</returns> 57 public int GetPageCount(string HtmlSource) 58 { 59 int pageCount = 0; 60 61 Regex reg = new Regex("<select class="jumpMenu" name="jumpMenu">(?<val>.*?)</select>", RegexOptions.Singleline | RegexOptions.IgnoreCase); 62 string htmlSource = HtmlSource; 63 Regex reg1 = new Regex("<option.*?>(?<val>.*?)</option>", RegexOptions.Singleline | RegexOptions.IgnoreCase); 64 int count = reg1.Matches(reg.Match(htmlSource).Groups["val"].Value).Count; 65 66 int.TryParse(reg1.Matches(reg.Match(htmlSource).Groups["val"].Value)[count - 1].Groups["val"].Value, 67 out pageCount); 68 69 return pageCount; 70 } 71 72 /// <summary> 73 /// 获取文章标题 74 /// </summary> 75 /// <param name="HtmlSource">网页内容</param> 76 /// <returns></returns> 77 public string GetArticleTitle(string HtmlSource) 78 { 79 string title = string.Empty; 80 81 Regex reg = new Regex("<span class="title text_overflow">(?<title>.*?)</span>", RegexOptions.Singleline | RegexOptions.IgnoreCase); 82 83 title = reg.Match(HtmlSource).Groups["title"].Value; 84 85 return title; 86 } 87 88 89 public string GetAuthorName(string HtmlSource) 90 { 91 string result = string.Empty; 92 93 Regex regex = new Regex("<a class="p-author" href="#">(?<value>.*?)</a>"); 94 95 result = regex.Match(HtmlSource).Groups["value"].Value; 96 97 return result; 98 } 99 100 public string GetLZArticle(string HtmlSource) 101 { 102 103 string result = string.Empty; 104 string authorName = GetAuthorName(HtmlSource); 105 106 Regex regex = new Regex("<td valign="top" class="post_info .*?" data-username="" + authorName + "".*?>.*?<div class="post_body">(?<value>.*?)</div>.*?</td>", RegexOptions.Singleline | RegexOptions.IgnoreCase); 107 108 for (int i = 0; i < regex.Matches(HtmlSource).Count; i++) 109 { 110 result += regex.Matches(HtmlSource)[i].Groups["value"].Value; 111 result += "--------------------分隔线--------------------"; 112 } 113 return result.Trim().Replace("<br />"," "); 114 }
代码都在这里了。