使用到了以下技术点:
1)webclient获得网页源码;
2)正则表达式,解析网页中想要的数据;
3)使用线程池加快网页采集数据。
4)……
以前写过几次类似的,但是找不到了,又重新写了一个。
代码比较粗糙,求拍砖。
using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Windows.Forms; namespace SpiderMan { public partial class Form1 : Form { public Form1() { InitializeComponent(); } /// <summary> /// 线程数量 /// </summary> private static int threadCount = 0; private void button1_Click(object sender, EventArgs e) { string urlPattern = "http://www.3464.com/data/zhongguochengshijingweidu/?PageNo={0}"; int pageFirstIndex = 1; int pageLastIndex = 125; for (int pageIndex = pageFirstIndex; pageIndex <= pageLastIndex; pageIndex++) { string url = string.Format(urlPattern, pageIndex); Log("开始读取url:" + url); ThreadPool.QueueUserWorkItem(aurl => { string html = GetHttpSource((string)aurl); ParseHtml(html); //线程计数-- Interlocked.Decrement(ref threadCount); }, url); //线程计数++ Interlocked.Increment(ref threadCount); } while (true) { Application.DoEvents(); Loading(); if (threadCount <= 0) { break; } } //Thread.Sleep(1000); Log("数据采集结束"); } #region 解析html /// <summary> /// 解析html /// </summary> /// <param name="html"></param> private void ParseHtml(string html) { var beginPos = html.IndexOf("编号"); var endPos = html.IndexOf("</table>", beginPos); var partHtml = html.Substring(beginPos, endPos - beginPos); /* <tr[^<]*<td[^>]*>(?<id>d*?)</td>[^>]*>(?<prov>w*)</td>[^>]*>[^>]*>(?<city>w*)</a></td>[^>]*>(?<city2>w*)</td>[^>]*>(?<py>w*)</td>[^>]*>(?<qh>w*)</td>[^>]*>(?<yb>w*)</td>[^>]*>(?<dj>[d.]*)</td>[^>]*>(?<bw>[d.]*)</td>[^>]*> */ var ms = Regex.Matches(partHtml, @"<tr[^<]*<td[^>]*>(?<id>d*?)</td>[^>]*>(?<prov>w*)</td>[^>]*>[^>]*>(?<city>w*)</a></td>[^>]*>(?<city2>w*)</td>[^>]*>(?<py>w*)</td>[^>]*>(?<qh>w*)</td>[^>]*>(?<yb>w*)</td>[^>]*>(?<dj>[d.]*)</td>[^>]*>(?<bw>[d.]*)</td>[^>]*>"); foreach (Match m in ms) { if (!m.Success) { Log("解析错误:" + m.Value); continue; } //Log(partHtml); var 编码 = m.Groups["id"].Value; var 省市 = m.Groups["prov"].Value; var 地区市 = m.Groups["city"].Value; var 市县 = m.Groups["city2"].Value; var 拼音 = m.Groups["py"].Value; var 区号 = m.Groups["qh"].Value; var 邮编 = m.Groups["yb"].Value; var 东经 = m.Groups["dj"].Value; var 北纬 = m.Groups["bw"].Value; Log(string.Format("{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}", 编码, 省市, 地区市, 市县, 拼音, 区号, 邮编, 东经, 北纬)); } } #endregion /// <summary> /// 简易进度条 /// </summary> private void Loading() { if (this.InvokeRequired) { this.Invoke(new MethodInvoker(Loading)); } else { int maxLength = 100; int residue = maxLength - this.Text.Length; this.Text = "采集中" + new StringBuilder().Append('.', residue).ToString(); } } #region Log /// <summary> /// 简易控制台输出 /// </summary> /// <param name="msg"></param> private void Log(string msg) { if (this.textBox1.InvokeRequired) { this.Invoke(new MethodInvoker(() => Log(msg))); } else { this.textBox1.AppendText(msg); this.textBox1.AppendText(System.Environment.NewLine); } } #endregion #region GetHttpSource /// <summary> /// 获得网页源码 /// </summary> /// <param name="url"></param> /// <returns></returns> private string GetHttpSource(string url) { //请求别人的网站温柔点 Thread.Sleep(new Random().Next(100, 500)); var wc = new WebClient {Encoding = Encoding.Default}; wc.Headers.Add("Content-Type", "application/x-www-form-urlencoded"); var source = wc.DownloadString(url); return source; } #endregion } }