关键点
1、正则匹配有效数据
2、多线程并发获取数据
代码展示
1 class Crawler 2 { 3 /// <summary> 4 /// URI 5 /// </summary> 6 public string Uri { get; set; } 7 /// <summary> 8 /// 创建实例时传入URI 9 /// </summary> 10 /// <param name="uri"></param> 11 public Crawler(string uri) 12 { 13 this.Uri = uri; 14 } 15 /// <summary> 16 /// 根据URI下载HTML 17 /// </summary> 18 /// <returns></returns> 19 public string DownlodHtml() 20 { 21 //根据网站下载对应的html字符串 22 using (WebClient wc = new WebClient()) 23 { 24 wc.Encoding = Encoding.UTF8; 25 string downloadStr = null; 26 try 27 { 28 downloadStr = wc.DownloadString(this.Uri); 29 } 30 catch(Exception e) 31 { 32 downloadStr = null; 33 Console.WriteLine(e.Message); 34 } 35 return downloadStr; 36 } 37 } 38 /// <summary> 39 /// 获取数据 40 /// </summary> 41 /// <param name="regx"></param> 42 /// <param name="html"></param> 43 /// <param name="i">获取匹配的正则第几组数数据</param> 44 /// <returns></returns> 45 public List<string> GetData(string regx, string html, int i) 46 { 47 MatchCollection matches = Regex.Matches(html, regx); 48 if (matches.Count > 0) 49 { 50 List<string> listTemp = new List<string>(); 51 foreach (Match item in matches) 52 { 53 listTemp.Add(item.Groups[i].Value); 54 } 55 return listTemp; 56 } 57 else 58 { 59 return null; 60 } 61 } 62 }
1 class MultiThreadCrawler 2 { 3 private string _uri; 4 //通知一个或多个正在等待的线程已发生事件 5 private ManualResetEvent _doneEvent; 6 7 public MultiThreadCrawler(string uri, ManualResetEvent doneEvent) 8 { 9 _uri = uri; 10 _doneEvent = doneEvent; 11 } 12 13 public void ThreadPoolCallback(Object threadContext) 14 { 15 AutoGetData(_uri); 16 //将事件设置为终止状态 17 _doneEvent.Set(); 18 } 19 20 public void ThreadPoolCallBackGetOnce(Object state) 21 { 22 AutoGetDataByPageNum(_uri); 23 //将事件设置为终止状态 24 _doneEvent.Set(); 25 } 26 }
参考资料
1、https://msdn.microsoft.com/zh-CN/library/3dasc8as.aspx