zoukankan      html  css  js  c++  java
  • .Net抓取网页数据

    关键点

    1、正则匹配有效数据

    2、多线程并发获取数据

    代码展示

     1 class Crawler
     2     {
     3         /// <summary>
     4         /// URI
     5         /// </summary>
     6         public string Uri { get; set; }
     7         /// <summary>
     8         /// 创建实例时传入URI
     9         /// </summary>
    10         /// <param name="uri"></param>
    11         public Crawler(string uri)
    12         {
    13             this.Uri = uri;
    14         }
    15         /// <summary>
    16         /// 根据URI下载HTML
    17         /// </summary>
    18         /// <returns></returns>
    19         public string DownlodHtml()
    20         {
    21             //根据网站下载对应的html字符串
    22             using (WebClient wc = new WebClient())
    23             {
    24                 wc.Encoding = Encoding.UTF8;
    25                 string downloadStr = null;
    26                 try
    27                 {
    28                     downloadStr = wc.DownloadString(this.Uri);
    29                 }
    30                 catch(Exception e)
    31                 {
    32                     downloadStr = null;
    33                     Console.WriteLine(e.Message);
    34                 }
    35                 return downloadStr;
    36             }
    37         }
    38         /// <summary>
    39         /// 获取数据
    40         /// </summary>
    41         /// <param name="regx"></param>
    42         /// <param name="html"></param>
    43         /// <param name="i">获取匹配的正则第几组数数据</param>
    44         /// <returns></returns>
    45         public List<string> GetData(string regx, string html, int i)
    46         {
    47             MatchCollection matches = Regex.Matches(html, regx);
    48             if (matches.Count > 0)
    49             {
    50                 List<string> listTemp = new List<string>();
    51                 foreach (Match item in matches)
    52                 {
    53                     listTemp.Add(item.Groups[i].Value);
    54                 }
    55                 return listTemp;
    56             }
    57             else
    58             {
    59                 return null;
    60             }
    61         }
    62     }
     1 class MultiThreadCrawler
     2     {   
     3         private string _uri;
     4         //通知一个或多个正在等待的线程已发生事件
     5         private ManualResetEvent _doneEvent;
     6 
     7         public MultiThreadCrawler(string uri, ManualResetEvent doneEvent)
     8         {
     9             _uri = uri;
    10             _doneEvent = doneEvent;
    11         }
    12 
    13         public void ThreadPoolCallback(Object threadContext)
    14         {
    15             AutoGetData(_uri);
    16             //将事件设置为终止状态
    17             _doneEvent.Set();
    18         }
    19 
    20         public void ThreadPoolCallBackGetOnce(Object state)
    21         {
    22             AutoGetDataByPageNum(_uri);
    23             //将事件设置为终止状态
    24             _doneEvent.Set();
    25         }         
    26     }

    参考资料
    1、https://msdn.microsoft.com/zh-CN/library/3dasc8as.aspx

  • 相关阅读:
    AC自动机
    【洛谷P1972】HH的项链
    【洛谷P4341】外星联络
    【洛谷P4576】棋盘游戏
    【JZOJ3800】败屩妖
    【JZOJ3798】临洮巨人
    【洛谷P3830】随机树
    【JZOJ3799】青蛙神
    牛客练习赛56 题解
    【洛谷P5300】与或和
  • 原文地址:https://www.cnblogs.com/BurtBlog/p/5036323.html
Copyright © 2011-2022 走看看