zoukankan      html  css  js  c++  java
  • 使用代理(WebProxy)爬虫

    关键代码:

     1 private Hashtable hash;//储存代理ip
     2         private WebProxy currentdaili;
     3         private int dailiExecMaxCount; //每个代理执行最大次数
     4         private int currentDailiExecCount; //当前代理执行次数
     5         public Handler2() //构造函数
     6         {
     7             dailiExecMaxCount = 100;
     8             currentDailiExecCount = 0;
     9             //hash = GetDailiList();
    10             currentdaili = GetOneDaili();
    11         }
    12 
    13 
    14         //http://www.xici.net.co
    15         /// <summary>
    16         /// 获取代理ip返回hashtable
    17         /// KK 2015-04-22
    18         /// </summary>
    19         /// <returns></returns>
    20         private Hashtable GetDailiList()
    21         {
    22             Hashtable result = new Hashtable();
    23             string strUrl = string.Format("http://www.xici.net.co");
    24             string detailContext = GetHtmlByUrl(strUrl);
    25             if (!string.IsNullOrEmpty(detailContext))
    26             {
    27                 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
    28                 string strkeyvalue = string.Empty;
    29                 try
    30                 {
    31                     doc.LoadHtml(detailContext);
    32                     HtmlNode node = doc.DocumentNode;
    33                     HtmlNodeCollection trlist = node.SelectNodes("//table[@id='ip_list']//tr[@class='odd' or @class='']");
    34                     foreach (HtmlNode item in trlist)
    35                     {
    36                         if (item.SelectNodes("td")[5].InnerText.ToUpper() == "HTTP")
    37                         {
    38                             strkeyvalue = item.SelectNodes("td")[1].InnerText + ":" + item.SelectNodes("td")[2].InnerText;
    39                             result.Add(strkeyvalue, strkeyvalue);
    40                         }
    41                     }
    42                 }
    43                 catch (Exception ex)
    44                 {
    45                     webframework.common.logclass.Debug("======取代理ip出错====GetDaili==" + ex.Message);
    46                     result = null;
    47                 }
    48 
    49 
    50             }
    51             else
    52             {
    53                 result = null;
    54             }
    55             return result;
    56         }
    57 
    58         /// <summary>
    59         /// 从hashtable代理中取任意ip代理
    60         /// </summary>
    61         /// <param name="hash"></param>
    62         /// <returns></returns>
    63         private WebProxy GetOneDaili()
    64         {
    65             try
    66             {
    67                 if (hash == null || hash.Count == 0)
    68                     hash = GetDailiList();
    69                 if (currentdaili != null && hash.Contains(currentdaili.Address.Authority + ":" + currentdaili.Address.Port))
    70                 {
    71                     hash.Remove(currentdaili.Address.Authority + ":" + currentdaili.Address.Port);
    72                 }
    73                 System.Collections.IDictionaryEnumerator enumerator = hash.GetEnumerator();
    74 
    75                 //随机取代理
    76                 Random rd = new Random();
    77                 int n = rd.Next(hash.Count);
    78                 int intCount = 0;
    79                 while (enumerator.MoveNext())
    80                 {
    81                     intCount++;
    82                     if (intCount == n)
    83                     {
    84                         currentdaili = new WebProxy(enumerator.Key.ToString(), true);
    85                         break;
    86                     }                    
    87                 }
    88             }
    89             catch (Exception ex)
    90             {
    91                 webframework.common.logclass.Debug("======从hashtable代理中取任意ip代理出错====GetOneDaili==" + ex.Message);
    92                 currentdaili = null;
    93             }
    94             logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port );
    95             return currentdaili;
    96         }
    View Code

    使用:

     1 /// <summary>
     2         /// 发送get请求
     3         /// </summary>
     4         /// <param name="strUrl"></param>
     5         /// <param name="isRetry"></param>
     6         /// <returns></returns>
     7         private string GetHtmlByUrl(string strUrl, bool isRetry = false, WebProxy daili = null)
     8         {
     9             currentDailiExecCount++;
    10             if (currentDailiExecCount > dailiExecMaxCount)
    11             {
    12                 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
    13             }
    14             try
    15             {
    16                 HttpWebResponse response = new webframework.common.HttpHelper()
    17                 {
    18                     URL = string.Format("{0}", strUrl),
    19                     //Proxy = daili == null ? currentdaili : daili,
    20                     //Proxy = new WebProxy("218.204.140.97:8118", true),
    21                     Proxy = daili == null ? (currentDailiExecCount > dailiExecMaxCount ? GetOneDaili() : currentdaili) : daili,
    22                     Timeout = 5 * 1000,
    23                 }.CreateGetHttpResponse();
    24 
    25                 return response.HttpString(Encoding.UTF8);
    26             }
    27             catch (Exception)
    28             {
    29                 //重试请求
    30                 if (!isRetry)
    31                     return GetHtmlByUrl(strUrl, true, GetOneDaili());
    32                 else
    33                     throw null;
    34             }
    35 
    36         }
    37 
    38 
    39         /// <summary>
    40         /// 发送post请求
    41         /// </summary>
    42         /// <param name="strUrl"></param>
    43         /// <param name="isRetry"></param>
    44         /// <returns></returns>
    45         private string PostHtmlByUrl(string strUrl, string strPostString, bool isRetry = false, WebProxy daili = null)
    46         {
    47             currentDailiExecCount++;
    48             if (currentDailiExecCount > dailiExecMaxCount)
    49             {
    50                 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
    51             }
    52             try
    53             {
    54                 HttpWebResponse response = new HttpHelper()
    55                 {
    56                     URL = strUrl,
    57                     PostString = strPostString,
    58                     //Proxy = new WebProxy("218.204.140.97:8118", true),
    59                     Proxy = daili == null ? (currentDailiExecCount>dailiExecMaxCount?GetOneDaili(): currentdaili) : daili,
    60                     //Proxy = daili == null ? currentdaili : daili,
    61                     PostEncoding = Encoding.UTF8,
    62                     Timeout = 5 * 1000,
    63                 }.CreatePostHttpResponse();
    64 
    65                 return response.HttpString(Encoding.UTF8);
    66             }
    67             catch (Exception)
    68             {
    69                 //重试请求
    70                 if (!isRetry)
    71                     return PostHtmlByUrl(strUrl, strPostString, true, GetOneDaili());
    72                 else
    73                     throw null;
    74             }
    75 
    76         }
    View Code

    参考资料:

    http://www.haolizi.net/example/view_199.html

  • 相关阅读:
    基于C++ Qt实现的红色警戒3修改器
    Java多线程(十五):CountDownLatch,Semaphore,Exchanger,CyclicBarrier,Callable和Future
    Java多线程(十四):Timer
    Java多线程(十三):线程池
    Java多线程(十二):中断机制
    Java多线程(十一):线程组
    Java多线程(十):BlockingQueue实现生产者消费者模型
    德国语言+留学签证递交材料详解(上海)
    Java多线程(九):生产者消费者模型
    Java多线程(八):ReentrantReadWriteLock
  • 原文地址:https://www.cnblogs.com/systemkk/p/4449634.html
Copyright © 2011-2022 走看看