zoukankan      html  css  js  c++  java
  • 使用代理(WebProxy)爬虫

    关键代码:

     1 private Hashtable hash;//储存代理ip
     2         private WebProxy currentdaili;
     3         private int dailiExecMaxCount; //每个代理执行最大次数
     4         private int currentDailiExecCount; //当前代理执行次数
     5         public Handler2() //构造函数
     6         {
     7             dailiExecMaxCount = 100;
     8             currentDailiExecCount = 0;
     9             //hash = GetDailiList();
    10             currentdaili = GetOneDaili();
    11         }
    12 
    13 
    14         //http://www.xici.net.co
    15         /// <summary>
    16         /// 获取代理ip返回hashtable
    17         /// KK 2015-04-22
    18         /// </summary>
    19         /// <returns></returns>
    20         private Hashtable GetDailiList()
    21         {
    22             Hashtable result = new Hashtable();
    23             string strUrl = string.Format("http://www.xici.net.co");
    24             string detailContext = GetHtmlByUrl(strUrl);
    25             if (!string.IsNullOrEmpty(detailContext))
    26             {
    27                 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
    28                 string strkeyvalue = string.Empty;
    29                 try
    30                 {
    31                     doc.LoadHtml(detailContext);
    32                     HtmlNode node = doc.DocumentNode;
    33                     HtmlNodeCollection trlist = node.SelectNodes("//table[@id='ip_list']//tr[@class='odd' or @class='']");
    34                     foreach (HtmlNode item in trlist)
    35                     {
    36                         if (item.SelectNodes("td")[5].InnerText.ToUpper() == "HTTP")
    37                         {
    38                             strkeyvalue = item.SelectNodes("td")[1].InnerText + ":" + item.SelectNodes("td")[2].InnerText;
    39                             result.Add(strkeyvalue, strkeyvalue);
    40                         }
    41                     }
    42                 }
    43                 catch (Exception ex)
    44                 {
    45                     webframework.common.logclass.Debug("======取代理ip出错====GetDaili==" + ex.Message);
    46                     result = null;
    47                 }
    48 
    49 
    50             }
    51             else
    52             {
    53                 result = null;
    54             }
    55             return result;
    56         }
    57 
    58         /// <summary>
    59         /// 从hashtable代理中取任意ip代理
    60         /// </summary>
    61         /// <param name="hash"></param>
    62         /// <returns></returns>
    63         private WebProxy GetOneDaili()
    64         {
    65             try
    66             {
    67                 if (hash == null || hash.Count == 0)
    68                     hash = GetDailiList();
    69                 if (currentdaili != null && hash.Contains(currentdaili.Address.Authority + ":" + currentdaili.Address.Port))
    70                 {
    71                     hash.Remove(currentdaili.Address.Authority + ":" + currentdaili.Address.Port);
    72                 }
    73                 System.Collections.IDictionaryEnumerator enumerator = hash.GetEnumerator();
    74 
    75                 //随机取代理
    76                 Random rd = new Random();
    77                 int n = rd.Next(hash.Count);
    78                 int intCount = 0;
    79                 while (enumerator.MoveNext())
    80                 {
    81                     intCount++;
    82                     if (intCount == n)
    83                     {
    84                         currentdaili = new WebProxy(enumerator.Key.ToString(), true);
    85                         break;
    86                     }                    
    87                 }
    88             }
    89             catch (Exception ex)
    90             {
    91                 webframework.common.logclass.Debug("======从hashtable代理中取任意ip代理出错====GetOneDaili==" + ex.Message);
    92                 currentdaili = null;
    93             }
    94             logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port );
    95             return currentdaili;
    96         }
    View Code

    使用:

     1 /// <summary>
     2         /// 发送get请求
     3         /// </summary>
     4         /// <param name="strUrl"></param>
     5         /// <param name="isRetry"></param>
     6         /// <returns></returns>
     7         private string GetHtmlByUrl(string strUrl, bool isRetry = false, WebProxy daili = null)
     8         {
     9             currentDailiExecCount++;
    10             if (currentDailiExecCount > dailiExecMaxCount)
    11             {
    12                 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
    13             }
    14             try
    15             {
    16                 HttpWebResponse response = new webframework.common.HttpHelper()
    17                 {
    18                     URL = string.Format("{0}", strUrl),
    19                     //Proxy = daili == null ? currentdaili : daili,
    20                     //Proxy = new WebProxy("218.204.140.97:8118", true),
    21                     Proxy = daili == null ? (currentDailiExecCount > dailiExecMaxCount ? GetOneDaili() : currentdaili) : daili,
    22                     Timeout = 5 * 1000,
    23                 }.CreateGetHttpResponse();
    24 
    25                 return response.HttpString(Encoding.UTF8);
    26             }
    27             catch (Exception)
    28             {
    29                 //重试请求
    30                 if (!isRetry)
    31                     return GetHtmlByUrl(strUrl, true, GetOneDaili());
    32                 else
    33                     throw null;
    34             }
    35 
    36         }
    37 
    38 
    39         /// <summary>
    40         /// 发送post请求
    41         /// </summary>
    42         /// <param name="strUrl"></param>
    43         /// <param name="isRetry"></param>
    44         /// <returns></returns>
    45         private string PostHtmlByUrl(string strUrl, string strPostString, bool isRetry = false, WebProxy daili = null)
    46         {
    47             currentDailiExecCount++;
    48             if (currentDailiExecCount > dailiExecMaxCount)
    49             {
    50                 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
    51             }
    52             try
    53             {
    54                 HttpWebResponse response = new HttpHelper()
    55                 {
    56                     URL = strUrl,
    57                     PostString = strPostString,
    58                     //Proxy = new WebProxy("218.204.140.97:8118", true),
    59                     Proxy = daili == null ? (currentDailiExecCount>dailiExecMaxCount?GetOneDaili(): currentdaili) : daili,
    60                     //Proxy = daili == null ? currentdaili : daili,
    61                     PostEncoding = Encoding.UTF8,
    62                     Timeout = 5 * 1000,
    63                 }.CreatePostHttpResponse();
    64 
    65                 return response.HttpString(Encoding.UTF8);
    66             }
    67             catch (Exception)
    68             {
    69                 //重试请求
    70                 if (!isRetry)
    71                     return PostHtmlByUrl(strUrl, strPostString, true, GetOneDaili());
    72                 else
    73                     throw null;
    74             }
    75 
    76         }
    View Code

    参考资料:

    http://www.haolizi.net/example/view_199.html

  • 相关阅读:
    PostgreSQL中的partition-wise join
    Partition-wise join
    外观模式 门面模式 Facade 结构型 设计模式(十三)
    桥接模式 桥梁模式 bridge 结构型 设计模式(十二)
    组合模式 合成模式 COMPOSITE 结构型 设计模式(十一)
    创建型设计模式对比总结 设计模式(八)
    原型模式 prototype 创建型 设计模式(七)
    单例模式 创建型 设计模式(六)
    建造者模式 生成器模式 创建型 设计模式(五)
    抽象工厂模式 创建型 设计模式(四)
  • 原文地址:https://www.cnblogs.com/systemkk/p/4449634.html
Copyright © 2011-2022 走看看