zoukankan      html  css  js  c++  java
  • dotNet使用HttpWebRequest模拟浏览器

    在编写网络爬虫时,HttpWebRequest几乎可以完成绝大多数网站的抓取,为了更好的使用这一技术,我将常用的几个功能进行了封装,以方便调用。这个类已经在多个项目中得到使用,主要解决了Cookies相关的一些问题;如果有其它方面的问题可以提出来,我会进一步完善。

    目前HttpHelper包含了以下几个方面:

    • GetHttpContent:通过Get或Post来获取网页的Html
    • SetCookie:根据response中头部的set-cookie对cookie进行设置,能识别httponly
    • GetAllCookies:将CookieContainer转换为键值对,方便存储和跨程序间调用
    • ConvertToCookieContainer:将键值对转换回CookieContainer供程序调用
    • BuildPostData:通过一个需要post的html构建出postdata

    代码如下:

      1 using System;
      2 using System.Collections.Generic;
      3 using System.Collections.Specialized;
      4 using System.IO;
      5 using System.IO.Compression;
      6 using System.Linq;
      7 using System.Net;
      8 using System.Net.Security;
      9 using System.Security.Cryptography.X509Certificates;
     10 using System.Text;
     11 using System.Text.RegularExpressions;
     12 using System.Collections;
     13 using HtmlAgilityPack;
     14 
     15 namespace TNIdea.Common.Helper
     16 {
     17     public class HttpHelper
     18     {
     19         public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^s"">]+)""?)";
     20 
     21         /// <summary>
     22         /// 获取网页的内容
     23         /// </summary>
     24         /// <param name="url">Url</param>
     25         /// <param name="postData">Post的信息</param>
     26         /// <param name="cookies">Cookies</param>
     27         /// <param name="userAgent">浏览器标识</param>
     28         /// <param name="referer">来源页</param>
     29         /// <param name="cookiesDomain">Cookies的Domian参数,配合cookies使用;为空则取url的Host</param>
     30         /// <param name="encode">编码方式,用于解析html</param>
     31         /// <returns></returns>
     32         public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null)
     33         {
     34             try
     35             {
     36                 HttpWebResponse httpResponse = null;
     37                 if (!string.IsNullOrWhiteSpace(postData))
     38                     httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer);
     39                 else
     40                     httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer);
     41 
     42                 #region 根据Html头判断
     43                 string Content = null;
     44                 //缓冲区长度
     45                 const int N_CacheLength = 10000;
     46                 //头部预读取缓冲区,字节形式
     47                 var bytes = new List<byte>();
     48                 int count = 0;
     49                 //头部预读取缓冲区,字符串
     50                 String cache = string.Empty;
     51 
     52                 //创建流对象并解码
     53                 Stream ResponseStream;
     54                 switch (httpResponse.ContentEncoding.ToUpperInvariant())
     55                 {
     56                     case "GZIP":
     57                         ResponseStream = new GZipStream(
     58                             httpResponse.GetResponseStream(), CompressionMode.Decompress);
     59                         break;
     60                     case "DEFLATE":
     61                         ResponseStream = new DeflateStream(
     62                             httpResponse.GetResponseStream(), CompressionMode.Decompress);
     63                         break;
     64                     default:
     65                         ResponseStream = httpResponse.GetResponseStream();
     66                         break;
     67                 }
     68 
     69                 try
     70                 {
     71                     while (
     72                         !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase)
     73                           || count >= N_CacheLength))
     74                     {
     75                         var b = (byte)ResponseStream.ReadByte();
     76                         if (b < 0) //end of stream
     77                         {
     78                             break;
     79                         }
     80                         bytes.Add(b);
     81 
     82                         count++;
     83                         cache += (char)b;
     84                     }
     85 
     86 
     87                     if (encode == null)
     88                     {
     89                         try
     90                         {
     91                             if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn")
     92                             {
     93                                 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline);
     94                                 if (match.Success)
     95                                 {
     96                                     try
     97                                     {
     98                                         string charset = match.Groups["Charset"].Value;
     99                                         encode = Encoding.GetEncoding(charset);
    100                                     }
    101                                     catch { }
    102                                 }
    103                                 else
    104                                     encode = Encoding.GetEncoding("GB2312");
    105                             }
    106                             else
    107                                 encode = Encoding.GetEncoding(httpResponse.CharacterSet);
    108                         }
    109                         catch { }
    110                     }
    111 
    112                     //缓冲字节重新编码,然后再把流读完
    113                     var Reader = new StreamReader(ResponseStream, encode);
    114                     Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();
    115                     Reader.Close();
    116                 }
    117                 catch (Exception ex)
    118                 {
    119                     return ex.ToString();
    120                 }
    121                 finally
    122                 {
    123                     httpResponse.Close();
    124                 }
    125                 #endregion 根据Html头判断
    126 
    127                 //获取返回的Cookies,支持httponly
    128                 if (string.IsNullOrWhiteSpace(cookiesDomain))
    129                     cookiesDomain = httpResponse.ResponseUri.Host;
    130 
    131                 cookies = new CookieContainer();
    132                 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);
    133                 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);
    134 
    135                 return Content;
    136             }
    137             catch
    138             {
    139                 return string.Empty;
    140             }
    141         }
    142 
    143 
    144         /// <summary>
    145         /// 创建GET方式的HTTP请求 
    146         /// </summary>
    147         /// <param name="url"></param>
    148         /// <param name="timeout"></param>
    149         /// <param name="userAgent"></param>
    150         /// <param name="cookies"></param>
    151         /// <param name="referer"></param>
    152         /// <returns></returns>
    153         public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
    154         {
    155             HttpWebRequest request = null;
    156             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
    157             {
    158                 //对服务端证书进行有效性校验(非第三方权威机构颁发的证书,如自己生成的,不进行验证,这里返回true)
    159                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
    160                 request = WebRequest.Create(url) as HttpWebRequest;
    161                 //request.ProtocolVersion = HttpVersion.Version10;    //http版本,默认是1.1,这里设置为1.0
    162             }
    163             else
    164             {
    165                 request = WebRequest.Create(url) as HttpWebRequest;
    166             }
    167 
    168             request.Referer = referer;
    169             request.Method = "GET";
    170 
    171             //设置代理UserAgent和超时
    172             if (string.IsNullOrWhiteSpace(userAgent))
    173                 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36";
    174 
    175             request.UserAgent = userAgent;
    176             request.Timeout = timeout;
    177             request.KeepAlive = true;
    178             request.AllowAutoRedirect = true;
    179 
    180             if (cookies == null)
    181                 cookies = new CookieContainer();
    182             request.CookieContainer = cookies;
    183 
    184             return request.GetResponse() as HttpWebResponse;
    185         }
    186 
    187         /// <summary>
    188         /// 创建POST方式的HTTP请求
    189         /// </summary>
    190         /// <param name="url"></param>
    191         /// <param name="postData"></param>
    192         /// <param name="timeout"></param>
    193         /// <param name="userAgent"></param>
    194         /// <param name="cookies"></param>
    195         /// <param name="referer"></param>
    196         /// <returns></returns>
    197         public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
    198         {
    199             HttpWebRequest request = null;
    200             //如果是发送HTTPS请求  
    201             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
    202             {
    203                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
    204                 request = WebRequest.Create(url) as HttpWebRequest;
    205                 //request.ProtocolVersion = HttpVersion.Version10;
    206             }
    207             else
    208             {
    209                 request = WebRequest.Create(url) as HttpWebRequest;
    210             }
    211             request.Referer = referer;
    212             request.Method = "POST";
    213             request.ContentType = "application/x-www-form-urlencoded";
    214 
    215             //设置代理UserAgent和超时
    216             if (string.IsNullOrWhiteSpace(userAgent))
    217                 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";
    218             else
    219                 request.UserAgent = userAgent;
    220             request.Timeout = timeout;
    221             request.KeepAlive = true;
    222             request.AllowAutoRedirect = true;
    223 
    224             if (cookies == null)
    225                 cookies = new CookieContainer();
    226             request.CookieContainer = cookies;
    227 
    228             //发送POST数据  
    229             if (!string.IsNullOrWhiteSpace(postData))
    230             {
    231                 byte[] data = Encoding.UTF8.GetBytes(postData);
    232                 request.ContentLength = data.Length;
    233                 using (Stream stream = request.GetRequestStream())
    234                 {
    235                     stream.Write(data, 0, data.Length);
    236                 }
    237             }
    238             //string[] values = request.Headers.GetValues("Content-Type");
    239             return request.GetResponse() as HttpWebResponse;
    240         }
    241 
    242         /// <summary>
    243         /// 验证证书
    244         /// </summary>
    245         /// <param name="sender"></param>
    246         /// <param name="certificate"></param>
    247         /// <param name="chain"></param>
    248         /// <param name="errors"></param>
    249         /// <returns>是否验证通过</returns>
    250         private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
    251         {
    252             if (errors == SslPolicyErrors.None)
    253                 return true;
    254             return false;
    255         }
    256 
    257         /// <summary>
    258         /// 根据response中头部的set-cookie对request中的cookie进行设置
    259         /// </summary>
    260         /// <param name="setCookie">The set cookie.</param>
    261         /// <param name="defaultDomain">The default domain.</param>
    262         /// <returns></returns>
    263         private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain)
    264         {
    265             try
    266             {
    267                 string[] setCookie = response.Headers.GetValues("Set-Cookie");
    268 
    269                 // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces.
    270                 List<string> a = new List<string>(setCookie);
    271                 for (int i = setCookie.Length - 1; i > 0; i--)
    272                 {
    273                     if (a[i].Substring(a[i].Length - 3) == "GMT")
    274                     {
    275                         a[i - 1] = a[i - 1] + ", " + a[i];
    276                         a.RemoveAt(i);
    277                         i--;
    278                     }
    279                 }
    280                 setCookie = a.ToArray<string>();
    281                 CookieCollection cookies = new CookieCollection();
    282                 foreach (string str in setCookie)
    283                 {
    284                     NameValueCollection hs = new NameValueCollection();
    285                     foreach (string i in str.Split(';'))
    286                     {
    287                         int index = i.IndexOf("=");
    288                         if (index > 0)
    289                             hs.Add(i.Substring(0, index).Trim(), i.Substring(index + 1).Trim());
    290                         else
    291                             switch (i)
    292                             {
    293                                 case "HttpOnly":
    294                                     hs.Add("HttpOnly", "True");
    295                                     break;
    296                                 case "Secure":
    297                                     hs.Add("Secure", "True");
    298                                     break;
    299                             }
    300                     }
    301                     Cookie ck = new Cookie();
    302                     foreach (string Key in hs.AllKeys)
    303                     {
    304                         switch (Key.ToLower().Trim())
    305                         {
    306                             case "path":
    307                                 ck.Path = hs[Key];
    308                                 break;
    309                             case "expires":
    310                                 ck.Expires = DateTime.Parse(hs[Key]);
    311                                 break;
    312                             case "domain":
    313                                 ck.Domain = hs[Key];
    314                                 break;
    315                             case "httpOnly":
    316                                 ck.HttpOnly = true;
    317                                 break;
    318                             case "secure":
    319                                 ck.Secure = true;
    320                                 break;
    321                             default:
    322                                 ck.Name = Key;
    323                                 ck.Value = hs[Key];
    324                                 break;
    325                         }
    326                     }
    327                     if (ck.Domain == "") ck.Domain = defaultDomain;
    328                     if (ck.Name != "") cookies.Add(ck);
    329                 }
    330                 return cookies;
    331             }
    332             catch
    333             {
    334                 return null;
    335             }
    336         }
    337 
    338         /// <summary>
    339         /// 遍历CookieContainer
    340         /// </summary>
    341         /// <param name="cookieContainer"></param>
    342         /// <returns>List of cookie</returns>
    343         public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer)
    344         {
    345             Dictionary<string, string> cookies = new Dictionary<string, string>();
    346 
    347             Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable",
    348                 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField |
    349                 System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { });
    350 
    351             foreach (string pathList in table.Keys)
    352             {
    353                 StringBuilder _cookie = new StringBuilder();
    354                 SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list",
    355                     System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField
    356                     | System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { });
    357                 foreach (CookieCollection colCookies in cookieColList.Values)
    358                     foreach (Cookie c in colCookies)
    359                         _cookie.Append(c.Name + "=" + c.Value + ";");
    360 
    361                 cookies.Add(pathList, _cookie.ToString().TrimEnd(';'));
    362             }
    363             return cookies;
    364         }
    365 
    366         /// <summary>
    367         /// convert cookies string to CookieContainer
    368         /// </summary>
    369         /// <param name="cookies"></param>
    370         /// <returns></returns>
    371         public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies)
    372         {
    373             CookieContainer cookieContainer = new CookieContainer();
    374 
    375             foreach (var cookie in cookies)
    376             {
    377                 string[] strEachCookParts = cookie.Value.Split(';');
    378                 int intEachCookPartsCount = strEachCookParts.Length;
    379 
    380                 foreach (string strCNameAndCValue in strEachCookParts)
    381                 {
    382                     if (!string.IsNullOrEmpty(strCNameAndCValue))
    383                     {
    384                         Cookie cookTemp = new Cookie();
    385                         int firstEqual = strCNameAndCValue.IndexOf("=");
    386                         string firstName = strCNameAndCValue.Substring(0, firstEqual);
    387                         string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
    388                         cookTemp.Name = firstName;
    389                         cookTemp.Value = allValue;
    390                         cookTemp.Path = "/";
    391                         cookTemp.Domain = cookie.Key;
    392                         cookieContainer.Add(cookTemp);
    393                     }
    394                 }
    395             }
    396             return cookieContainer;
    397         }
    398 
    399         public static string BuildPostData(string htmlContent)
    400         {
    401             HtmlDocument htmlDoc = new HtmlDocument();
    402             htmlDoc.LoadHtml(htmlContent);
    403             //Get the form node collection.
    404             HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form");
    405             HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input");
    406 
    407             StringBuilder postData = new StringBuilder();
    408 
    409             foreach (HtmlNode input in htmlInputs)
    410             {
    411                 if(input.Attributes["value"] != null)
    412                     postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&");
    413             }
    414             return postData.ToString().TrimEnd('&');
    415         }
    416     }
    417 }

    部分网站需要登录的问题我已经着手通过另一个项目来解决(imitate-login),目前还有许多网页使用了JavaScript或各种基于JS的框架来对网页进行数据加载,如何来模拟执行JavaScript暂时还没找到比较优美的解决方案,如果大家有什么好的方案可以发给我,谢谢!

     未经授权,拒绝任何全文及摘要转载!

  • 相关阅读:
    hmac模块和hashlib模块
    logging模块
    sys模块
    datetime模块
    time模块
    例题:100节楼梯,0-49节,分数等于节数。50节(包括50节)以后每节10分。输入节数,得出分数。这个题如果按照讲页来做是错误的,所以再写一遍,请大家指导
    C# .ToString() 格式化
    例题:判断平年还是闰年。理解使用异常语句try catch finally 和datetime 时间类
    SQL Server第一堂课:创建数据库,创建表,以及表中最基本的增,删,改
    例题:输入学生的各项资料,然后根据学生的分数,重新排序。重新复习结构体,集合,数组,for循环,冒泡排序,水平符的使用。
  • 原文地址:https://www.cnblogs.com/NewIdea/p/http-helper-at-csharp.html
Copyright © 2011-2022 走看看