zoukankan      html  css  js  c++  java
  • 网络蜘蛛(网络爬虫)核心C#源代码

    网络蜘蛛或爬虫需要能够下载网页、图片(流)以及登录的Cookies等信息,以下的C#代码是比较实用的核心程序。
    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.IO;
    using System.IO.Compression;
    using System.Xml;
    using System.Web;
    using System.Collections;
    using System.Runtime.InteropServices;
    using System.Net;
    using System.Net.Security;
    using System.Security.Authentication;
    using System.Security.Cryptography.X509Certificates;
    namespace Common
    {
    /// <summary>
    /// 准备POST
    /// </summary>
    /// <param name="httpRequest"></param>
    public delegate void OnGetPostReady(HttpWebRequest httpRequest);
    /// <summary>
    /// 准备取回应
    /// </summary>
    /// <param name="httpRequest"></param>
    public delegate void OnGetResponseReady(HttpWebRequest httpRequest);
    public class HttpWebHelper
    {
    protected HttpWebRequest httpRequest;
    protected HttpWebResponse httpResponse;
    protected CookieContainer cookieContainer;
    protected CredentialCache credentialCache;
    protected bool certificatedMode = false;
    protected string certFilepath = string.Empty;
    public OnGetPostReady OnGetPostReadyHandler = null;
    public OnGetPostReady OnGetResponseReadyHandler = null;
    protected readonly int DEFAULT_BUFFER_SIZE = 4096;
    public WebProxy webProxySrv = null;
    private static readonly int MyConnectionLimit = 300;
    public bool CheckGotoRecv
    {
    get;
    set;
    }
    public bool DoBetIsGotoRecv
    {
    get;
    set;
    }
    public bool LastAccessError
    {
    private set;
    get;
    }
    /// <summary>
    /// 当前自动转向后的url
    /// </summary>
    public string CurrentUrl
    {
    private set;
    get;
    }
    public string CurrentLocation
    {
    private set;
    get;
    }
    public string CurSetCookie
    {
    set;
    get;
    }
    public string CurSetCookie2
    {
    set;
    get;
    }
    /// <summary>
    /// 默认构造器
    /// </summary>
    public HttpWebHelper()
    {
    this.cookieContainer = new CookieContainer();
    ServicePointManager.DefaultConnectionLimit = MyConnectionLimit;
    ServicePointManager.Expect100Continue = false;
    ServicePointManager.MaxServicePointIdleTime = 10000;
    }
    /// <summary>
    /// 代理參數構造器
    /// </summary>
    /// <param name="wp"></param>
    public HttpWebHelper(WebProxy wp) : this()
    {
    this.webProxySrv = wp;
    }
    /// <summary>
    /// 需要基本认证的构造器
    /// </summary>
    /// <param name="cred"></param>
    public HttpWebHelper(bool cred)
    : this()
    {
    this.certificatedMode = cred;
    }
    public HttpWebHelper(bool cred, WebProxy wp)
    : this()
    {
    this.certificatedMode = cred;
    this.webProxySrv = wp;
    }
    /// <summary>
    /// 基本认证和证书,refer页面
    /// </summary>
    /// <param name="cred"></param>
    /// <param name="certFilepath"></param>
    public HttpWebHelper(bool cred, string certFilepath)
    : this(cred)
    {
    this.certFilepath = certFilepath;
    }
    public HttpWebHelper(bool cred, WebProxy wp, string certFilepath)
    : this(cred, wp)
    {
    this.certFilepath = certFilepath;
    }
    /// <summary>
    /// 提供批量用户名和密码的构造器
    /// </summary>
    /// <param name="uri"></param>
    /// <param name="method"></param>
    /// <param name="username"></param>
    /// <param name="password"></param>
    public HttpWebHelper(string uri, string method, string username, string password)
    : this(true)
    {
    this.credentialCache = new CredentialCache();
    this.credentialCache.Add(new Uri(uri), method, new NetworkCredential(username, password));
    }
    /// <summary>
    /// 安全询问回调函数,直接同意
    /// </summary>
    /// <param name="sender"></param>
    /// <param name="certificate"></param>
    /// <param name="chain"></param>
    /// <param name="errors"></param>
    /// <returns></returns>
    public bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
    {
    return true;
    }
    private void SetHttpRequestOptions_Accept(string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm, string httpAccept)
    {
    this.SetHttpRequestOptions(url, method, cc, referUrl, nocache, dm);
    this.httpRequest.Accept = httpAccept;
    }
    /// <summary>
    /// 设置HttpWebRequest对象
    /// </summary>
    /// <param name="url"></param>
    /// <param name="method"></param>
    /// <param name="cc"></param>
    /// <param name="referUrl"></param>
    /// <param name="nocache"></param>
    /// <param name="dm"></param>
    private void SetHttpRequestOptions(string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm)
    {
    httpRequest = (HttpWebRequest)HttpWebRequest.Create(url);
    httpRequest.UnsafeAuthenticatedConnectionSharing = true;
    httpRequest.ServicePoint.ConnectionLimit = MyConnectionLimit;
    if (null != this.webProxySrv) httpRequest.Proxy = this.webProxySrv;
    if (this.certificatedMode && url.ToLower().Substring(0, 5).Equals("https"))
    {
    ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(CheckValidationResult);
    if (null == this.credentialCache)
    httpRequest.UseDefaultCredentials = true;
    else
    httpRequest.Credentials = this.credentialCache;
    if (!string.IsNullOrEmpty(this.certFilepath))
    httpRequest.ClientCertificates.Add(X509Certificate.CreateFromCertFile(this.certFilepath));
    }
    httpRequest.CookieContainer = this.cookieContainer;
    if (!string.IsNullOrEmpty(referUrl)) httpRequest.Referer = referUrl;
    httpRequest.AutomaticDecompression = dm;
    httpRequest.ServicePoint.Expect100Continue = false;
    httpRequest.ServicePoint.UseNagleAlgorithm = false;
    httpRequest.ContentType = "application/x-www-form-urlencoded";
    // httpRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, */*";
    // httpRequest.AllowWriteStreamBuffering = true; 默认值就是true
    // httpRequest.AllowAutoRedirect = true; 默认值就是true
    httpRequest.Method = method;
    httpRequest.Timeout = ApplicationConfig.HTTP_REQUEST_TIMEOUT;
    // 讀寫超時
    //httpRequest.ReadWriteTimeout = ApplicationConfig.HTTP_REQUEST_TIMEOUT;
    // httpRequest.MaximumAutomaticRedirections = 50; 默认值就是50
    httpRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
    httpRequest.Headers.Add("Accept-Language", "zh-cn");
    httpRequest.Headers.Add("UA-CPU", "x86");
    //httpRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); 
    if (nocache)
    {
    httpRequest.Headers.Add("Cache-Control", "no-cache");
    //httpRequest.Headers.Add("Pragma", "no-cache");
    }
    if (null != cc) httpRequest.CookieContainer.Add(cc);
    // 回调发起请求前事件
    if(null != this.OnGetPostReadyHandler)
    {
    try
    {
    this.OnGetPostReadyHandler(this.httpRequest);
    //BaseDebug.DebugPrint("KeepAlive = " + this.httpRequest.KeepAlive.ToString());
    }
    catch (System.Exception ex)
    {
    this.LastAccessError = true;
    BaseDebug.DebugPrint(ex.ToString());
    }
    }
    }
    private void SetHttpRequestOptions(string url, string method, CookieCollection cc, string referUrl, string httpAccept)
    {
    this.SetHttpRequestOptions_Accept(url, method, cc, referUrl, false, DecompressionMethods.GZip | DecompressionMethods.Deflate, httpAccept);
    }
    /// <summary>
    /// 重新设置某些成员
    /// </summary>
    private void ManualResetMember()
    this.cookieContainer = httpRequest.CookieContainer;
    this.CurrentUrl = httpRequest.Address.OriginalString;
    this.CurrentLocation = httpResponse.Headers["Location"];
    }
    public MemoryStream GetMemoryStream(string url, string method, CookieCollection cc, string referUrl, string httpAccept)
    {
    MemoryStream ms = new MemoryStream();
    try
    {
    this.SetHttpRequestOptions(url, method, cc, referUrl, "*/*");
    this.httpRequest.Accept = httpAccept;
    this.httpResponse = (HttpWebResponse)httpRequest.GetResponse();
    // 是否收到响应
    if (!this.httpRequest.HaveResponse)
    {
    this.httpResponse.Close();
    this.httpRequest.Abort();
    return ms;
    }
    this.ManualResetMember();
    if (null != this.OnGetResponseReadyHandler)
    {
    try
    {
    this.OnGetResponseReadyHandler(this.httpRequest);
    }
    catch (System.Exception ex)
    {
    this.LastAccessError = true;
    BaseDebug.DebugPrint(ex.ToString());
    }
    }
    this.DoBetIsGotoRecv = true;
    Stream sm = httpResponse.GetResponseStream();
    if (null != sm && sm.CanRead)
    {
    BinaryReader br = new BinaryReader(sm);
    byte[] bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);
    while (null != bytes && bytes.Length != 0)
    {
    ms.Write(bytes, 0, bytes.Length);
    bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);
    }
    br.Close();
    }
    if (httpResponse.Headers["Set-Cookie"] != null)
    this.CurSetCookie = httpResponse.Headers["Set-Cookie"].ToString();
    httpResponse.Close();
    if (null != sm) sm.Close();
    // 非常重要,回到开头
    ms.Seek(0, SeekOrigin.Begin);
    }
    catch (System.Exception ex)
    {
    this.LastAccessError = true;
    BaseDebug.DebugPrint("異常網址:" + url);
    BaseDebug.DebugPrint(ex.ToString());
    if (null != httpRequest) httpRequest.Abort();
    }
    return ms;
    }
    public MemoryStream SimpleGetMemoryStream(string url, string method)
    {
    return this.GetMemoryStream(url, method, null, null, "text/html");
    }
    public MemoryStream SimpleGetMemoryStream(string url, string method, string httpAccept)
    {
    return this.GetMemoryStream(url, method, null, null, httpAccept);
    }
    /// <summary>
    /// 仅仅发送请求,返回所有的输出文本
    /// </summary>
    /// <param name="url"></param>
    /// <param name="method"></param>
    /// <param name="coding"></param>
    /// <param name="cc"></param>
    /// <param name="referUrl"></param>
    /// <returns></returns>
    public string SimpleDoPostWrapper(string url, string method, Encoding coding, CookieCollection cc, string referUrl)
    {
    string str = string.Empty;
    StreamReader sr = null;
    MemoryStream sm = null;
    if (null == coding)
    {
    sm = this.GetMemoryStream(url, method, cc, referUrl, "text/html");
    sr = new StreamReader(sm);
    }
    else
    {
    sm = this.GetMemoryStream(url, method, cc, referUrl, "text/html");
    sr = new StreamReader(sm, coding);
    }
    str = sr.ReadToEnd();
    sr.Close();
    sm.Close();
    return str;
    }
    public string SimpleDoPostWrapper(string url, string method)
    {
    return this.SimpleDoPostWrapper(url, method, null, null, null);
    }
    public string SimpleDoPostWrapper(string url, string method, CookieCollection cc)
    {
    return this.SimpleDoPostWrapper(url, method, null, cc, null);
    }
    public string SimpleDoPostWrapper(string url, string method, string referUrl)
    {
    return this.SimpleDoPostWrapper(url, method, null, null, referUrl);
    }
    /// <summary>
    /// 上送数据,返回输出流
    /// </summary>
    /// <param name="url"></param>
    /// <param name="data"></param>
    /// <param name="method"></param>
    /// <param name="coding"></param>
    /// <param name="cc"></param>
    /// <param name="referUrl"></param>
    /// <returns></returns>
    public MemoryStream GetMemoryStream(string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl)
    {
    MemoryStream ms = new MemoryStream();
    try
    {
    this.SetHttpRequestOptions(url, method, cc, referUrl, "text/html");
    byte[] bytesData = coding.GetBytes(data);
    Stream requestStream = httpRequest.GetRequestStream();
    requestStream.Write(bytesData, 0, bytesData.Length);
    requestStream.Flush();
    requestStream.Close();
    this.httpResponse = (HttpWebResponse)httpRequest.GetResponse();
    // 是否收到响应
    if (!this.httpRequest.HaveResponse)
    {
    this.httpResponse.Close();
    this.httpRequest.Abort();
    return ms;
    }
    this.ManualResetMember();
    if (null != this.OnGetResponseReadyHandler)
    {
    try
    {
    this.OnGetResponseReadyHandler(this.httpRequest);
    }
    catch (System.Exception ex)
    {
    this.LastAccessError = true;
    BaseDebug.DebugPrint(ex.ToString());
    }
    }
    this.DoBetIsGotoRecv = true;
    Stream sm = httpResponse.GetResponseStream();
    if (null != sm && sm.CanRead)
    {
    BinaryReader br = new BinaryReader(sm);
    byte[] bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);
    while (null != bytes && bytes.Length != 0)
    {
    ms.Write(bytes, 0, bytes.Length);
    bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);
    }
    br.Close();
    }
    if (httpResponse.Headers["Set-Cookie"] != null)
    this.CurSetCookie = httpResponse.Headers["Set-Cookie"].ToString();
    httpResponse.Close();
    if (null != sm) sm.Close();
    // 非常重要,回到开头
    ms.Seek(0, SeekOrigin.Begin);
    }
    catch (System.Exception ex)
    {
    this.LastAccessError = true;
    BaseDebug.DebugPrint("異常網址:" + url);
    BaseDebug.DebugPrint(ex.ToString());
    if (null != httpRequest) httpRequest.Abort();
    }
    return ms;
    }
    public MemoryStream SimpleGetMemoryStream(string url, string data, string method, Encoding coding)
    {
    return this.GetMemoryStream(url, data, method, coding, null, null);
    }
    public MemoryStream SimpleGetMemoryStream(string url, string data, string method, Encoding coding, string referUrl)
    {
    return this.GetMemoryStream(url, data, method, coding, null, referUrl);
    }
    /// <summary>
    /// 上送,返回所有的输出文本
    /// </summary>
    /// <param name="url"></param>
    /// <param name="data"></param>
    /// <param name="method"></param>
    /// <param name="coding"></param>
    /// <param name="referUrl"></param>
    /// <returns></returns>
    public string DoPostWrapper(string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl)
    {
    string str = string.Empty;
    MemoryStream sm = this.GetMemoryStream(url, data, method, coding, cc, referUrl);
    StreamReader sr = new StreamReader(sm);
    str = sr.ReadToEnd();
    sr.Close();
    sm.Close();
    return str;
    }
    public string DoPostWrapper(string url, string data, string method, Encoding coding)
    {
    return this.DoPostWrapper(url, data, method, coding, null, null);
    }
    public string DoPostWrapper(string url, string data, string method, Encoding coding, CookieCollection cc)
    {
    return this.DoPostWrapper(url, data, method, coding, cc, null);
    }
    public string DoPostWrapper(string url, string data, string method, Encoding coding, string referUrl)
    {
    return this.DoPostWrapper(url, data, method, coding, null, referUrl);
    }
    /// <summary>
    /// 上送,返回所有的输出文本,参数是字典
    /// </summary>
    /// <param name="url"></param>
    /// <param name="dicArguments"></param>
    /// <param name="method"></param>
    /// <param name="coding"></param>
    /// <param name="referUrl"></param>
    /// <returns></returns>
    public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, CookieCollection cc, string referUrl)
    {
    string data = this.BuildRequestArguments(dicArguments);
    return this.DoPostWrapper(url, data, method, coding, cc, referUrl);
    }
    public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding)
    {
    return this.DoPostWrapper(url, dicArguments, method, coding, null, null);
    }
    public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, CookieCollection cc)
    {
    return this.DoPostWrapper(url, dicArguments, method, coding, cc, null);
    }
    public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, string referUrl)
    {
    return this.DoPostWrapper(url, dicArguments, method, coding, null, referUrl);
    }
    /// <summary>
    /// 下载验证码,只返回内存流,调用函数要负责关闭该Stream
    /// </summary>
    /// <param name="url"></param>
    /// <param name="method"></param>
    /// <returns></returns>
    public MemoryStream DownloadStream(string url, string method)
    {
    return this.SimpleGetMemoryStream(url, method, "*/*");
    }
    /// <summary>
    /// 从字典中生成上传参数.提供编码定制支持
    /// </summary>
    /// <param name="dicArguments"></param>
    /// <param name="coding"></param>
    /// <returns></returns>
    public string BuildRequestArguments(Dictionary<string, string> dicArguments, Encoding coding)
    {
    StringBuilder sb = new StringBuilder();
    string str = string.Empty;
    if (0 == dicArguments.Count) return str;
    foreach (KeyValuePair<string, string> kvp in dicArguments)
    {
    if(null != coding)
    sb.Append(HttpUtility.UrlEncode(kvp.Key, coding) + "=" + HttpUtility.UrlEncode(kvp.Value, coding));
    else
    sb.Append(HttpUtility.UrlEncode(kvp.Key) + "=" + HttpUtility.UrlEncode(kvp.Value));
    // a&b
    sb.Append("&");
    }
    str = sb.ToString();
    return str.Substring(0, str.Length - 1);
    }
    /// <summary>
    /// 从字典中生成上传的默认参数,不提供编码定制支持
    /// </summary>
    /// <param name="dicArguments"></param>
    /// <returns></returns>
    public string BuildRequestArguments(Dictionary<string, string> dicArguments)
    {
    return this.BuildRequestArguments(dicArguments, null);
    }
    /// <summary>
    /// 查询cookie中的某个项的值
    /// </summary>
    /// <param name="key"></param>
    /// <param name="domain"></param>
    /// <returns></returns>
    public string GetCookieValue(string key, string domain) 
    {
    if (0 == this.cookieContainer.Count)
    {
    return string.Empty;
    }
    CookieCollection cc = this.cookieContainer.GetCookies(new Uri(domain));
    return cc[key].Value;
    }
    /// <summary>
    /// 设置cookies容器
    /// </summary>
    /// <param name="cc"></param>
    public void SetCookieContainer(CookieContainer cc) 
    {
    this.cookieContainer = cc;
    }
    /// <summary>
    /// 放棄請求
    /// </summary>
    public bool AbortHttpRequest()
    {
    if(null != this.httpRequest)
    {
    this.httpRequest.Abort();
    }
    return this.CheckGotoRecv && this.DoBetIsGotoRecv;
    }
    }
    }
  • 相关阅读:
    SQL大全基本语法
    ASP.NET实现二维码
    JMeter下载及安装配置完整版
    windows下搭建nginx负载均衡
    C# 接口生成工具Swagger用法
    在IIS上部署网站,应用程序池选择.Net Framework版本为 v4.0.30319后,打开网站报错
    sqlserver 中WITH NOLOCK、HOLDLOCK、UPDLOCK、TABLOCK、TABLOCKX
    C#进阶学习笔记
    C#单例模式的理解
    实现WebService只返还json结构数据
  • 原文地址:https://www.cnblogs.com/top5/p/1976917.html
Copyright © 2011-2022 走看看