zoukankan      html  css  js  c++  java
  • C#获取网页信息核心方法(入门一)

    目录:信息采集入门系列目录

     下面记录的是我自己整理的C#请求页面核心类,主要有如下几个方法

    1.HttpWebRequest Get请求获得页面html

    2.HttpWebRequest Post请求获得页面html

    3.模拟登录获得cookie内容

    4.模拟登录获得cookie字符串

    5.代理的设置

    6.利用webbrowser 获取js生成的页面

    7.为webbrowser设置cookie,模拟登录

    8.使用demo

    HttpWebRequest Get请求获得页面html

    注意点:以前抓取觉得很慢,最后发现是代理的问题,没有代理就设置为null,这样就不用每次去找代理,影响执行效率,还有一些参数可以自习设置,比如模拟浏览器等。

            /// <summary>
            /// get请求获得页面的html
            /// </summary>
            /// <param name="url">需要获取的url</param>
            /// <param name="proxy">代理,没有设置为null,不然每次去读代理造成请求很慢</param>
            /// <param name="cookie">该网站所需要的cookie</param>
            /// <param name="timeout">超时时间</param>
            /// <returns>页面请求后的html</returns>
            public static string Crawl(string url, WebProxy proxy, CookieContainer cookie, int timeout = 10000)
            {
                string result = string.Empty;
                HttpWebRequest request = null;
                WebResponse response = null;
                StreamReader streamReader = null;
                try
                {
                    request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Proxy = proxy;
                    request.Timeout = timeout;
                    request.AllowAutoRedirect = true;
                    request.CookieContainer = cookie;
                    response = (HttpWebResponse)request.GetResponse();
                    streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                    result = streamReader.ReadToEnd();
    
                }
                catch (Exception ex)
                {
                    throw ex;
                }
                finally
                {
                    if (request != null)
                    {
                        request.Abort();
                    }
                    if (response != null)
                    {
                        response.Close();
                    }
                    if (streamReader != null)
                    {
                        streamReader.Dispose();
                    }
                }
    
                return result;
            }

    HttpWebRequest Post请求获得页面html

            /// <summary>
            /// post请求获得页面
            /// </summary>
            /// <param name="url">需要获取的url</param>
            /// <param name="postdata">post的数据字符串,如id=1&name=test</param>
            /// <param name="proxy">代理</param>
            /// <param name="cookie">coolie</param>
            /// <param name="timeout">超时</param>
            /// <returns></returns>
            public static string Crawl(string url, string postdata,WebProxy proxy, CookieContainer cookie, int timeout = 10000)
            {
                string result = string.Empty;
                HttpWebRequest request = null;
                WebResponse response = null;
                StreamReader streamReader = null;
                try
                {
                    request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Proxy = proxy;
                    request.Timeout = timeout;
                    request.AllowAutoRedirect = true;
                    request.CookieContainer = cookie;
    
                    byte[] bs = Encoding.ASCII.GetBytes(postdata);
                    string responseData = String.Empty;
                    request.Method = "POST";
                    request.ContentType = "application/x-www-form-urlencoded";
                    request.ContentLength = bs.Length;
                    using (Stream reqStream = request.GetRequestStream())
                    {
                        reqStream.Write(bs, 0, bs.Length);
                        reqStream.Close();
                    }
                    response = (HttpWebResponse)request.GetResponse();
                    streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                    result = streamReader.ReadToEnd();
    
                }
                catch (Exception ex)
                {
                    throw ex;
                }
                finally
                {
                    if (request != null)
                    {
                        request.Abort();
                    }
                    if (response != null)
                    {
                        response.Close();
                    }
                    if (streamReader != null)
                    {
                        streamReader.Dispose();
                    }
                }
    
                return result;
            }

    模拟登录获得cookie内容

    先找到登录的页面,分析登录页面的post参数和链接,获得cookie后可以直接传到上面的方法

            /// <summary>
            ///根据模拟请求页面获得cookie
            /// </summary>
            /// <param name="url">模拟的url</param>
            /// <returns>cookie</returns>
            public static CookieContainer GetCookie(string url, WebProxy proxy, int timeout = 10000)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                try
                {
                    CookieContainer cc = new CookieContainer();
                    request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Proxy = proxy;
                    request.Timeout = timeout;
                    request.AllowAutoRedirect = true;
                    request.CookieContainer = cc;
                    response = (HttpWebResponse)request.GetResponse();
                    response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
                    return cc;
                }
                catch (Exception ex)
                {
                    throw ex;
                }
                finally
                {
                    if (request != null)
                    {
                        request.Abort();
                    }
                    if (response != null)
                    {
                        response.Close();
                    }
                }
    
            }

    模拟登录获得cookie字符串

            /// <summary>
            /// 获得cookie字符串,webbrowser可以使用
            /// </summary>
            /// <param name="url"></param>
            /// <param name="proxy"></param>
            /// <param name="timeout"></param>
            /// <returns></returns>
            public static string GetCookieString(string url, WebProxy proxy, int timeout = 10000)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                try
                {
                    CookieContainer cc = new CookieContainer();
                    request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Proxy = proxy;
                    request.Timeout = timeout;
                    request.AllowAutoRedirect = true;
                    request.CookieContainer = cc;
                    response = (HttpWebResponse)request.GetResponse();
                    response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
                    string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri);
                    return strcrook;
    
                }
                catch (Exception ex)
                {
                    throw ex;
                }
                finally
                {
                    if (request != null)
                    {
                        request.Abort();
                    }
                    if (response != null)
                    {
                        response.Close();
                    }
                }
            }

    代理的设置

           /// <summary>
            /// 创建代理
            /// </summary>
            /// <param name="port">代理端口</param>
            /// <param name="user">用户名</param>
            /// <param name="password">密码</param>
            /// <returns></returns>
            public static WebProxy CreatePorxy(string port, string user, string password)
            {
                WebProxy proxy = new WebProxy(); 
                proxy.Address = new Uri(port); 
                proxy.Credentials = new NetworkCredential(user, password); 
                return proxy;
            }

    利用webbrowser 获取js生成的页面

    说明:由于不知道页面什么时候执行完成,这里是等待5s,默认执行完成,效率有待提高。

    另外执行需要线程安全添加[STAThread]

            /// <summary>
            /// 抓取js生成的页面
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string CrawlDynamic(string url)
            {
                WebBrowser browser = new WebBrowser();
    
                browser.ScriptErrorsSuppressed = true;
    
                browser.Navigate(url);
    
                //先要等待加载完毕
                while (browser.ReadyState != WebBrowserReadyState.Complete)
                {
                    Application.DoEvents();
                }
    
                System.Timers.Timer timer = new System.Timers.Timer();
    
                var isComplete = false;
    
                timer.Elapsed += new System.Timers.ElapsedEventHandler((sender, e) =>
                {
                    //加载完毕
                    isComplete = true;
    
                    timer.Stop();
                });
    
                timer.Interval = 1000 * 5;
    
                timer.Start();
    
                //继续等待 5s,等待js加载完
                while (!isComplete)
                    Application.DoEvents();
    
                var htmldocument = browser.Document;
                return htmldocument.ActiveElement.InnerHtml;
            }

    为webbrowser设置cookie,模拟登录

     刚开始始终不成功以为这个方法不能用,后面发现原来是doain设置有问题,我的例子是www.aa.xxx.com,设置的为http://xx.com可以使用,这个地方可能需要根据自己的情况来选择域名。

            [DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]
            public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData);
    
            /// <summary>
            /// 为webbrowser设置cookie
            /// </summary>
            /// <param name="cookieStr">cookie字符串,可以从上面方法获得</param>
            /// <param name="domain">需要设置的域名</param>
            public static void SetCookie(string cookieStr,string domain)
            {
                foreach (string c in cookieStr.Split(';'))
                {
                    string[] item = c.Split('=');
                    if (item.Length == 2)
                    {
                        string name = item[0];
                        string value = item[1];
                        InternetSetCookie(domain, name, value);
                    }
    
                }
            }

    使用demo

                //代理,没有就直接传null
                WebProxy proxy = WebCrawl.WebRequestHelper.CreatePorxy("xx.com", "user", "password");
    
                //根据登录页得到cookie
                CookieContainer cookie = WebCrawl.WebRequestHelper.GetCookie("http://xxxx.login.com", proxy);
    
                //获取页面
                string html = WebCrawl.WebRequestHelper.Crawl("http://xxx.index.com", proxy, cookie);
    
                //根据登录页得到cookie字符串
                string cookiestr = WebCrawl.WebRequestHelper.GetCookieString("http://xxxx.login.com", proxy);
    
                //为webbrowser设置cookie
                WebCrawl.WebRequestHelper.SetCookie(cookiestr, "https://xx.com");
    
                //获取需要登录切用js生成的页面,当然普通页面也可以
                string htmlWithJs = WebCrawl.WebRequestHelper.CrawlDynamic("http://xxx.index.com");
  • 相关阅读:
    Linux文件系统介绍
    httpd 2.4连接php-fpm
    基于lnmp环境安装Discuz
    apache 与 php-fpm 几种处理方式
    Discuz!安装搭建
    Linux中实现文本过滤
    httpd-2.4安装配置
    firewall-cmd.man
    了解JSON
    JSTL和EL表达式
  • 原文地址:https://www.cnblogs.com/xiaoshuai1992/p/webcrawl.html
Copyright © 2011-2022 走看看