zoukankan      html  css  js  c++  java
  • C# 采集页面数据

    using HtmlAgilityPack;
    using Nito.AsyncEx;
    using System;
    using System.Diagnostics;
    using System.IO;
    using System.IO.Compression;
    using System.Net;
    using System.Text;
    using System.Threading;
    using System.Threading.Tasks;
    
    namespace test1
    {
        class Program
        {
            static void Main(string[] args)
            {
                var uri = new Uri("https://www.baidu.com/");
                string pageHtml=AsyncContext.Run(() => GetHttpDomByUrl(uri));
                HtmlDocument thisnode = new HtmlDocument();
                thisnode.LoadHtml(pageHtml);
                var tnode = thisnode.DocumentNode;
                if (tnode.SelectSingleNode("//*/a[@name="tj_login"]") != null)
                {
                    Console.WriteLine("获取到的数据为:"+tnode.SelectSingleNode("//*/a[@name="tj_login"]").Attributes["href"].Value);
                }
                Console.WriteLine("测试成功");
                Console.ReadKey();
            }
            public static CookieContainer CookiesContainer = new CookieContainer();//定义Cookie容器
            static CookieContainer cookie = new CookieContainer();//设置为全局,这样可以方便每个函数直接调用
            public static async Task<string> GetHttpDomByUrl(Uri uri, string proxy = null)
            {
                Thread.Sleep(1000);
                return await Task.Run(() =>
                {
                    var pageSource = string.Empty;
                    try
                    {
                        //模拟浏览器请求
                        //if (this.OnStart != null) this.OnStart(this, new OnStartEventArgs(uri));
                        var watch = new Stopwatch();
                        watch.Start();
                        var request = (HttpWebRequest)WebRequest.Create(uri);
                        request.Accept = "*/*";
                        request.ServicePoint.Expect100Continue = false;//加快载入速度
                        request.ServicePoint.UseNagleAlgorithm = false;//禁止Nagle算法加快载入速度
                        request.AllowWriteStreamBuffering = false;//禁止缓冲加快载入速度
                        request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");//定义gzip压缩页面支持
                        request.ContentType = "application/x-www-form-urlencoded";//定义文档类型及编码
                        request.AllowAutoRedirect = true;//禁止自动跳转
                        //设置User-Agent,伪装成Google Chrome浏览器
                        request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
                        request.Timeout = 5000;//定义请求超时时间为5秒
                        request.KeepAlive = true;//启用长连接
                        request.Method = "GET";//定义请求方式为GET     
                        request.CookieContainer = cookie;
                        if (proxy != null)
                        {
                            request.Proxy = new WebProxy(proxy);//设置代理服务器IP,伪装请求地址
                        }
                        //request.CookieContainer = this.CookiesContainer;//附加Cookie容器
                        request.ServicePoint.ConnectionLimit = int.MaxValue;//定义最大连接数
                        using (var response = (HttpWebResponse)request.GetResponse())
                        {
                           
                            //获取请求响应
                            foreach (Cookie cookie in response.Cookies)
                            {
                                CookiesContainer.Add(cookie);//将Cookie加入容器,保存登录状态
                                
                            }
                            //判断网页是否被gzip压缩 
                            if (response.ContentEncoding.ToLower().Contains("gzip"))
                            {
                                //解压
                                using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
                                {
                                    using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
                                    {
                                        pageSource = reader.ReadToEnd();
                                    }
                                }
                            }
                            //判断网页http头中是否Content-Encoding:deflate
                            else if (response.ContentEncoding.ToLower().Contains("deflate"))
                            {
                                //解压
                                using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
                                {
                                    using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
                                    {
                                        pageSource = reader.ReadToEnd();
                                    }
    
                                }
                            }
                            //正常流获取网页
                            else
                            {
                                using (Stream stream = response.GetResponseStream())//原始
                                {
                                    using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
                                    {
    
                                        pageSource = reader.ReadToEnd();
                                    }
                                }
                            }
                        }
                        request.Abort();
                        watch.Stop();
                        var threadId = System.Threading.Thread.CurrentThread.ManagedThreadId;//获取当前任务线程ID
                        var milliseconds = watch.ElapsedMilliseconds;//获取请求执行时间
                        //if (this.OnCompleted != null)
                        //{
                        //    this.OnCompleted(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource));
                        //    // Console.WriteLine("程序执行完成");
                        //}
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine($"hello, task的线程ID为{Thread.CurrentThread.ManagedThreadId}");
                        Console.WriteLine(uri + $"请求页面失败正在重新请求,当前线程{Thread.CurrentThread.ManagedThreadId}:" + ex.Message.ToString());
                        Thread.Sleep(1000);
                        return AsyncContext.Run(() => GetHttpDomByUrl(uri));
                    }
                    return pageSource;
                });
            }
        }
    }
  • 相关阅读:
    Leetcode788.Rotated Digits旋转数字
    Leetcode788.Rotated Digits旋转数字
    Leetcode796.Rotate String旋转字符串
    Leetcode796.Rotate String旋转字符串
    Leetcode784.Letter Case Permutation字母大小写全排列
    Leetcode784.Letter Case Permutation字母大小写全排列
    Leetcode771.Jewels and Stones宝石与石头
    Leetcode771.Jewels and Stones宝石与石头
    Leetcode724.Find Pivot Index寻找数组的中心索引
    Leetcode724.Find Pivot Index寻找数组的中心索引
  • 原文地址:https://www.cnblogs.com/lbonet/p/13594643.html
Copyright © 2011-2022 走看看