zoukankan      html  css  js  c++  java
  • c# http请求ajax页面

      我们在用Http请求的时候,某些页面是ajax加载的,所以请求过来的页面数据不完整。也就是说ajax局部加载数据的地方,我们请求不到,这时候该怎么办呢?

      WebDriver+phantomjs 这两个组合在一起使用,可以完成此任务。分别简单介绍下,WebDriver是一个前端的自动化测试框架,phantomjs是一个无界面的浏览器,基于webkit。WebDriver调用phantomjs.exe工作。下面是WebDriver提供的API,看来它能驱动各种浏览器工作。

            

      使用前准备:

           在Nuget上,下载 Selenium.WebDriverSelenium.PhantomJS.WebDriver两个包,在项目中引用 WebDriver.dll,在输出目录下要有phantomjs.exe。

      我们看一个完整的例子:

      

    using OpenQA.Selenium;
    using OpenQA.Selenium.PhantomJS;
    using OpenQA.Selenium.Support.UI;
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading;
    using System.Threading.Tasks;
    
    namespace ConsoleApplication1
    {
        public interface ICrawler
        {
            event EventHandler<OnStartEventArgs> OnStart;
            event EventHandler<OnCompletedEvent> OnCompleted;
            event EventHandler<OnErrorEventArgs> OnError;
    
            Task Start(Uri uri, Script script, Operation opreation);
        }
    
        public class Operation
        {
    
            public Action<PhantomJSDriver> Action;
    
            public Func<IWebDriver, bool> Condition;
    
            public int timeout { get; set; }
        }
    
        public class Script
        {
            public string Code { set; get; }
    
            public object[] Args { set; get; }
    
        }
    
        public class OnStartEventArgs
        {
            public Uri Uri { set; get; }
    
            public OnStartEventArgs(Uri uri)
            {
                this.Uri = uri;
            }
        }
    
        public class OnErrorEventArgs
        {
            public Uri Uri { set; get; }
    
            public Exception Exception { set; get; }
    
            public OnErrorEventArgs(Uri uri, Exception ex)
            {
                this.Uri = uri;
    
                this.Exception = ex;
            }
        }
    
    
    
        public class OnCompletedEvent
        {
            public Uri Uri { set; get; }
    
            public int ThreadId { set; get; }
    
            public string PageSource { get; private set; }
    
            public long Milliseconds { get; private set; }
    
            public PhantomJSDriver Driver { get; private set; }
    
            public OnCompletedEvent(Uri uri, int threadId, string pageSource, long milliseconds, PhantomJSDriver driver)
            {
                this.Uri = uri;
                this.ThreadId = threadId;
                this.PageSource = pageSource;
                this.Milliseconds = milliseconds;
                this.Driver = driver;
            }
        }
    
        public class HighCrawler : ICrawler
        {
    
            public event EventHandler<OnStartEventArgs> OnStart;
    
            public event EventHandler<OnCompletedEvent> OnCompleted;
    
            public event EventHandler<OnErrorEventArgs> OnError;
    
            private static PhantomJSOptions _options;
            private static PhantomJSDriverService _service;
    
    
            static HighCrawler()
            {
                var service = PhantomJSDriverService.CreateDefaultService();
                service.DiskCache = true;
                service.IgnoreSslErrors = true;
                service.HideCommandPromptWindow = true;
                service.LoadImages = false;
                service.LocalToRemoteUrlAccess = true;
    
                _service = service;
    
                _options = new PhantomJSOptions();
            }
    
    
            public Task Start(Uri uri, Script script, Operation operation)
            {
                return Task.Factory.StartNew(() =>
                {
                    if (OnStart != null)
                    {
                        this.OnStart(this, new OnStartEventArgs(uri));
                    }
    
                    var driver = new PhantomJSDriver(_service, _options);
                    try
                    {
                        var watch = DateTime.Now;
                        driver.Navigate().GoToUrl(uri.ToString());
    
                        if (script != null)
    
                            driver.ExecuteScript(script.Code, script.Args);
    
                        if (operation.Action != null) operation.Action.Invoke(driver);
    
                        var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.timeout));  //设置超时时间
    
                        if (operation.Condition != null) driverWait.Until(operation.Condition);
    
                        var threadId = Thread.CurrentThread.ManagedThreadId;
    
                        var milliseconds = DateTime.Now.Subtract(watch).Milliseconds;
    
                        var pageSource = driver.PageSource;
    
                        if (this.OnCompleted != null)
                            this.OnCompleted(this, new OnCompletedEvent(uri, threadId, pageSource, milliseconds, driver));
    
                    }
                    catch (Exception ex)
                    {
                        if (OnError != null)
                            this.OnError(this, new OnErrorEventArgs(uri, ex));
                    }
                    finally
                    {
                        driver.Close();
                        driver.Quit();
                    }
                });
            }
        }
    }

      这是封装了一个类,方便使用,我们看如何使用:

            /// <summary>
            /// 解析网站
            /// </summary>
            /// <param name="url">待解析的网站</param>
            /// <param name="waitId">等待加载的元素Id:"search-main"</param>
            /// <param name="xpath">解析路径:"//div[@class="article panel article-result"]//h5[@class="title"]//a"</param>
            private static void TestWaitForReady(string url, string waitId, string xpath, int timeout = 10000)
            {
    
                var crawler = new HighCrawler();
    
                crawler.OnStart += (s, e) =>
                {
    
                    Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                };
    
                crawler.OnError += (s, e) =>
                {
                    Console.WriteLine("爬虫出现错误:" + e.Uri.ToString() + ",异常信息" + e.Exception.ToString());
                };
    
                crawler.OnCompleted += (s, e) =>
                {
                    Console.WriteLine("接收到的源码长度:" + e.PageSource.Length);
    
                    Thread.Sleep(1000);
                    Console.WriteLine("爬虫结束,花费时间:" + e.Milliseconds);
                    var items = e.Driver.FindElements(By.XPath(xpath));
    
                    foreach (var item in items)
                    {
                        Console.WriteLine(item.Text);
                    }
                };
    
                var operition = new Operation
                {
                    Action = (x) =>
                    {
    
                    },
                    Condition = (x) =>
                    {
                        return x.FindElement(By.Id(waitId)).Displayed;
                    },
                    timeout = timeout
                };
    
                crawler.Start(new Uri(url), null, operition);
    
            }

      取ajax异步结果的核心原理:WebDriver把页面上的某个元素,作为标识,一旦出现此元素,表明ajax结束,这时候再返回结果,中间有个等待的过程。

      

  • 相关阅读:
    chrome浏览器中安装以及使用Elasticsearch head 插件
    windows10 升级并安装配置 jmeter5.3
    linux下部署Elasticsearch6.8.1版本的集群
    【Rollo的Python之路】Python 爬虫系统学习 (八) logging模块的使用
    【Rollo的Python之路】Python 爬虫系统学习 (七) Scrapy初识
    【Rollo的Python之路】Python 爬虫系统学习 (六) Selenium 模拟登录
    【Rollo的Python之路】Python 爬虫系统学习 (五) Selenium
    【Rollo的Python之路】Python 爬虫系统学习 (四) XPath学习
    【Rollo的Python之路】Python 爬虫系统学习 (三)
    【Rollo的Python之路】Python sys argv[] 函数用法笔记
  • 原文地址:https://www.cnblogs.com/wangqiang3311/p/8989681.html
Copyright © 2011-2022 走看看