zoukankan      html  css  js  c++  java
  • c# http请求ajax页面

      我们在用Http请求的时候,某些页面是ajax加载的,所以请求过来的页面数据不完整。也就是说ajax局部加载数据的地方,我们请求不到,这时候该怎么办呢?

      WebDriver+phantomjs 这两个组合在一起使用,可以完成此任务。分别简单介绍下,WebDriver是一个前端的自动化测试框架,phantomjs是一个无界面的浏览器,基于webkit。WebDriver调用phantomjs.exe工作。下面是WebDriver提供的API,看来它能驱动各种浏览器工作。

            

      使用前准备:

           在Nuget上,下载 Selenium.WebDriverSelenium.PhantomJS.WebDriver两个包,在项目中引用 WebDriver.dll,在输出目录下要有phantomjs.exe。

      我们看一个完整的例子:

      

    using OpenQA.Selenium;
    using OpenQA.Selenium.PhantomJS;
    using OpenQA.Selenium.Support.UI;
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading;
    using System.Threading.Tasks;
    
    namespace ConsoleApplication1
    {
        public interface ICrawler
        {
            event EventHandler<OnStartEventArgs> OnStart;
            event EventHandler<OnCompletedEvent> OnCompleted;
            event EventHandler<OnErrorEventArgs> OnError;
    
            Task Start(Uri uri, Script script, Operation opreation);
        }
    
        public class Operation
        {
    
            public Action<PhantomJSDriver> Action;
    
            public Func<IWebDriver, bool> Condition;
    
            public int timeout { get; set; }
        }
    
        public class Script
        {
            public string Code { set; get; }
    
            public object[] Args { set; get; }
    
        }
    
        public class OnStartEventArgs
        {
            public Uri Uri { set; get; }
    
            public OnStartEventArgs(Uri uri)
            {
                this.Uri = uri;
            }
        }
    
        public class OnErrorEventArgs
        {
            public Uri Uri { set; get; }
    
            public Exception Exception { set; get; }
    
            public OnErrorEventArgs(Uri uri, Exception ex)
            {
                this.Uri = uri;
    
                this.Exception = ex;
            }
        }
    
    
    
        public class OnCompletedEvent
        {
            public Uri Uri { set; get; }
    
            public int ThreadId { set; get; }
    
            public string PageSource { get; private set; }
    
            public long Milliseconds { get; private set; }
    
            public PhantomJSDriver Driver { get; private set; }
    
            public OnCompletedEvent(Uri uri, int threadId, string pageSource, long milliseconds, PhantomJSDriver driver)
            {
                this.Uri = uri;
                this.ThreadId = threadId;
                this.PageSource = pageSource;
                this.Milliseconds = milliseconds;
                this.Driver = driver;
            }
        }
    
        public class HighCrawler : ICrawler
        {
    
            public event EventHandler<OnStartEventArgs> OnStart;
    
            public event EventHandler<OnCompletedEvent> OnCompleted;
    
            public event EventHandler<OnErrorEventArgs> OnError;
    
            private static PhantomJSOptions _options;
            private static PhantomJSDriverService _service;
    
    
            static HighCrawler()
            {
                var service = PhantomJSDriverService.CreateDefaultService();
                service.DiskCache = true;
                service.IgnoreSslErrors = true;
                service.HideCommandPromptWindow = true;
                service.LoadImages = false;
                service.LocalToRemoteUrlAccess = true;
    
                _service = service;
    
                _options = new PhantomJSOptions();
            }
    
    
            public Task Start(Uri uri, Script script, Operation operation)
            {
                return Task.Factory.StartNew(() =>
                {
                    if (OnStart != null)
                    {
                        this.OnStart(this, new OnStartEventArgs(uri));
                    }
    
                    var driver = new PhantomJSDriver(_service, _options);
                    try
                    {
                        var watch = DateTime.Now;
                        driver.Navigate().GoToUrl(uri.ToString());
    
                        if (script != null)
    
                            driver.ExecuteScript(script.Code, script.Args);
    
                        if (operation.Action != null) operation.Action.Invoke(driver);
    
                        var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.timeout));  //设置超时时间
    
                        if (operation.Condition != null) driverWait.Until(operation.Condition);
    
                        var threadId = Thread.CurrentThread.ManagedThreadId;
    
                        var milliseconds = DateTime.Now.Subtract(watch).Milliseconds;
    
                        var pageSource = driver.PageSource;
    
                        if (this.OnCompleted != null)
                            this.OnCompleted(this, new OnCompletedEvent(uri, threadId, pageSource, milliseconds, driver));
    
                    }
                    catch (Exception ex)
                    {
                        if (OnError != null)
                            this.OnError(this, new OnErrorEventArgs(uri, ex));
                    }
                    finally
                    {
                        driver.Close();
                        driver.Quit();
                    }
                });
            }
        }
    }

      这是封装了一个类,方便使用,我们看如何使用:

            /// <summary>
            /// 解析网站
            /// </summary>
            /// <param name="url">待解析的网站</param>
            /// <param name="waitId">等待加载的元素Id:"search-main"</param>
            /// <param name="xpath">解析路径:"//div[@class="article panel article-result"]//h5[@class="title"]//a"</param>
            private static void TestWaitForReady(string url, string waitId, string xpath, int timeout = 10000)
            {
    
                var crawler = new HighCrawler();
    
                crawler.OnStart += (s, e) =>
                {
    
                    Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                };
    
                crawler.OnError += (s, e) =>
                {
                    Console.WriteLine("爬虫出现错误:" + e.Uri.ToString() + ",异常信息" + e.Exception.ToString());
                };
    
                crawler.OnCompleted += (s, e) =>
                {
                    Console.WriteLine("接收到的源码长度:" + e.PageSource.Length);
    
                    Thread.Sleep(1000);
                    Console.WriteLine("爬虫结束,花费时间:" + e.Milliseconds);
                    var items = e.Driver.FindElements(By.XPath(xpath));
    
                    foreach (var item in items)
                    {
                        Console.WriteLine(item.Text);
                    }
                };
    
                var operition = new Operation
                {
                    Action = (x) =>
                    {
    
                    },
                    Condition = (x) =>
                    {
                        return x.FindElement(By.Id(waitId)).Displayed;
                    },
                    timeout = timeout
                };
    
                crawler.Start(new Uri(url), null, operition);
    
            }

      取ajax异步结果的核心原理:WebDriver把页面上的某个元素,作为标识,一旦出现此元素,表明ajax结束,这时候再返回结果,中间有个等待的过程。

      

  • 相关阅读:
    CCF总结
    句柄的定义
    ubuntu16.04下安装和使用Kdevelop的技巧
    找不到ros相关包的解决方法
    用kdevelop构建ROS程序包
    Linux给软件添加卓面快捷方式
    工程代码结构说明——哈工大编译原理课程(五)
    符号表管理——哈工大编译原理课程(四)
    语义分析和中间代码生成——哈工大编译原理课程(三)
    语法分析器——哈工大编译原理课程(二)
  • 原文地址:https://www.cnblogs.com/wangqiang3311/p/8989681.html
Copyright © 2011-2022 走看看