爬虫获取数据时,可能会遇到AJAX加载的页面,如果无法分析出接口的话,就只能使用秘密武器——WebDriverDownloader。不过最好还是分析出接口为好,WebDriver的性能实在是太低了。现在是Puppeteer的天下了,可以看看这个。
#region WebDriverDownloader var option = new Option(); option.LoadImage = false; option.LoadFlashPlayer = false; option.AlwaysLoadNoFocusLibrary = false; option.Headless = true; IWebDriverAction webDriverAction = new Click();//webDriver事件 List<IWebDriverAction> webDriverActions = new List<IWebDriverAction>();//webDriver事件链表 webDriverActions.Add(webDriverAction);//add webDriver事件 var downloader = new WebDriverDownloader(Browser.Chrome, 5000, option); downloader.Actions = webDriverActions;//绑定到浏览器,等其执行完成load后执行 spider.Downloader = downloader; #endregion
/// <summary> /// 点击操作的实现 /// </summary> public class Click : IWebDriverAction { /// <summary> /// 滚动次数 /// </summary> public int ClickTimes { get; set; } = 1; /// <summary> /// 点击操作的具体实现 /// </summary> /// <param name="webDriver">WebDriver</param> /// <returns>是否操作成功</returns> public bool Invoke(RemoteWebDriver webDriver) { try { //webDriver.Manage().Window.Maximize(); for (var i = 0; i < ClickTimes; i++) { Thread.Sleep(2000); string sJavascriptCode = "window.open("https://www.baidu.com/");"; webDriver.ExecuteScript(sJavascriptCode); Thread.Sleep(1000); } } catch (Exception) { return false; } return true; } }