zoukankan      html  css  js  c++  java
  • 使用HtmlAgilityPack批量抓取网页数据

    相关软件点击下载

    登录的处理。因为有些网页数据需要登陆后才能提取。这里要使用ieHTTPHeaders来提取登录时的提交信息。

    抓取网页

     HtmlAgilityPack.HtmlDocument htmlDoc;

                
    if (!string.IsNullOrEmpty(登录URL))
                
    {
                    htmlDoc 
    = htmlWeb.Load(登录URL, 提交的用户验证信息, 获取数据的网页URL);
                }

                
    else
                
    {
                    htmlDoc 
    = htmlWeb.Load(获取数据的网页URL);
                }

            
     ArrayList list = new ArrayList();
                list.add(
    "//table/tr[1]/td");
                list.add(
    "//table/tr[2]/td");
                
    //获取循环的节点的xpath,比如://table/tr
                HtmlNodeCollection repeatNodes = htmlDoc.DocumentNode.SelectNodes("//table/tr");

                
    //循环节点
                foreach (HtmlNode node in repeatNodes)
                
    {
                    
    //循环获取数据
                    foreach (string dataPath in list)
                    
    {

                        HtmlNode dataNode 
    = node.SelectSingleNode(list);
                        
    if (dataNode != null)
                        
    {
                            
    string text = dataNode.InnerText;
                        }


                    }

                }

    如果出现乱码,调整编码集为gb2312或者是utf-8

    htmlWeb.DefaultEncoding = System.Text.Encoding.GetEncoding(strEncode);

    -------------------------------------------------------------------------------------------

    using System;

    using System.Collections.Generic;

    using System.Text;

    using Microsoft.VisualStudio.TestTools.WebTesting;

    using HtmlAgilityPack;

    public class WebTest1Coded : WebTest

    {

    public override IEnumerator<WebTestRequest> GetRequestEnumerator()

    {

    WebTestRequest request1 = new WebTestRequest("http://www.microsoft.com/");

    request1.ValidateResponse += new EventHandler<ValidationEventArgs>(request1_ValidateResponse);

    yield return request1;

    }

    void request1_ValidateResponse(object sender, ValidationEventArgs e)

    {

    //load the response body string as an HtmlAgilityPack.HtmlDocument

    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

    doc.LoadHtml(e.Response.BodyString);

    //locate the "Nav" element

    HtmlNode navNode = doc.GetElementbyId("Nav");

    //pick the first <li> element

    HtmlNode firstNavItemNode = navNode.SelectSingleNode(".//li");

    //validate the first list item in the Nav element says "Windows"

    e.IsValid = firstNavItemNode.InnerText == "Windows";

    }

    }

  • 相关阅读:
    json to dictionary
    ie下 频繁异步加载导致崩溃
    如何实现数据在表内部置顶
    js货币转换,replaceAll写法
    js奇怪的问题
    mssql中public和db_owner权限下拿到webshell或者系统权限
    关于Alexa Toolbar的破解方法
    如何备份和还原 IIS
    WIN下IIS备份与恢复操作
    汽车品牌标志数据库
  • 原文地址:https://www.cnblogs.com/chuncn/p/1561564.html
Copyright © 2011-2022 走看看