zoukankan      html  css  js  c++  java
  • 利用HtmlAgilityPack库进行HTML数据抓取

    主要介绍基于XPATH的文本分析方式的实现,代码如下:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    using HtmlAgilityPack;
    namespace MyIdea.Spider
    {
        class Program
        {
            static void Main(string[] args)
            {
                GetDataFromFile();
                GetDataFromUrl();
                Console.ReadKey();
            }
    
            static void GetDataFromFile()
            {
                HtmlDocument doc = new HtmlDocument();
                doc.Load(AppDomain.CurrentDomain.BaseDirectory.Replace(@"binDebug","") + "/test.html");
                Console.Write("==================html文件解析结果(test1)==================
    ");
                Console.Write(string.Format("div节点2的内容:{0}
    ", doc.DocumentNode.SelectNodes("/html/body/div/div")[1].InnerText));
                Console.Write(string.Format("div节点3中第一个div子节点内容:{0}
    
    ", doc.DocumentNode.SelectNodes("/html/body/div/div/div")[0].InnerText));
            }
    
            static void GetDataFromUrl()
            {
                string url = "http://movie.douban.com/chart";
                string movieXpath = "/html/body/div[3]/div[1]/div/div[1]/div/div/table/tr/td[2]/div";
                HtmlWeb request = new HtmlWeb();
                HtmlDocument doc = request.Load(url);
    
                HtmlNodeCollection movieItems = doc.DocumentNode.SelectNodes(movieXpath);
                Console.Write("==========豆瓣电影排行榜页面电影信息解析结果(test2)==========
    ");
                foreach (HtmlNode item in movieItems)
                {
                    string title = item.Descendants("a").First().InnerText.Replace(" ","").Replace("
    ","");
                    string introduce = item.Descendants("p").First().InnerText.Replace(" ", "").Replace("
    ", "");
                    Console.WriteLine("
    <片名>:"+title);
                    Console.WriteLine("<介绍>:" + introduce);
                }
            }
        }
    }

    解析结果

  • 相关阅读:
    MediaRecorder.AudioSource参数
    putty连接服务器
    支持库
    ImageView
    .net下MD5算法和加盐
    SqlHelper文件复习
    .net下连接数据库
    Windows Server 2003 R2 64位简体中文版下载
    gacutil.exe 注册assembly
    Sharepoint Powershell
  • 原文地址:https://www.cnblogs.com/huangzelin/p/4564380.html
Copyright © 2011-2022 走看看