zoukankan      html  css  js  c++  java
  • csharp: using HtmlAgilityPack and ScrapySharp reading Url find text

    https://github.com/exaphaser/ScrapySharp

    https://github.com/zzzprojects/html-agility-pack

    https://github.com/atifaziz/Fizzler

    https://archive.codeplex.com/?p=fizzlerex

    https://github.com/aspnet/blazor

    https://github.com/SteveSanderson/Blazor

    https://www.mathjax.org/#samples 数学公式

     https://github.com/Ivony/Jumony

    https://github.com/GeReV/NSoup

    https://github.com/robinvanderknaap/MvcJqGrid

    http://www.defenseinnovationmarketplace.mil/strategy.html

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.IO;
    using System.Net;
    using System.Collections;
    using ScrapySharp;
    using ScrapySharp.Network;
    using ScrapySharp.Core;
    using HtmlAgilityPack;
    
    
    namespace HtmlAgilityPackDemo
    {
    
        /// <summary>
        /// HTML解析利器HtmlAgilityPack
        /// geovindu
        /// 涂聚文
        /// 20180305
        /// </summary>
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }
            /// <summary>
            /// 
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void Form1_Load(object sender, EventArgs e)
            {
                this.textBox1.Text = "ln";
    
                //List<CityList> lis=new List<CityList>();
            }
            /// <summary>
            /// 
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetWebClient(string url)
            {
                string strHTML = "";
                WebClient myWebClient = new WebClient();
                Stream myStream = myWebClient.OpenRead(url);
                StreamReader sr = new StreamReader(myStream, Encoding.Default);//注意编码
                strHTML = sr.ReadToEnd();
                myStream.Close();
                return strHTML;
            }
    
            /// <summary>
            /// nl
            /// </summary>
            /// <param name="cityCode"></param>
            public  string ParsePageByArea(String cityCode, out List<CityList> listcity)
            {
                StringBuilder stp = new StringBuilder();
                CityList city = null;
                List<CityList> clits = new List<CityList>();
                //更加链接格式和省份代码构造URL
                String url = String.Format("http://www.tianqihoubao.com/lishi/{0}.htm", cityCode);
                //下载网页源代码 
                var docText = GetWebClient(url);
                //加载源代码,获取文档对象
                var doc = new HtmlAgilityPack.HtmlDocument(); 
                doc.LoadHtml(docText);
                //更加xpath获取总的对象,如果不为空,就继续选择dl标签
                var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[1]/div[6]/div[1]/div[1]/div[3]");
                if (res != null)
                {
                    var list = res.SelectNodes(@"dl");//选择标签数组
                    if (list.Count < 1)
                    {
                        listcity = clits;
                        return "";
                    }
                    foreach (var item in list)
                    {
                        var dd = item.SelectSingleNode(@"dd").SelectNodes("a");
                        foreach (var node in dd)
                        {
                            city = new CityList();
                            var text = node.InnerText.Trim();
                            //拼音代码要从href属性中进行分割提取
                            var herf = node.Attributes["href"].Value.Trim().Split('/', '.');
                           string str= string.Format("{0}:{1}", text, herf[herf.Length - 2]);
                           city.CityName = text;
                           city.CityCode = herf[herf.Length - 2];
                           stp.Append("
    " + str);
                           clits.Add(city);
    
                        }
                    }                
                }
                listcity = clits;
                return stp.ToString();
            }
            /// <summary>
            /// http://www.tianqihoubao.com/lishi/dalian/month/201802.html
            /// </summary>
            /// <param name="cityCode"></param>
            /// <param name="year"></param>
            /// <param name="month"></param>
            public  string ParsePageByCityMonth(String cityCode, Int32 year, Int32 month,out List<WeatherList> wea)
            {
                StringBuilder stp = new StringBuilder();
                List<WeatherList> wlist = new List<WeatherList>();
                WeatherList wt = null;
                //更加拼音代码,月份信息构造URL
                String url = String.Format("http://www.tianqihoubao.com/lishi/{0}/month/{1}{2:D2}.html", cityCode, year, month);
                //获取该链接的源代码
                var docText = GetWebClient(url);
                //加载源代码,获取页面结构对象
                var doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(docText);
                //更加Xpath获取表格对象
                var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[2]/div[6]/div[1]/div[1]/table[1]");
                if (res != null)
                {
                    //获取所有行
                    var list = res.SelectNodes(@"tr");
                    list.RemoveAt(0);//移除第一行,是表头
                    // 遍历每一行,获取日期,以及天气状况等信息
                    foreach (var item in list)
                    {
                        wt = new WeatherList();
                        var dd = item.SelectNodes(@"td");
                        //日期 -  - 气温 - 风力风向
                        if (dd.Count != 4) continue;
                        //获取当前行日期
                        var date1 = dd[0].InnerText.Replace("
    ", "").Replace(" ", "").Trim();
                        //获取当前行天气状况
                        var tq = dd[1].InnerText.Replace("
    ", "").Replace(" ", "").Trim();
                        //获取当前行气温
                        var qw = dd[2].InnerText.Replace("
    ", "").Replace(" ", "").Trim();
                        //获取当前行风力风向
                        var fx = dd[3].InnerText.Replace("
    ", "").Replace(" ", "").Trim();
                        //输出
                        string str=string.Format("{0}:{1},{2},{3}", date1, tq, qw, fx);
                        stp.Append(str);
                        wt.Climate = tq;
                        wt.Date =DateTime.Parse(date1);
                        wt.Temperature = qw;
                        wt.WindDirection = fx;
                        wlist.Add(wt);
    
                    }
                }
                wea = wlist;
                return stp.ToString();
            }
            /// <summary>
            /// http://www.dusystem.com/geovindu.html
            /// ScrapingBrowser
            /// 获取文件标题
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public string getHtmlTitle(string url)
            {
                StringBuilder titl = new StringBuilder();
                var uri = new Uri(url);
                var browser1 = new ScrapingBrowser();
                var html1 = browser1.DownloadString(uri);
                var doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(html1);
                var html = doc.DocumentNode;
    
                var title = html.SelectNodes("title");
                foreach (var htmlNode in title)
                {
                    titl.Append(htmlNode.InnerText);
                }
                    //CssSelect  CssSelectAncestors
                var ps = html.SelectNodes("p").Elements("div#endText");
                foreach (var htmlNode in ps)
                {
                    titl.Append(htmlNode.InnerHtml);
                }
    
               return titl.ToString();
    
            }
            /// <summary>
            /// 
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void button1_Click(object sender, EventArgs e)
            {
                List<CityList> list = new List<CityList>();
                this.richTextBox1.Text = ParsePageByArea(this.textBox1.Text.Trim(),out list);
                this.comboBox1.DataSource = list;
                this.comboBox1.DisplayMember = "CityName";
                this.comboBox1.ValueMember = "CityCode";
    
    
            }
            /// <summary>
            /// 
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void button2_Click(object sender, EventArgs e)
            {
                List<WeatherList> list = new List<WeatherList>();
                int year=DateTime.Now.Year;
                int mont=DateTime.Now.Month-1;
                this.richTextBox2.Text = ParsePageByCityMonth(this.comboBox1.SelectedValue.ToString(), year, mont, out list);
                this.dataGridView1.DataSource = list;
                
            }
    
        }
        /// <summary>
        /// 
        /// </summary>
        public class CityList
        {
            /// <summary>
            /// 
            /// </summary>
            public string CityName { get; set; }
            /// <summary>
            /// 
            /// </summary>
            public string CityCode { get; set; }
        }
    
        /// <summary>
        /// Climate, temperature, wind direction
        /// </summary>
        public class WeatherList
        {
            /// <summary>
            /// 气候
            /// </summary>
            public string Climate { get; set; }
            /// <summary>
            /// 温度
            /// </summary>
            public string Temperature { get; set; }
            /// <summary>
            /// 风向
            /// </summary>
            public string WindDirection { get; set; }
            /// <summary>
            /// 
            /// </summary>
            public DateTime Date { get; set; }
        }
       
    }
    

      

      private void button3_Click(object sender, EventArgs e)
            {
                int year = DateTime.Now.Year;
                int mont = DateTime.Now.Month - 1;
                string url = "http://www.tianqihoubao.com/lishi/dalian/month/201802.html";
                var docText = GetWebClient(url);
                HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();           
                
                document.LoadHtml(docText);
    
               // document.OptionOutputAsXml = true;
    
                var divname = document.DocumentNode.Descendants("div").FirstOrDefault();
    
                var body = document.DocumentNode.SelectNodes("//body").Single();
    
                var ta = document.DocumentNode.SelectNodes("//table").Single();
    
                foreach (var script in document.DocumentNode.Descendants("script").ToArray())
                    script.Remove();
                foreach (var style in document.DocumentNode.Descendants("style").ToArray())
                    style.Remove();
    
               // foreach (var comment in document.DocumentNode.SelectNodes("//comment()").ToArray())
                //    comment.Remove();//新增的代码
    
                //document.DocumentNode.SelectSingleNode("//div[@id='myTrips']").SelectNodes(".//li");
                //是示天气的
                List<string> paragraphs = document.DocumentNode.SelectNodes("//table[@class='b']//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();
    
                string name = document.DocumentNode.SelectSingleNode("//td/input").Attributes["value"].Value;
    
    
               // List<string> paragraphs = document.DocumentNode.SelectNodes("//table[contains(@class, 'b')]//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();////b: is class name
                //XPath: /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1]
                HtmlNode tablenode = document.DocumentNode.SelectSingleNode("//table[@class='b']//tr");     //b: is class name 根据XPath查找节点,跟XmlNode差不多
    
                
    
                HtmlNode node = document.DocumentNode.SelectSingleNode("//*");
    
    
    
                IEnumerable<HtmlNode> nodeList = node.Ancestors();  //获取该元素所有的父节点的集合
                foreach (HtmlNode item in nodeList)
                {
                    Console.Write(item.Name + " ");   //输出 div div body html #document
                }
    
                HtmlAttributeCollection attrs = node.Attributes;
                foreach (var item in attrs)
                {
                    Console.WriteLine(item.Name + " : " + item.Value);    //输出 class :user_match clear
                }
    
                HtmlNodeCollection CNodes = node.ChildNodes;    //所有的子节点
                foreach (HtmlNode item in CNodes)
                {
                    Console.WriteLine(item.Name + "-" + item.InnerText);  //输出 别忘了文本节点也算
                }
    
                HtmlAttributeCollection attrs1 = node.ClosingAttributes;    //获取在结束标记的 HTML 属性的集合。  例如</ul class="">
                Console.WriteLine(attrs1.Count);    //输出0
    
                HtmlNode node1 = node.FirstChild;   //悲剧了ul的第一个节点是一个 
     换行文本节点 第二个节点才到第一个li
                Console.WriteLine(node1.NodeType);  //输出Text 文本节点
                HtmlNode node3 = node.LastChild;    //同样最后一个节点一样是 
     文本节点
                Console.WriteLine(node3.NodeType);  //输出Text 文本节点
    
                HtmlNode node2 = node.SelectSingleNode("child::div[1]");     //获取当前节点的第一个子li节点
                Console.WriteLine(node2.XPath);     //根据节点生成XPath表达式   /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1] 
    
                Console.WriteLine(node.HasAttributes);          //输出 True   判断节点是否含有属性
                Console.WriteLine(node.HasChildNodes);          //输出 True   判断节点是否含有子节点
                Console.WriteLine(node.HasClosingAttributes);   //False     判断节点结束标记是否含有属性
    
                Console.WriteLine(node.Line);           //输出 155  该节点开始标记位于页面代码的第几行
                Console.WriteLine(node.LinePosition);   //输出 1   该节点开始标记位于第几列2
                Console.WriteLine(node.NodeType);       //输出 Element   该节点类型 此处为元素节点            
                Console.WriteLine(node.OriginalName);   //输出 ul
                HtmlNode node4 = node.SelectSingleNode("child::div[1]");
                Console.WriteLine(node4.InnerText);     //输出 
                HtmlNode node5 = node4.NextSibling.NextSibling;     //获取下一个兄弟元素 因为有一个换行符的文本节点,因此要两次,跳过换行那个文本节点
                Console.WriteLine(node5.InnerText);     //输出 
                HtmlNode node6 = node5.PreviousSibling.PreviousSibling;     //同样两次以跳过换行文本节点
                Console.WriteLine(node6.InnerText);     //输出 
                HtmlNode node7 = node6.ParentNode;      //获取父节点
                Console.WriteLine(node7.Name);          //输出 ul
                string str = node.OuterHtml;
                Console.WriteLine(str);     //输出整个ul代码class="user_match clear">
                Console.WriteLine(node.StreamPosition); //输出7331    获取此节点的流位置在文档中,相对于整个文档(Html页面源代码)的开始。
    
                HtmlAgilityPack.HtmlDocument doc1 = node.OwnerDocument;
    
                foreach (HtmlAgilityPack.HtmlNode div in body.SelectNodes("//div"))
                {
                    var classValue = div.Attributes["class"] == null ? null : div.Attributes["class"].Value;
    
                    if (classValue == "first")
                    {
                        //write innerText into a table at place [i][column1]
                    }
                    else if (classValue == "second")
                    {
                        //write innerText into the same table in [i][column2]
                    }
                }
    
                string innerText1 = document.DocumentNode.SelectSingleNode("//body").SelectNodes("//div").Single(n => n.Attributes.Any(a => a.Name == "class" && a.Value == "first")).InnerText;
            }
    

      

  • 相关阅读:
    nginx 服务器重启命令,关闭
    eclipse实现热部署和热启动
    Intellij IDEA 文件修改提示星号
    IntelliJ IDEA 自动编译功能无法使用,On 'update' action:选项里面没有update classes and resources这项
    idea最常使用的快捷键
    centos 切换用户显示bash-4.2$,不显示用户名路径的问题
    汉诺塔
    C语言笔记
    @org.springframework.beans.factory.annotation.Autowired(required=true)
    Error creating bean with name 'xxxx' defined in URL
  • 原文地址:https://www.cnblogs.com/geovindu/p/8509035.html
Copyright © 2011-2022 走看看