zoukankan      html  css  js  c++  java
  • HtmlAgilityPack抓取搜房网数据简单示例

    HtmlAgilityPack是一个开源的解析HTML元素的类库,最大的特点是可以通过XPath来解析HMTL,如果您以前用C#操作过XML,那么使用起HtmlAgilityPack也会得心应手。目前最新版本为1.4.6。

    程序示例如下:

     

    代码如下:

    using HtmlAgilityPack;
    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading;
    using System.Threading.Tasks;
    using System.Windows.Forms;
    
    namespace WinformHtmlAgilityPack
    {
        public partial class Form1 : Form
        {
    
            public Form1()
            {
                InitializeComponent();
            }
            private void Form1_Load(object sender, EventArgs e)
            {
                Thread thread = new Thread(DownloadData);
                thread.Start();
            }
            private void DownloadData()
            {
                GZipWebClient c = new GZipWebClient();
                List<HouseModel> houseList = new List<HouseModel>();
                int pageCount = 50;
                int index = 1;
                for (int m = 1; m <= pageCount; m++)
                {
                    byte[] data = c.DownloadData(new Uri("http://newhouse.fang.com/house/s/b9" + m + "/"));
                    string strx = Encoding.Default.GetString(data);
    
                    HtmlWeb htmlWeb = new HtmlWeb();
                    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                    htmlDoc.LoadHtml(strx);
    
    
                    //HtmlNode node = htmlDoc.GetElementbyId("sjina_C01_37");
                    //HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='nl_con clearfix']");
                    //HtmlNode list = node.SelectSingleNode("//ul");
    
                    //根据xpath ul下的li   li[2]代表索引   得到 名称  
                    //HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[1]/div/div[2]/div[1]/div[1]/a");
                    //根据xpath ul下的li   li[2]代表索引   得到 价格
                    //HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[3]/div/div[2]/div[4]/span");
                    for (int i = 1; i <= 20; i++)
                    {
                        //通过Xpath选中html的指定元素
    
                        HouseModel houseModel = new HouseModel();
                        HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[1]/div[1]/a");
                        if (nodeli_name != null)
                        {
                            houseModel.楼盘名称 = NoHTML(nodeli_name.InnerHtml);
                        }
                        HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/span");
                        if (nodeli_price != null)
                        {
                            houseModel.价格 = NoHTML(nodeli_price.InnerHtml);
                        }
                        HtmlNode nodeli_unit = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/em");
                        if (nodeli_unit != null)
                        {
                            houseModel.单位 = NoHTML(nodeli_unit.InnerHtml);
                        }
    
                        HtmlNode nodeli_address = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[2]/div[1]/a");
                        if (nodeli_address != null)
                        {
                            string district = nodeli_address.InnerHtml.Split(']')[0];
                            houseModel.所在区域 = NoHTML(district).Substring(1);
                            houseModel.地址 = NoHTML(nodeli_address.InnerHtml);
                        }
                        houseModel.序号 = index;
                        houseList.Add(houseModel);
                        this.label1.Invoke(new MethodInvoker(delegate
                        {
                            this.label1.Text = "已经抓取:"+houseList.Count.ToString()+" 条.....";
    
                        }));
                        index++;
    
                        this.dataGridView1.Invoke(new MethodInvoker(delegate
                        {
                            this.dataGridView1.DataSource = null;
                            this.dataGridView1.DataSource = houseList;
                            this.dataGridView1.Columns[5].Width = 500;
    
                            this.dataGridView1.FirstDisplayedScrollingRowIndex = this.dataGridView1.Rows.Count - 1; 
                        }));
    
                        Thread.Sleep(100);
                    }
                }
                this.label1.Invoke(new MethodInvoker(delegate
                {
                    this.label1.Text = "已经抓取完毕。";
    
                }));
            }
            public class GZipWebClient : WebClient
            {
                protected override WebRequest GetWebRequest(Uri address)
                {
                    HttpWebRequest webrequest = (HttpWebRequest)base.GetWebRequest(address);
                    webrequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
                    return webrequest;
                }
            }
            public class HouseModel
            {
                public int 序号 { get; set; }
                public string 楼盘名称 { get; set; }
                public string 所在区域 { get; set; }
                public string 价格 { get; set; }
                public string 单位 { get; set; }
                public string 地址 { get; set; }
            }
    
    
            private void btn_exportexcel_Click(object sender, EventArgs e)
            {
                DataGridViewToExcel(this.dataGridView1);
            }
            #region
            private void DataGridViewToExcel(DataGridView dgv)
            {
                SaveFileDialog dlg = new SaveFileDialog();
                dlg.Filter = "Execl files (*.xls)|*.xls";
                dlg.FilterIndex = 0;
                dlg.RestoreDirectory = true;
                dlg.CreatePrompt = true;
                dlg.Title = "保存为Excel文件";
    
                if (dlg.ShowDialog() == DialogResult.OK)
                {
                    Stream myStream;
                    myStream = dlg.OpenFile();
                    StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding(-0));
                    string columnTitle = "";
                    try
                    {
                        //写入列标题   
                        for (int i = 0; i < dgv.ColumnCount; i++)
                        {
                            if (i > 0)
                            {
                                columnTitle += "	";
                            }
                            columnTitle += dgv.Columns[i].HeaderText;
                        }
                        sw.WriteLine(columnTitle);
    
                        //写入列内容   
                        for (int j = 0; j < dgv.Rows.Count; j++)
                        {
                            string columnValue = "";
                            for (int k = 0; k < dgv.Columns.Count; k++)
                            {
                                if (k > 0)
                                {
                                    columnValue += "	";
                                }
                                if (dgv.Rows[j].Cells[k].Value == null)
                                    columnValue += "";
                                else
                                    columnValue += dgv.Rows[j].Cells[k].Value.ToString().Trim();
                            }
                            sw.WriteLine(columnValue);
                        }
                        sw.Close();
                        myStream.Close();
                    }
                    catch (Exception e)
                    {
                        MessageBox.Show(e.ToString());
                    }
                    finally
                    {
                        sw.Close();
                        myStream.Close();
                    }
                }
            }
            #endregion
            public static string NoHTML(string Htmlstring)
            {
                //删除脚本   
                Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
                //删除HTML   
                Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"([
    ])[s]+", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
    
                Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase);
    
                Htmlstring.Replace("<", "");
                Htmlstring.Replace(">", "");
                Htmlstring.Replace("
    ", "");
                return Htmlstring;
            } 
        }
    }
    

     源码地址:http://files.cnblogs.com/files/sunyj/WinformHtmlAgilityPack.rar

  • 相关阅读:
    java对redis的基本操作(一)
    Android RelativeLayout 属性
    查看jdk的位数
    Java简单文件传输 socket简单文件传输示例
    Java使用socket实现两人聊天对话
    Java观察者设计模式
    Java装饰设计模式的例子
    php邮件发送 phpmailer
    php smarty 缓存和配置文件的基本使用方法
    php smarty insert用法
  • 原文地址:https://www.cnblogs.com/sunyj/p/5025240.html
Copyright © 2011-2022 走看看