zoukankan      html  css  js  c++  java
  • 网页爬虫【原创】【开源】

    使用到了以下技术点:
    1)webclient获得网页源码;
    2)正则表达式,解析网页中想要的数据;
    3)使用线程池加快网页采集数据。
    4)……
     
    以前写过几次类似的,但是找不到了,又重新写了一个。
    代码比较粗糙,求拍砖。
     
    using System;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading;
    using System.Windows.Forms;
    
    namespace SpiderMan
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }
    
            /// <summary>
            /// 线程数量
            /// </summary>
            private static int threadCount = 0;
    
            private void button1_Click(object sender, EventArgs e)
            {
                string urlPattern = "http://www.3464.com/data/zhongguochengshijingweidu/?PageNo={0}";
                int pageFirstIndex = 1;
                int pageLastIndex = 125;
    
                for (int pageIndex = pageFirstIndex; pageIndex <= pageLastIndex; pageIndex++)
                {
                    string url = string.Format(urlPattern, pageIndex);
                    Log("开始读取url:" + url);
    
    
                    ThreadPool.QueueUserWorkItem(aurl =>
                    {
                        string html = GetHttpSource((string)aurl);
                        ParseHtml(html);
    
                        //线程计数--
                        Interlocked.Decrement(ref threadCount);
                    }, url);
    
                    //线程计数++
                    Interlocked.Increment(ref threadCount);
                }
    
                while (true)
                {
                    Application.DoEvents();
                    Loading();
    
                    if (threadCount <= 0)
                    {
                        break;
                    }
                }
    
                //Thread.Sleep(1000);
                Log("数据采集结束");
            }
    
            #region 解析html
            /// <summary>
            /// 解析html
            /// </summary>
            /// <param name="html"></param>
            private void ParseHtml(string html)
            {
                var beginPos = html.IndexOf("编号");
                var endPos = html.IndexOf("</table>", beginPos);
    
                var partHtml = html.Substring(beginPos, endPos - beginPos);
                /*
                 <tr[^<]*<td[^>]*>(?<id>d*?)</td>[^>]*>(?<prov>w*)</td>[^>]*>[^>]*>(?<city>w*)</a></td>[^>]*>(?<city2>w*)</td>[^>]*>(?<py>w*)</td>[^>]*>(?<qh>w*)</td>[^>]*>(?<yb>w*)</td>[^>]*>(?<dj>[d.]*)</td>[^>]*>(?<bw>[d.]*)</td>[^>]*>
                 */
    
                var ms = Regex.Matches(partHtml,
                    @"<tr[^<]*<td[^>]*>(?<id>d*?)</td>[^>]*>(?<prov>w*)</td>[^>]*>[^>]*>(?<city>w*)</a></td>[^>]*>(?<city2>w*)</td>[^>]*>(?<py>w*)</td>[^>]*>(?<qh>w*)</td>[^>]*>(?<yb>w*)</td>[^>]*>(?<dj>[d.]*)</td>[^>]*>(?<bw>[d.]*)</td>[^>]*>");
    
                foreach (Match m in ms)
                {
                    if (!m.Success)
                    {
                        Log("解析错误:" + m.Value);
                        continue;
                    }
    
                    //Log(partHtml);
    
                    var 编码 = m.Groups["id"].Value;
                    var 省市 = m.Groups["prov"].Value;
                    var 地区市 = m.Groups["city"].Value;
                    var 市县 = m.Groups["city2"].Value;
                    var 拼音 = m.Groups["py"].Value;
                    var 区号 = m.Groups["qh"].Value;
                    var 邮编 = m.Groups["yb"].Value;
                    var 东经 = m.Groups["dj"].Value;
                    var 北纬 = m.Groups["bw"].Value;
    
                    Log(string.Format("{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}", 编码, 省市, 地区市, 市县, 拼音, 区号, 邮编, 东经, 北纬));
                }
    
            }
            #endregion
    
            /// <summary>
            /// 简易进度条
            /// </summary>
            private void Loading()
            {
                if (this.InvokeRequired)
                {
                    this.Invoke(new MethodInvoker(Loading));
                }
                else
                {
                    int maxLength = 100;
                    int residue = maxLength - this.Text.Length;
                    this.Text = "采集中" + new StringBuilder().Append('.', residue).ToString();
                }
            }
    
            #region Log
            /// <summary>
            /// 简易控制台输出
            /// </summary>
            /// <param name="msg"></param>
            private void Log(string msg)
            {
                if (this.textBox1.InvokeRequired)
                {
                    this.Invoke(new MethodInvoker(() => Log(msg)));
                }
                else
                {
                    this.textBox1.AppendText(msg);
                    this.textBox1.AppendText(System.Environment.NewLine);
                }
            }
            #endregion
    
            #region GetHttpSource
            /// <summary>
            /// 获得网页源码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            private string GetHttpSource(string url)
            {
                //请求别人的网站温柔点
                Thread.Sleep(new Random().Next(100, 500));
    
                var wc = new WebClient {Encoding = Encoding.Default};
                wc.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
                var source = wc.DownloadString(url);
                return source;
            }
            #endregion
        }
    }
    

      

     
     
  • 相关阅读:
    python系列十二:python3模块
    python系列十一:python3数据结构
    python系列十:python3函数
    python系列九:python3迭代器和生成器
    python系列八:Python3条件控制&循环语句
    python系列七:Python3字典dict
    python系列六:Python3元组tuple
    Linux Ubuntu 安装SSH服务
    Linux Ubuntu 查看IP
    Linux 基础命令
  • 原文地址:https://www.cnblogs.com/luqingfei/p/5056116.html
Copyright © 2011-2022 走看看