zoukankan      html  css  js  c++  java
  • 网站数据采集程序(爬虫)

    采集数据无非就是三步,抓取页面,分析数据,入库。

    一、抓取页面

     抓取页面也是在网上找的例子,主要是用到了2个方法

    1,获取网站类容;2,清除html标签。具体看代码:

    /// <summary>
            /// 根据Url获得内容
            /// </summary>
            /// <param name="url">Url</param>
            /// <returns>string</returns>
            public string GetContentUrl(string url)
            {
                string htmlContent = string.Empty;
                try
                {
                    System.Threading.Thread.Sleep(500);
                    HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
                    //伪造浏览器数据,避免被防采集程序过滤
                    req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215; CrazyCoder.cn;www.aligong.com)";
                    req.ReadWriteTimeout = 30000;
                    req.Timeout = 300000;
                    req.Proxy = null;
                    HttpWebResponse response = (HttpWebResponse)req.GetResponse();
                    using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                    {
                        htmlContent = sr.ReadToEnd();
                        sr.Dispose();
                        response.Close();
                    }
                }
                catch
                {
                    htmlContent = "";
                }
                return htmlContent;
            }
    View Code
     1 /// <summary>
     2         /// 清除Html标签
     3         /// </summary>
     4         /// <param name="ContentStr">Html内容</param>
     5         /// <returns>string</returns>
     6         public string ClearLable(string ContentStr)
     7         {
     8             while (ContentStr.IndexOf('<') >= 0 && ContentStr.IndexOf('>') > 0)
     9             {
    10                 int begin = ContentStr.IndexOf('<');
    11                 int end = ContentStr.IndexOf('>');
    12                 string SubContect = ContentStr.Substring(begin, end - begin + 1);
    13                 ContentStr = ContentStr.Replace(SubContect, "");
    14             }
    15             ContentStr = ContentStr.Replace("&nbsp;", "");
    16             return ContentStr.Trim();
    17         }
    View Code

    第二步:分析数据

    通过html正则模板获取到匹配的正则,然后取得正则匹配的集合。放入自己的集合里分析它

     1 public List<String> GetListURl(string url)
     2         {
     3             string htmlContent = GetContentUrl(url);//取得网页地址内容
     4 
     5             if (!string.IsNullOrWhiteSpace(htmlContent))
     6             {
     7                 return DealHtmlContentList(htmlContent);//调用处理方法得到list返回集合
     8             }
     9             return null;
    10         }
    11 private List<String> DealHtmlContentList(string htmlContent)
    12         {
    13             List<string> listStr = new List<string>();
    14             string sLi = "<ul id="house-lst" class="house-lst">";//获取的列表代码段
    15             string eLi = "</ul>";
    16             string arryLi = string.Empty;
    17             int start = htmlContent.IndexOf(sLi);
    18             int end = 0;
    19             if (start > 0)
    20             {
    21                 end = htmlContent.Substring(start).IndexOf(eLi);
    22                 if (end > 0) arryLi = htmlContent.Substring(start, end);//通过截取得到列表代码
    23             }
    24             if (!string.IsNullOrWhiteSpace(arryLi))
    25             {
    26                 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);//正则匹配li列表
    27                 for (Match mch = regli.Match(arryLi); mch.Success; mch = mch.NextMatch())//放进集合
    28                 {
    29                     listStr.Add(mch.Value);
    30                 }
    31             }
    32             return listStr;
    33         }
    View Code

    这是获取网页内容代码,截取到列表页集合那段html代码。匹配正则变成集合返回。这只是列表页的数据

     1 public string GetListDetail(string url) {
     2             string htmlContent = GetContentUrl(url);//取得详情页地址内容
     3             if (!string.IsNullOrWhiteSpace(htmlContent))
     4             {
     5                 return DealHtmlContentDetail(htmlContent);//调用处理方法得到sql执行语句
     6             }
     7             return null;
     8         }
     9 
    10 private string DealHtmlContentDetail(string htmlContent) {
    11             string sql = string.Empty;
    12             string sDiv = "<ol>";
    13             string eDiv = "</ol>";
    14             string arryDiv = string.Empty;
    15             int start = htmlContent.IndexOf(sDiv);
    16             int end = 0;
    17             if (start > 0)
    18             {
    19                 end = htmlContent.Substring(start).IndexOf(eDiv);
    20                 if (end > 0) arryDiv = htmlContent.Substring(start, end);
    21             }
    22 
    23             if (!string.IsNullOrWhiteSpace(arryDiv))
    24             {
    25                 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);
    26                 Regex reglable = new Regex("<label>(.*?)</label>", RegexOptions.Singleline);
    27                 Regex regspan = new Regex("<span class="other">(.*?)</span>", RegexOptions.Singleline);
    28                 Match mlable, mspan;
    29                 string InsertSql = "INSERT INTO LJHostInfo(Title,AveragePrice,";//sql语句拼接
    30                 string InsertSqlParam = "('{0}','{1}',";
    31                 for (Match mch = regli.Match(arryDiv); mch.Success; mch = mch.NextMatch())//匹配详情数据
    32                 {
    33                     mlable = reglable.Match(mch.Value); mspan = regspan.Match(mch.Value);
    34                     if (mlable.Success)
    35                     {
    36                         string value = ClearLable(mspan.Value);
    37                         switch (ClearLable(mlable.Value))//分部比较并写入sql语句拼接
    38                         {
    39                             case "建筑年代:":
    40                                 InsertSql += "BuildYear,";
    41                                 InsertSqlParam += "'" + value + "',";
    42                                 break;
    43                             case "建筑类型:":
    44                                 InsertSql += "BuildType,";
    45                                 InsertSqlParam += "'" + value + "',";
    46                                 break;
    47                             case "物业费用:":
    48                                 InsertSql += "PropertyPrice,";
    49                                 InsertSqlParam += "'" + value + "',";
    50                                 break;
    51                             case "物业公司:":
    52                                 InsertSql += "PropertyCompany,";
    53                                 InsertSqlParam += "'" + value + "',";
    54                                 break;
    55                             case "开发商:":
    56                                 InsertSql += "Developers,";
    57                                 InsertSqlParam += "'" + value + "',";
    58                                 break;
    59                             case "楼栋总数:":
    60                                 InsertSql += "FloorNum,";
    61                                 InsertSqlParam += "'" + value + "',";
    62                                 //匹配容积率
    63                                 if (mlable.NextMatch().Success)
    64                                 {
    65                                     InsertSql += "Rate,";
    66                                     InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
    67                                 }
    68                                 break;
    69                             case "房屋总数:":
    70                                 InsertSql += "HousesNum,";
    71                                 InsertSqlParam += "'" + value + "',";
    72                                 //匹配绿化率
    73                                 if (mlable.NextMatch().Success)
    74                                 {
    75                                     InsertSql += "GreenRates,";
    76                                     InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
    77                                 }
    78                                 break;
    79                             case "所属学区:":
    80                                 InsertSql += "SchoolAddress,";
    81                                 InsertSqlParam += "'" + value + "',";
    82                                 break;
    83                             case "附近门店:":
    84                                 InsertSql += "NearbyAddress,";
    85                                 InsertSqlParam += "'" + ClearLable(mch.Value).Replace("附近门店:","").Trim().Replace(" ","") + "',";//获取门店信息
    86                                 break;
    87                         }
    88                     }
    89                 }
    90                 InsertSql = InsertSql.TrimEnd(',') + ") ";
    91                 InsertSqlParam = InsertSqlParam.TrimEnd(',') + ")";
    92                 sql = InsertSql + "VALUES" + InsertSqlParam;
    93             }
    94 
    95             return sql;
    96         }
    View Code

     需要注意的就是匹配数据去掉html标签,加入sql语句。重复的匹配再插入

    第三步:多线程任务类

     1 /// <summary>
     2     /// 任务执行入库操作类
     3     /// </summary>
     4     public class ThreadWorker
     5     {
     6         private ClumbForm cForm;
     7         private List<String> list;
     8         private string siteUrl = "@$#@$#@$#@$@#$#@$#@$#@$";//加密处理(^_^)
     9         private LianJiaCaiJi caiji=new LianJiaCaiJi();
    10 
    11         public ThreadWorker(ClumbForm cf, List<String> _list)
    12         {
    13             cForm = cf;
    14             list = _list;
    15         }
    16 
    17         /// <summary>
    18         /// 线程任务开始
    19         /// </summary>
    20         /// <param name="objParams"></param>
    21         public void StartWorker()
    22         {
    23             string splitStr = string.Empty;
    24             Regex regh2 = new Regex("<h2>(.*?)</h2>", RegexOptions.Singleline);
    25             Regex regspan = new Regex("<span class="num">(.*?)</span>", RegexOptions.Singleline);
    26             Match m;
    27             Match ms;
    28             foreach (var item in list)
    29             {
    30                 m = regh2.Match(item);
    31                 if (m.Success)
    32                 {
    33                     lock (this)
    34                     {
    35                         ms = regspan.Match(item);
    36                         cForm.TotalCount += 1;
    37                         cForm.SBINSERTSQL.AppendFormat(caiji.GetListDetail(siteUrl + GetQuotationContent(m.Value, "href")), GetQuotationContent(m.Value, "title"), ms.Success ? caiji.ClearLable(ms.Value) : "0.00");
    38                         cForm.ShowMsg("已完成:" + GetQuotationContent(m.Value, "title") + "小区,价格:"+ (ms.Success ? caiji.ClearLable(ms.Value) : "0.00")+ " 完成时间:" + System.DateTime.Now.ToString());
    39                         cForm.ShowLableMsg(cForm.TotalCount+"");
    40                     }
    41                 }
    42             }
    43             cForm.ShowMsg("已完成第:" + cForm.TotalCount + "页数据采集, 完成时间:" + System.DateTime.Now.ToString());
    44         }
    45 
    46         /// <summary>
    47         /// 取得双引号中间的数据
    48         /// </summary>
    49         /// <param name="content"></param>
    50         /// <returns></returns>
    51         private string GetQuotationContent(string content,string tag) {
    52             int s=content.IndexOf(tag)+2;
    53             if ( s>= 0) {
    54                 int tagS = content.Substring(s + tag.Length).IndexOf('"');
    55                 return content.Substring(s + tag.Length, tagS);
    56             }
    57             return "";
    58         }
    59     
    60     }
    View Code

     然后是任务执行

     1 private void btnCaiJi_Click(object sender, EventArgs e)
     2         {
     3             //初始状态
     4             listBoxMessage.Items.Clear();
     5             IsComplete = false;
     6 
     7             if (string.IsNullOrWhiteSpace(txtPageStart.Text) || string.IsNullOrWhiteSpace(txtPageEnd.Text))
     8             {
     9                 MessageBox.Show("请输入采集页数!");
    10                 return;
    11             }
    12             else if (int.Parse(txtPageStart.Text) > 100) {
    13                 MessageBox.Show("采集页数只能在100以内!");
    14                 return;
    15             }
    16             ShowMsg("开始时间:" + System.DateTime.Now.ToString() + " 处理中请等待....");
    17             _cts = new CancellationTokenSource();
    18             ThreadPool.QueueUserWorkItem(state => CountTo(int.Parse(txtPageStart.Text), _cts.Token));
    19 
    20         }
    21 
    22         /// <summary>
    23         /// 以累计的方式多线程采集数据
    24         /// </summary>
    25         /// <param name="countTo">累加到的指定值</param>
    26         /// <param name="ct">取消凭证</param>
    27         private void CountTo(int countTo, CancellationToken ct)
    28         {
    29             for (; countTo <= int.Parse(txtPageEnd.Text); countTo++)
    30             {
    31                 tw = new ThreadWorker(this, caiji.GetListURl(string.Format(url, countTo)), null);
    32                 if (ct.IsCancellationRequested)
    33                 {
    34                     break;
    35                 }
    36                 //Invoke方法用于获得创建控件的线程所在的上下文
    37                 this.Invoke(new Action(tw.StartWorker));
    38                 Thread.Sleep(200);
    39             }
    40             IsComplete = true;
    41             ShowMsg("结束时间:" + System.DateTime.Now.ToString() + " 采集完成,总条数:"+TotalCount);
    42         }
    43 
    44         /// <summary>
    45         /// 实时信息显示
    46         /// </summary>
    47         /// <param name="msg">提示信息</param>
    48         public void ShowMsg(string msg)
    49         {
    50             try
    51             {
    52                 if (listBoxMessage.InvokeRequired)
    53                 {
    54                     GetMsgDelegate labDele = new GetMsgDelegate(ShowMsg);
    55                     this.Invoke(labDele, new object[] { msg });
    56                 }
    57                 else
    58                 {
    59                     listBoxMessage.Items.Add(msg);
    60                     listBoxMessage.SelectedItem = listBoxMessage.Items[listBoxMessage.Items.Count - 1];//设定listbox自动滚动
    61                     if (IsComplete)
    62                     {
    63                         btnCaiJi.Enabled = true;
    64                         btnExceSql.Enabled = true;
    65                     }
    66                     else
    67                     {
    68                         btnCaiJi.Enabled = false;
    69                         btnExceSql.Enabled = false;
    70                     }
    71                 }
    72             }
    73             catch { }
    74         }
    View Code

    执行时界面

  • 相关阅读:
    跨境支付与业务流程介绍
    idea 编译级别的设置
    WebSocket客户端学习
    2018年 新年目标
    enum 的使用
    前段技术学习网站
    堆 和 栈 区别
    CMU Deep Learning 2018 by Bhiksha Raj 学习记录(11)) Lecture 12: Recurrent Neural Networks 2
    CMU Deep Learning 2018 by Bhiksha Raj 学习记录(10)
    CMU Deep Learning 2018 by Bhiksha Raj 学习记录(9)
  • 原文地址:https://www.cnblogs.com/starts/p/5381523.html
Copyright © 2011-2022 走看看