zoukankan      html  css  js  c++  java
  • 信息采集

    这两天,编码做了一个新蛋网手机信息的采集,web页面信息采集是用WebClient控件。需要调用方法Gather()。希望能有帮助。

    代码如下:

    /* 
     * Created By ChinaAgan 2012-1-18
     * 
     */
    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Collections;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    
    using CnBlogCollector.Properties;
    
    namespace CnBlogCollector
    {
        /// <summary>
        /// 数据采集类
        /// </summary>
        public class Collector
        {
            #region 变量
            private string cnblogMain = "http://www.newegg.com.cn/SubCategory/1043-{0}.htm";//cnblog首页地址
            
            private WebClient wc = new WebClient(); 
            #endregion
    
    
           #region 创建目录
            /// <summary>
            /// 判断目录是否存在,若不存在则创建该目录
            /// </summary>
            /// <param name="path"></param>
            /// <returns></returns>
            public string CreateFolderIfNot(string path)
            {
                //获取该目录的完整路径
                string rtn = Path.GetFullPath(path);
                //若该目录不存在
                if (!Directory.Exists(rtn))
                {
                    //创建该目录
                    Directory.CreateDirectory(rtn);
                }
                return rtn;
            }
            #endregion
    
           #region 采集网页数据
           public void Gather(int startIndex, int endIndex)
           {
               WebProxy webProxy = new WebProxy("proxy.cn1.global.***.com:8080");
               webProxy.Credentials = new System.Net.NetworkCredential("user", "password");
               wc.Proxy = webProxy;
    
               string outContent = "";
               //根据startIndex和endIndex来遍历cnblog首页上文章
               for (int i = startIndex; i < endIndex; i++)
               {
                   //从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING
                   string url = string.Format(cnblogMain, i.ToString());
                   string mainData = Encoding.GetEncoding("GB2312").GetString(wc.DownloadData(url)).Replace("\r\n", "");
    
                   string strPattern = @"<p\s+class=""info""><a\s+href=(?<url>.+?)\s+title=""(?<title>.+?)"">(?<content>.+?)</a>";
                   string oldPricePattern = @"<p\s+class=""bypast""><span>¥(?<OldPrice>.+?)</span></p>";
                   string newPricePattern = @"<p\s+class=""current""><strong\s+class=""price""><span>¥</span>(?<NewPrice>\d+?\..+?)</strong></p>";
    
                   List<string> nameList = new List<string>();
                   List<string> oldPriceList = new List<string>();
                   List<string> newPriceList = new List<string>();
                   string oldPrice = String.Empty;
                   string newPrice = String.Empty;
    
                   MatchCollection MatchesName = Regex.Matches(mainData, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
                   MatchCollection MatchesOldPrice = Regex.Matches(mainData, oldPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
                   MatchCollection MatchesNewPrice = Regex.Matches(mainData, newPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
    
                   foreach (Match NextMatch in MatchesName)
                   {
                       nameList.Add(NextMatch.Groups["content"].Value);
                   }
    
                   foreach (Match NextMatch in MatchesOldPrice)
                   {
                       oldPriceList.Add(NextMatch.Groups["OldPrice"].Value);
                   }
    
                   foreach (Match NextMatch in MatchesNewPrice)
                   {
                       newPriceList.Add(NextMatch.Groups["NewPrice"].Value);
                   }
    
                   for (int iLen = 0; iLen < nameList.Count; iLen++)
                   {
                       outContent += String.Format("手机名称:{0}," + "原价:{1},现价:{2}", nameList[iLen].ToString(), oldPriceList[iLen].ToString(), newPriceList[iLen].ToString()) +"\r\n";
                   }
    
                   // 现价和&32;之类符号的处理。
                   string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + ".txt";
                   if (File.Exists(pth))
                   {
                       File.Delete(pth);  
                   }
    
                   File.AppendAllText(pth, outContent, Encoding.GetEncoding("GB2312"));
    
                   outContent = "";
               }
           } 
           #endregion
        }
    }
    
  • 相关阅读:
    c#读取.config文件内容
    c# 读取配置文件方法
    C# Log4net详细说明
    C# 运算符集
    LeetCode 69_ x 的平方根
    LeetCode 172 _ 阶乘后的零
    LeetCode 171 _ Excel表列序号
    LeetCode 88 _ 合并两个有序数组
    LeetCode 581 _ 最短无序连续子数组
    LeetCode 283 _ 移动零
  • 原文地址:https://www.cnblogs.com/chinaagan/p/2325231.html
Copyright © 2011-2022 走看看