using Newtonsoft.Json; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using Crawler.Model; namespace Crawler.DataService { public class CategoryRepository //: IRepository<Commodity> { private Logger logger = new Logger(typeof(CategoryRepository)); public void Save(List<Category> categoryList) { SqlHelper.InsertList<Category>(categoryList, "Category"); new Action<List<Category>>(SaveList).BeginInvoke(categoryList, null, null); } /// <summary> /// 根据Level获取类别列表 /// </summary> /// <param name="level"></param> /// <returns></returns> public List<Category> QueryListByLevel(int level) { string sql = string.Format("SELECT * FROM category WHERE categorylevel={0};", level); return SqlHelper.QueryList<Category>(sql); } /// <summary> /// 存文本记录的 /// </summary> /// <param name="categoryList"></param> public void SaveList(List<Category> categoryList) { StreamWriter sw = null; try { string recordFileName = string.Format("{0}_Category.txt", DateTime.Now.ToString("yyyyMMddHHmmss")); string totolPath = Path.Combine(ObjectFactory.DataPath, recordFileName); if (!Directory.Exists(Path.GetDirectoryName(totolPath))) { Directory.CreateDirectory(Path.GetDirectoryName(totolPath)); sw = File.CreateText(totolPath); } else { sw = File.AppendText(totolPath); } sw.WriteLine(JsonConvert.SerializeObject(categoryList)); } catch (Exception e) { logger.Error("CategoryRepository.SaveList出现异常", e); } finally { if (sw != null) { sw.Flush(); sw.Close(); sw.Dispose(); } } } } }
using Newtonsoft.Json; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using Crawler.Model; namespace Crawler.DataService { public class CommodityRepository //: IRepository<Commodity> { private Logger logger = new Logger(typeof(CommodityRepository)); public void SaveList(List<Commodity> commodityList) { if (commodityList == null || commodityList.Count == 0) return; IEnumerable<IGrouping<string, Commodity>> group = commodityList.GroupBy<Commodity, string>(c => GetTableName(c)); foreach (var data in group) { SqlHelper.InsertList<Commodity>(data.ToList(), data.Key); } } private string GetTableName(Commodity commodity) { return string.Format("JD_Commodity_{0}", (commodity.ProductId % 30 + 1).ToString("000")); } /// <summary> /// 保存文本记录 /// </summary> /// <param name="commodityList"></param> /// <param name="category"></param> /// <param name="page"></param> public void SaveList(List<Commodity> commodityList, Category category, int page) { StreamWriter sw = null; try { string recordFileName = string.Format("{0}/{1}/{2}/{3}.txt", category.CategoryLevel, category.ParentCode, category.Id, page); string totolPath = Path.Combine(ObjectFactory.DataPath, recordFileName); if (!Directory.Exists(Path.GetDirectoryName(totolPath))) { Directory.CreateDirectory(Path.GetDirectoryName(totolPath)); sw = File.CreateText(totolPath); } else { sw = File.AppendText(totolPath); } sw.WriteLine(JsonConvert.SerializeObject(commodityList)); } catch (Exception e) { logger.Error("CommodityRepository.SaveList出现异常", e); } finally { if (sw != null) { sw.Flush(); sw.Close(); sw.Dispose(); } } } } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Crawler; namespace Crawler.DataService { public class DBInit { private static Logger logger = new Logger(typeof(DBInit)); /// <summary> /// 谨慎使用 会全部删除数据库并重新创建! /// </summary> public void InitCommodityTable() { #region Delete try { StringBuilder sb = new StringBuilder(); for (int i = 1; i < 31; i++) { sb.AppendFormat("DROP TABLE [dbo].[JD_Commodity_{0}];", i.ToString("000")); } SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { if (ex.Message.Contains("因为它不存在,或者您没有所需的权限。")) { logger.Warn("初始化数据库InitCommodityTable删除的时候,原表不存在"); } else { logger.Error("初始化数据库InitCommodityTable失败", ex); throw ex; } } #endregion Delete #region Create try { StringBuilder sb = new StringBuilder(); for (int i = 1; i < 31; i++) { sb.AppendFormat(@"CREATE TABLE [dbo].[JD_Commodity_{0}]( [Id] [int] IDENTITY(1,1) NOT NULL, [ProductId] [bigint] NULL, [CategoryId] [int] NULL, [Title] [nvarchar](500) NULL, [Price] [decimal](18, 2) NULL, [Url] [varchar](1000) NULL, [ImageUrl] [varchar](1000) NULL, CONSTRAINT [PK_JD_Commodity_{0}] PRIMARY KEY CLUSTERED ( [Id] ASC )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] ) ON [PRIMARY];", i.ToString("000")); } SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { logger.Error("InitCommodityTable创建异常", ex); throw ex; } #endregion Create } /// <summary> /// 谨慎使用 会全部删除数据库并重新创建! /// </summary> public void InitCategoryTable() { #region Delete try { StringBuilder sb = new StringBuilder(); sb.AppendFormat("DROP TABLE [dbo].[Category];"); SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { if (ex.Message.Equals("无法对 表 'dbo.Category' 执行 删除,因为它不存在,或者您没有所需的权限。")) { logger.Warn("初始化数据库InitCategoryTable删除的时候,原表不存在"); } else { logger.Error("初始化数据库InitCategoryTable失败", ex); throw ex; } } #endregion Delete #region Create try { StringBuilder sb = new StringBuilder(); sb.AppendFormat(@"CREATE TABLE [dbo].[Category]( [Id] [int] IDENTITY(1,1) NOT NULL, [Code] [varchar](100) NULL, [ParentCode] [varchar](100) NULL, [CategoryLevel] [int] NULL, [Name] [nvarchar](50) NULL, [Url] [varchar](1000) NULL, [State] [int] NULL, CONSTRAINT [PK_Category] PRIMARY KEY CLUSTERED ( [Id] ASC )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] ) ON [PRIMARY];"); SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { logger.Error("初始化数据库InitCategoryTable 创建失败", ex); throw ex; } #endregion Create } } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Crawler.DataService { public interface IRepository<T> where T : class//, new() { void Save(T entity); void SaveList(List<T> entity); } }
using System; using System.Collections.Generic; using System.Configuration; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Crawler.DataService { public class ObjectFactory { public static string DataPath = ConfigurationManager.AppSettings["DataPath"]; } }
using System; using System.Collections.Generic; using System.Configuration; using System.Data; using System.Data.SqlClient; using System.Linq; using System.Reflection; using System.Text; using System.Threading.Tasks; namespace Crawler.DataService { public class SqlHelper { private static Logger logger = new Logger(typeof(SqlHelper)); private static string ConnStr = ConfigurationManager.ConnectionStrings["mvc5"].ConnectionString; /// <summary> /// 事务执行 /// </summary> /// <param name="sql"></param> public static void ExecuteNonQuery(string sql) { using (SqlConnection sqlConn = new SqlConnection(ConnStr)) { sqlConn.Open(); SqlCommand cmd = new SqlCommand(sql, sqlConn); cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();// } } public static void ExecuteNonQueryWithTrans(string sql) { SqlTransaction trans = null; try { using (SqlConnection sqlConn = new SqlConnection(ConnStr)) { sqlConn.Open(); trans = sqlConn.BeginTransaction(); SqlCommand cmd = new SqlCommand(sql, sqlConn, trans); cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();// trans.Commit(); } } catch (Exception ex) { //logger.Error(string.Format("ExecuteNonQueryWithTrans出现异常,sql={0}", sql), ex); if (trans != null && trans.Connection != null) trans.Rollback(); throw ex; } finally { } } public static List<T> QueryList<T>(string sql) where T : new() { using (SqlConnection sqlConn = new SqlConnection(ConnStr)) { sqlConn.Open(); SqlCommand cmd = new SqlCommand(sql, sqlConn); return TransList<T>(cmd.ExecuteReader()); } } public static void Insert<T>(T model, string tableName) where T : new() { string sql = GetInsertSql<T>(model, tableName); ExecuteNonQuery(sql); } public static void InsertList<T>(List<T> list, string tableName) where T : new() { string sql = string.Join(" ", list.Select(t => GetInsertSql<T>(t, tableName))); ExecuteNonQuery(sql); } #region Private private static string GetInsertSql<T>(T model, string tableName) { StringBuilder sbSql = new StringBuilder(); StringBuilder sbFields = new StringBuilder(); StringBuilder sbValues = new StringBuilder(); Type type = model.GetType(); var properties = type.GetProperties(); foreach (PropertyInfo p in properties) { string name = p.Name; if (!name.Equals("id", StringComparison.OrdinalIgnoreCase)) { sbFields.AppendFormat("[{0}],", name); string sValue = null; object oValue = p.GetValue(model); if (oValue != null) sValue = oValue.ToString().Replace("'", ""); sbValues.AppendFormat("'{0}',", sValue); } } sbSql.AppendFormat("INSERT INTO {0} ({1}) VALUES ({2});", tableName, sbFields.ToString().TrimEnd(','), sbValues.ToString().TrimEnd(',')); return sbSql.ToString(); } private static List<T> TransList<T>(SqlDataReader reader) where T : new() { List<T> tList = new List<T>(); Type type = typeof(T); var properties = type.GetProperties(); if (reader.Read()) { do { T t = new T(); foreach (PropertyInfo p in properties) { p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType)); } tList.Add(t); } while (reader.Read()); } return tList; } private static T TransModel<T>(SqlDataReader reader) where T : new() { T t = new T(); if (reader.Read()) { do { Type type = typeof(T); var properties = type.GetProperties(); foreach (PropertyInfo p in properties) { p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType)); } } while (reader.Read()); } return t; } #endregion Private } }
using Newtonsoft.Json; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using Crawler.Model; namespace Crawler.DataService { public class WarnRepository //: IRepository<Commodity> { private Logger logger = new Logger(typeof(WarnRepository)); public void SaveWarn(Category category, string msg) { StreamWriter sw = null; try { string recordFileName = string.Format("warn/{0}/{1}/{2}.txt", category.CategoryLevel, category.ParentCode, category.Id); string totolPath = Path.Combine(ObjectFactory.DataPath, recordFileName); if (!Directory.Exists(Path.GetDirectoryName(totolPath))) { Directory.CreateDirectory(Path.GetDirectoryName(totolPath)); sw = File.CreateText(totolPath); } else { sw = File.AppendText(totolPath); } sw.WriteLine(msg); sw.WriteLine(JsonConvert.SerializeObject(JsonConvert.SerializeObject(category))); } catch (Exception e) { logger.Error("SaveWarn出现异常", e); } finally { if (sw != null) { sw.Flush(); sw.Close(); sw.Dispose(); } } } } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Crawler.Model { public class BaseModel { public int Id { get; set; } } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Crawler.Model { public class Category:BaseModel { public string Code { get; set; } public string ParentCode { get; set; } public string Name { get; set; } public string Url { get; set; } public int CategoryLevel { get; set; } public int State { get; set; } } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Crawler.Model { public class Commodity : BaseModel { public long ProductId { get; set; } public int CategoryId { get; set; } public string Title { get; set; } public decimal Price { get; set; } public string Url { get; set; } public string ImageUrl { get; set; } } //jQuery5427073([{"id":"J_1707419","p":"5149.00","m":"5499.00"},{"id":"J_1589214","p":"1999.00","m":"2999.00"},{"id":"J_1546310","p":"3999.00","m":"4999.00"},{"id":"J_1510479","p":"2999.00","m":"3569.00"},{"id":"J_1707420","p":"4149.00","m":"4499.00"},{"id":"J_1770620","p":"2099.00","m":"2499.00"},{"id":"J_1258277","p":"2699.00","m":"3299.00"},{"id":"J_1707423","p":"4599.00","m":"4705.00"},{"id":"J_1252778","p":"3099.00","m":"4199.00"},{"id":"J_1553732","p":"3298.00","m":"4598.00"},{"id":"J_1576022","p":"2999.00","m":"3999.00"},{"id":"J_1420120","p":"1999.00","m":"2899.00"},{"id":"J_647948","p":"1299.00","m":"1698.00"},{"id":"J_1044476","p":"1999.00","m":"2999.00"},{"id":"J_1376591","p":"1299.00","m":"1599.00"},{"id":"J_1416294","p":"4599.00","m":"5898.00"},{"id":"J_1455427","p":"1499.00","m":"1999.00"},{"id":"J_1253502","p":"2799.00","m":"3999.00"},{"id":"J_1553624","p":"2998.00","m":"4398.00"},{"id":"J_1301951","p":"2279.00","m":"3999.00"},{"id":"J_1115374","p":"2499.00","m":"4299.00"},{"id":"J_671315","p":"1999.00","m":"2898.00"},{"id":"J_1283945","p":"3099.00","m":"4199.00"},{"id":"J_1283940","p":"2499.00","m":"2999.00"},{"id":"J_1027317","p":"2799.00","m":"5999.00"},{"id":"J_1314962","p":"3699.00","m":"5199.00"},{"id":"J_1565150","p":"4068.00","m":"5727.00"},{"id":"J_1565175","p":"3788.00","m":"5377.00"},{"id":"J_1565182","p":"3938.00","m":"5757.00"},{"id":"J_1209084","p":"3599.00","m":"4999.00"}]); public class CommodityPrice { public string id { get; set; } public decimal p { get; set; } public decimal m { get; set; } } }
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace Crawler { /// <summary> /// http://tool.sufeinet.com/HttpHelper.aspx /// </summary> public class HttpHelper { private static Logger logger = new Logger(typeof(HttpHelper)); /// <summary> /// 根据url下载内容 之前是GB2312 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string DownloadUrl(string url) { return DownloadHtml(url, Encoding.UTF8); } /// <summary> /// 下载html /// http://tool.sufeinet.com/HttpHelper.aspx /// HttpWebRequest功能比较丰富,WebClient使用比较简单 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string DownloadHtml(string url, Encoding encode) { string html = string.Empty; try { HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求 request.Timeout = 30 * 1000;//设置30s的超时 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"; request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// //request.Host = "search.yhd.com"; //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search"); //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch"); //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/"); //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312 using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求 { if (response.StatusCode != HttpStatusCode.OK) { logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode)); } else { try { StreamReader sr = new StreamReader(response.GetResponseStream(), encode); html = sr.ReadToEnd();//读取数据 sr.Close(); } catch (Exception ex) { logger.Error(string.Format("DownloadHtml抓取{0}保存失败", url), ex); html = null; } } } } catch (System.Net.WebException ex) { if (ex.Message.Equals("远程服务器返回错误: (306)。")) { logger.Error("远程服务器返回错误: (306)。", ex); return null; } } catch (Exception ex) { logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex); html = null; } return html; } } }
using System; using System.IO; using System.Diagnostics; using System.Configuration; using log4net.Config; using log4net; using Newtonsoft.Json; using System.Collections.Generic; namespace Crawler { public class Logger { static Logger() { XmlConfigurator.Configure(new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "CfgFiles\log4net.cfg.xml"))); ILog Log = LogManager.GetLogger(typeof(Logger)); Log.Info("系统初始化Logger模块"); } private ILog loger = null; public Logger(Type type) { loger = LogManager.GetLogger(type); } /// <summary> /// Log4日志 /// </summary> /// <param name="msg"></param> /// <param name="ex"></param> public void Error(string msg = "出现异常", Exception ex = null) { Console.WriteLine(msg); loger.Error(msg, ex); } /// <summary> /// Log4日志 /// </summary> /// <param name="msg"></param> public void Warn(string msg) { Console.WriteLine(msg); loger.Warn(msg); } /// <summary> /// Log4日志 /// </summary> /// <param name="msg"></param> public void Info(string msg) { Console.WriteLine(msg); loger.Info(msg); } /// <summary> /// Log4日志 /// </summary> /// <param name="msg"></param> public void Debug(string msg ) { Console.WriteLine(msg); loger.Debug(msg); } } }
using HtmlAgilityPack; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; using Newtonsoft.Json; using Crawler.Model; namespace Crawler { public class CategorySearch { private static Logger logger = new Logger(typeof(CategorySearch)); private static int count = 1; public static List<Category> Crawler(string url) { List<Category> categoryList = new List<Category>(); try { string html = HttpHelper.DownloadUrl(url); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); string fristPath = "//*[@class='category-item m']"; HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(fristPath); int k = 1; foreach (HtmlNode node in nodeList) { categoryList.AddRange(First(node.InnerHtml, k++.ToString("00") + "f", "root")); } } catch (Exception ex) { logger.Error("CrawlerMuti出现异常", ex); } return categoryList; } /// <summary> /// 对每一个一级类进行查找 /// </summary> /// <param name="html"></param> /// <param name="code"></param> /// <param name="parentCode"></param> /// <returns></returns> private static List<Category> First(string html, string code, string parentCode) { List<Category> categoryList = new List<Category>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); string path = "//*[@class='mt']/h2/span"; HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path); foreach (HtmlNode node in nodeList) { Category category = new Category() { Id = count++, State = 0, CategoryLevel = 1, Code = code, ParentCode = parentCode }; category.Name = node.InnerText; category.Url = "";// node.Attributes["href"].Value; categoryList.Add(category); } categoryList.AddRange(Second(html, code)); return categoryList; } /// <summary> /// 在一个一级类下面的全部二级类进行查找 /// </summary> /// <param name="html"></param> /// <param name="parentCode"></param> /// <returns></returns> private static List<Category> Second(string html, string parentCode) { List<Category> categoryList = new List<Category>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); string path = "//*[@class='items']/dl"; HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path); int k = 1; foreach (HtmlNode node in nodeList) { string code = string.Format("{0}{1}s", parentCode, k.ToString("00")); string secondHtml = node.InnerHtml; if (string.IsNullOrWhiteSpace(secondHtml)) continue; HtmlDocument secondDoc = new HtmlDocument(); secondDoc.LoadHtml(secondHtml); Category category = new Category() { Id = count++, State = 0, CategoryLevel = 2, Code = code, ParentCode = parentCode }; HtmlNode secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt/a"); if (secondNode == null)//图书音像 { secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt"); } category.Name = secondNode.InnerText; if (secondNode.Attributes["href"] != null) { category.Url = secondNode.Attributes["href"].Value; if (!category.Url.StartsWith("http:")) { category.Url = string.Concat("http:", category.Url); } } categoryList.Add(category); HtmlNode thirdNode = secondDoc.DocumentNode.SelectSingleNode("//dd"); if (thirdNode == null) continue; categoryList.AddRange(Third(thirdNode.InnerHtml, code)); k++; } return categoryList; } /// <summary> /// 在一个二级类下的全部三级类里面进行查找 /// </summary> /// <param name="html"></param> /// <param name="parentCode"></param> /// <returns></returns> private static List<Category> Third(string html, string parentCode) { List<Category> categoryList = new List<Category>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); string path = "//a"; HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path); if (nodeList == null || nodeList.Count == 0) return categoryList; int k = 1; foreach (HtmlNode node in nodeList) { string code = string.Format("{0}{1}t", parentCode, k.ToString("00")); Category category = new Category() { Id = count++, State = 0, CategoryLevel = 3, Code = code, ParentCode = parentCode }; category.Name = node.InnerText; category.Url = node.Attributes["href"].Value; if (!category.Url.StartsWith("http:")) { category.Url = string.Concat("http:", category.Url); } categoryList.Add(category); k++; } return categoryList; } } }
using HtmlAgilityPack; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; using Newtonsoft.Json; using Crawler.DataService; using Crawler.Model; namespace Crawler { public class CommoditySearch { private Logger logger = new Logger(typeof(CommoditySearch)); private WarnRepository warnRepository = new WarnRepository(); private CommodityRepository commodityRepository = new CommodityRepository(); private Category category = null; public CommoditySearch(Category _category) { category = _category; } public void Crawler() { try { if (string.IsNullOrEmpty(category.Url)) { warnRepository.SaveWarn(category, string.Format("Url为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url)); return; } string html = HttpHelper.DownloadUrl(category.Url);//下载html HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html);//加载html string pageNumberPath = @"//*[@id='J_topPage']/span/i"; HtmlNode pageNumberNode = doc.DocumentNode.SelectSingleNode(pageNumberPath); if (pageNumberNode != null) { string sNumber = pageNumberNode.InnerText; for (int i = 1; i < int.Parse(sNumber) + 1; i++) { string pageUrl = string.Format("{0}&page={1}", category.Url, i); try { List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i))); //commodityRepository.SaveList(commodityList); } catch (Exception ex)//保证一页的错误不影响另外一页 { logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex); } } } //string fristPath = "//*[@id='J_bottomPage']/span[1]/a"; //HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(fristPath);//xPath分析 //if (noneNodeList == null) //{ // warnRepository.SaveWarn(category, string.Format("分页数据为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url)); // return; //} //string pageUrl = null; //foreach (var node in noneNodeList) //{ // string sNum = node.InnerHtml; // if (sNum.Equals("1")) // { // pageUrl = node.Attributes["href"].Value.Replace("&", "&"); // if (!pageUrl.StartsWith("http://")) // pageUrl = string.Format("http://list.jd.com{0}", pageUrl); // break; // } //} //string sMaxPageNumPath = "//*[@id='J_bottomPage']/span[2]/em[1]/b"; //HtmlNode sMaxPageNumPathNode = doc.DocumentNode.SelectSingleNode(sMaxPageNumPath); //string sMaxPageNum = sMaxPageNumPathNode.InnerHtml; //for (int i = 1; i < int.Parse(sMaxPageNum) + 1; i++) //{ // try // { // List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i))); // commodityRepository.SaveList(commodityList); // } // catch (Exception ex)//保证一页的错误不影响另外一页 // { // logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex); // } //} } catch (Exception ex) { logger.Error("CrawlerMuti出现异常", ex); warnRepository.SaveWarn(category, string.Format("出现异常,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url)); } } private List<Commodity> GetCommodityList(Category category, string url) { string html = HttpHelper.DownloadUrl(url); List<Commodity> commodityList = new List<Commodity>(); try { if (string.IsNullOrEmpty(html)) return commodityList; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); string liPath = "//*[@id='plist']/ul/li"; HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(liPath); if (noneNodeList == null || noneNodeList.Count == 0) { warnRepository.SaveWarn(category, string.Format("GetCommodityList商品数据为空,Name={0} Level={1} category.Url={2} url={3}", category.Name, category.CategoryLevel, category.Url, url)); return commodityList; } foreach (var node in noneNodeList) { HtmlDocument docChild = new HtmlDocument(); docChild.LoadHtml(node.OuterHtml); Commodity commodity = new Commodity() { CategoryId = category.Id }; string urlPath = "//*[@class='p-name']/a"; HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath); if (urlNode == null) { continue; } commodity.Url = urlNode.Attributes["href"].Value; if (!commodity.Url.StartsWith("http:")) commodity.Url = "http:" + commodity.Url; string sId = Path.GetFileName(commodity.Url).Replace(".html", ""); commodity.ProductId = long.Parse(sId); //*[@id="plist"]/ul/li[1]/div/div[3]/a/em string titlePath = "//*[@class='p-name']/a/em"; HtmlNode titleNode = docChild.DocumentNode.SelectSingleNode(titlePath); if (titleNode == null) { //Log.Error(titlePath); continue; } commodity.Title = titleNode.InnerText; string iamgePath = "//*[@class='p-img']/a/img"; HtmlNode imageNode = docChild.DocumentNode.SelectSingleNode(iamgePath); if (imageNode == null) { continue; } if (imageNode.Attributes.Contains("src")) commodity.ImageUrl = imageNode.Attributes["src"].Value; else if (imageNode.Attributes.Contains("original")) commodity.ImageUrl = imageNode.Attributes["original"].Value; else if (imageNode.Attributes.Contains("data-lazy-img")) commodity.ImageUrl = imageNode.Attributes["data-lazy-img"].Value; else { continue; } if (!commodity.ImageUrl.StartsWith("http:")) commodity.ImageUrl = "http:" + commodity.ImageUrl; commodityList.Add(commodity); } Console.WriteLine("{0}一共获取了{1}条数据", url, commodityList.Count); } catch (Exception ex) { logger.Error(string.Format("GetCommodityList出现异常,url={0}", url), ex); } return GetCommodityPrice(category, commodityList); } /// <summary> /// 获取商品价格 /// </summary> /// <param name="commodityList"></param> /// <returns></returns> private List<Commodity> GetCommodityPrice(Category category, List<Commodity> commodityList) { try { if (commodityList == null || commodityList.Count() == 0) return commodityList; StringBuilder sb = new StringBuilder(); //sb.Append(@"http://p.3.cn/prices/mgets?my=list_price&type=1&area=1_72_4137&skuIds="); //sb.Append(string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId)))); // sb.AppendFormat("http://p.3.cn/prices/mgets?callback=jQuery1069298&type=1&area=1_72_4137_0&skuIds={0}&pdbp=0&pdtk=&pdpin=&pduid=1945966343&_=1469022843655", string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId)))); string html = HttpHelper.DownloadUrl(sb.ToString()); if (string.IsNullOrWhiteSpace(html)) { logger.Warn(string.Format("获取url={0}时获取的html为空", sb.ToString())); } html = html.Substring(html.IndexOf("(") + 1); html = html.Substring(0, html.LastIndexOf(")")); List<CommodityPrice> priceList = JsonConvert.DeserializeObject<List<CommodityPrice>>(html); commodityList.ForEach(c => c.Price = priceList.FirstOrDefault(p => p.id.Equals(string.Format("J_{0}", c.ProductId))).p); //commodityList.ForEach(c => Console.WriteLine(" Title={0} ImageUrl={1} Url={2} Price={3} Id={4}", c.Title, c.ImageUrl, c.Url, c.Price, c.Id)); } catch (Exception ex) { logger.Error("GetCommodityPrice出现异常", ex); } return commodityList; } } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading; using System.Threading.Tasks; using Newtonsoft.Json; using Crawler.DataService; using Crawler.Model; namespace Crawler { /// <summary> /// 1 爬虫,爬虫攻防 /// 2 下载html /// 3 xpath解析html,获取数据和深度抓取 /// 4 不一样的属性和ajax数据的获取 /// 5 多线程爬虫 /// </summary> class Program { private static Logger logger = new Logger(typeof(Program)); static void Main(string[] args) { try { Console.WriteLine("今天是爬虫的学习"); ////测试DownloadHtml //string html = HttpHelper.DownloadHtml(@"http://list.jd.com/list.html?cat=9987,653,655", Encoding.UTF8); ////测试获取分类页 //string html1 = HttpHelper.DownloadHtml("http://www.jd.com/allSort.aspx", Encoding.UTF8); ////测试抓取商品列表 //string testCategory = "{"Id":73,"Code":"02f01s01T","ParentCode":"02f01s","Name":"烟机/灶具","Url":"http://list.jd.com/list.html?cat=737,13297,1300","Level":3}"; //new CommoditySearch(JsonConvert.DeserializeObject<Category>(testCategory)).Crawler(); //List<Category> categoryList = CategorySearch.Crawler("http://www.jd.com/allSort.aspx"); //Crawl(); } catch (Exception ex) { logger.Error("异常啦,", ex); Console.WriteLine("*****************木有成功**********************"); Console.ReadLine(); } Console.ReadLine(); } /// <summary> /// 抓取 /// </summary> private static void Crawl() { DBInit dbInit = new DBInit(); CategoryRepository categoryRepository = new CategoryRepository(); Console.WriteLine("请输入Y/N进行类别表初始化确认! Y 删除Category表然后重新创建,然后抓取类型数据,N(或者其他)跳过"); string input = Console.ReadLine(); if (input.Equals("Y", StringComparison.OrdinalIgnoreCase)) { dbInit.InitCategoryTable(); List<Category> categoryList = CategorySearch.Crawler("http://www.jd.com/allSort.aspx"); categoryRepository.Save(categoryList); Console.WriteLine("类型数据初始化完成,共抓取类别{0}个", categoryList.Count); } else { Console.WriteLine("你选择不初始化类别数据"); } Console.WriteLine("*****************^_^**********************"); Console.WriteLine("请输入Y/N进行商品数据初始化确认! Y 删除全部商品表表然后重新创建,然后抓取商品数据,N(或者其他)跳过"); input = Console.ReadLine(); if (input.Equals("Y", StringComparison.OrdinalIgnoreCase)) { dbInit.InitCommodityTable(); CrawlerCommodity(); } Console.WriteLine("*****************^_^**********************"); //CleanAll(); } /// <summary> /// 抓取商品 /// </summary> private static void CrawlerCommodity() { Console.WriteLine("{0} jd商品开始抓取 - -", DateTime.Now); CategoryRepository categoryRepository = new CategoryRepository(); List<Category> categoryList = categoryRepository.QueryListByLevel(3); List<Task> taskList = new List<Task>(); TaskFactory taskFactory = new TaskFactory(); foreach (Category category in categoryList) { CommoditySearch searcher = new CommoditySearch(category); //searcher.Crawler(); taskList.Add(taskFactory.StartNew(searcher.Crawler)); if (taskList.Count > 15) { taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList(); Task.WaitAny(taskList.ToArray()); } } Task.WaitAll(taskList.ToArray()); Console.WriteLine("{0} jd商品抓取全部完成 - -", DateTime.Now); CleanAll(); } /// <summary> /// 清理数据 /// </summary> private static void CleanAll() { try { Console.WriteLine("{0} 开始清理重复数据 - -", DateTime.Now); StringBuilder sb = new StringBuilder(); for (int i = 1; i < 31; i++) { sb.AppendFormat(@"DELETE FROM [dbo].[JD_Commodity_{0}] where productid IN(select productid from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1);", i.ToString("000")); } #region /* DELETE FROM [dbo].[JD_Commodity_001] where productid IN(select productid from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_002] where productid IN(select productid from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_003] where productid IN(select productid from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_004] where productid IN(select productid from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_005] where productid IN(select productid from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_006] where productid IN(select productid from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_007] where productid IN(select productid from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as IDv from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_008] where productid IN(select productid from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_009] where productid IN(select productid from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_010] where productid IN(select productid from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_011] where productid IN(select productid from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_012] where productid IN(select productid from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_013] where productid IN(select productid from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_014] where productid IN(select productid from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_015] where productid IN(select productid from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_016] where productid IN(select productid from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_017] where productid IN(select productid from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_018] where productid IN(select productid from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_019] where productid IN(select productid from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_020] where productid IN(select productid from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_021] where productid IN(select productid from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_022] where productid IN(select productid from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_023] where productid IN(select productid from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_024] where productid IN(select productid from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_025] where productid IN(select productid from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_026] where productid IN(select productid from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_027] where productid IN(select productid from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_028] where productid IN(select productid from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_029] where productid IN(select productid from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_030] where productid IN(select productid from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1); */ #endregion Console.WriteLine("执行清理sql:{0}", sb.ToString()); SqlHelper.ExecuteNonQuery(sb.ToString()); Console.WriteLine("{0} 完成清理重复数据 - -", DateTime.Now); } catch (Exception ex) { logger.Error("CleanAll出现异常", ex); } finally { Console.WriteLine("{0} 结束清理重复数据 - -", DateTime.Now); } } } }