using AnfleCrawler.Common; using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer { internal class Dooioo : AnalyzerBase { protected override void AnalyzeInternal(PageLandEntity current) { var lander = Crawler.Lander; var pHandler = CreateContentHandler(current); switch (current.Depth) { case 0: { var dom = lander.GetDocument(pHandler); DoPerPaging(current, dom.DocumentNode, ".pagination a:last-child"); foreach (var node in QueryNodes(dom.DocumentNode, "#hlist a")) { var url = GetHref(node, current.Url); Crawler.PushUrl(url, DataDepth.Houses); } } break; case DataDepth.Houses: { var dom = lander.GetDocument(pHandler); var attrs = new AttributeFiller(); var Nset = QueryNodes(dom.DocumentNode, "#building-info li").Select(p => { var spans = QueryTexts(p, "span").ToArray(); return string.Format("{0}:{1}", spans[0], spans[1]); }); attrs.Append(Nset); Guid hashKey = GenHashKey(current.Url.OriginalString); var bo = Crawler.Repository.LoadHouses(hashKey); bo.SiteID = current.Url.GetDomain(); bo.PageUrl = current.Url.OriginalString; bo.CityName = Crawler.Config.CityName; attrs.FillEntity(bo, new Dictionary<string, string>() { {"小区名", "小区名称"}, {"板块", "所属区域"}, {"建造年代", "竣工时间"}, {"地址", "小区地址"}, {"物业类型", "物业类别"}, }); MapMark(bo); Repository.Save(bo); Crawler.OutWrite("保存楼盘 {0}", bo.小区名称); var Pset = QueryNodes(dom.DocumentNode, ".pagination a", false); if (Pset.Any()) { string pageCount = Pset.Skip(Pset.Count() - 2).First().InnerText; Crawler.PushUrl(new Uri(string.Format("http://www.dooioo.com/ershoufang/s117862?p=[2-{0}]", pageCount)), DataDepth.Deal, bo.RowID); } SaveHouselisting(bo.RowID, current, dom); } break; case DataDepth.Deal: { Guid housesID = (Guid)current.State; pHandler.CrossLoad = (arg, xDom) => { string pName = "p"; if (arg.IsRepost) { arg.IsRepost = false; return; } var query = System.Web.HttpUtility.ParseQueryString(arg.RequestUrl.Query); int pageIndex; if (!int.TryParse(query[pName], out pageIndex)) { pageIndex = 1; } var input = xDom.GetElementsByTagName("ul").Cast<System.Windows.Forms.HtmlElement>() .Where(p => p.GetAttribute("class").Contains("pagination")).FirstOrDefault(); if (input == null) { App.LogInfo("CrossLoad xPaing:{0} {1}", this.GetType().Name, xDom.Body.InnerHtml); return; } var btn = input.GetElementsByTagName("a").Cast<System.Windows.Forms.HtmlElement>() .Where(p => p.InnerText == pageIndex.ToString()).First(); btn.InvokeMember("click"); arg.IsRepost = true; }; var dom = lander.GetDocument(pHandler); SaveHouselisting(housesID, current, dom); } break; } } private void SaveHouselisting(Guid housesID, PageLandEntity current, HtmlAgilityPack.HtmlDocument dom) { var attrs = new AttributeFiller(); foreach (var node in QueryNodes(dom.DocumentNode, "#history-list tr")) { var spans = QueryTexts(node, "td").ToArray(); attrs.Append("HousesID:{0}", housesID); DateTime dump; if (DateTime.TryParse(spans[4], out dump)) { attrs.Append("TransactionDate:{0}", dump); } attrs.Append("SoldPriceOrRent:{0}", spans[2]); attrs.Append("UnitPriceOrLease:{0}", spans[3]); attrs.Append("Apartment:{0}", spans[0]); attrs.Append("ServiceBroker:{0}", spans[5]); attrs.Append("Area:{0}", spans[1]); var bo = new HouselistingEntity(); attrs.FillEntity(bo); Repository.SaveHouselisting(bo); Crawler.OutWrite("保存小区出售记录 {0}", housesID); } } } }