zoukankan      html  css  js  c++  java
  • Mytophome Deal

    using AnfleCrawler.Common;
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace AnfleCrawler.DataAnalyzer
    {
        internal class Mytophome : AnalyzerBase
        {
            protected override void AnalyzeInternal(PageLandEntity current)
            {
                var lander = Crawler.Lander;
                var pHandler = CreateContentHandler(current);
                switch (current.Depth)
                {
                    case 0:
                        {
                            var dom = lander.GetDocument(pHandler);
                            var nextNode = QueryNode(dom.DocumentNode, "nobr").ParentNode;
                            nextNode.SetAttributeValue("id", PagingHack);
                            DoPerPaging(current, dom.DocumentNode, string.Format("#{0}", PagingHack));
    
                            foreach (var node in QueryNodes(dom.DocumentNode, ".deD_ctt li"))
                            {
                                var Nset = QueryNodes(node, "span").ToArray();
                                var hUrl = GetHref(QueryNode(Nset[1], "a"), current.Url);
                                var query = System.Web.HttpUtility.ParseQueryString(hUrl.Query);
                                string shid = query["estateId"];
                                hUrl = new Uri(string.Format("http://{0}/wiki/{1}/detail.html", hUrl.Authority, shid));
                                Guid housesID;
                                try
                                {
                                    CheckHouses(hUrl, out housesID);
                                }
                                catch (HtmlNodeMissingException ex)
                                {
                                    App.LogError(ex, "OrgUrl={0} HousesUrl={1}", shid, hUrl);
                                    continue;
                                }
    
                                var vals = Nset.Select(p => p.InnerText.HtmlTrim()).ToArray();
                                DateTime? transactionDate = null;
                                DateTime dump;
                                if (DateTime.TryParse(vals.Last(), out dump))
                                {
                                    transactionDate = dump;
                                }
                                if (vals.Length == 6)
                                {
                                    Repository.SaveHouselisting(new HouselistingEntity()
                                    {
                                        HousesID = housesID,
                                        TransactionDate = transactionDate,
                                        BuildingName = vals[2],
                                        Area = string.Format("{0}平方", vals[3]),
                                        SoldPriceOrRent = string.Format("{0}万", vals[4]),
                                        UnitPriceOrLease = string.Format("{0}元/平方", vals[5]),
                                    });
                                }
                                else
                                {
                                    Repository.SaveHouselisting(new HouselistingEntity()
                                    {
                                        HousesID = housesID,
                                        TransactionDate = transactionDate,
                                        Area = string.Format("{0}平方", vals[2]),
                                        SoldPriceOrRent = string.Format("{0}万", vals[3]),
                                        UnitPriceOrLease = string.Format("{0}元/平方", vals[4]),
                                    });
                                }
                                Crawler.OutWrite("保存小区出售记录 {0}", housesID);
                            }
                        }
                        break;
                }
            }
    
            private void CheckHouses(Uri housesUrl, out Guid housesID)
            {
                var pHandler = CreateContentHandler(new PageLandEntity()
                {
                    Url = housesUrl,
                    Depth = DataDepth.Houses
                });
                pHandler.AjaxBlocks.Add(HACK);
                var dom = Crawler.Lander.GetDocument(pHandler);
                var attrs = new AttributeFiller();
    
                attrs.Append(QueryTexts(dom.DocumentNode, ".xxjs_rbar_ct li"));
    
                housesID = GenHashKey(housesUrl.OriginalString);
                var bo = Crawler.Repository.LoadHouses(housesID);
                if (!string.IsNullOrEmpty(bo.SiteID))
                {
                    return;
                }
                bo.SiteID = "Mytophome.com";
                bo.PageUrl = housesUrl.OriginalString;
                bo.CityName = Crawler.Config.CityName;
                attrs.FillEntity(bo, new Dictionary<string, string>()
                {
                    {"楼盘名称", "小区名称"},
                    {"楼盘地址", "小区地址"},
                    {"发展商", "开发商"},
                    {"物管公司", "物业公司"},
                    {"物管电话", "物业办公电话"},
                });
                MapMark(bo);
                Crawler.Repository.Save(bo);
                Crawler.OutWrite("保存楼盘 {0}", bo.小区名称);
            }
        }
    }
  • 相关阅读:
    webpack 性能优化
    Bert模型实现垃圾邮件分类
    基于SKLearn的SVM模型垃圾邮件分类——代码实现及优化
    sklearn中,数据集划分函数 StratifiedShuffleSplit.split() 使用踩坑
    mysql5.7安装教程【转载】
    Postman 使用小技巧/指南
    如何知道 window 的 load 事件已经触发
    前端常用库 CDN
    使用 rollup 打包可按需加载的 NPM 包
    webpack 4 快速搭建
  • 原文地址:https://www.cnblogs.com/Googler/p/4272703.html
Copyright © 2011-2022 走看看