zoukankan      html  css  js  c++  java
  • realestate.cei.gov.cn

    using AnfleCrawler.Common;
    using System;
    using System.Collections.Concurrent;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace AnfleCrawler.DataAnalyzer
    {
        public class ManualAnalyzer : AnalyzerBase
        {
            private ConcurrentQueue<string[]> _dict = new ConcurrentQueue<string[]>();
    
            public override void Init(PageCrawler crawler)
            {
                crawler.Lander.Idle += Lander_Idle;
                base.Init(crawler);
    
                var url = new Uri("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq=20140601&lx=w6&r1=20140830");
                var dom = Crawler.Lander.GetDocument(new PageContentHandler() { Url = url });
                foreach (var node in QueryNodes(dom.DocumentNode, "#qrq option"))
                {
                    string val = node.GetAttributeValue("value", string.Empty);
                    Crawler.PushUrl(new Uri(string.Format("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq={0}&lx=w6&r1=20140830", val)), 1);
                }
            }
            void Lander_Idle(object sender, EventArgs e)
            {
                Crawler.OutWrite("Start step2...");
                App.LogInfo("Start step2...");
                using (var writer = new System.IO.StreamWriter(@"D:outdict.txt", false, Encoding.UTF8))
                {
                    foreach (var set in _dict)
                    {
                        writer.WriteLine(string.Join(",", set));
                    }
                }
            }
    
            protected override void AnalyzeInternal(PageLandEntity current)
            {
                Crawler.OutWrite("*Start step1...");
                var lander = Crawler.Lander;
                var pHandler = CreateContentHandler(current);
                switch (current.Depth)
                {
                    case 1:
                        {
                            var query = System.Web.HttpUtility.ParseQueryString(current.Url.Query);
                            var dt = DateTime.ParseExact(query["rq"], "yyyyMMdd", null);
                            var dom = lander.GetDocument(pHandler);
    
                            var checkNode = QueryNode(dom.DocumentNode, "#str1");
                            if (string.IsNullOrWhiteSpace(checkNode.InnerText))
                            {
                                return;
                            }
                            checkNode.InnerHtml = checkNode.InnerHtml.Replace("<tr", "</tr><tr").Substring(5);
                            App.LogInfo("WTF CN:{0}", checkNode.InnerHtml);
                            var set = QueryNodes(checkNode, "tr");
                            foreach (var node in set)
                            {
                                var x = new List<string>();
                                x.Add(dt.ToString("yyyy-MM-dd"));
                                x.AddRange(QueryTexts(node, "td"));
                                _dict.Enqueue(x.ToArray());
                            }
                            _dict.Enqueue(new string[] { Environment.NewLine });
                            Crawler.OutWrite("#Stop step1 {0} {1}", dt.ToShortDateString(), set.Count());
                        }
                        break;
                }
            }
        }
    }
  • 相关阅读:
    类 7.2访问控制与封装 笔记
    第七章 类(class)7.1 笔记
    10.4.3反向迭代器Reverse_iterator笔记
    10.4再探迭代器笔记
    10.3.4参数绑定 bind
    10.3lambda表达式笔记
    第10章 10.1-10.3笔记
    关于按下ctrl+z后,之后的cin失效的问题
    构造和改变一个string的其他方法
    BZOJ2527 & 洛谷3527:[Poi2011]Meteors——题解
  • 原文地址:https://www.cnblogs.com/Googler/p/4110974.html
Copyright © 2011-2022 走看看