zoukankan      html  css  js  c++  java
  • realestate.cei.gov.cn

    using AnfleCrawler.Common;
    using System;
    using System.Collections.Concurrent;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace AnfleCrawler.DataAnalyzer
    {
        public class ManualAnalyzer : AnalyzerBase
        {
            private ConcurrentQueue<string[]> _dict = new ConcurrentQueue<string[]>();
    
            public override void Init(PageCrawler crawler)
            {
                crawler.Lander.Idle += Lander_Idle;
                base.Init(crawler);
    
                var url = new Uri("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq=20140601&lx=w6&r1=20140830");
                var dom = Crawler.Lander.GetDocument(new PageContentHandler() { Url = url });
                foreach (var node in QueryNodes(dom.DocumentNode, "#qrq option"))
                {
                    string val = node.GetAttributeValue("value", string.Empty);
                    Crawler.PushUrl(new Uri(string.Format("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq={0}&lx=w6&r1=20140830", val)), 1);
                }
            }
            void Lander_Idle(object sender, EventArgs e)
            {
                Crawler.OutWrite("Start step2...");
                App.LogInfo("Start step2...");
                using (var writer = new System.IO.StreamWriter(@"D:outdict.txt", false, Encoding.UTF8))
                {
                    foreach (var set in _dict)
                    {
                        writer.WriteLine(string.Join(",", set));
                    }
                }
            }
    
            protected override void AnalyzeInternal(PageLandEntity current)
            {
                Crawler.OutWrite("*Start step1...");
                var lander = Crawler.Lander;
                var pHandler = CreateContentHandler(current);
                switch (current.Depth)
                {
                    case 1:
                        {
                            var query = System.Web.HttpUtility.ParseQueryString(current.Url.Query);
                            var dt = DateTime.ParseExact(query["rq"], "yyyyMMdd", null);
                            var dom = lander.GetDocument(pHandler);
    
                            var checkNode = QueryNode(dom.DocumentNode, "#str1");
                            if (string.IsNullOrWhiteSpace(checkNode.InnerText))
                            {
                                return;
                            }
                            checkNode.InnerHtml = checkNode.InnerHtml.Replace("<tr", "</tr><tr").Substring(5);
                            App.LogInfo("WTF CN:{0}", checkNode.InnerHtml);
                            var set = QueryNodes(checkNode, "tr");
                            foreach (var node in set)
                            {
                                var x = new List<string>();
                                x.Add(dt.ToString("yyyy-MM-dd"));
                                x.AddRange(QueryTexts(node, "td"));
                                _dict.Enqueue(x.ToArray());
                            }
                            _dict.Enqueue(new string[] { Environment.NewLine });
                            Crawler.OutWrite("#Stop step1 {0} {1}", dt.ToShortDateString(), set.Count());
                        }
                        break;
                }
            }
        }
    }
  • 相关阅读:
    SpringBoot集成RocketMQ报错:Bad annotation definition in @ExtRocketMQTemplateConfiguration...
    RocketMQ分析
    SpringBoot 自定义 health Actuator 原理
    【质量】容错机制
    【Java】ByteBuffer介绍
    【AWS】Essentials
    【QA123】NFR 非功能性需求
    【JVM123】OOM分析和解决
    【网络123】Http返回码
    【网络123】HTTP连接
  • 原文地址:https://www.cnblogs.com/Googler/p/4110974.html
Copyright © 2011-2022 走看看