zoukankan      html  css  js  c++  java
  • realestate.cei.gov.cn

    using AnfleCrawler.Common;
    using System;
    using System.Collections.Concurrent;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace AnfleCrawler.DataAnalyzer
    {
        public class ManualAnalyzer : AnalyzerBase
        {
            private ConcurrentQueue<string[]> _dict = new ConcurrentQueue<string[]>();
    
            public override void Init(PageCrawler crawler)
            {
                crawler.Lander.Idle += Lander_Idle;
                base.Init(crawler);
    
                var url = new Uri("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq=20140601&lx=w6&r1=20140830");
                var dom = Crawler.Lander.GetDocument(new PageContentHandler() { Url = url });
                foreach (var node in QueryNodes(dom.DocumentNode, "#qrq option"))
                {
                    string val = node.GetAttributeValue("value", string.Empty);
                    Crawler.PushUrl(new Uri(string.Format("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq={0}&lx=w6&r1=20140830", val)), 1);
                }
            }
            void Lander_Idle(object sender, EventArgs e)
            {
                Crawler.OutWrite("Start step2...");
                App.LogInfo("Start step2...");
                using (var writer = new System.IO.StreamWriter(@"D:outdict.txt", false, Encoding.UTF8))
                {
                    foreach (var set in _dict)
                    {
                        writer.WriteLine(string.Join(",", set));
                    }
                }
            }
    
            protected override void AnalyzeInternal(PageLandEntity current)
            {
                Crawler.OutWrite("*Start step1...");
                var lander = Crawler.Lander;
                var pHandler = CreateContentHandler(current);
                switch (current.Depth)
                {
                    case 1:
                        {
                            var query = System.Web.HttpUtility.ParseQueryString(current.Url.Query);
                            var dt = DateTime.ParseExact(query["rq"], "yyyyMMdd", null);
                            var dom = lander.GetDocument(pHandler);
    
                            var checkNode = QueryNode(dom.DocumentNode, "#str1");
                            if (string.IsNullOrWhiteSpace(checkNode.InnerText))
                            {
                                return;
                            }
                            checkNode.InnerHtml = checkNode.InnerHtml.Replace("<tr", "</tr><tr").Substring(5);
                            App.LogInfo("WTF CN:{0}", checkNode.InnerHtml);
                            var set = QueryNodes(checkNode, "tr");
                            foreach (var node in set)
                            {
                                var x = new List<string>();
                                x.Add(dt.ToString("yyyy-MM-dd"));
                                x.AddRange(QueryTexts(node, "td"));
                                _dict.Enqueue(x.ToArray());
                            }
                            _dict.Enqueue(new string[] { Environment.NewLine });
                            Crawler.OutWrite("#Stop step1 {0} {1}", dt.ToShortDateString(), set.Count());
                        }
                        break;
                }
            }
        }
    }
  • 相关阅读:
    将Nginx添加到windows服务中
    springboot使用redis管理session
    GIT常用命令
    阻止360、谷歌浏览器表单自动填充
    谈谈对Spring IOC的理解
    同一个Nginx服务器同一端口配置多个代理服务
    LeetCode 653. Two Sum IV
    109. Convert Sorted List to Binary Search Tree(根据有序链表构造平衡的二叉查找树)
    108. Convert Sorted Array to Binary Search Tree(从有序数组中构造平衡的BST)
    LeetCode 236. Lowest Common Ancestor of a Binary Tree(二叉树求两点LCA)
  • 原文地址:https://www.cnblogs.com/Googler/p/4110974.html
Copyright © 2011-2022 走看看