zoukankan      html  css  js  c++  java
  • 58.com qiyi

    using AnfleCrawler.Common;
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace AnfleCrawler.DataAnalyzer
    {
        internal class Qy58 : AnalyzerBase
        {
            public override void Init(PageCrawler crawler)
            {
                base.Init(crawler);
    
                var url = new Uri("http://qy.58.com/caohejing/pn1/?PGTID=14177711280840.45006677554920316&ClickID=1");
                //http://qy.58.com/19583455460359/?PGTID=14177659184690.5166369006238447&ClickID=4
                crawler.PushUrl(url, 0);
            }
    
            protected override void AnalyzeInternal(PageLandEntity current)
            {
                var lander = Crawler.Lander;
                var pHandler = CreateContentHandler(current);
                switch (current.Depth)
                {
                    case 0:
                        {
                            pHandler.AjaxBlocks.Add(HACK);
                            var dom = lander.GetDocument(pHandler);
                            DoPerPaging(current, dom.DocumentNode, ".next");
    
                            foreach (var node in QueryNodes(dom.DocumentNode, ".compList a"))
                            {
                                var url = GetHref(node, current.Url);
                                Crawler.PushUrl(url, 1);
                            }
                        }
                        break;
                    case 1:
                        {
                            var dom = lander.GetDocument(pHandler);
                            var attr = new AttributeFiller();
    
                            attr.Append("Name:{0}", QueryTexts(dom.DocumentNode, ".compT").First());
    
                            foreach (var th in QueryNodes(dom.DocumentNode, ".basicMsg table th").Skip(1))
                            {
                                string sTh = th.InnerText, sTd;
                                switch (sTh)
                                {
                                    case "联系电话":
                                    case "邮箱":
                                        var client = new System.Net.WebClient();
                                        var iNode = QueryNode(th.NextSibling, "img");
                                        byte[] imgRaw = client.DownloadData(GetHref(iNode, current.Url, attrName: "src"));
                                        var img = new System.Drawing.Bitmap(new System.IO.MemoryStream(imgRaw));
                                        sTd = OCR(img);
                                        break;
                                    case "公司地址":
                                        sTd = QueryTexts(th.NextSibling, "span").First();
                                        break;
                                    default:
                                        sTd = th.NextSibling.InnerText.HtmlTrim();
                                        break;
                                }
                                attr.Append("{0}:{1}", sTh, sTd);
                            }
    
                            var bo = new CompanyEntity();
                            bo.City = "上海";
                            bo.GroupName = "漕河泾企业";
                            bo.PageUrl = current.Url.OriginalString;
                            bo.UpdateDate = DateTime.Now;
                            attr.FillEntity(bo, new Dictionary<string, string>() 
                            {
                                {"公司性质", "Nature"},
                                {"公司行业", "Industry"},
                                {"公司规模", "Scale"},
                                {"联系人", "ContactPerson"},
                                {"企业网址", "Website"},
    
                                {"联系电话", "Tel"},
                                {"邮箱", "Email"},
                                {"公司地址", "Address"},
                            });
                            Repository.SaveCompany(bo);
                            Crawler.OutWrite("保存企业 {0}", bo.Name);
                        }
                        break;
                }
            }
        }
    }
  • 相关阅读:
    金山词霸注册表怎么删
    新手学习jquery
    《企业应用架构模式》(POEAA)读书笔记
    Silverlight 4 tools
    asp.net非常基础的面试题
    VS 2010 中文版正式版无法安装Silverlight4 Tools的解决办法
    OnPreRender(EventArgs e) 事件常用的方法
    各大搜索引擎网站登录入口
    向用户控件传递参数的问题
    URLRewriter
  • 原文地址:https://www.cnblogs.com/Googler/p/4211492.html
Copyright © 2011-2022 走看看