zoukankan      html  css  js  c++  java
  • Soufun_News

    using AnfleCrawler.Common;
    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace AnfleCrawler.DataAnalyzer
    {
        internal class Soufun_News : AnalyzerBase
        {
            private enum Kind
            {
                [Description("市场")]
                Market = 32,
                [Description("政策")]
                Policy = 35,
                [Description("公司")]
                Company = 736,
            }
    
            private static readonly string[] FilterTags = new string[] { "script", "iframe" };
    
            public override void Init(PageCrawler crawler)
            {
                string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>()));
                crawler.PushUrl(new StringPatternGenerator(exp), 0);
                base.Init(crawler);
            }
    
            protected override void AnalyzeInternal(PageLandEntity current)
            {
                var lander = Crawler.Lander;
                dynamic repository = Repository;
                var pHandler = CreateContentHandler(current);
                switch (current.Depth)
                {
                    case 0:
                        {
                            var dom = lander.GetDocument(pHandler);
                            foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext"))
                            {
                                var linkNode = QueryNode(node, "a.link_01");
                                string url = GetHref(linkNode, current.Url).OriginalString;
                                int i = url.LastIndexOf(".");
                                Crawler.PushUrl(new Uri(url.Insert(i, "_all")), 1);
                            }
                        }
                        break;
                    case 1:
                        {
                            var dom = lander.GetDocument(pHandler);
                            var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26");
                            string kind = QueryNodes(hackNode, "a").Last().InnerText;
                            string title = QueryNode(dom.DocumentNode, "h1").InnerText;
                            var contentNode = QueryNode(dom.DocumentNode, "#news_body");
                            foreach (string tag in FilterTags)
                            {
                                foreach (var node in QueryNodes(contentNode, tag, false).ToArray())
                                {
                                    node.Remove();
                                }
                            }
                            var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take(2).ToArray();
                            string source = null;
                            DateTime publishDate;
                            DateTime.TryParse(set[0].InnerText, out publishDate);
                            if (set.Length == 2)
                            {
                                source = set[1].InnerText;
                            }
                            repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate);
                            Crawler.OutWrite("保存新闻 {0}", title);
                        }
                        break;
                }
            }
        }
    }
            public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate)
            {
                Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString);
                using (var db = Create())
                {
                    var q = from t in db.News
                            where t.RowID == rowID
                            select t;
                    var news = q.SingleOrDefault();
                    if (news == null)
                    {
                        db.News.Add(news = new News()
                        {
                            RowID = rowID,
                            SiteID = pageUrl.Authority,
                        });
                    }
                    news.Kind = kind;
                    news.Source = source;
                    news.Title = title;
                    news.Content = content;
                    news.PublishDate = publishDate;
                    db._SaveChanges();
                }
            }
  • 相关阅读:
    坚持的力量 第一篇
    有声似无声
    坚持的力量
    新浪技术面试题
    单词的个数
    我的研究生规划
    go to the train station
    百度面试题求绝对值最小的数
    关于CIW认证考试CIW 常见问题解答
    [恢]hdu 2087
  • 原文地址:https://www.cnblogs.com/Googler/p/4181664.html
Copyright © 2011-2022 走看看