using AnfleCrawler.Common; using System; using System.Collections.Generic; using System.ComponentModel; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer { internal class Soufun_News : AnalyzerBase { private enum Kind { [Description("市场")] Market = 32, [Description("政策")] Policy = 35, [Description("公司")] Company = 736, } private static readonly string[] FilterTags = new string[] { "script", "iframe" }; public override void Init(PageCrawler crawler) { string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>())); crawler.PushUrl(new StringPatternGenerator(exp), 0); base.Init(crawler); } protected override void AnalyzeInternal(PageLandEntity current) { var lander = Crawler.Lander; dynamic repository = Repository; var pHandler = CreateContentHandler(current); switch (current.Depth) { case 0: { var dom = lander.GetDocument(pHandler); foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext")) { var linkNode = QueryNode(node, "a.link_01"); string url = GetHref(linkNode, current.Url).OriginalString; int i = url.LastIndexOf("."); Crawler.PushUrl(new Uri(url.Insert(i, "_all")), 1); } } break; case 1: { var dom = lander.GetDocument(pHandler); var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26"); string kind = QueryNodes(hackNode, "a").Last().InnerText; string title = QueryNode(dom.DocumentNode, "h1").InnerText; var contentNode = QueryNode(dom.DocumentNode, "#news_body"); foreach (string tag in FilterTags) { foreach (var node in QueryNodes(contentNode, tag, false).ToArray()) { node.Remove(); } } var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take(2).ToArray(); string source = null; DateTime publishDate; DateTime.TryParse(set[0].InnerText, out publishDate); if (set.Length == 2) { source = set[1].InnerText; } repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate); Crawler.OutWrite("保存新闻 {0}", title); } break; } } } }
public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate) { Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString); using (var db = Create()) { var q = from t in db.News where t.RowID == rowID select t; var news = q.SingleOrDefault(); if (news == null) { db.News.Add(news = new News() { RowID = rowID, SiteID = pageUrl.Authority, }); } news.Kind = kind; news.Source = source; news.Title = title; news.Content = content; news.PublishDate = publishDate; db._SaveChanges(); } }