zoukankan      html  css  js  c++  java
  • .net HttpCrawler

    using HtmlAgilityPack;
    using System;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace HttpCrawler
    {
        class Program
        {
            static void Main(string[] args)
            {
                Stopwatch sw = new Stopwatch();
                sw.Start();
                var titles = from row in GetHtml("http://bbs.csdn.net/forums/DotNET/").DocumentNode.SelectSingleNode("//table[@class='table_list parent_forum ']").Elements("tr").Skip(1)
                             let td = row.Element("td")
                             where td != null
                             let a = td.Descendants("a").FirstOrDefault()
                             where a != null
                             select new
                             {
                                 href = a.Attributes["href"].Value,
                                 text = a.InnerText
                             };
                var pages = from t in titles
                                .AsParallel().WithDegreeOfParallelism(20)
                            where t.href != null
                            let path = "http://bbs.csdn.net" + t.href
                            let subQuery = from nick in GetHtml(path).DocumentNode.SelectNodes("//span[@class='name2nick']")
                                           where nick.InnerText == "sp1234"
                                           select nick
                            where subQuery.Any()
                            select new
                            {
                                title = t.text,
                                href = path
                            };
                var results = pages.ToList();
                sw.Stop();
                Console.WriteLine("不加并发的时间:"+sw.ElapsedMilliseconds);
                Console.ReadKey();
            }
            static HtmlDocument GetHtml(string url)
            {
                var content = Encoding.UTF8.GetString(new WebClient().DownloadData(url));
                var doc = new HtmlDocument();
                doc.Load(new StringReader(content));
                return doc;
            }
        }
    }
    

      

  • 相关阅读:
    03_ if 练习 _ little2big
    uva 11275 3D Triangles
    uva 12296 Pieces and Discs
    uvalive 3218 Find the Border
    uvalive 2797 Monster Trap
    uvalive 4992 Jungle Outpost
    uva 2218 Triathlon
    uvalive 3890 Most Distant Point from the Sea
    uvalive 4728 Squares
    uva 10256 The Great Divide
  • 原文地址:https://www.cnblogs.com/c-x-a/p/7792750.html
Copyright © 2011-2022 走看看