zoukankan      html  css  js  c++  java
  • .net HttpCrawler

    using HtmlAgilityPack;
    using System;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace HttpCrawler
    {
        class Program
        {
            static void Main(string[] args)
            {
                Stopwatch sw = new Stopwatch();
                sw.Start();
                var titles = from row in GetHtml("http://bbs.csdn.net/forums/DotNET/").DocumentNode.SelectSingleNode("//table[@class='table_list parent_forum ']").Elements("tr").Skip(1)
                             let td = row.Element("td")
                             where td != null
                             let a = td.Descendants("a").FirstOrDefault()
                             where a != null
                             select new
                             {
                                 href = a.Attributes["href"].Value,
                                 text = a.InnerText
                             };
                var pages = from t in titles
                                .AsParallel().WithDegreeOfParallelism(20)
                            where t.href != null
                            let path = "http://bbs.csdn.net" + t.href
                            let subQuery = from nick in GetHtml(path).DocumentNode.SelectNodes("//span[@class='name2nick']")
                                           where nick.InnerText == "sp1234"
                                           select nick
                            where subQuery.Any()
                            select new
                            {
                                title = t.text,
                                href = path
                            };
                var results = pages.ToList();
                sw.Stop();
                Console.WriteLine("不加并发的时间:"+sw.ElapsedMilliseconds);
                Console.ReadKey();
            }
            static HtmlDocument GetHtml(string url)
            {
                var content = Encoding.UTF8.GetString(new WebClient().DownloadData(url));
                var doc = new HtmlDocument();
                doc.Load(new StringReader(content));
                return doc;
            }
        }
    }
    

      

  • 相关阅读:
    两数之和
    IDEA类的旁边有个对勾
    Markdown 常用语法
    GitLab 使用指南(IntelliJ IDEA)
    Python yield 用法
    Mac for MySQL 5.7 安装教程
    Mac Hadoop2.6(CDH5.9.2)伪分布式集群安装
    Hive 建外链表到 Hbase(分内部表、外部表两种方式)
    hive grouping sets 等聚合函数
    hive row_number等窗口分析函数
  • 原文地址:https://www.cnblogs.com/c-x-a/p/7792750.html
Copyright © 2011-2022 走看看