zoukankan      html  css  js  c++  java
  • tdf sample

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using GearUp.Crawler.Entities;
    using HtmlAgilityPack;
    using System.Threading.Tasks;
    using System.Threading.Tasks.Dataflow;
    using System.Text.RegularExpressions;
    using System.Collections.Concurrent;
    using System.Threading;
    
    namespace GearUp.Crawler
    {
        public class Crawler
        {
            private ILoreBookItemRepository repository;
            private ILorebookItemParser parser;
            private LinkManager linkManager;
    
            private string linkDomain;
    
            private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>();
    
            private const int DownloadTimeout = 10;
    
            public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager)
            {
                this.repository = repository;
                this.parser = parser;
                this.linkManager = linkManager;
            }
    
            public async void StartCrawl(string targetUrl)
            {
                var cts = new CancellationTokenSource();
                var ct = cts.Token;
    
                linkDomain = LinkManager.LinkDomain(targetUrl);
    
                var downloaderOptions = new ExecutionDataflowBlockOptions
                {
                    MaxMessagesPerTask = 3,
                    MaxDegreeOfParallelism = 4,
                    BoundedCapacity = 10
                };
    
                var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions);
    
                var pipelineOptions = new ExecutionDataflowBlockOptions
                {
                    MaxMessagesPerTask = 2,
                    CancellationToken = ct
                };
    
                var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions);
    
                var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 });
    
                var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct });
    
                // Flow setup
                downloader.LinkTo(contentBroadcaster);
                contentBroadcaster.LinkTo(linkParser);
                contentBroadcaster.LinkTo(writer);
                linkParser.LinkTo(downloader);
    
                //Kick off the TPL dataflow here
                downloader.Post(targetUrl);
                WriteToConsole("Crawling...", ConsoleColor.Green);
                PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape);
                cts.Cancel();
                WriteToConsole("Stopping...", ConsoleColor.Green);
                await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion);
    
            }
    
            public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page)
            {
                if (page == null) return Enumerable.Empty<string>();
    
                var discoveredLinks = new List<string>();
                var document = new LorebookDocument(page.Html);
                foreach (var link in document.LinksInArticleBodyDiv())
                {
                    var fullUrl = linkManager.FullyQualifyLink(page.Url, link);
                    if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl)))
                        discoveredLinks.Add(fullUrl);
                }
                WriteToConsole("   {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count);
                return discoveredLinks;
            }
    
            public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url)
            {
                WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url);
                var itemDetails = document.OfficialLorebookEntry();
                var item = parser.ParseHtmlNode(itemDetails, url);
                return item;
            }
    
            public async Task<PageAndUrl> DownloadUrl(string url)
            {
                try
                {
                    if (urls.ContainsKey(url)) return null;
                    urls.TryAdd(url, true);
    
                    var client = new WebClient();
                    WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url);
                    var download = client.DownloadStringTaskAsync(url);
                    var cancel = Task.Delay(DownloadTimeout * 1000);
                    var any = await Task.WhenAny(download, cancel);
                    if (any == cancel)
                    {
                        client.CancelAsync();
                        WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url);
                        return null;
                    }
                    string result = download.Result;
    
                    WriteToConsole("Downloaded: {0}", ConsoleColor.White, url);
    
                    return new PageAndUrl() { Url = url, Html = result };
                }
    
                catch (WebException ex)
                {
                    WriteToConsole("Error: [{0}]
    	{1}", ConsoleColor.Red, url, ex.Message);
                }
                catch (AggregateException ex)
                {
                    foreach (var exc in ex.Flatten().InnerExceptions)
                    {
                        WriteToConsole("Error: [{0}]
    	{1}", ConsoleColor.Red, url, exc.Message);
                    }
                }
                catch (Exception ex)
                {
                    WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message);
                }
    
                return null;
            }
    
            public async Task SaveEntry(PageAndUrl page)
            {
                if (page == null) return;
                var document = new LorebookDocument(page.Html);
                var item = ExtractLoreBookItem(document, page.Url);
                if (item != null) await repository.Save(page.Url, item);
            }
    
            private static void WriteToConsole(string format, ConsoleColor color, params object[] texts)
            {
                Console.ForegroundColor = color;
                Console.WriteLine(format, texts);
                Console.ResetColor();
            }
    
            private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null)
            {
                WriteToConsole(message, color);
                if (key == null)
                    Console.ReadLine();
                else
                {
                    ConsoleKeyInfo entry;
                    do
                    {
                        entry = Console.ReadKey(true);
                    } while (key != entry.Key);
                }
            }
    
        }
    }
    
  • 相关阅读:
    surfer插值方法及提取插值结果 转载
    Surfer的grd数据转换成gmt可用的grd数据方法
    Appium+Python3+ Android入门
    Flask入门的第一个项目
    测试报告模板
    火狐浏览器之伪造IP地址
    获取apk的签名信息
    初识kibana
    Fiddler模拟post四种请求数据
    Python-正则表达式
  • 原文地址:https://www.cnblogs.com/zeroone/p/4418338.html
Copyright © 2011-2022 走看看