zoukankan      html  css  js  c++  java
  • DotnetSpider5 爬博客园新闻

      只要是爬虫必须爬一下博客园.不知道为什么反正都这样..就跟hello world一样吧

      DotnetSpider 是非常优秀的爬虫框架.无论扩展性 易用性 可读性. 已经跳进作者的坑4次了..DotnetSpider 现在版本是5  我是从2开始用的 最近打算跳入新坑

    版本5的文档 https://github.com/dotnetcore/DotnetSpider/wiki

    爬博客园其实作者是提供了Sample  不过比较简单

    我这边为了跳新坑 重新改了下 对接了mysql

    public class CnblogsSpider : Spider
    	{
    		public static async Task RunAsync()
    		{
    			var builder = Builder.CreateDefaultBuilder<CnblogsSpider>();
    			builder.UseSerilog();
    			builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>();
    			await builder.Build().RunAsync();
    		}
    
    		public CnblogsSpider(IOptions<SpiderOptions> options,
    			SpiderServices services,
    			ILogger<Spider> logger) : base(
    			options, services, logger)
    		{
    		}
    
    		protected override async Task InitializeAsync(CancellationToken stoppingToken)
    		{
    			await AddRequestsAsync(new Request("https://news.cnblogs.com/n/666228/"));
    			await AddRequestsAsync(new Request("https://news.cnblogs.com/n/page/1/"));
    			AddDataFlow(new ListNewsParser());
                AddDataFlow(new MysqlNewStorage());
            }
    
    		protected override (string Id, string Name) GetIdAndName()
    		{
    			return (Guid.NewGuid().ToString(), "cnblogs");
    		}
    
    		protected class MysqlNewStorage : StorageBase
    		{
    
    			public override async Task InitAsync()
    			{
    				await using var conn = new MySqlConnection(AppConfig.Configuration.GetConnectionString("Default"));
    				//await conn.ExecuteAsync("create database if not exists cnblogs2;");
    				await conn.ExecuteAsync($@"
    create table if not exists article
    (
        id       int auto_increment
        primary key,
        title    varchar(500)      not null,
    
        sContent  varchar(2000)     null
    );
    ");
    			}
    
    			protected override async Task StoreAsync(DataContext context)
    			{
    				var typeName = typeof(Article).FullName;
    				var data = (Article)context.GetData(typeName);
    				if (data != null && data is Article news)
    				{
    
    
    					await using var conn =
    						new MySqlConnection(AppConfig.Configuration.GetConnectionString("Default"));
    
    					var icount = conn.Query<int>($"SELECT count(id)  FROM article WHERE title = '{data.Title}'").FirstOrDefault();
                        if (icount <= 0)
                        {
    						await conn.ExecuteAsync(
    						$"INSERT IGNORE INTO article (title, sContent) VALUES (@Title,@SContent);",
    						data);
    					}
    					
    				}
    			}
    
    		}
    
    		protected class ListNewsParser : DataParser
    		{
    			public ListNewsParser()
    			{
                    // AddRequiredValidator("news\.cnblogs\.com/n/page");
                    AddRequiredValidator(request =>
                    {
                        return Regex.IsMatch(request.RequestUri.ToString(), "news.cnblogs.com");
                    });
    				AddFollowRequestQuerier(Selectors.XPath("."));
    				//AddRequiredValidator("cnblogs.com");
    				// if you want to collect every pages
    				// AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']"));
    			}
    
    			protected override Task Parse(DataContext context)
    			{
    				//var newsList = context.Selectable.SelectList(Selectors.XPath(".//div[@class='news_block']"));
    				//if (newsList != null)
    				//{
    				//	foreach (var news in newsList)
    				//	{
    				//		var title = news.Select(Selectors.XPath(".//h2[@class='news_entry']"))?.Value;
    				//		var url = news.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href"))?.Value;
    				//		//var summary = news.Select(Selectors.XPath(".//div[@class='entry_summary']"))?.Value;
    				//		//var views = news.Select(Selectors.XPath(".//span[@class='view']"))?.Value.Replace(" 人浏览", "");
    
    				//		if (!string.IsNullOrWhiteSpace(url))
    				//		{
    				//			var request = context.CreateNewRequest(url);
    				//			//request.SetProperty("title", title);
    				//			//request.SetProperty("url", url);
    				//			//request.SetProperty("summary", summary);
    				//			//request.SetProperty("views", views);
    
    				//			context.AddFollowRequests(request);
    				//		}
    
    				//	}
    
    
    				//}
    				//var request = context.CreateNewRequest("http://baidu.com//");
    				//context.AddFollowRequests(request);
    				var news_content = context.Selectable.Select(Selectors.XPath(".//div[@id='news_main']"));
    				if (news_content != null)
    				{
    					var title = news_content.Select(Selectors.XPath(".//div[@id='news_title']"))?.Value;
    					var content = news_content.Select(Selectors.XPath(".//div[@id='news_content']"))?.Value;
    
    
    					var typeName = typeof(Article).FullName;
    					context.AddData(typeName,
    						new Article
    						{
    							
    							Title = title.Trim(),
    							SContent = content.Trim(),
    							//Summary = context.Request.Properties["summary"]?.Trim(),
    							//Views = int.Parse(context.Request.Properties["views"]),
    							//Content = context.Selectable.Select(Selectors.XPath(".//div[@id='news_body']")).Value?.Trim()
    						}
    						);
    
    				}
    
    				return Task.CompletedTask;
    			}
    		}
    
    		
    		public class Article
    		{
    			public string Title { get; set; }
    
                public string SContent { get; set; }
            }
    	}
    

      

     源码(https://files.cnblogs.com/files/leoxjy/ConsoleDotnetSpider5Sample.zip)

  • 相关阅读:
    线程池略略观
    spring-mvc的工作原理
    openstack cinder-backup流程与源码分析
    为何说只有 1 种实现线程的方法?
    经典排序算法原理解析与优劣对比
    Java中List和ArrayList的区别
    openstack-taskflow 组件记录
    递归:如何利用递归求解汉诺塔问题?
    登录MySQL提示ERROR 1045 (28000)错误解决方法
    回归JavaScript基础(九)
  • 原文地址:https://www.cnblogs.com/leoxjy/p/13216904.html
Copyright © 2011-2022 走看看