zoukankan      html  css  js  c++  java
  • 利用HtmlAgilityPack抓取网站图片并下载~~~~~~邪恶完善版

    今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。

    原始版本:

    利用HtmlAgilityPack抓取XX网站图片并下载~~~~~~邪恶版。。。。

    新版本代码:

    #region Using namespace

    using System;
    using System.IO;
    using System.Linq;
    using System.Net;
    using HtmlAgilityPack;

    #endregion

    namespace DownloadImages
    {
        internal class Program
        {
            private static readonly WebClient Wc = new WebClient();
            private static readonly char[] InvalidFileNameChars = new[]
                                                                      {
                                                                          '"',
                                                                          '<',
                                                                          '>',
                                                                          '|',
                                                                          '\0',
                                                                          '\u0001',
                                                                          '\u0002',
                                                                          '\u0003',
                                                                          '\u0004',
                                                                          '\u0005',
                                                                          '\u0006',
                                                                          '\a',
                                                                          '\b',
                                                                          '\t',
                                                                          '\n',
                                                                          '\v',
                                                                          '\f',
                                                                          '\r',
                                                                          '\u000e',
                                                                          '\u000f',
                                                                          '\u0010',
                                                                          '\u0011',
                                                                          '\u0012',
                                                                          '\u0013',
                                                                          '\u0014',
                                                                          '\u0015',
                                                                          '\u0016',
                                                                          '\u0017',
                                                                          '\u0018',
                                                                          '\u0019',
                                                                          '\u001a',
                                                                          '\u001b',
                                                                          '\u001c',
                                                                          '\u001d',
                                                                          '\u001e',
                                                                          '\u001f',
                                                                          ':',
                                                                          '*',
                                                                          '?',
                                                                          '\\',
                                                                          '/'
                                                                      };
            public static string CleanInvalidFileName(string fileName)
            {
                fileName = fileName + "";
                fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c + """"));

                if (fileName.Length > 1)
                    if (fileName[0] == '.')
                        fileName = "dot" + fileName.TrimStart('.');

                return fileName;
            }
            private static void Main(string[] args)
            {
                Start();
            }

            private static void Start()
            {
                var web = new HtmlWeb();
                var startDate = int.Parse(DateTime.Parse("2010-08-18").ToString("yyyyMMdd"));
                var endDate = int.Parse(DateTime.Now.ToString("yyyyMMdd"));
                const int startPageId = 49430;
                const int endPageId = 124621;
                for (int k = startDate; k <= endDate; k++)
                {
                    for (int j = startPageId; j <= endPageId; j++)
                    {
                        string cnblogs = http://xxxxxxxx/ + k + "/" + j + ".html";  //此处省略……源码内详
                        HtmlDocument doc = web.Load(cnblogs);
                        var titles = doc.DocumentNode.SelectNodes("//title");
                        var titleName = j.ToString();
                        if( titles!=null && titles.Count>0)
                            titleName = titles[0].InnerText;
                        HtmlNode node = doc.GetElementbyId("ks_xp");
                        if (node == null)
                        {
                            continue;
                        }
                        foreach (HtmlNode child in node.SelectNodes("//img"))
                        {
                            if (child.Attributes["src"] == null)
                                continue;

                            string imgurl = child.Attributes["src"].Value;
                            DownLoadImg(imgurl, k + "", CleanInvalidFileName(titleName));
                            Console.WriteLine("正在下载:" + titleName + " " + imgurl);
                        }
                    }
                }
                //善后
                CleanEmptyFolders();
            }

            private static void CleanEmptyFolders()
            {
                var rootFolders = Environment.CurrentDirectory + "\\Images\\";
                var folders = Directory.GetDirectories(rootFolders, "*.*", SearchOption.AllDirectories);
                foreachvar f in folders)
                {
                    if (Directory.GetFiles(f, "*.*", SearchOption.AllDirectories).Length == 0)
                        Directory.Delete(f);
                }
            }

            private static void DownLoadImg(string url, string folderName, string subFolderName)
            {
                var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf("/") + 1));
                var fileFolder = Environment.CurrentDirectory + "\\Images\\" + folderName + "\\" + subFolderName + "\\" ;
                if (!Directory.Exists(fileFolder))
                    Directory.CreateDirectory(fileFolder);
                fileName = fileFolder + fileName;
                try
                {
                    Wc.DownloadFile(url, fileName);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
            }
        }
    }
     

    测试程序和源码下载:

    /Files/Chinasf/DownloadImages.rar

    版权信息
    作者:Chinasf
    本文版权归作者所有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
  • 相关阅读:
    《软件性能测试与LoadRunner实战教程》新书上市
    《你必须掌握的Entity Framework 6.x与Core 2.0》正式出版感想
    《你必须掌握的Entity Framework 6.x与Core 2.0》书籍出版
    别跟我谈EF抵抗并发,敢问你到底会不会用EntityFramework
    EntityFramework Core 运行dotnet ef命令迁移背后本质是什么?(EF Core迁移原理)
    已被.NET基金会认可的弹性和瞬态故障处理库Polly介绍
    WebAPi之SelfHost自创建证书启动Https疑难解惑及无法正确返回结果
    Web APi之认证(Authentication)两种实现方式【二】(十三)
    读懂操作系统之虚拟内存TLB与缓存(cache)关系篇(四)
    读懂操作系统之缓存原理(cache)(三)
  • 原文地址:https://www.cnblogs.com/Chinasf/p/2354971.html
Copyright © 2011-2022 走看看