zoukankan      html  css  js  c++  java
  • C#备份博客园随笔文章和图片----使用HtmlAgilityPack解析html

    之前用.NET做网页采集实现采用正则表达式去匹配解析,比较繁琐,花费时间较多,若是Html复杂的话真是欲哭无泪。
    很早就听过包HtmlAgilityPack,其是在.NET下用XPath来解析的HTML的一个类库(包)。但是一直没时间尝试,简单了解了下HtmlAgilityPack的API后,发现真是HTML解析利器,于是花些时间做一个例子记录下。
     
    本次是以下载博客园随笔分类文章为例,采用两部分实现,第一部分是将采集到的文章放到集合变量中,第二部分是通过操作集合变量将文章下载到本地,
    这样做效率较低,因为可以直接边采集文章边下载。暂时没有考虑效率问题,仅仅只是实现功能。下面简单阐述下。
     
    获取随笔分类
     
    根据输入的博客名取得对应的随笔分类。
     
    复制代码
       /// <summary>
            /// 获取博客分类
            /// </summary>
            /// <param name=" uname"></param>
            /// <returns></returns>
            private static List< BlogType> GettBlogTypeList(string uname)
            {
                string url = "http://www.cnblogs.com/" + uname + "/mvc/blog/sidecolumn.aspx?blogApp=" + uname;
                string htmlStr = CommonHelper .GetRequestStr(url);
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(htmlStr);
                var nodes = doc.DocumentNode.SelectNodes("//div[@id='sidebar_postcategory']//a"); //随笔分类
                if (nodes == null || nodes.Count <= 0)
                    return null ;
    
                List<BlogType > list = new List< BlogType>();
                for (int i = 0; i < nodes.Count; i++)
                {
                    var aUrl = nodes[i].Attributes["href" ].Value;
                    var name = nodes[i].InnerText;
                    list.Add( new BlogType () { BlogTypeUrl = aUrl, BlogTypeName = name.Contains( "(") ? name.Split('(')[0] : name,BlogTypeNameShow=name });
                }
                return list;
            }
    
     
    
      public class BlogType
        {
            public string BlogTypeUrl { get; set; }
            public string BlogTypeName { get; set; }
            public string BlogTypeNameShow { get; set; }
        }
    复制代码
     如获取到的随笔分类如下:
     
     
    采集分类的文章
     
    采用两步实现,第一步获取只包含标题和url的文章,第二步再获取文章内容。
         
    复制代码
     /// <summary>
            /// 根据分类获取博客
            /// </summary>
            /// <param name=" blogTypes"></param>
            /// <param name=" useTime"></param>
            /// <returns></returns>
            public static Dictionary< BlogType,List <BlogInfo>> GetBlogsByType( List<BlogType > blogTypes,out long useTime)
            {
                Stopwatch sw = new Stopwatch();
                sw.Start();
                Dictionary<BlogType , List< BlogInfo>> dic = new Dictionary< BlogType, List <BlogInfo>>();          
                foreach (var blogType in blogTypes)
                {
                    List<BlogInfo > list = new List< BlogInfo>();
                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml( CommonHelper.GetRequestStr(blogType.BlogTypeUrl));
                    var typeNameNode = doc.DocumentNode.SelectSingleNode("//div[@class='entrylist']/h1");
                    string typeName = typeNameNode.InnerText;
                    var listPosttitleNodes = doc.DocumentNode.SelectNodes("//div[@class='entrylistPosttitle']/a");
                    if (listPosttitleNodes != null && listPosttitleNodes.Count > 0)
                    {
                        for (int i = 0; i < listPosttitleNodes.Count; i++)
                        {
                            Console.WriteLine("正在爬取文章【{0}】..." , listPosttitleNodes[i].InnerText);
                            list.Add( new BlogInfo ()
                            {
                                BlogUrl = listPosttitleNodes[i].Attributes[ "href"].Value,
                                BlogTitle = listPosttitleNodes[i].InnerText,
                                BlogTypeName = typeName
                            });
                        }
                    }
    
                    dic.Add(blogType,list);
    
                }
    
                sw.Stop();
                useTime = sw.ElapsedMilliseconds;
                return dic;
            }
    
     
    
     
    
         /// <summary>
            /// 获取详细的博客信息
            /// </summary>
            /// <param name=" dic"></param>
            /// <param name=" useTime"></param>
            /// <returns></returns>
            public static Dictionary< BlogType, List <BlogInfo>> GetBlogDetail( Dictionary<BlogType , List<BlogInfo >> dic, out long useTime)
            {
                Stopwatch sw = new Stopwatch();
                sw.Start();
                HtmlDocument doc = new HtmlDocument();
                for(int k=0;k<dic.Keys.Count;k++)
                {
                    var blogType = dic.Keys.ElementAt(k);
                    var list = dic[blogType];
                    for (int i = 0; i < list.Count; i++)
                    {
                        Console.WriteLine("正在获取文章【{0}】内容..." , list[i].BlogTitle);
                        doc.LoadHtml( CommonHelper.GetRequestStr(list[i].BlogUrl));
                        var bodyNode = doc.DocumentNode.SelectSingleNode("//div[@id='cnblogs_post_body']");
                        var dateNode = doc.DocumentNode.SelectSingleNode("//span[@id='post-date']");
                        var userNode = doc.DocumentNode.SelectSingleNode("//div[@class='postDesc']/a[1]");
                        list[i].BlogContent = bodyNode == null ? "内容获取失败" : bodyNode.InnerHtml;
                        list[i].BlogPostTime = dateNode == null ? "发布时间获取失败" : dateNode.InnerText;
                        list[i].BlogName = userNode == null ? "用户获取失败" : userNode.InnerText;
                    }
                    dic[blogType] = list;
                }
                sw.Stop();
                useTime = sw.ElapsedMilliseconds;
                return dic;
            }
    
     
    
        public class BlogInfo
        {
            public string BlogUrl { get; set; }
            public string BlogName { get; set; }
            public string BlogTitle { get; set; }
            public string BlogContent { get; set; }
            public string BlogTypeName { get; set; }
            public string BlogPostTime { get; set; }
        }
    复制代码
     下载到本地
     
    根据上面采集到的文章再一步步下载到本地,期间分两步,第一步下载图片,第二步下载文章内容。
     
     /// <summary>
    
            /// 下载
    
            /// </summary>
    
            /// <param name=" dic"></param>
    
            /// <param name=" uname"></param>
    
            /// <param name=" useTime"></param>
    
            /// <returns></returns>
    
            public static string DowanloadBlog( Dictionary<BlogType , List< BlogInfo>> dic, string uname,out long useTime)
    
            {
    
                Stopwatch sw = new Stopwatch();
    
                sw.Start();
    
                int countFlag = 0;
    
                for (int i = 0; i < dic.Keys.Count; i++)
    
                {
    
                    var blogType = dic.Keys.ElementAt(i);
    
                    var blogList = dic[blogType];
    
                    var dicPath = AppDomain .CurrentDomain.BaseDirectory +"BlogFiles\" + uname + "\" + blogType.BlogTypeName;
    
                    Console.WriteLine("<<开始处理分类【{0}】<<" , blogType.BlogTypeName);
    
                    FileHelper.CreatePath(dicPath);
    
                    var blogModel = new BlogInfo();
    
                    for (int j = 0; j < blogList.Count; j++)
    
                    {
    
                        countFlag++;
    
                        try
    
                        {
    
                            Console.WriteLine("~~~~开始处理文章{0}【{1}】~~~~" , countFlag,blogModel.BlogTitle);
    
                            blogModel = blogList[j];
    
                            var filePath = dicPath + "\" + FileHelper.FilterInvalidChar(blogModel.BlogTitle, "_") + ".html" ;
    
                            HtmlDocument doc = new HtmlDocument();
    
                            doc.DocumentNode.InnerHtml = blogModel.BlogContent;
    
     
    
                            //处理图片
    
                            Console.WriteLine("~~开始处理图片" );
    
                            var imgPath = dicPath + "\images" ;
    
                            FileHelper.CreatePath(imgPath);
    
                            SaveImage(doc, imgPath);
    
                            Console.WriteLine("~~处理图片完成" );
    
     
    
                            //去掉a标签
    
                            var aNodes = doc.DocumentNode.SelectNodes("//a");
    
                            if (aNodes != null && aNodes.Count > 0)
    
                            {
    
                                for (int a = 0; a < aNodes.Count; a++)
    
                                {
    
                                    if (aNodes[a].Attributes["href" ] != null && aNodes[a].Attributes[ "href"].Value != "#" )
    
                                    {
    
                                        aNodes[a].Attributes[ "href"].Value = "javascript:void()" ;
    
                                    }
    
                                }
    
                            }
    
                            doc.DocumentNode.InnerHtml = "<div id='div_head'>" + uname + " " + blogType.BlogTypeName + "</div><div id='div_title'>" + blogModel.BlogTitle + "<div><div id='div_body'>" + doc.DocumentNode.InnerHtml + "</div>";
    
                            doc.Save(filePath, Encoding.UTF8);
    
                            Console.WriteLine("~~~~处理文章{0}【{1}】完毕~~~~" ,countFlag,blogModel.BlogTitle);
    
                        }
    
                        catch (Exception ex)
    
                        {
    
                            string errorMsg = DateTime .Now.ToString("yyyyMMdd HH:mm:ss") + "
    " + "url=" + blogModel.BlogUrl + "
    " + "title=" + blogModel.BlogTitle + "
    " + "errorMsg=" + ex.Message + "
    " + "stackTrace=" + ex.StackTrace + "
    
    
    ";
    
                            Console.WriteLine("error>>处理文章【{0}】出现错误,开始记录错误信息~~" , blogModel.BlogTitle);
    
                            FileHelper.SaveTxtFile(dicPath, "errorLog.txt" , errorMsg, false);
    
                            Console.WriteLine("error>>处理文章【{0}】出现错误,记录错误信息完成~~" , blogModel.BlogTitle);
    
                        }
    
                    }
    
                    Console.WriteLine("<<处理分类【{0}】完成<<" , blogType.BlogTypeName);
    
     
    
                }
    
                sw.Start();
    
                useTime = sw.ElapsedMilliseconds;
    
                return AppDomain .CurrentDomain.BaseDirectory + "BlogFiles\" + uname;
    
            }
    
     
    
     /// <summary>
    
            /// 保存图片
    
            /// </summary>
    
            /// <param name=" doc"></param>
    
            /// <param name=" filePath"></param>
    
            public static void SaveImage( HtmlDocument doc, string filePath)
    
            {
    
                var imgNodes = doc.DocumentNode.SelectNodes("//img");
    
                if (imgNodes != null && imgNodes.Count > 0)
    
                {
    
                    for (int i = 0; i < imgNodes.Count; i++)
    
                    {
    
                        try
    
                        {                     
    
                            string src = imgNodes[i].Attributes["src" ].Value;
    
                            string fileName = "" ;
    
                            if (src != null && src.Contains("/"))
    
                            {
    
                                fileName = src.Substring(src.LastIndexOf( "/") + 1);
    
                                Console.WriteLine("~~开始下载图片【{0}】~~" , fileName);
    
                                string imgPath = filePath + "\" + fileName;
    
                                imgNodes[i].Attributes[ "src"].Value = imgPath;
    
                                byte[] imgByte = CommonHelper .GetRequestByteArr(src);
    
                                if (imgByte != null )
    
                                {
    
                                    FileHelper.SaveImage(imgPath, imgByte);
    
                                    Console.WriteLine("~~下载图片【{0}】完成~~" , fileName);
    
                                }
    
                                else
    
                                {
    
                                    Console.WriteLine("~~下载图片【{0}】失败~~" , fileName);
    
                                }
    
                            }
    
                        }
    
                        catch (Exception ex)
    
                        {
    
                            throw new Exception( "SaveImage_Error:" + ex.Message);
    
                        }
    
     
    
                    }
    
                }
    
            }
    View Code
    程序入口
     
    主要代码如下
                    
    复制代码
        var types = GettBlogTypeList(uname);
                        long time1 = 0;
                        long time2 = 0;
                        long timeDownload = 0;
                        Console.WriteLine("正在爬取,请耐心等待..." );
                        var blogList = GetBlogsByType(types,out time1);
                        var blogDetailList = GetBlogDetail(blogList,out time2);
                        Console.WriteLine("爬取完毕,开始下载..." );
                        string filePath=DowanloadBlog(blogDetailList, uname,out timeDownload);
                        Console.WriteLine("**处理完毕,爬取用时{0}ms,下载用时{1}ms,{2}" , time1+time2, timeDownload, filePath);
                        handlerRight = false;
     
    复制代码
    演示效果
     
     
     
    文件存储在项目bin目录下,一个用户一个文件夹
     
    按随笔分类生成不同的文件夹
     
     
     
    生成.html文件,一个分类的所有图片都存在该分类下的images下。
     
     

    欢迎指出程序bug,提出优化意见,(●'◡'●)
     

    出处:https://www.cnblogs.com/kungge/p/5956501.html

    ==========================================================================

    上面的获取文章分类已经无法使用了,查看博客园的API说明文档,参考文章如下,

    http://wcf.open.cnblogs.com/blog/help

    根据这个里面的提示,我们可以分两步备份文章,先获取文章标题和摘要,然后在下载每一篇文章,保存文章和图片到本地文件夹

    首先我们定义一个博客文章类

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace BackupBlogs.Models
    {
        public class BlogInfo
        {
            public string id { get; set; }
    
            public string title { get; set; }
    
            public string summary { get; set; }
    
            public string published { get; set; }
    
            public string updated { get; set; }
    
            public string link { get; set; }
    
            public string diggs { get; set; }
    
            public string views { get; set; }
    
            public string comments { get; set; }
    
            public string body { get; set; }
        }
    }
    View Code

    在定义两个工具类

    using HtmlAgilityPack;
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Net.Http;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace BackupBlogs.Common
    {
        public static class CommonHelper
        {
            #region HttpClient
            private static HttpClient _httpClient;
            public static HttpClient httpClient
            {
                get
                {
                    if (_httpClient == null)
                    {
                        _httpClient = new HttpClient();
                        _httpClient.Timeout = new TimeSpan(0, 4, 0);
    
                    }
                    return _httpClient;
                }
                set { _httpClient = value; }
            }
    
            #endregion
    
            #region get请求
            /// <summary>
            /// get请求返回的字符串
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetRequestStr(string url)
            {
                try
                {
                    var response = httpClient.GetAsync(new Uri(url)).Result;
                    return response.Content.ReadAsStringAsync().Result;
                }
                catch (Exception ex)
                {
                    FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  请求URL发生异常:" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                    return null;
                }
            }
            /// <summary>
            /// get请求返回的二进制
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static byte[] GetRequestByteArr(string url)
            {
                try
                {
                    if (url.Contains("://"))
                    {
                        var response = httpClient.GetAsync(new Uri(url)).Result;
                        return response.Content.ReadAsByteArrayAsync().Result;
                    }
                    else if (url.IndexOf("data:image") == 0)
                    {
                        return Convert.FromBase64String(url.Split(';')[1].Split(',')[1]);
                    }
                    else
                        return null;
                }
                catch (Exception ex)
                {
                    FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  请求URL发生异常:" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                    return null;
                }
            }
            #endregion
    
            #region post请求
            /// <summary>
            /// post请求返回的字符串
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string PostRequestStr(string url)
            {
                try
                {
                    string contentStr = "";
                    StringContent sc = new StringContent(contentStr);
                    sc.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("application/x-www-form-urlencoded");//todo
                    var response = httpClient.PostAsync(new Uri(url), sc).Result;
                    return response.Content.ReadAsStringAsync().Result;
                }
                catch (Exception ex)
                {
                    FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  请求URL发生异常:" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                    return null;
                }
            }
            #endregion
    
            #region MD5加密解密
    
            /// <summary>
            /// 用MD5加密字符串,可选择生成16位或者32位的加密字符串
            /// </summary>
            /// <param name="password">待加密的字符串</param>
            /// <param name="bit">位数,一般取值16 或 32</param>
            /// <returns>返回的加密后的字符串</returns>
            public static string MD5Encrypt(string strWord, int bit)
            {
                string tmp = MD5Encrypt(strWord);
                if (bit == 16)
                    return tmp.ToString().Substring(8, 16);
                else if (bit == 32)
                    return tmp.ToString();//默认情况
                else
                    return string.Empty;
            }
    
            /// <summary>
            /// 用MD5加密字符串
            /// </summary>
            /// <param name="password">待加密的字符串</param>
            /// <returns></returns>
            public static string MD5Encrypt(string strWord)
            {
                System.Security.Cryptography.MD5CryptoServiceProvider md5Hasher = new System.Security.Cryptography.MD5CryptoServiceProvider();
                byte[] hashedDataBytes;
                hashedDataBytes = md5Hasher.ComputeHash(Encoding.GetEncoding("gb2312").GetBytes(strWord));
                StringBuilder tmp = new StringBuilder();
                foreach (byte i in hashedDataBytes)
                {
                    tmp.Append(i.ToString("x2"));
                }
                return tmp.ToString();
            }
            #endregion
        }
    }
    View Code
    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace BackupBlogs.Common
    {
        public static class FileHelper
        {
            #region 创建路径
            /// <summary>
            /// 创建路径
            /// </summary>
            /// <param name="path"></param>
            public static bool CreatePath(string path)
            {
                if (!Directory.Exists(path))
                {
                    Directory.CreateDirectory(path);
                    return true;
                }
                return false;
            }
            #endregion
    
            #region 保存图片
            /// <summary>
            /// 保存图片
            /// </summary>
            /// <param name="bt"></param>
            public static void SaveImage(string filePath, byte[] bt)
            {
                try
                {
                    File.WriteAllBytes(filePath, bt);
                }
                catch (Exception ex)
                {
                    Console.WriteLine("SaveImage 方法发生异常:" + ex.Message);
                }
            }
            #endregion
    
            #region 保存文本文件
            public static void SaveTxtFile(string filePath, string txtStr, bool isCover = true)
            {
                try
                {
                    CreatePath(System.IO.Path.GetDirectoryName(filePath));
                    if (isCover)
                        File.WriteAllText(filePath, txtStr, Encoding.Default);
                    else
                        File.AppendAllText(filePath, txtStr, Encoding.Default);
    
                }
                catch (Exception ex)
                {
                    Console.WriteLine("SaveTxtFile 方法发生异常:" + ex.Message);
                }
            }
            #endregion
    
            #region 过滤文件名中特殊字符
            public static string FilterInvalidChar(string fileName, string replaceStr)
            {
                foreach (var c in Path.GetInvalidFileNameChars())
                {
                    fileName = fileName.Replace(c.ToString(), replaceStr);
                }
                fileName = fileName.Replace(" ", replaceStr);
                return fileName;
            }
            #endregion
        }
    }
    View Code

    最后,再来个主要的类

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace BackupBlogs
    {
        using Common;
        using HtmlAgilityPack;
        using Models;
        using System.Diagnostics;
    
        class Program
        {
            private static string userPath = "";
            private static string blogPath = "";
            private static string imgPath = "";
            private static string htmlTemp = @"
    <!DOCTYPE HTML>
    <html>
    <head>
    <title>C#备份博客园文章列表</title>
    </head>
    <body>
    </body>
    </html>";
    
            static void Main(string[] args)
            {
                //输入博客名称
                string uname = "";
                bool unameNull = true;
    
                do
                {
                    ShowLog("--请输入要下载的博客名称--");
                    uname = Console.ReadLine();
                    if (string.IsNullOrEmpty(uname))
                    {
                        ShowLog("--请输入要下载的博客名称--");
                        uname = Console.ReadLine();
                    }
                    else
                    {
                        unameNull = false;
                    }
                } while (unameNull);
    
                //获取博客标题
                bool hasTypes = true;
                List<BackupBlogs.Models.BlogInfo> blogList = new List<Models.BlogInfo>();
                do
                {
    
                    userPath = AppDomain.CurrentDomain.BaseDirectory + "cnblogFiles\" + uname + "_" + DateTime.Now.ToString("yyyyMMdd");
                    blogPath = userPath + "\Blogs";
                    FileHelper.CreatePath(blogPath);
                    imgPath = userPath + "\Images";
                    FileHelper.CreatePath(imgPath);
    
                    blogList = GettBlogSummaryList(uname);
                    if (blogList == null || blogList.Count == 0)
                    {
                        ShowLog("--未获取到文章列表,请重新输入要下载的博客名称--", true);
                        uname = Console.ReadLine();
                        blogList = GettBlogSummaryList(uname);
                    }
                    else
                    {
                        hasTypes = false;
                    }
                } while (hasTypes);
    
                //保存标题列表
                SaveBlogsSummary(blogList);
    
                //保存博客详细文章
                bool handlerRight = true;
                do
                {
                    long time1 = 0;
                    long time2 = 0;
                    long timeDownload = 0;
                    ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                    ShowLog($"--正在爬取博客文章,共计 {blogList.Count} 篇,请耐心等待...", true);
                    var blogDetailCount = GetBlogDetail(blogList, out time2);
                    ShowLog($"--爬取完毕,成功下载了【{blogDetailCount}/{blogList.Count}】篇博客文章.", true);
                    ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                    int saveCount = SaveBlogDetail(blogList, uname, out timeDownload);
                    ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                    ShowLog(saveCount == blogList.Count ? "保存全部文章成功!" : $"保存文章【{saveCount}】条成功");
                    ShowLog($"--处理完毕,爬取用时{(time1 + time2)}ms,下载用时{timeDownload}ms,
    保存路径:{userPath}", true);
                    handlerRight = false;
    
                } while (handlerRight);
    
    
    
                Console.ReadKey();
            }
    
    
    
            #region 获取博客标题
            /// <summary>
            /// 获取博客分类
            /// </summary>
            /// <param name="uname"></param>
            /// <returns></returns>
            private static List<BackupBlogs.Models.BlogInfo> GettBlogSummaryList(string uname)
            {
                string msgTitle = "第1阶段:";
                string url = "";
                List<BlogInfo> list = new List<BlogInfo>();
                ShowLog(msgTitle + $"获取{uname}的随笔如下:", true);
                HtmlDocument doc = new HtmlDocument();
                bool isGetBlog = true;
                int pageNum = 1;
                int pageSize = 10;
                do
                {
                    int currPageBlogCount = 0;
                    url = "http://wcf.open.cnblogs.com/blog/u/" + uname + $"/posts/{pageNum}/{pageSize}";
                    string htmlStr = CommonHelper.GetRequestStr(url);
                    doc.LoadHtml(htmlStr);
                    var nodes = doc.DocumentNode.SelectNodes("//entry");
                    if (nodes == null || nodes.Count <= 0)
                        isGetBlog = false;
                    else
                    {
                        foreach (var item in nodes)
                        {
                            currPageBlogCount++;
                            if (item.ChildNodes.Count != 10)
                                continue;
                            BlogInfo blogSummary = new BlogInfo()
                            {
                                id = item.ChildNodes["id"].InnerText,
                                comments = item.ChildNodes["comments"].InnerText,
                                diggs = item.ChildNodes["diggs"].InnerText,
                                link = item.ChildNodes["link"].Attributes["href"].Value,
                                published = item.ChildNodes["published"].InnerText,
                                summary = item.ChildNodes["summary"].InnerText,
                                title = item.ChildNodes["title"].InnerText,
                                updated = item.ChildNodes["updated"].InnerText,
                                views = item.ChildNodes["views"].InnerText
                            };
                            list.Add(blogSummary);
                            ShowLog(msgTitle + $"【{currPageBlogCount + (pageNum - 1) * pageSize}】获取文章标题【{blogSummary.title}】", true);
                        }
                    }
                    //isGetBlog = false;
                    pageNum++;
                } while (isGetBlog);
                return list;
            }
    
    
            private static void SaveBlogsSummary(List<BackupBlogs.Models.BlogInfo> blogSummaries)
            {
    
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(htmlTemp);
                HtmlNode nList = doc.DocumentNode.SelectSingleNode("/html/body");
                nList.AppendChild(HtmlNode.CreateNode("<table border='1' cellpadding='0' cellspacing='0' width='98%'><tr>" +
                    "<th width='150'>发布时间</th>" +
                    "<th width='100'>阅读数</th>" +
                    "<th width='100'>评论数</th>" +
                    "<th width='*'>博文标题</th>" +
                    "</tr></table>"));
                ShowLog(Environment.NewLine + "开始保存博客文章标题");
                foreach (var item in blogSummaries)
                {
                    string div = "<div>";
                    //div += "&nbsp;&nbsp;&nbsp;&nbsp;<a href='" + item.link + "' target='_blank' >原文</a>";
                    div += "【发布:" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "";
                    //div += "【更新:" + DateTime.Parse(item.updated).ToString("yyyy-MM-dd HH:mm:ss") + "】";
                    div += "【阅读数:" + item.views.PadLeft(4, '0') + "";
                    div += "【评论数:" + item.comments.PadLeft(3, '0') + "";
                    div += "<a href='.\Blogs\" + System.Web.HttpUtility.UrlEncode(FileHelper.FilterInvalidChar(item.title, "_")) + ".html" + "' target='_blank'>" + item.title + "</a>";
                    div += "</div>";
    
                    string divTR = "<tr>";
                    //divTR += "&nbsp;&nbsp;&nbsp;&nbsp;<a href='" + item.link + "' target='_blank' >原文</a>";
                    divTR += "<td>" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "</td>";
                    divTR += "<td align='right'>" + item.views + "</td>";
                    divTR += "<td align='right'>" + item.comments + "</td>";
                    divTR += "<td><a href='.\Blogs\" + System.Web.HttpUtility.UrlEncode(FileHelper.FilterInvalidChar(item.title, "_")) + ".html" + "' target='_blank'>" + item.title + "</a></td>";
                    divTR += "</tr>";
                    nList.SelectSingleNode("table").AppendChild(HtmlNode.CreateNode(divTR));
                }
                doc.Save(userPath + "\index.html", Encoding.UTF8);
                ShowLog($"共保存【{blogSummaries.Count}】篇博客标题保存完成", true);
            }
    
            #endregion
    
    
            #region 获取博客详细信息
            /// <summary>
            /// 获取详细的博客信息
            /// </summary>
            /// <param name="dic"></param>
            /// <param name="useTime"></param>
            /// <returns></returns>
            private static int GetBlogDetail(List<BlogInfo> blogs, out long useTime)
            {
                Stopwatch sw = new Stopwatch();
                sw.Start();
                string msgTitle = "第2阶段:";
                int GetDetailCount = 0;
                HtmlDocument doc = new HtmlDocument();
    
                for (int k = 0; k < blogs.Count; k++)
                {
                    string url = $"http://wcf.open.cnblogs.com/blog/post/body/{blogs[k].id}";
                    ShowLog(msgTitle + string.Format("【{0}/{1}】正在获取文章【{2}】", k + 1, blogs.Count, blogs[k].title), true);
                    string blogBody = CommonHelper.GetRequestStr(url);
                    doc.LoadHtml(blogBody);
                    var bodyNode = doc.DocumentNode.SelectSingleNode("//string");
                    blogs[k].body = bodyNode == null ? "内容获取失败" : System.Web.HttpUtility.HtmlDecode(bodyNode.InnerHtml);
                }
                ShowLog("下载失败的文章如下:", true);
                var errBlogs = blogs.Where(x => x.body == "内容获取失败");
                foreach (var item in errBlogs)
                {
                    ShowLog(Newtonsoft.Json.JsonConvert.SerializeObject(item), true);
                }
                GetDetailCount = blogs.Count - errBlogs.Count();
                sw.Stop();
                useTime = sw.ElapsedMilliseconds;
                return GetDetailCount;
            }
            #endregion
    
    
            #region 保存博客
            /// <summary>
            /// 保存图片
            /// </summary>
            /// <param name="doc"></param>
            /// <param name="filePath"></param>
            private static void SaveImage(BlogInfo blog, string filePath, string uname)
            {
                string msgTitle = "第3阶段:";
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(blog.body);
                var imgNodes = doc.DocumentNode.SelectNodes("//img");
                if (imgNodes != null && imgNodes.Count > 0)
                {
                    for (int i = 0; i < imgNodes.Count; i++)
                    {
                        try
                        {
                            string src = imgNodes[i].Attributes["src"].Value;
                            if (src.IndexOf("//") == 0 && src.IndexOf("http") == -1)
                            {
                                src = src.Remove(0, src.IndexOf("//")).Insert(0, "http:");
                            }
                            string fileName = "";
                            string imgPath = "";
                            if (src != null && src.Contains("/"))
                            {
                                if (src.IndexOf("data:image") == 0)
                                    fileName = blog.id + "_" + CommonHelper.MD5Encrypt(src) + "." + src.Split(';')[0].Split('/')[1];
                                else
                                    fileName = src.Substring(src.LastIndexOf("/") + 1);
    
                                imgPath = filePath.Replace(userPath, "..") + "\" + fileName;
                                imgNodes[i].Attributes["src"].Value = imgPath;
                                byte[] imgByte = CommonHelper.GetRequestByteArr(src);
                                if (imgByte != null)
                                {
                                    FileHelper.SaveImage(filePath + "\" + fileName, imgByte);
                                }
                                else
                                {
                                    ShowLog(msgTitle + $"下载图片失败!   下载URL:{src}", true);
                                    ShowLog(msgTitle + $"下载图片失败详细博客:【id:{blog.id};title:{blog.title};url:{blog.link};】");
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            ShowErrorLog(msgTitle + " SaveImage 方法发生异常:" + ex.Message, true);
                        }
    
                    }
                    blog.body = doc.DocumentNode.InnerHtml;
                }
            }
    
            /// <summary>
            /// 下载
            /// </summary>
            /// <param name="dic"></param>
            /// <param name="uname"></param>
            /// <param name="useTime"></param>
            /// <returns></returns>
            public static int SaveBlogDetail(List<BlogInfo> blogs, string uname, out long useTime)
            {
                Stopwatch sw = new Stopwatch();
                sw.Start();
                string msgTitle = "第3阶段:";
                int countFlag = 0;
                foreach (var item in blogs)
                {
    
                    countFlag++;
                    try
                    {
                        ShowLog(string.Format(msgTitle + "【{0}/{1}】开始处理【{2}】", countFlag, blogs.Count, item.title), true);
                        var filePath = blogPath + "\" + FileHelper.FilterInvalidChar(item.title, "_") + ".html";
    
                        //处理图片
                        SaveImage(item, imgPath, uname);
    
                        //去掉a标签
                        //var aNodes = doc.DocumentNode.SelectNodes("//a");
                        //if (aNodes != null && aNodes.Count > 0)
                        //{
                        //    for (int a = 0; a < aNodes.Count; a++)
                        //    {
                        //        if (aNodes[a].Attributes["href"] != null && aNodes[a].Attributes["href"].Value != "#")
                        //        {
                        //            aNodes[a].Attributes["href"].Value = "javascript:void()";
                        //        }
                        //    }
                        //}
    
                        HtmlDocument doc = new HtmlDocument();
                        doc.LoadHtml(htmlTemp);
                        doc.DocumentNode.SelectSingleNode("/html/head/title").InnerHtml = item.title;
                        var n1 = HtmlNode.CreateNode("<div id='div_head'><h1>" + item.title + "</h1><br />" +
                            "【发布:" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "" +
                            "【阅读数:" + item.views + "" +
                            "【评论数:" + item.comments + "" +
                            "<a href='" + item.link + "' target='_blank' >阅读原文</a>" +
                            "</div>");
                        var n2 = HtmlNode.CreateNode("<div id='div_body'>" + item.body + "</div>");
                        doc.DocumentNode.SelectSingleNode("/html/body").AppendChild(n1);
                        doc.DocumentNode.SelectSingleNode("/html/body").AppendChild(n2);
                        doc.Save(filePath, Encoding.UTF8);
                        ShowLog(msgTitle + string.Format("【{0}/{1}】处理文章【{2}】完毕", countFlag, blogs.Count, item.title), true);
                    }
                    catch (Exception ex)
                    {
                        string errorMsg = DateTime.Now.ToString("yyyyMMdd HH:mm:ss") + "
    " + "url=" + item.link + "
    " + "title=" + item.title + "
    " + "errorMsg=" + ex.Message + "
    " + "stackTrace=" + ex.StackTrace + "
    
    
    ";
                        ShowErrorLog(msgTitle + $"error>>处理文章【{item.title}】出现错误:{ex.Message}" + Environment.NewLine + errorMsg, true);
                    }
                }
                sw.Start();
                useTime = sw.ElapsedMilliseconds;
                return countFlag;
            }
    
    
            #endregion
    
            private static void ShowLog(string msg, bool isSaveLog = false)
            {
                Console.WriteLine(msg);
                if (isSaveLog)
                    FileHelper.SaveTxtFile(userPath + "\Log.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  " + msg + Environment.NewLine, false);
            }
            private static void ShowErrorLog(string msg, bool isSaveLog = false)
            {
                Console.WriteLine(msg);
                if (isSaveLog)
                    FileHelper.SaveTxtFile(userPath + "\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  " + msg + Environment.NewLine, false);
            }
    
        }
    }
    View Code

    执行试试吧

    这里只是给个代码轮廓和思想,代码中同样没有考虑多线程、内存资源的释放等问题,难免不会有异常的错误,后续继续优化。

  • 相关阅读:
    确保EF上下文线程内唯一
    linq的join
    编码:隐匿在计算机软硬件背后的语言
    EF删除数据
    插入数据返回主键值用 output inserted.UId
    Fancybox丰富的弹出层效果
    回车登录
    “:Choose a destination with a supported architecture in order to run on this device.”
    How to Enable Multi-Touch
    How does CCFileUTils::fullPathForFilename work
  • 原文地址:https://www.cnblogs.com/mq0036/p/11569415.html
Copyright © 2011-2022 走看看