zoukankan      html  css  js  c++  java
  • C# 爬虫 正则、NSoup、HtmlAgilityPack、Jumony四种方式抓取小说

    心血来潮,想爬点小说。通过百度选择了个小说网站,随便找了一本小说http://www.23us.so/files/article/html/13/13655/index.html

    1、分析html规则

    思路是获取小说章节目录,循环目录,抓取所有章节中的内容,拼到txt文本中。最后形成完本小说。

    1、获取小说章节目录

    通过分析,我在标注的地方获取小说名字及章节目录。

    <meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>// 获取小说名字
    <table cellspacing="1" cellpadding="0" bgcolor="#E4E4E4" id="at">// 所有的章节都在这个table中。

    下面是利用正则,获取名字与目录。

    //获取小说名字
    Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
    string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
    
    //获取章节目录
    Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|
    )*?</table>");
    var mat_mulu = reg_mulu.Match(html);
    string mulu = mat_mulu.Groups[0].ToString();

    2、获取小说正文内容

    通过章节a标签中的url地址,查看章节内容。

    通过分析,正文内容在<dd id="contents">中。

    //获取正文
    Regex reg = new Regex(@"<dd id=""contents"">(.|
    )*?</dd>");
    MatchCollection mc = reg.Matches(html_z);
    var mat = reg.Match(html_z);
    string content = mat.Groups[0].ToString().Replace("<dd id="contents">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "
    ");

    2、C#完整代码

    using System;
    using System.Collections;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Web;
    using System.Web.Mvc;
    
    namespace Test.Controllers
    {
        public class CrawlerController : BaseController
        {
            // GET: Crawler
            public void Index()
            {
                //抓取整本小说
                CrawlerController cra = new CrawlerController();//顶点抓取小说网站小说
                string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", "");
    
                //获取小说名字
                Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
                string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
    
                //获取章节目录
                Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|
    )*?</table>");
                var mat_mulu = reg_mulu.Match(html);
                string mulu = mat_mulu.Groups[0].ToString();
    
                //匹配a标签里面的url
                Regex tmpreg = new Regex("<a[^>]+?href="([^"]+)"[^>]*>([^<]+)</a>", RegexOptions.Compiled);
                MatchCollection sMC = tmpreg.Matches(mulu);
                if (sMC.Count != 0)
                {
                    //循环目录url,获取正文内容
                    for (int i = 0; i < sMC.Count; i++)
                    {
                        //sMC[i].Groups[1].Value
                        //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a> 
                        //1是http://www.23us.so/files/article/html/13/13655/5638725.html
                        //2是第一章 泰山之巅
    
                        //获取章节标题
                        string title = sMC[i].Groups[2].Value;
    
                        //获取文章内容
                        string html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");
    
                        //获取小说名字,章节中也可以查找名字
                        //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />");
                        //string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
    
                        //获取标题,通过分析h1标签也可以得到章节标题
                        //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1];
    
                        //获取正文
                        Regex reg = new Regex(@"<dd id=""contents"">(.|
    )*?</dd>");
                        MatchCollection mc = reg.Matches(html_z);
                        var mat = reg.Match(html_z);
                        string content = mat.Groups[0].ToString().Replace("<dd id="contents">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "
    ");
    
                        //txt文本输出
                        string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\", "/") + "Txt/";
                        Novel(title + "
    " + content, name, path);
                    }
                }
            }
    
            /// <summary>
            /// 创建文本
            /// </summary>
            /// <param name="content">内容</param>
            /// <param name="name">名字</param>
            /// <param name="path">路径</param>
            public void Novel(string content, string name, string path)
            {
                string Log = content + "
    ";
                //创建文件夹,如果不存在就创建file文件夹
                if (Directory.Exists(path) == false)
                {
                    Directory.CreateDirectory(path);
                }
    
                //判断文件是否存在,不存在则创建
                if (!System.IO.File.Exists(path + name + ".txt"))
                {
                    FileStream fs1 = new FileStream(path + name + ".txt", FileMode.Create, FileAccess.Write);//创建写入文件 
                    StreamWriter sw = new StreamWriter(fs1);
                    sw.WriteLine(Log);//开始写入值
                    sw.Close();
                    fs1.Close();
                }
                else
                {
                    FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
                    StreamWriter sr = new StreamWriter(fs);
                    sr.WriteLine(Log);//开始写入值
                    sr.Close();
                    fs.Close();
                }
            }
    
            //Post
            public string HttpPost(string Url, string postDataStr)
            {
                CookieContainer cookie = new CookieContainer();
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
                request.Method = "POST";
                request.ContentType = "application/x-www-form-urlencoded";
                request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
                request.CookieContainer = cookie;
                Stream myRequestStream = request.GetRequestStream();
                StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
                myStreamWriter.Write(postDataStr);
                myStreamWriter.Close();
    
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
    
                response.Cookies = cookie.GetCookies(response.ResponseUri);
                Stream myResponseStream = response.GetResponseStream();
                StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
                string retString = myStreamReader.ReadToEnd();
                myStreamReader.Close();
                myResponseStream.Close();
    
                return retString;
            }
    
            //Get
            public string HttpGet(string Url, string postDataStr)
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
                request.Method = "GET";
                HttpWebResponse response;
                request.ContentType = "text/html;charset=UTF-8";
                try
                {
                    response = (HttpWebResponse)request.GetResponse();
                }
                catch (WebException ex)
                {
                    response = (HttpWebResponse)request.GetResponse();
                }
    
                Stream myResponseStream = response.GetResponseStream();
                StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
                string retString = myStreamReader.ReadToEnd();
                myStreamReader.Close();
                myResponseStream.Close();
    
                return retString;
            }
        }
    }
    View Code

    3、最后效果


    4、补充

    wlong 同学提了个建议,说用NSoup解析html更方便,我就去查了查,目前没有太大的感触,可能不太会用。DLL下载地址http://nsoup.codeplex.com/

    NSoup版:

    NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(html);
    //获取小说名字
    //<meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>
    //获取meta NSoup.Select.Elements ele = doc.GetElementsByTag("meta"); string name = ""; foreach (var i in ele) { if (i.Attr("name") == "keywords") { name = i.Attr("content").ToString(); } } //获取章节 NSoup.Select.Elements eleChapter = doc.GetElementsByTag("table");//查找table,获取table里的html NSoup.Nodes.Document docChild = NSoup.NSoupClient.Parse(eleChapter.ToString()); NSoup.Select.Elements eleChild = docChild.GetElementsByTag("a");//查找a标签 //循环目录,获取正文内容 foreach (var j in eleChild) { string title = j.Text();//获取章节标题 string htmlChild = cra.HttpGet(j.Attr("href").ToString(), "");//获取文章内容 }

    HtmlAgilityPack版(NaYoung提供):

    DLL下载地址:HtmlAgilityPack.1.4.6.zip

    HtmlWeb htmlWeb = new HtmlWeb();
    HtmlDocument document = htmlWeb.Load("http://www.23us.so/files/article/html/13/13694/index.html");
    HtmlNodeCollection nodeCollection = document.DocumentNode.SelectNodes(@"//table/tr/td/a[@href]");  //代表获取所有
    string name = document.DocumentNode.SelectNodes(@"//meta[@name='keywords']")[0].GetAttributeValue("content", "").Split(',')[0];
    foreach (var node in nodeCollection)
    {
        HtmlAttribute attribute = node.Attributes["href"];
        String val = attribute.Value;  //章节url
        var title = htmlWeb.Load(val).DocumentNode.SelectNodes(@"//h1")[0].InnerText;  //文章标题
        var doc = htmlWeb.Load(val).DocumentNode.SelectNodes(@"//dd[@id='contents']");
        var content = doc[0].InnerHtml.Replace("&nbsp;", "").Replace("<br>", "
    ");  //文章内容
        //txt文本输出
        string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\", "/") + "Txt/";
        Novel(title + "
    " + content, name, path);
    }

    Jumony版:

    C# 爬虫 Jumony-html解析

  • 相关阅读:
    使用 media 实现响应式布局
    Django组件的中间件
    Django组件的cookie和 session,用户认证组件
    Django的文件上传和分页
    Ajax
    Django模型层的多表操作(2)
    Django模型层的多表操作(1)
    Django的查询表记录
    Django2.0版本的路由层和ORM但表操作
    Django的路由层,视图层和模版层
  • 原文地址:https://www.cnblogs.com/cang12138/p/7464226.html
Copyright © 2011-2022 走看看