zoukankan html css js c++ java

C# 爬虫正则、NSoup、HtmlAgilityPack、Jumony四种方式抓取小说

心血来潮，想爬点小说。通过百度选择了个小说网站，随便找了一本小说http://www.23us.so/files/article/html/13/13655/index.html。

1、分析html规则

思路是获取小说章节目录，循环目录，抓取所有章节中的内容，拼到txt文本中。最后形成完本小说。

1、获取小说章节目录

通过分析，我在标注的地方获取小说名字及章节目录。

<meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>// 获取小说名字
<table cellspacing="1" cellpadding="0" bgcolor="#E4E4E4" id="at">// 所有的章节都在这个table中。

下面是利用正则，获取名字与目录。

//获取小说名字
Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

//获取章节目录
Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|
)*?</table>");
var mat_mulu = reg_mulu.Match(html);
string mulu = mat_mulu.Groups[0].ToString();

2、获取小说正文内容

通过章节a标签中的url地址，查看章节内容。

通过分析，正文内容在<dd id="contents">中。

//获取正文
Regex reg = new Regex(@"<dd id=""contents"">(.|
)*?</dd>");
MatchCollection mc = reg.Matches(html_z);
var mat = reg.Match(html_z);
string content = mat.Groups[0].ToString().Replace("<dd id="contents">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "
");

2、C#完整代码

using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.Mvc;

namespace Test.Controllers
{
    public class CrawlerController : BaseController
    {
        // GET: Crawler
        public void Index()
        {
            //抓取整本小说
            CrawlerController cra = new CrawlerController();//顶点抓取小说网站小说
            string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", "");

            //获取小说名字
            Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
            string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

            //获取章节目录
            Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|
)*?</table>");
            var mat_mulu = reg_mulu.Match(html);
            string mulu = mat_mulu.Groups[0].ToString();

            //匹配a标签里面的url
            Regex tmpreg = new Regex("<a[^>]+?href="([^"]+)"[^>]*>([^<]+)</a>", RegexOptions.Compiled);
            MatchCollection sMC = tmpreg.Matches(mulu);
            if (sMC.Count != 0)
            {
                //循环目录url，获取正文内容
                for (int i = 0; i < sMC.Count; i++)
                {
                    //sMC[i].Groups[1].Value
                    //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a> 
                    //1是http://www.23us.so/files/article/html/13/13655/5638725.html
                    //2是第一章 泰山之巅

                    //获取章节标题
                    string title = sMC[i].Groups[2].Value;

                    //获取文章内容
                    string html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");

                    //获取小说名字,章节中也可以查找名字
                    //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />");
                    //string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

                    //获取标题,通过分析h1标签也可以得到章节标题
                    //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1];

                    //获取正文
                    Regex reg = new Regex(@"<dd id=""contents"">(.|
)*?</dd>");
                    MatchCollection mc = reg.Matches(html_z);
                    var mat = reg.Match(html_z);
                    string content = mat.Groups[0].ToString().Replace("<dd id="contents">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "
");

                    //txt文本输出
                    string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\", "/") + "Txt/";
                    Novel(title + "
" + content, name, path);
                }
            }
        }

        /// <summary>
        /// 创建文本
        /// </summary>
        /// <param name="content">内容</param>
        /// <param name="name">名字</param>
        /// <param name="path">路径</param>
        public void Novel(string content, string name, string path)
        {
            string Log = content + "
";
            //创建文件夹，如果不存在就创建file文件夹
            if (Directory.Exists(path) == false)
            {
                Directory.CreateDirectory(path);
            }

            //判断文件是否存在，不存在则创建
            if (!System.IO.File.Exists(path + name + ".txt"))
            {
                FileStream fs1 = new FileStream(path + name + ".txt", FileMode.Create, FileAccess.Write);//创建写入文件 
                StreamWriter sw = new StreamWriter(fs1);
                sw.WriteLine(Log);//开始写入值
                sw.Close();
                fs1.Close();
            }
            else
            {
                FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
                StreamWriter sr = new StreamWriter(fs);
                sr.WriteLine(Log);//开始写入值
                sr.Close();
                fs.Close();
            }
        }

        //Post
        public string HttpPost(string Url, string postDataStr)
        {
            CookieContainer cookie = new CookieContainer();
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
            request.Method = "POST";
            request.ContentType = "application/x-www-form-urlencoded";
            request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
            request.CookieContainer = cookie;
            Stream myRequestStream = request.GetRequestStream();
            StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
            myStreamWriter.Write(postDataStr);
            myStreamWriter.Close();

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();

            response.Cookies = cookie.GetCookies(response.ResponseUri);
            Stream myResponseStream = response.GetResponseStream();
            StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
            string retString = myStreamReader.ReadToEnd();
            myStreamReader.Close();
            myResponseStream.Close();

            return retString;
        }

        //Get
        public string HttpGet(string Url, string postDataStr)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
            request.Method = "GET";
            HttpWebResponse response;
            request.ContentType = "text/html;charset=UTF-8";
            try
            {
                response = (HttpWebResponse)request.GetResponse();
            }
            catch (WebException ex)
            {
                response = (HttpWebResponse)request.GetResponse();
            }

            Stream myResponseStream = response.GetResponseStream();
            StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
            string retString = myStreamReader.ReadToEnd();
            myStreamReader.Close();
            myResponseStream.Close();

            return retString;
        }
    }
}

View Code

3、最后效果

4、补充

wlong 同学提了个建议，说用NSoup解析html更方便，我就去查了查，目前没有太大的感触，可能不太会用。DLL下载地址http://nsoup.codeplex.com/

NSoup版:

NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(html);
//获取小说名字
//<meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>
//获取meta
NSoup.Select.Elements ele = doc.GetElementsByTag("meta");
string name = "";
foreach (var i in ele)
{
    if (i.Attr("name") == "keywords")
    {
        name = i.Attr("content").ToString();
    }
}
//获取章节
NSoup.Select.Elements eleChapter = doc.GetElementsByTag("table");//查找table，获取table里的html
NSoup.Nodes.Document docChild = NSoup.NSoupClient.Parse(eleChapter.ToString());
NSoup.Select.Elements eleChild = docChild.GetElementsByTag("a");//查找a标签
//循环目录，获取正文内容
foreach (var j in eleChild)
{
    string title = j.Text();//获取章节标题

    string htmlChild = cra.HttpGet(j.Attr("href").ToString(), "");//获取文章内容
}

HtmlAgilityPack版（NaYoung提供）：

DLL下载地址：HtmlAgilityPack.1.4.6.zip

HtmlWeb htmlWeb = new HtmlWeb();
HtmlDocument document = htmlWeb.Load("http://www.23us.so/files/article/html/13/13694/index.html");
HtmlNodeCollection nodeCollection = document.DocumentNode.SelectNodes(@"//table/tr/td/a[@href]");  //代表获取所有
string name = document.DocumentNode.SelectNodes(@"//meta[@name='keywords']")[0].GetAttributeValue("content", "").Split(',')[0];
foreach (var node in nodeCollection)
{
    HtmlAttribute attribute = node.Attributes["href"];
    String val = attribute.Value;  //章节url
    var title = htmlWeb.Load(val).DocumentNode.SelectNodes(@"//h1")[0].InnerText;  //文章标题
    var doc = htmlWeb.Load(val).DocumentNode.SelectNodes(@"//dd[@id='contents']");
    var content = doc[0].InnerHtml.Replace("&nbsp;", "").Replace("<br>", "
");  //文章内容
    //txt文本输出
    string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\", "/") + "Txt/";
    Novel(title + "
" + content, name, path);
}

Jumony版：

C# 爬虫 Jumony-html解析

查看全文

相关阅读:
springmvc介绍
 mybatis中的动态sql应用
 mybatis中表的关联
 mybatis分页
 聚类评估指标系列(二)：准确率和F值
 混淆矩阵，准确率，召回率，F-score，PR曲线，ROC曲线，AUC
聚类评估指标系列(一)：标准化互信息NMI计算步骤及其Python实现
 numpy.where() 用法详解
 互信息Mutual Information
转：Prewitt算子、Sobel算子、canny算子、Lapacian算子

原文地址：https://www.cnblogs.com/cang12138/p/7464226.html

C# 爬虫 正则、NSoup、HtmlAgilityPack、Jumony四种方式抓取小说

1、分析html规则

2、C#完整代码

3、最后效果

4、补充

C# 爬虫正则、NSoup、HtmlAgilityPack、Jumony四种方式抓取小说