zoukankan      html  css  js  c++  java
  • C#网络爬虫抓取小说

    心血来潮,想研究下爬虫,爬点小说。

    通过百度选择了个小说网站,随便找了一本小书http://www.23us.so/files/article/html/13/13655/index.html

    一、分析html规则

    思路是获取小说章节目录,循环目录,抓取所有章节中的内容,拼到txt文本中。最后形成完本小说。

    1、获取小说章节目录

    通过分析,我在标注的地方获取小说名字及章节目录。

    <meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>// 获取小说名字
    <table cellspacing="1" cellpadding="0" bgcolor="#E4E4E4" id="at">// 所有的章节都在这个table中。

    下面是利用正则,获取名字与目录。

    // 获取小说名字
    Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
    string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
    
    // 获取章节目录
    Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|
    )*?</table>");
    var mat_mulu = reg_mulu.Match(html);
    string mulu = mat_mulu.Groups[0].ToString();

    2、获取小说正文内容

    通过章节a标签中的url地址,查看章节内容。

     通过分析,正文内容在<dd id="contents">中。

    // 获取正文
    Regex reg = new Regex(@"<dd id=""contents"">(.|
    )*?</dd>");
    MatchCollection mc = reg.Matches(html_z);
    var mat = reg.Match(html_z);
    string content = mat.Groups[0].ToString().Replace("<dd id="contents">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "
    ");

    二、C#完整代码

      1 using System;
      2 using System.Collections;
      3 using System.Collections.Generic;
      4 using System.IO;
      5 using System.Linq;
      6 using System.Net;
      7 using System.Text;
      8 using System.Text.RegularExpressions;
      9 using System.Web;
     10 using System.Web.Mvc;
     11 
     12 namespace TestInsect.Controllers
     13 {
     14     public class CrawlerController : Controller
     15     {
     16         // GET: Crawler
     17         public ActionResult Index()
     18         {
     19             Index1();
     20             return View();
     21         }
     22         // GET: Crawler
     23         public void Index1()
     24         {
     25             //抓取整本小说
     26             CrawlerController cra = new CrawlerController();// 顶点抓取小说网站小说
     27             string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", "");
     28 
     29             // 获取小说名字
     30             Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
     31             string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
     32 
     33             // 获取章节目录
     34             Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|
    )*?</table>");
     35             var mat_mulu = reg_mulu.Match(html);
     36             string mulu = mat_mulu.Groups[0].ToString();
     37 
     38             // 匹配a标签里面的url
     39             Regex tmpreg = new Regex("<a[^>]+?href="([^"]+)"[^>]*>([^<]+)</a>", RegexOptions.Compiled);
     40             MatchCollection sMC = tmpreg.Matches(mulu);
     41             if (sMC.Count != 0)
     42             {
     43                 //循环目录url,获取正文内容
     44                 for (int i = 0; i < sMC.Count; i++)
     45                 {
     46                     //sMC[i].Groups[1].Value
     47                     //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a> 
     48                     //1是http://www.23us.so/files/article/html/13/13655/5638725.html
     49                     //2是第一章 泰山之巅
     50 
     51                     // 获取章节标题
     52                     string title = sMC[i].Groups[2].Value;
     53 
     54                     // 获取文章内容
     55                     string html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");
     56 
     57                     // 获取小说名字,章节中也可以查找名字
     58                     //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />");
     59                     //string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
     60 
     61                     // 获取标题,通过分析h1标签也可以得到章节标题
     62                     //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1];
     63 
     64                     // 获取正文
     65                     Regex reg = new Regex(@"<dd id=""contents"">(.|
    )*?</dd>");
     66                     MatchCollection mc = reg.Matches(html_z);
     67                     var mat = reg.Match(html_z);
     68                     string content = mat.Groups[0].ToString().Replace("<dd id="contents">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "
    ");
     69 
     70                     // txt文本输出
     71                     string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\", "/") + "Txt/";
     72                     Novel(title + "
    " + content, name, path);
     73                 }
     74             }
     75         }
     76 
     77         /// <summary>
     78         /// 创建文本
     79         /// </summary>
     80         /// <param name="content">内容</param>
     81         /// <param name="name">名字</param>
     82         /// <param name="path">路径</param>
     83         public void Novel(string content, string name, string path)
     84         {
     85             string Log = content + "
    ";
     86             // 创建文件夹,如果不存在就创建file文件夹
     87             if (Directory.Exists(path) == false)
     88             {
     89                 Directory.CreateDirectory(path);
     90             }
     91 
     92             // 判断文件是否存在,不存在则创建
     93             if (!System.IO.File.Exists(path + name + ".txt"))
     94             {
     95                 FileStream fs1 = new FileStream(path + name + ".txt", FileMode.Create, FileAccess.Write);// 创建写入文件 
     96                 StreamWriter sw = new StreamWriter(fs1);
     97                 sw.WriteLine(Log);// 开始写入值
     98                 sw.Close();
     99                 fs1.Close();
    100             }
    101             else
    102             {
    103                 FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
    104                 StreamWriter sr = new StreamWriter(fs);
    105                 sr.WriteLine(Log);// 开始写入值
    106                 sr.Close();
    107                 fs.Close();
    108             }
    109         }
    110 
    111         public string HttpPost(string Url, string postDataStr)
    112         {
    113             CookieContainer cookie = new CookieContainer();
    114             HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
    115             request.Method = "POST";
    116             request.ContentType = "application/x-www-form-urlencoded";
    117             request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
    118             request.CookieContainer = cookie;
    119             Stream myRequestStream = request.GetRequestStream();
    120             StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
    121             myStreamWriter.Write(postDataStr);
    122             myStreamWriter.Close();
    123 
    124             HttpWebResponse response = (HttpWebResponse)request.GetResponse();
    125 
    126             response.Cookies = cookie.GetCookies(response.ResponseUri);
    127             Stream myResponseStream = response.GetResponseStream();
    128             StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
    129             string retString = myStreamReader.ReadToEnd();
    130             myStreamReader.Close();
    131             myResponseStream.Close();
    132 
    133             return retString;
    134         }
    135 
    136         public string HttpGet(string Url, string postDataStr)
    137         {
    138             HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
    139             request.Method = "GET";
    140             HttpWebResponse response;
    141             request.ContentType = "text/html;charset=UTF-8";
    142             try
    143             {
    144                 response = (HttpWebResponse)request.GetResponse();
    145             }
    146             catch (WebException ex)
    147             {
    148                 response = (HttpWebResponse)request.GetResponse();
    149             }
    150 
    151             Stream myResponseStream = response.GetResponseStream();
    152             StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
    153             string retString = myStreamReader.ReadToEnd();
    154             myStreamReader.Close();
    155             myResponseStream.Close();
    156 
    157             return retString;
    158         }
    159     }
    160 }
    View Code

    补充:

    有人说NSoup解析html更方便,可能不太会用。DLL下载地址http://nsoup.codeplex.com/

     1 NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(html);
     2 //获取小说名字
     3 //<meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>
     4 //获取meta
     5 NSoup.Select.Elements ele = doc.GetElementsByTag("meta");
     6 string name = "";
     7 foreach (var i in ele)
     8 {
     9     if (i.Attr("name") == "keywords")
    10     {
    11         name = i.Attr("content").ToString();
    12     }
    13 }
    14 //获取章节
    15 NSoup.Select.Elements eleChapter = doc.GetElementsByTag("table");//查找table,获取table里的html
    16 NSoup.Nodes.Document docChild = NSoup.NSoupClient.Parse(eleChapter.ToString());
    17 NSoup.Select.Elements eleChild = docChild.GetElementsByTag("a");//查找a标签
    18 //循环目录,获取正文内容
    19 foreach (var j in eleChild)
    20 {
    21     string title = j.Text();//获取章节标题
    22 
    23     string htmlChild = cra.HttpGet(j.Attr("href").ToString(), "");//获取文章内容
    24 }
    View Code

     原文:http://www.cnblogs.com/cang12138/p/7464226.html

  • 相关阅读:
    Http请求头与响应头
    获取ip位置方法
    简单的Http Server实现
    HTTP
    long、int与byte数组之间的相互转换
    GlusterFS简单配置
    创建线程池
    网络编程socket
    面向对象-进阶篇
    面向对象-初级篇
  • 原文地址:https://www.cnblogs.com/Cein/p/7491290.html
Copyright © 2011-2022 走看看