zoukankan      html  css  js  c++  java
  • 批量下载小说网站文章

     1 using HtmlAgilityPack;
     2 using System;
     3 using System.Collections.Generic;
     4 using System.IO;
     5 using System.Net;
     6 using System.Text;
     7 
     8 namespace DownloadNovel
     9 {
    10     class Program
    11     {
    12         static void Main(string[] args)
    13         {
    14             //小说网站的网址
    15             string webSiteUrl = "http://www.biqugew.com";
    16             //指定小说目录的网址
    17             string NovelUrl = "http://www.biqugew.com/book/10/";
    18             
    19            
    20 
    21             DownNovel(webSiteUrl, NovelUrl);
    22         }
    23         private static void DownNovel(string webSiteUrl, string NovelUrl)
    24         {
    25             string[] split = { "<br>", "
    " };
    26             //指定小说的目录的 Xpath
    27             string TableXpath = "/body[1]/div[1]/div[5]/div[1]/dl[1]/dd";
    28             //获取小说标题的 XPath
    29             string TitleXpath = "/html[1]/body[1]/div[1]/div[3]/div[1]/div[2]/h1[1]";
    30             //获取指定小说的内容的 Xpath
    31             string ContentsXpath = "/html/body/div/div[3]/div/div[3]";
    32             WebClient client = new WebClient { Encoding = Encoding.GetEncoding("GB2312") };
    33             HtmlNodeCollection nodes = null;
    34             {
    35                 HtmlDocument doc = new HtmlDocument();
    36                 //获取目录页
    37                 doc.LoadHtml(client.DownloadString(NovelUrl));
    38                 nodes = doc.DocumentNode.SelectNodes(TableXpath);
    39             }
    40             //解析目录页
    41             foreach (HtmlNode node in nodes)
    42             {
    43                 HtmlDocument doc = new HtmlDocument();
    44                 //获取小说单章的网站
    45                 string url = webSiteUrl + node.SelectSingleNode("a").Attributes["href"].Value;
    46                 //获取小说单章整个网页
    47                 doc.LoadHtml(client.DownloadString(url));
    48                 //获取本章小说的标题
    49                 string title = doc.DocumentNode.SelectSingleNode(TitleXpath).InnerHtml;
    50                 //获取小说文本内容 doc.DocumentNode.SelectSingleNode("/html/body/div/div[3]/div/div[3]").OuterHtml
    51                 string str = doc.DocumentNode.SelectSingleNode(ContentsXpath).InnerHtml.Replace("&nbsp;", "");
    52                 //过滤文本中的特殊字符和字符串
    53                 string aticale = "";
    54                 foreach (var txt in str.Split(split, StringSplitOptions.RemoveEmptyEntries))
    55                 {
    56                     if (!txt.Contains("<a"))
    57                         aticale += txt;
    58                 }
    59                 Console.WriteLine(title);
    60                 WriteLog(title + Environment.NewLine + aticale);
    61             }
    62             
    63         }
    64 
    65         static void WriteLog(string msg)
    66         {
    67             string path = Environment.CurrentDirectory + "/novel/";
    68             if (!Directory.Exists(path)) Directory.CreateDirectory(path);
    69             string fileName = DateTime.Now.ToString("yyyy-MM-dd");
    70             string filepath = path + fileName + ".txt";
    71             Stream fileStream = File.Open(filepath, FileMode.Append, FileAccess.Write, FileShare.Write);
    72             StreamWriter writeAdapter = new StreamWriter(fileStream, Encoding.Default);
    73             writeAdapter.WriteLine(msg);
    74             writeAdapter.WriteLine();
    75             writeAdapter.Close();
    76         }
    77 
    78     }
    79 }
  • 相关阅读:
    How to clean up BizTalk Message Box
    BizTalk: Database ‘BizTalkMsgBoxDb’ is full
    Install or Uninstall a Windows Service
    Distinguished Fields and Optional Elements
    Stay Hungry, Stay Foolish
    男子英文名大全,来历、释义
    Correlations in BizTalk 2004
    17.域环境解决方案
    18.windows痕迹清理
    Burpsuit手机抓包
  • 原文地址:https://www.cnblogs.com/sunbingqiang/p/10305290.html
Copyright © 2011-2022 走看看