zoukankan      html  css  js  c++  java
  • 想看小说,自己写个采集类,读网页文章写入txt文件

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    
    namespace allen
    {
        class Program
        {
            /// <summary>
            /// 根据网址取得HTML代码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            static string GetHtml(string url)
            {
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                Stream stream = response.GetResponseStream();
                StreamReader reader = new StreamReader(stream, Encoding.Default);
                string html = reader.ReadToEnd();
                stream.Close();
                return html;
            }
            static Regex reg;
            /// <summary>
            /// 过滤器,留下文章正文
            /// </summary>
            /// <param name="htmlStr"></param>
            /// <returns></returns>
            static string MyFilter(string htmlStr)
            {
                reg = new Regex(@"\s+");//先把任意空白符做掉
                htmlStr = reg.Replace(htmlStr, "");
                reg = new Regex("点此下载封神演义.txt</font></font></a></div></td>.*</div></td></tr><tr><tdclass=");//匹配出正文
                Match match = reg.Match(htmlStr);
                string result = match.Value;
                result = result.Replace("点此下载封神演义.txt</font></font></a></div></td>", "");
                result = result.Replace("</div></td></tr><tr><tdclass=","");
                result = result.Replace("</tr></table>", "");
                result = result.Replace("本文章下载于www.Txt66.com", "");
                result = result.Replace("<br>",Environment.NewLine);
                return result;
            }
            /// <summary>
            /// 循环读取每页的文章,写入记事本
            /// </summary>
            static void WriteFile()
            {
                int page_num = 1;
                string url = "http://www.txt66.com/read2.asp?id=8480&PageNum={0}";
                string url_temp = string.Empty;
                string html = string.Empty;
                string text = string.Empty;
                StreamWriter sw = new StreamWriter(@"F:\g.txt", true, Encoding.Unicode);
                while (page_num < 124)
                {
                    url_temp = string.Format(url, page_num);
                    html = GetHtml(url_temp);
                    text = MyFilter(html);
                    sw.Write(text);
                    Console.WriteLine("写入第{0}页", page_num);
                    System.Threading.Thread.Sleep(600);
                    page_num++;
                }
                sw.Close();
            }
            /// <summary>
            /// 主函数
            /// </summary>
            /// <param name="args"></param>
            static void Main(string[] args)
            {
                WriteFile();
                Console.ReadKey();
            }
        }
    }
    
    
  • 相关阅读:
    linux 学习笔记
    linux 子系统折腾记 (三)
    linux子系统折腾记 (二)
    windows linux 子系统折腾记
    会计学习笔记(非专业)
    linux 大冒险
    coreRT 和 Native 编译netcore AOT程序
    dotnet core如何编译exe
    win10的hyper-v共享文件夹
    packagereference 里面的资产是怎么回事?
  • 原文地址:https://www.cnblogs.com/liulun/p/1679690.html
Copyright © 2011-2022 走看看