zoukankan      html  css  js  c++  java
  • 想看小说,自己写个采集类,读网页文章写入txt文件

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    
    namespace allen
    {
        class Program
        {
            /// <summary>
            /// 根据网址取得HTML代码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            static string GetHtml(string url)
            {
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                Stream stream = response.GetResponseStream();
                StreamReader reader = new StreamReader(stream, Encoding.Default);
                string html = reader.ReadToEnd();
                stream.Close();
                return html;
            }
            static Regex reg;
            /// <summary>
            /// 过滤器,留下文章正文
            /// </summary>
            /// <param name="htmlStr"></param>
            /// <returns></returns>
            static string MyFilter(string htmlStr)
            {
                reg = new Regex(@"\s+");//先把任意空白符做掉
                htmlStr = reg.Replace(htmlStr, "");
                reg = new Regex("点此下载封神演义.txt</font></font></a></div></td>.*</div></td></tr><tr><tdclass=");//匹配出正文
                Match match = reg.Match(htmlStr);
                string result = match.Value;
                result = result.Replace("点此下载封神演义.txt</font></font></a></div></td>", "");
                result = result.Replace("</div></td></tr><tr><tdclass=","");
                result = result.Replace("</tr></table>", "");
                result = result.Replace("本文章下载于www.Txt66.com", "");
                result = result.Replace("<br>",Environment.NewLine);
                return result;
            }
            /// <summary>
            /// 循环读取每页的文章,写入记事本
            /// </summary>
            static void WriteFile()
            {
                int page_num = 1;
                string url = "http://www.txt66.com/read2.asp?id=8480&PageNum={0}";
                string url_temp = string.Empty;
                string html = string.Empty;
                string text = string.Empty;
                StreamWriter sw = new StreamWriter(@"F:\g.txt", true, Encoding.Unicode);
                while (page_num < 124)
                {
                    url_temp = string.Format(url, page_num);
                    html = GetHtml(url_temp);
                    text = MyFilter(html);
                    sw.Write(text);
                    Console.WriteLine("写入第{0}页", page_num);
                    System.Threading.Thread.Sleep(600);
                    page_num++;
                }
                sw.Close();
            }
            /// <summary>
            /// 主函数
            /// </summary>
            /// <param name="args"></param>
            static void Main(string[] args)
            {
                WriteFile();
                Console.ReadKey();
            }
        }
    }
    
    
  • 相关阅读:
    16 | 网络优化(中):复杂多变的移动网络该如何优化?
    Understanding Temporal Metrics
    Objective-C Runtime 大佬系列文章整理
    面向对象编程中的封装、抽象、继承、多态特性以及应用
    面向接口编程原理
    一次HTTP请求的完整过程——协议篇(DNS、TCP、HTTP)
    计算机网络 | 图解 DNS & HTTPDNS 原理
    CDN的加速原理是什么?
    让WKWebview支持NSURLProtocol总结
    WKWebView 请求拦截
  • 原文地址:https://www.cnblogs.com/liulun/p/1679690.html
Copyright © 2011-2022 走看看