zoukankan      html  css  js  c++  java
  • 控制台爬取小说(大王饶命)

     1    var url = GetWBJokeUrl("/book/1719.html");
     2             string next;
     3             GetContent(url, out next);
     4             while (true)
     5             {
     6                 ConsoleKeyInfo info = System.Console.ReadKey();
     7                 switch (info.Key)
     8                 {
     9                     case ConsoleKey.E:
    10                         Environment.Exit(0);
    11                         break;
    12                     case ConsoleKey.RightArrow:
    13                         GetContent(GetWBJokeUrl(next), out next);
    14                         break;
    15                     default:
    16                         System.Console.WriteLine(info.Key);
    17                         break;
    18                 }
    19 
    20             }
    使用
     1  /// <summary>
     2         /// 获取大王饶命小说页面
     3         /// </summary>
     4         /// <param name="firstUrl">第一次进入的页面</param>
     5         /// <param name="nexturl">下一页</param>
     6         private static void GetContent(string firstUrl, out string nexturl)
     7         {
     8             var html = GetUrlContent(firstUrl);
     9             var url = @"<a href=([^>]+?)>下一页</a>";
    10             string re1 = "/.+html";
    11             nexturl = MatchReg(re1, MatchReg(url, html));
    12             var divContent = @"(?m)<div id=""BookText""[^>]*>(?<div>(?:w|W)*?)</div[^>]*>";
    13             html = MatchReg(divContent, html, "div").Trim().Replace("<br />", "");
    14             var delh4 = @"<h4>([sS]*?)</h4>";
    15             html = html.Replace(MatchReg(delh4, html), "");
    16             Console.WriteLine(html);
    17 
    18         }
    19 
    20         /// <summary>
    21         /// 筛选数据
    22         /// </summary>
    23         /// <param name="regStr">正则字符串</param>
    24         /// <param name="html">网页标签</param>
    25         /// <param name="input">需要获取的标签</param>
    26         /// <returns></returns>
    27         public static string MatchReg(string regStr, string html, string input = "0")
    28         {
    29             var reg = new Regex(regStr, RegexOptions.Multiline | RegexOptions.IgnoreCase);
    30             var mc = reg.Match(html);
    31             if (mc.Success)
    32             {
    33                 return mc.Groups[input].Value;
    34             }
    35             return "";
    36         }
    37 
    38         /// <summary>
    39         /// 爬取地址
    40         /// </summary>
    41         const string qsbkMainUrl = "http://www.dawangraoming.com";
    42         /// <summary>
    43         /// 爬取页面位置
    44         /// </summary>
    45         /// <param name="firsturl"></param>
    46         /// <returns></returns>
    47         private static string GetWBJokeUrl(string firsturl)
    48         {
    49             StringBuilder url = new StringBuilder();
    50             url.Append(qsbkMainUrl);
    51             url.Append(firsturl);
    52             return url.ToString();
    53         }
    54 
    55         /// <summary>
    56         /// /伪装网站访问
    57         /// </summary>
    58         /// <param name="url">目标网站地址</param>
    59         /// <returns></returns>
    60         private static string GetUrlContent(string url)
    61         {
    62             try
    63             {
    64 
    65                 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
    66 
    67                 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36";
    68 
    69                 request.Method = "GET";
    70 
    71                 request.ContentType = "text/html;charset=UTF-8";
    72 
    73                 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
    74 
    75                 Stream myResponseStream = response.GetResponseStream();
    76 
    77                 StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
    78 
    79                 string retString = myStreamReader.ReadToEnd();
    80 
    81                 myStreamReader.Close();
    82 
    83                 myResponseStream.Close();
    84 
    85                 return retString;
    86 
    87             }
    88 
    89             catch { return null; }
    90 
    91         }
    封装方法
    好好学习,天天向上。
  • 相关阅读:
    PythonIDE
    Python学习笔记六:return的用法
    Python学习笔记三:逻辑操作符
    Python编程实现对CodeSys中ST代码的自动排版(一)
    Python学习笔记十二:列表(4)len 、in、For的使用
    Python学习笔记七:字符串的操作(一)
    Python学习笔记五:while语句
    Python编程实现对CodeSys中ST代码的自动排版(二)
    Python学习笔记十:列表(2)列表元素的读写
    Python学习笔记九:列表(1)列表元素介绍及创建方法
  • 原文地址:https://www.cnblogs.com/Zhengxue/p/8864869.html
Copyright © 2011-2022 走看看