zoukankan      html  css  js  c++  java
  • 控制台爬取小说(大王饶命)

     1    var url = GetWBJokeUrl("/book/1719.html");
     2             string next;
     3             GetContent(url, out next);
     4             while (true)
     5             {
     6                 ConsoleKeyInfo info = System.Console.ReadKey();
     7                 switch (info.Key)
     8                 {
     9                     case ConsoleKey.E:
    10                         Environment.Exit(0);
    11                         break;
    12                     case ConsoleKey.RightArrow:
    13                         GetContent(GetWBJokeUrl(next), out next);
    14                         break;
    15                     default:
    16                         System.Console.WriteLine(info.Key);
    17                         break;
    18                 }
    19 
    20             }
    使用
     1  /// <summary>
     2         /// 获取大王饶命小说页面
     3         /// </summary>
     4         /// <param name="firstUrl">第一次进入的页面</param>
     5         /// <param name="nexturl">下一页</param>
     6         private static void GetContent(string firstUrl, out string nexturl)
     7         {
     8             var html = GetUrlContent(firstUrl);
     9             var url = @"<a href=([^>]+?)>下一页</a>";
    10             string re1 = "/.+html";
    11             nexturl = MatchReg(re1, MatchReg(url, html));
    12             var divContent = @"(?m)<div id=""BookText""[^>]*>(?<div>(?:w|W)*?)</div[^>]*>";
    13             html = MatchReg(divContent, html, "div").Trim().Replace("<br />", "");
    14             var delh4 = @"<h4>([sS]*?)</h4>";
    15             html = html.Replace(MatchReg(delh4, html), "");
    16             Console.WriteLine(html);
    17 
    18         }
    19 
    20         /// <summary>
    21         /// 筛选数据
    22         /// </summary>
    23         /// <param name="regStr">正则字符串</param>
    24         /// <param name="html">网页标签</param>
    25         /// <param name="input">需要获取的标签</param>
    26         /// <returns></returns>
    27         public static string MatchReg(string regStr, string html, string input = "0")
    28         {
    29             var reg = new Regex(regStr, RegexOptions.Multiline | RegexOptions.IgnoreCase);
    30             var mc = reg.Match(html);
    31             if (mc.Success)
    32             {
    33                 return mc.Groups[input].Value;
    34             }
    35             return "";
    36         }
    37 
    38         /// <summary>
    39         /// 爬取地址
    40         /// </summary>
    41         const string qsbkMainUrl = "http://www.dawangraoming.com";
    42         /// <summary>
    43         /// 爬取页面位置
    44         /// </summary>
    45         /// <param name="firsturl"></param>
    46         /// <returns></returns>
    47         private static string GetWBJokeUrl(string firsturl)
    48         {
    49             StringBuilder url = new StringBuilder();
    50             url.Append(qsbkMainUrl);
    51             url.Append(firsturl);
    52             return url.ToString();
    53         }
    54 
    55         /// <summary>
    56         /// /伪装网站访问
    57         /// </summary>
    58         /// <param name="url">目标网站地址</param>
    59         /// <returns></returns>
    60         private static string GetUrlContent(string url)
    61         {
    62             try
    63             {
    64 
    65                 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
    66 
    67                 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36";
    68 
    69                 request.Method = "GET";
    70 
    71                 request.ContentType = "text/html;charset=UTF-8";
    72 
    73                 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
    74 
    75                 Stream myResponseStream = response.GetResponseStream();
    76 
    77                 StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
    78 
    79                 string retString = myStreamReader.ReadToEnd();
    80 
    81                 myStreamReader.Close();
    82 
    83                 myResponseStream.Close();
    84 
    85                 return retString;
    86 
    87             }
    88 
    89             catch { return null; }
    90 
    91         }
    封装方法
    好好学习,天天向上。
  • 相关阅读:
    poj 3280 Cheapest Palindrome(区间DP)
    POJ 2392 Space Elevator(多重背包)
    HDU 1285 定比赛名次(拓扑排序)
    HDU 2680 Choose the best route(最短路)
    hdu 2899 Strange fuction (三分)
    HDU 4540 威威猫系列故事――打地鼠(DP)
    HDU 3485 Count 101(递推)
    POJ 1315 Don't Get Rooked(dfs)
    脱离eclipse,手动写一个servlet
    解析xml,几种方式
  • 原文地址:https://www.cnblogs.com/Zhengxue/p/8864869.html
Copyright © 2011-2022 走看看