zoukankan      html  css  js  c++  java
  • 没事虫子爬个书...

    x

    看到程序猿爬虫的故事...一个无聊的周末...也想用Jumony爬点书,,,囤起来...仓鼠症...

    using Ivony.Html;
    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Text;
    using System.Windows.Forms;
    
    namespace BookGet
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }
            private void button1_Click(object sender, EventArgs e)
            {
                string host = "https://m.xxx.net/";
                string baseUrl = "https://m.xxx.net/wapsort/";
                var nextUrl = "11_1.html";// "171271.html";
                string url = "";
    
                //所有书籍路径...
                Dictionary<string, string> bookInfoDic = new Dictionary<string, string>();
                string bookName = string.Empty;
                string bookUrl = string.Empty;
    
                #region 获取所有的页→获取此页中的所有书籍字典
                int testI = 0;
                //循环所有页...
                nextUrl = baseUrl + nextUrl;
                while (nextUrl != "" && testI < 20)
                {
                    testI++;
                    try
                    {
                        url = nextUrl;
                        var doc = new Ivony.Html.Parser.JumonyParser().LoadDocument(url);
    
                        //此页中的所有书籍...
                        IEnumerable<IHtmlElement> bookList = doc.Find("#nr_body div div.common-bookele h3 a");
                        foreach (var bookItem in bookList)
                        {
                            bookName = bookItem.InnerText();
                            bookUrl = bookItem.Attribute("href").Value();
                            if (!bookInfoDic.ContainsKey(bookName))
                            {
                                //if (bookName == "好想宠坏你")
                                {
                                    bookInfoDic.Add(bookName, bookUrl);
                                }
                            }
                        }
    
                        var domNext = doc.FindFirst("#nr_body div#page a.next");
                        nextUrl = domNext.Attribute("href").Value();
                        if (domNext.Attribute("class").Value() == "prev none")
                        {
                            nextUrl = "";
                        }
                    }
                    catch
                    {
                        Console.WriteLine(string.Format("{0}没有成功", url));
                        nextUrl = "";
                    }
    
    
                }
                #endregion
    
    
                #region 读取所有书,并下载到本地...
    
                bookName = string.Empty;
                //保存此书的路径...
                string bookPath = string.Empty;
                //书的ID
                //string bookIDStr = string.Empty;
                string beginReadUrl = string.Empty;
                //一个章节的标题(分段阅读)
                string bookTitlePage = string.Empty;
                //一个章节的文本...
                string bookTextPage = string.Empty;
    
                StringBuilder bookTextBuil = new StringBuilder();
    
                string nextTextPage = string.Empty;
                FileStream fs = null;
                StreamWriter sw = null;
    
                //循环书List...
                foreach (var item in bookInfoDic)
                {
                    bookTextBuil.Clear();
                    try
                    {
                        bookPath = string.Format("D:\yuzhaiwu\{0}.txt", item.Key);
                        if (File.Exists(bookPath))
                        {
                            fs = new FileStream(bookPath, FileMode.Append);
                        }
                        else
                        {
                            fs = new FileStream(bookPath, FileMode.Create);
                        }
    
    
                        sw = new StreamWriter(fs, Encoding.UTF8);
                        //进入书的主页...
                        var mainPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(item.Value);
                        //开始阅读...
                        var beginReadEle = mainPage.FindFirst("#novelMain a.btn");
                        beginReadUrl = beginReadEle.Attribute("href").Value();
    
                        nextTextPage = (host + beginReadUrl);
                        //下一页下一页...
                        while (nextTextPage != "")
                        {
                            //各个章节...
                            var firstPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(nextTextPage, Encoding.UTF8, true);
                            #region 如果发现页面中所有的html代码在一个title中的话...读取title中的html代码,在转换...
                            //string htmlPage = firstPage.FindFirst("title").InnerHtml();
                            //var firstPageTemp = new JumonyParser().Parse(htmlPage);
                            //bookTextPage = firstPageTemp.FindFirst("#nr1").InnerText();
                            #endregion
    
                            bookTitlePage = firstPage.FindFirst("#nr_title").InnerText();
                            //bookTextPage = firstPage.FindFirst("#nr1").InnerText();
                            bookTextPage = firstPage.FindFirst("#nr1").InnerHtml().Replace("<p>", "
    ").Replace("</p>", "
    ").Replace("<p></p>", "");
                            bookTextBuil.AppendFormat("
    
    
    {0}
    {1}", bookTitlePage, bookTextPage);
    
                            //获取下一章节路径...
                            var nextPageEle = firstPage.FindFirst("#nr_body a#pb_next");//#nr_body div.nr_page table tbody tr td.next a#pb_next
                            nextTextPage = nextPageEle.Attribute("href").Value();
                            //如果相等,表明是最后一页了...
                            if (nextTextPage == item.Value)
                            {
                                nextTextPage = "";
                            }
                        }
    
                        //var bookIDEle = mainPage.FindFirst("SOHUCS");
                        //bookIDStr = bookIDEle.Attribute("sid").Value();
    
                    }
                    catch (System.IO.IOException ioEx)
                    {
                        MessageBox.Show(ioEx.Message);
    
                    }
                    catch (Exception ex)
                    {
                        MessageBox.Show(ex.Message);
                    }
                    finally
                    {
    
                        //Console.WriteLine(title);
                        //sw.WriteLine("");
                        //sw.WriteLine(title);
                        //sw.WriteLine("");
                        sw.WriteLine(bookTextBuil.ToString());
    
                    }
    
                }
    
    
                sw.Close();
                fs.Close();
                #endregion
                MessageBox.Show("全部成功!");
    
            }
        }
    }

    x

  • 相关阅读:
    【linux】which和whereis
    【linux】locate介绍
    【linux】find命令详解
    【linux】umask
    【linux】文件目录说明
    【linux】Linux系统信息查看命令大全
    【linux】mkdir -p命令
    【linux】head&&tail
    【linux】less &&more
    【linux】ls常用参数
  • 原文地址:https://www.cnblogs.com/love-zf/p/8612693.html
Copyright © 2011-2022 走看看