zoukankan html css js c++ java

爬虫爬取金庸小说--回顾经典小说

金庸小说网爬虫

爬虫新人欢迎各位大神指出BUG

链接http://www.jinyongwang.com/book/

requests + BeautifulSoup + multiprocessing

github地址：https://github.com/Frankssss/NovelSpider

import random
import requests
import time
from multiprocessing import Process, Queue
from bs4 import BeautifulSoup as bs


class NovelDownload(Process):
    def __init__(self, name, url_queue):
        super(NovelDownload, self).__init__()
        self.name = name
        self.url_queue = url_queue

    def run(self):
        print('%s 开始' % self.name)
        time.sleep(1)
        while 1:
            if self.url_queue.empty():
                break
            book = self.url_queue.get()
            print('%s 开始下载' % book[0])
            for chapter in self.getChapterList(book):
                with open('d:\data\金庸\%s.txt' % book[0], 'a', encoding='utf8') as f:
                    f.write(chapter[0] + '
')
                    for content in self.getContent(chapter):
                        f.write(content + '
')
            print('%s 下载完毕' % book[0])
        print('%s 结束' % self.name)

    @staticmethod
    def getBookList(url_queue):
        book_list = []
        url = 'http://www.jinyongwang.com/book/'
        html = NovelDownload.getHtmlText(url)
        soup = bs(html, 'lxml')
        booklist = soup.find('div', attrs={"class": 'booklist'})
        ul = booklist.find('ul', attrs={'class': 'list'})
        lis = ul.find_all('li')
        for li in lis:
            book_url = url.rsplit('/', 2)[0] + li.find('a').get('href')
            book_name = li.find('img').get('alt')
            book_list.append([book_name, book_url])
            url_queue.put([book_name, book_url])
        return book_list

    def getChapterList(self, book):
        html = NovelDownload.getHtmlText(book[1])
        soup = bs(html, 'lxml')
        ul = soup.find('ul', attrs={'class', 'mlist'})
        lis = ul.find_all('li')
        for li in lis:
            chapter_url = 'http://www.jinyongwang.com' + li.find('a').get('href')
            chapter_name = li.find('a').get_text()
            yield [
                chapter_name, chapter_url
            ]

    def getContent(self, chapter):
        html = NovelDownload.getHtmlText(chapter[1])
        soup = bs(html, 'lxml')
        div = soup.find('div', attrs={'class': 'vcon'})
        ps = div.find_all('p')
        for p in ps:
            content = p.get_text()
            yield content

    @staticmethod
    def getHtmlText(url):
        time.sleep(random.random())
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
        try:
            r = requests.get(url, headers=headers)
            r.encoding = r.apparent_encoding
            r.raise_for_status()
            if r.status_code == 200:
                return r.text
        except:
            return None


def create_pool(url_queue):
    pool_list = []
    pool_name = ['进程1', '进程2', '进程3', '进程4']
    for name in pool_name:
        p = NovelDownload(name, url_queue)
        pool_list.append(p)
    return pool_list


def create_queue():
    url_queue = Queue()
    return url_queue


def main():
    url_queue = create_queue()
    NovelDownload.getBookList(url_queue)
    pool_list = create_pool(url_queue)
    for p in pool_list:
        p.start()
    for p in pool_list:
        p.join()


if __name__ == '__main__':
    temp = time.time()
    main()
    print(time.time() - temp)

查看全文

相关阅读:
[转] DataSet的的几种遍历
 [转] C#实现在Sql Server中存储和读取Word文件（Not Correct Modified）
C# 在根据窗体中的表格数据生成word文档时出错
 【剑指offer】堆栈推弹出序列
 kettle于javascript步骤错误处理
 【算法导论】堆排序
 malloc,free简单的实现
 Lichee （六）优化配置的微内核
 EJBCA于Linux安装在
 【C++】智能指针auto_ptr简单的实现

原文地址：https://www.cnblogs.com/frank-shen/p/9910272.html