zoukankan      html  css  js  c++  java
  • 爬虫爬取金庸小说--回顾经典小说

    金庸小说网爬虫

    爬虫新人 欢迎各位大神指出BUG

    链接http://www.jinyongwang.com/book/

    requests + BeautifulSoup + multiprocessing

    github地址:https://github.com/Frankssss/NovelSpider

    import random
    import requests
    import time
    from multiprocessing import Process, Queue
    from bs4 import BeautifulSoup as bs
    
    
    class NovelDownload(Process):
        def __init__(self, name, url_queue):
            super(NovelDownload, self).__init__()
            self.name = name
            self.url_queue = url_queue
    
        def run(self):
            print('%s 开始' % self.name)
            time.sleep(1)
            while 1:
                if self.url_queue.empty():
                    break
                book = self.url_queue.get()
                print('%s 开始下载' % book[0])
                for chapter in self.getChapterList(book):
                    with open('d:\data\金庸\%s.txt' % book[0], 'a', encoding='utf8') as f:
                        f.write(chapter[0] + '
    ')
                        for content in self.getContent(chapter):
                            f.write(content + '
    ')
                print('%s 下载完毕' % book[0])
            print('%s 结束' % self.name)
    
        @staticmethod
        def getBookList(url_queue):
            book_list = []
            url = 'http://www.jinyongwang.com/book/'
            html = NovelDownload.getHtmlText(url)
            soup = bs(html, 'lxml')
            booklist = soup.find('div', attrs={"class": 'booklist'})
            ul = booklist.find('ul', attrs={'class': 'list'})
            lis = ul.find_all('li')
            for li in lis:
                book_url = url.rsplit('/', 2)[0] + li.find('a').get('href')
                book_name = li.find('img').get('alt')
                book_list.append([book_name, book_url])
                url_queue.put([book_name, book_url])
            return book_list
    
        def getChapterList(self, book):
            html = NovelDownload.getHtmlText(book[1])
            soup = bs(html, 'lxml')
            ul = soup.find('ul', attrs={'class', 'mlist'})
            lis = ul.find_all('li')
            for li in lis:
                chapter_url = 'http://www.jinyongwang.com' + li.find('a').get('href')
                chapter_name = li.find('a').get_text()
                yield [
                    chapter_name, chapter_url
                ]
    
        def getContent(self, chapter):
            html = NovelDownload.getHtmlText(chapter[1])
            soup = bs(html, 'lxml')
            div = soup.find('div', attrs={'class': 'vcon'})
            ps = div.find_all('p')
            for p in ps:
                content = p.get_text()
                yield content
    
        @staticmethod
        def getHtmlText(url):
            time.sleep(random.random())
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
            try:
                r = requests.get(url, headers=headers)
                r.encoding = r.apparent_encoding
                r.raise_for_status()
                if r.status_code == 200:
                    return r.text
            except:
                return None
    
    
    def create_pool(url_queue):
        pool_list = []
        pool_name = ['进程1', '进程2', '进程3', '进程4']
        for name in pool_name:
            p = NovelDownload(name, url_queue)
            pool_list.append(p)
        return pool_list
    
    
    def create_queue():
        url_queue = Queue()
        return url_queue
    
    
    def main():
        url_queue = create_queue()
        NovelDownload.getBookList(url_queue)
        pool_list = create_pool(url_queue)
        for p in pool_list:
            p.start()
        for p in pool_list:
            p.join()
    
    
    if __name__ == '__main__':
        temp = time.time()
        main()
        print(time.time() - temp)
    

      

  • 相关阅读:
    powershell初探(七)
    powershell初探(九)
    打造一个有感觉的Vim(一)
    屏幕录像软件Wink
    注释也精彩
    解决XP专业版局域网访问故障十八招
    可以抓文字的抓图软件
    轻松玩转XP系统(一)
    Excel实战技巧之[活用条件格式]
    局域网传输工具飞鸽传书IPMessager
  • 原文地址:https://www.cnblogs.com/frank-shen/p/9910272.html
Copyright © 2011-2022 走看看