zoukankan      html  css  js  c++  java
  • 爬虫爬取金庸小说--回顾经典小说

    金庸小说网爬虫

    爬虫新人 欢迎各位大神指出BUG

    链接http://www.jinyongwang.com/book/

    requests + BeautifulSoup + multiprocessing

    github地址:https://github.com/Frankssss/NovelSpider

    import random
    import requests
    import time
    from multiprocessing import Process, Queue
    from bs4 import BeautifulSoup as bs
    
    
    class NovelDownload(Process):
        def __init__(self, name, url_queue):
            super(NovelDownload, self).__init__()
            self.name = name
            self.url_queue = url_queue
    
        def run(self):
            print('%s 开始' % self.name)
            time.sleep(1)
            while 1:
                if self.url_queue.empty():
                    break
                book = self.url_queue.get()
                print('%s 开始下载' % book[0])
                for chapter in self.getChapterList(book):
                    with open('d:\data\金庸\%s.txt' % book[0], 'a', encoding='utf8') as f:
                        f.write(chapter[0] + '
    ')
                        for content in self.getContent(chapter):
                            f.write(content + '
    ')
                print('%s 下载完毕' % book[0])
            print('%s 结束' % self.name)
    
        @staticmethod
        def getBookList(url_queue):
            book_list = []
            url = 'http://www.jinyongwang.com/book/'
            html = NovelDownload.getHtmlText(url)
            soup = bs(html, 'lxml')
            booklist = soup.find('div', attrs={"class": 'booklist'})
            ul = booklist.find('ul', attrs={'class': 'list'})
            lis = ul.find_all('li')
            for li in lis:
                book_url = url.rsplit('/', 2)[0] + li.find('a').get('href')
                book_name = li.find('img').get('alt')
                book_list.append([book_name, book_url])
                url_queue.put([book_name, book_url])
            return book_list
    
        def getChapterList(self, book):
            html = NovelDownload.getHtmlText(book[1])
            soup = bs(html, 'lxml')
            ul = soup.find('ul', attrs={'class', 'mlist'})
            lis = ul.find_all('li')
            for li in lis:
                chapter_url = 'http://www.jinyongwang.com' + li.find('a').get('href')
                chapter_name = li.find('a').get_text()
                yield [
                    chapter_name, chapter_url
                ]
    
        def getContent(self, chapter):
            html = NovelDownload.getHtmlText(chapter[1])
            soup = bs(html, 'lxml')
            div = soup.find('div', attrs={'class': 'vcon'})
            ps = div.find_all('p')
            for p in ps:
                content = p.get_text()
                yield content
    
        @staticmethod
        def getHtmlText(url):
            time.sleep(random.random())
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
            try:
                r = requests.get(url, headers=headers)
                r.encoding = r.apparent_encoding
                r.raise_for_status()
                if r.status_code == 200:
                    return r.text
            except:
                return None
    
    
    def create_pool(url_queue):
        pool_list = []
        pool_name = ['进程1', '进程2', '进程3', '进程4']
        for name in pool_name:
            p = NovelDownload(name, url_queue)
            pool_list.append(p)
        return pool_list
    
    
    def create_queue():
        url_queue = Queue()
        return url_queue
    
    
    def main():
        url_queue = create_queue()
        NovelDownload.getBookList(url_queue)
        pool_list = create_pool(url_queue)
        for p in pool_list:
            p.start()
        for p in pool_list:
            p.join()
    
    
    if __name__ == '__main__':
        temp = time.time()
        main()
        print(time.time() - temp)
    

      

  • 相关阅读:
    codevs 2632 非常好友
    codevs 1213 解的个数
    codevs 2751 军训分批
    codevs 1519 过路费
    codevs 1503 愚蠢的宠物
    codevs 2639 约会计划
    codevs 3369 膜拜
    codevs 3135 River Hopscotch
    数论模板
    JXOJ 9.7 NOIP 放松模拟赛 总结
  • 原文地址:https://www.cnblogs.com/frank-shen/p/9910272.html
Copyright © 2011-2022 走看看