zoukankan      html  css  js  c++  java
  • 爬虫爬取金庸小说--回顾经典小说

    金庸小说网爬虫

    爬虫新人 欢迎各位大神指出BUG

    链接http://www.jinyongwang.com/book/

    requests + BeautifulSoup + multiprocessing

    github地址:https://github.com/Frankssss/NovelSpider

    import random
    import requests
    import time
    from multiprocessing import Process, Queue
    from bs4 import BeautifulSoup as bs
    
    
    class NovelDownload(Process):
        def __init__(self, name, url_queue):
            super(NovelDownload, self).__init__()
            self.name = name
            self.url_queue = url_queue
    
        def run(self):
            print('%s 开始' % self.name)
            time.sleep(1)
            while 1:
                if self.url_queue.empty():
                    break
                book = self.url_queue.get()
                print('%s 开始下载' % book[0])
                for chapter in self.getChapterList(book):
                    with open('d:\data\金庸\%s.txt' % book[0], 'a', encoding='utf8') as f:
                        f.write(chapter[0] + '
    ')
                        for content in self.getContent(chapter):
                            f.write(content + '
    ')
                print('%s 下载完毕' % book[0])
            print('%s 结束' % self.name)
    
        @staticmethod
        def getBookList(url_queue):
            book_list = []
            url = 'http://www.jinyongwang.com/book/'
            html = NovelDownload.getHtmlText(url)
            soup = bs(html, 'lxml')
            booklist = soup.find('div', attrs={"class": 'booklist'})
            ul = booklist.find('ul', attrs={'class': 'list'})
            lis = ul.find_all('li')
            for li in lis:
                book_url = url.rsplit('/', 2)[0] + li.find('a').get('href')
                book_name = li.find('img').get('alt')
                book_list.append([book_name, book_url])
                url_queue.put([book_name, book_url])
            return book_list
    
        def getChapterList(self, book):
            html = NovelDownload.getHtmlText(book[1])
            soup = bs(html, 'lxml')
            ul = soup.find('ul', attrs={'class', 'mlist'})
            lis = ul.find_all('li')
            for li in lis:
                chapter_url = 'http://www.jinyongwang.com' + li.find('a').get('href')
                chapter_name = li.find('a').get_text()
                yield [
                    chapter_name, chapter_url
                ]
    
        def getContent(self, chapter):
            html = NovelDownload.getHtmlText(chapter[1])
            soup = bs(html, 'lxml')
            div = soup.find('div', attrs={'class': 'vcon'})
            ps = div.find_all('p')
            for p in ps:
                content = p.get_text()
                yield content
    
        @staticmethod
        def getHtmlText(url):
            time.sleep(random.random())
            headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
            try:
                r = requests.get(url, headers=headers)
                r.encoding = r.apparent_encoding
                r.raise_for_status()
                if r.status_code == 200:
                    return r.text
            except:
                return None
    
    
    def create_pool(url_queue):
        pool_list = []
        pool_name = ['进程1', '进程2', '进程3', '进程4']
        for name in pool_name:
            p = NovelDownload(name, url_queue)
            pool_list.append(p)
        return pool_list
    
    
    def create_queue():
        url_queue = Queue()
        return url_queue
    
    
    def main():
        url_queue = create_queue()
        NovelDownload.getBookList(url_queue)
        pool_list = create_pool(url_queue)
        for p in pool_list:
            p.start()
        for p in pool_list:
            p.join()
    
    
    if __name__ == '__main__':
        temp = time.time()
        main()
        print(time.time() - temp)
    

      

  • 相关阅读:
    centos 用户管理
    rsync 实验
    文件共享和传输
    PAT 1109 Group Photo
    PAT 1108 Finding Average
    PAT 1107 Social Clusters
    PAT 1106 Lowest Price in Supply Chain
    PAT 1105 Spiral Matrix
    PAT 1104 Sum of Number Segments
    PAT 1103 Integer Factorization
  • 原文地址:https://www.cnblogs.com/frank-shen/p/9910272.html
Copyright © 2011-2022 走看看