zoukankan      html  css  js  c++  java
  • Python爬取暴走漫画动态图

    最近再之乎上看到比较好的Python爬虫教程,看过之后对爬虫有了大概的了解,随后自己写了个爬取暴走漫画动图的爬虫练练手,另外附上Python爬虫教程的原始链接,完整看一遍教程之后还是会有很多收获的

    源码

    话不多说,直接上代码

    # -*- coding: UTF-8 -*-
    
    import requests
    import bs4
    import sys
    import os
    import re
    from multiprocessing.dummy import Pool as ThreadPool
    import urllib3
    from tqdm import tqdm
    import shutil
    
    baseUrl = 'http://baozoumanhua.com/catalogs/gif'
    
    curDir = os.getcwd()
    
    htmlDir = os.path.join(curDir, 'htmls')
    
    gifDir = os.path.join(curDir, 'gifs')
    
    gifMap = {}
    
    noneCnt = 0
    
    # win文件命名不允许使用的字符
    pat = re.compile(r'[\/|\?|\*|:|\||\\|<|>|\s|"]')
    
    total_pages = 1000
    
    get_gifs_bar = ''
    
    
    def get_html_text(url):
        try:
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return "Something Wrong!"
    
    
    def get_pages(num):
        global get_pages_bar
        get_pages_bar = tqdm(total=total_pages, ascii=True)
        tqdm.write('.')
        tqdm.write('.')
        tqdm.write('.')
        tqdm.write('Downloading web pages...')
        num += 1
        pool = ThreadPool(8)
        pool.map(download_page, range(1, num))
        pool.close()
        pool.join()
        get_pages_bar.close()
    
    
    def get_gif_name(num, item):
        global noneCnt
        author = item.find('a', 'text-overflow').string + ':'
        a_s = item.find_all('a')
        for a in a_s:
            if a.has_attr('data-full-url'):
                gif_name = author + a.string
                return gif_name
        gif_name = author + 'NA' + str(noneCnt)
        noneCnt += 1
        return gif_name
    
    
    def get_gif_links(item):
        imgs = item.find_all('img', 'lazy lazy-img none')
        links = []
        for img in imgs:
            if img.has_attr('data-original'):
                links.append(img['data-original'])
        return links
    
    
    def add_gifMap(name, links):
        global gifMap
        if len(links) < 1:
            return
        if len(links) == 1:
            gif_name = name + '.gif'
            gifMap[gif_name] = links[0]
            return
        for i in range(1, len(links) + 1):
            gif_name = name + str(i) + '.gif'
            gifMap[gif_name] = links[i - 1]
    
    
    def get_gifs(num):
        tqdm.write('.')
        tqdm.write('.')
        tqdm.write('.')
        tqdm.write('Parsing pages...')
        num += 1
        get_links_bar = tqdm(total=total_pages, ascii=True)
        for n in range(1, num):
            file_name = os.path.join(htmlDir, 'page' + str(n) + '.html')
            soup = bs4.BeautifulSoup(open(file_name, 'rb'), 'lxml')
            article = soup.find_all('div', 'article')
            for item in article:
                gif_name = get_gif_name(n, item)
                gif_links = get_gif_links(item)
                add_gifMap(gif_name, gif_links)
            get_links_bar.update(1)
        get_links_bar.close()
    
    
    def download_gif(name):
        global gifMap
        global pat
        global get_gifs_bar
        file_name = re.sub(pat, '_', name)
        try:
            if os.path.exists(os.path.join(htmlDir, 'gifs', file_name)):
                return
            r = requests.get(gifMap[name], timeout=30, verify=False)
            r.raise_for_status()
            with open(os.path.join(gifDir, file_name), 'wb') as fo:
                fo.write(r.content)
        except:
            tqdm.write('Download ' + name + ' fail...')
        finally:
            get_gifs_bar.update(1)
    
    
    def downloader():
        total_gifs = len(gifMap.keys())
        tqdm.write('.')
        tqdm.write('.')
        tqdm.write('.')
        tqdm.write('Downloading gifs...')
        global get_gifs_bar
        get_gifs_bar = tqdm(total=total_gifs, ascii=True)
        pool = ThreadPool(8)
        pool.map(download_gif, gifMap.keys())
        pool.close()
        pool.join()
        get_gifs_bar.close()
    
    
    def download_page(num):
        url = baseUrl + '?page=' + str(num)
        file_name = os.path.join(htmlDir, 'page' + str(num) + '.html')
        with open(file_name, 'wb') as fo:
            fo.write(get_html_text(url))
        get_pages_bar.update(1)
    
    
    def set_env():
        global total_pages
        if os.path.exists(gifDir) and sum([len(x) for _, _, x in os.walk(gifDir)]) > 5000:
            total_pages = 10
            tqdm.write('Find many gifs in dir, just update gifs...')
        if not os.path.exists(gifDir):
            os.mkdir(gifDir)
        if os.path.exists(htmlDir):
            shutil.rmtree(htmlDir)
        os.mkdir(htmlDir)
    
    
    def main():
        set_env()
        get_pages(total_pages)
        get_gifs(total_pages)
        downloader()
        shutil.rmtree(htmlDir)
        tqdm.write('Congratulatins!!!')
        tqdm.write('All pictures in folder : gifs...')
        tqdm.write('Just open the folder and enjoy yourself!!!')
        os.system('pause')
        return 0
    
    
    if __name__ == "__main__":
        urllib3.disable_warnings()
        reload(sys)
        sys.setdefaultencoding('utf-8')
        sys.exit(main())
    
    

    程序运行示例

    首次执行会下载1000页的动图,请耐心等待,根据网络状况不同,可能需要30分钟的时间,并且确保磁盘有13G的空间
    完整执行过一次该软件后,再次执行只会更新最新10页动图
    执行完毕后,所有动图保存在当前文件夹下的gifs文件夹中

  • 相关阅读:
    tornado+websocket+mongodb实现在线视屏文字聊天
    mongoexport 导出需要授权数据库中的集合 报错 Authentication failed.
    nginx日志中添加请求的response日志
    SSE(Server-sent events)技术在web端消息推送和实时聊天中的使用
    RESTful接口设计原则和优点
    一次请求中,经过 nginx+uWSGI+flask应用程序搭建服务的执行过程
    项目中记录影响性能的缓慢数据库查询
    macos Item2 添加 Shell Integration (ftp传输)
    windows安装 阿里云的Fun工具
    windows10安装docker[含百度网盘docker安装包]
  • 原文地址:https://www.cnblogs.com/migoo/p/8953314.html
Copyright © 2011-2022 走看看