zoukankan html css js c++ java

Python爬取暴走漫画动态图

最近再之乎上看到比较好的Python爬虫教程，看过之后对爬虫有了大概的了解，随后自己写了个爬取暴走漫画动图的爬虫练练手，另外附上Python爬虫教程的原始链接，完整看一遍教程之后还是会有很多收获的

源码

话不多说，直接上代码

# -*- coding: UTF-8 -*-

import requests
import bs4
import sys
import os
import re
from multiprocessing.dummy import Pool as ThreadPool
import urllib3
from tqdm import tqdm
import shutil

baseUrl = 'http://baozoumanhua.com/catalogs/gif'

curDir = os.getcwd()

htmlDir = os.path.join(curDir, 'htmls')

gifDir = os.path.join(curDir, 'gifs')

gifMap = {}

noneCnt = 0

# win文件命名不允许使用的字符
pat = re.compile(r'[\/|\?|\*|:|\||\\|<|>|\s|"]')

total_pages = 1000

get_gifs_bar = ''


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Something Wrong!"


def get_pages(num):
    global get_pages_bar
    get_pages_bar = tqdm(total=total_pages, ascii=True)
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('Downloading web pages...')
    num += 1
    pool = ThreadPool(8)
    pool.map(download_page, range(1, num))
    pool.close()
    pool.join()
    get_pages_bar.close()


def get_gif_name(num, item):
    global noneCnt
    author = item.find('a', 'text-overflow').string + '：'
    a_s = item.find_all('a')
    for a in a_s:
        if a.has_attr('data-full-url'):
            gif_name = author + a.string
            return gif_name
    gif_name = author + 'NA' + str(noneCnt)
    noneCnt += 1
    return gif_name


def get_gif_links(item):
    imgs = item.find_all('img', 'lazy lazy-img none')
    links = []
    for img in imgs:
        if img.has_attr('data-original'):
            links.append(img['data-original'])
    return links


def add_gifMap(name, links):
    global gifMap
    if len(links) < 1:
        return
    if len(links) == 1:
        gif_name = name + '.gif'
        gifMap[gif_name] = links[0]
        return
    for i in range(1, len(links) + 1):
        gif_name = name + str(i) + '.gif'
        gifMap[gif_name] = links[i - 1]


def get_gifs(num):
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('Parsing pages...')
    num += 1
    get_links_bar = tqdm(total=total_pages, ascii=True)
    for n in range(1, num):
        file_name = os.path.join(htmlDir, 'page' + str(n) + '.html')
        soup = bs4.BeautifulSoup(open(file_name, 'rb'), 'lxml')
        article = soup.find_all('div', 'article')
        for item in article:
            gif_name = get_gif_name(n, item)
            gif_links = get_gif_links(item)
            add_gifMap(gif_name, gif_links)
        get_links_bar.update(1)
    get_links_bar.close()


def download_gif(name):
    global gifMap
    global pat
    global get_gifs_bar
    file_name = re.sub(pat, '_', name)
    try:
        if os.path.exists(os.path.join(htmlDir, 'gifs', file_name)):
            return
        r = requests.get(gifMap[name], timeout=30, verify=False)
        r.raise_for_status()
        with open(os.path.join(gifDir, file_name), 'wb') as fo:
            fo.write(r.content)
    except:
        tqdm.write('Download ' + name + ' fail...')
    finally:
        get_gifs_bar.update(1)


def downloader():
    total_gifs = len(gifMap.keys())
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('Downloading gifs...')
    global get_gifs_bar
    get_gifs_bar = tqdm(total=total_gifs, ascii=True)
    pool = ThreadPool(8)
    pool.map(download_gif, gifMap.keys())
    pool.close()
    pool.join()
    get_gifs_bar.close()


def download_page(num):
    url = baseUrl + '?page=' + str(num)
    file_name = os.path.join(htmlDir, 'page' + str(num) + '.html')
    with open(file_name, 'wb') as fo:
        fo.write(get_html_text(url))
    get_pages_bar.update(1)


def set_env():
    global total_pages
    if os.path.exists(gifDir) and sum([len(x) for _, _, x in os.walk(gifDir)]) > 5000:
        total_pages = 10
        tqdm.write('Find many gifs in dir, just update gifs...')
    if not os.path.exists(gifDir):
        os.mkdir(gifDir)
    if os.path.exists(htmlDir):
        shutil.rmtree(htmlDir)
    os.mkdir(htmlDir)


def main():
    set_env()
    get_pages(total_pages)
    get_gifs(total_pages)
    downloader()
    shutil.rmtree(htmlDir)
    tqdm.write('Congratulatins!!!')
    tqdm.write('All pictures in folder : gifs...')
    tqdm.write('Just open the folder and enjoy yourself!!!')
    os.system('pause')
    return 0


if __name__ == "__main__":
    urllib3.disable_warnings()
    reload(sys)
    sys.setdefaultencoding('utf-8')
    sys.exit(main())

程序运行示例

首次执行会下载1000页的动图，请耐心等待，根据网络状况不同，可能需要30分钟的时间，并且确保磁盘有13G的空间
完整执行过一次该软件后，再次执行只会更新最新10页动图
执行完毕后，所有动图保存在当前文件夹下的gifs文件夹中

查看全文

相关阅读:
BAT 批处理脚本教程
 javascript定时器
 使用命令行打开文件夹并显示
 用cmd加密文件夹
 烟波钓叟歌概述讲解
 奇门遁甲的起源
 八卦基本知识
 word2vec和word embedding有什么区别?
Privoxy shadowscocks代理
 Elasticsearch源码分析—线程池(十一) ——就是从队列里处理请求

原文地址：https://www.cnblogs.com/migoo/p/8953314.html