最近再之乎上看到比较好的Python爬虫教程,看过之后对爬虫有了大概的了解,随后自己写了个爬取暴走漫画动图的爬虫练练手,另外附上Python爬虫教程的原始链接,完整看一遍教程之后还是会有很多收获的
源码
话不多说,直接上代码
# -*- coding: UTF-8 -*-
import requests
import bs4
import sys
import os
import re
from multiprocessing.dummy import Pool as ThreadPool
import urllib3
from tqdm import tqdm
import shutil
baseUrl = 'http://baozoumanhua.com/catalogs/gif'
curDir = os.getcwd()
htmlDir = os.path.join(curDir, 'htmls')
gifDir = os.path.join(curDir, 'gifs')
gifMap = {}
noneCnt = 0
# win文件命名不允许使用的字符
pat = re.compile(r'[\/|\?|\*|:|\||\\|<|>|\s|"]')
total_pages = 1000
get_gifs_bar = ''
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "Something Wrong!"
def get_pages(num):
global get_pages_bar
get_pages_bar = tqdm(total=total_pages, ascii=True)
tqdm.write('.')
tqdm.write('.')
tqdm.write('.')
tqdm.write('Downloading web pages...')
num += 1
pool = ThreadPool(8)
pool.map(download_page, range(1, num))
pool.close()
pool.join()
get_pages_bar.close()
def get_gif_name(num, item):
global noneCnt
author = item.find('a', 'text-overflow').string + ':'
a_s = item.find_all('a')
for a in a_s:
if a.has_attr('data-full-url'):
gif_name = author + a.string
return gif_name
gif_name = author + 'NA' + str(noneCnt)
noneCnt += 1
return gif_name
def get_gif_links(item):
imgs = item.find_all('img', 'lazy lazy-img none')
links = []
for img in imgs:
if img.has_attr('data-original'):
links.append(img['data-original'])
return links
def add_gifMap(name, links):
global gifMap
if len(links) < 1:
return
if len(links) == 1:
gif_name = name + '.gif'
gifMap[gif_name] = links[0]
return
for i in range(1, len(links) + 1):
gif_name = name + str(i) + '.gif'
gifMap[gif_name] = links[i - 1]
def get_gifs(num):
tqdm.write('.')
tqdm.write('.')
tqdm.write('.')
tqdm.write('Parsing pages...')
num += 1
get_links_bar = tqdm(total=total_pages, ascii=True)
for n in range(1, num):
file_name = os.path.join(htmlDir, 'page' + str(n) + '.html')
soup = bs4.BeautifulSoup(open(file_name, 'rb'), 'lxml')
article = soup.find_all('div', 'article')
for item in article:
gif_name = get_gif_name(n, item)
gif_links = get_gif_links(item)
add_gifMap(gif_name, gif_links)
get_links_bar.update(1)
get_links_bar.close()
def download_gif(name):
global gifMap
global pat
global get_gifs_bar
file_name = re.sub(pat, '_', name)
try:
if os.path.exists(os.path.join(htmlDir, 'gifs', file_name)):
return
r = requests.get(gifMap[name], timeout=30, verify=False)
r.raise_for_status()
with open(os.path.join(gifDir, file_name), 'wb') as fo:
fo.write(r.content)
except:
tqdm.write('Download ' + name + ' fail...')
finally:
get_gifs_bar.update(1)
def downloader():
total_gifs = len(gifMap.keys())
tqdm.write('.')
tqdm.write('.')
tqdm.write('.')
tqdm.write('Downloading gifs...')
global get_gifs_bar
get_gifs_bar = tqdm(total=total_gifs, ascii=True)
pool = ThreadPool(8)
pool.map(download_gif, gifMap.keys())
pool.close()
pool.join()
get_gifs_bar.close()
def download_page(num):
url = baseUrl + '?page=' + str(num)
file_name = os.path.join(htmlDir, 'page' + str(num) + '.html')
with open(file_name, 'wb') as fo:
fo.write(get_html_text(url))
get_pages_bar.update(1)
def set_env():
global total_pages
if os.path.exists(gifDir) and sum([len(x) for _, _, x in os.walk(gifDir)]) > 5000:
total_pages = 10
tqdm.write('Find many gifs in dir, just update gifs...')
if not os.path.exists(gifDir):
os.mkdir(gifDir)
if os.path.exists(htmlDir):
shutil.rmtree(htmlDir)
os.mkdir(htmlDir)
def main():
set_env()
get_pages(total_pages)
get_gifs(total_pages)
downloader()
shutil.rmtree(htmlDir)
tqdm.write('Congratulatins!!!')
tqdm.write('All pictures in folder : gifs...')
tqdm.write('Just open the folder and enjoy yourself!!!')
os.system('pause')
return 0
if __name__ == "__main__":
urllib3.disable_warnings()
reload(sys)
sys.setdefaultencoding('utf-8')
sys.exit(main())
程序运行示例
首次执行会下载1000页的动图,请耐心等待,根据网络状况不同,可能需要30分钟的时间,并且确保磁盘有13G的空间
完整执行过一次该软件后,再次执行只会更新最新10页动图
执行完毕后,所有动图保存在当前文件夹下的gifs文件夹中