实现每个链接都能独立缓存,如果存在直接读取,如果不存在,则获取网页,然后系列化后保存到本地
目录功能比较简单:后续可能会改进
#!/usr/bin/env python #coding:utf-8 #Created by Andy @ 2017/6/28 import os import hashlib import urllib.request import random import time import gzip import pickle # 简单的反防爬,每次随机选下header headers = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'}, {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)'}] def cache_html(url, header): # 根据不同链接产生不同的哈希值,并以此为文件名,保证一链接一缓存文件 md = hashlib.md5() md.update(url.encode(encoding='utf8')) file_name = md.hexdigest() # 判断缓存文件是否已经存在,如果存在就直接读取缓存文件 path = os.path.join(base_path, file_name) if os.path.exists(path) and os.path.getsize(path): print("Cache file already exist!") with open(path, 'rb') as read_f: html = pickle.load(read_f) try: html = gzip.decompress(html).decode('utf8') # 有些网站的数据经过压缩了 except: html = html.decode('utf8') else: req = urllib.request.Request(url, headers=header) html = urllib.request.urlopen(req).read() if not req or not html: print("Connection failed...") else: time.sleep(random.randint(1, 3)) with open(file_name, 'wb') as write_f: pickle.dump(html, write_f) try: html = gzip.decompress(html).decode('utf8') except: html = html.decode('utf-8') return html if __name__ == '__main__': header = random.choice(headers) base_path = os.path.dirname(os.path.abspath(__file__)) url = 'http://www.python.org' html = cache_html(url, header) print(html)
下面是将上面的缓存 网页当作一个模块来调用,实现爬取豆瓣电影排行top 250的爬虫,因为豆瓣采取了一定的反爬虫策略
所以这里的缓存的作用就在于,只要成功爬取一次网页,后面就可以从本地读取缓存,而不用对豆瓣发起请求:
#!/usr/bin/env python #coding:utf-8 #Created by Andy @ 2017/6/28 import os import hashlib import urllib.request import random import time import gzip import pickle # 简单的反防爬,每次随机选下header headers = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'}, {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)'}] header = random.choice(headers) base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"cache") if not os.path.exists(base_path): os.mkdir(base_path) def cache_html(url, header=header): # 根据不同链接产生不同的哈希值,并以此为文件名,保证一链接一缓存文件 md = hashlib.md5() md.update(url.encode(encoding='utf8')) file_name = md.hexdigest() # 判断缓存文件是否已经存在,如果存在就直接读取缓存文件 path = os.path.join(base_path, file_name) if os.path.exists(path) and os.path.getsize(path): print("Cache file already exist!") with open(path, 'rb') as read_f: html = pickle.load(read_f) try: html = gzip.decompress(html).decode('utf8') # 有些网站的数据经过压缩了 except: html = html.decode('utf8') else: req = urllib.request.Request(url, headers=header) html = urllib.request.urlopen(req).read() time.sleep(random.randint(2,5)) if not req or not html: print("Connection failed...") else: with open(path, 'wb') as write_f: pickle.dump(html, write_f) try: html = gzip.decompress(html).decode('utf8') except: html = html.decode('utf-8') return html
#!/usr/bin/env python #coding:utf-8 #Created by Andy @ 2017/6/27 import urllib.request import re import random import pickle import time import sys import io from cache_html import cache_html movie = [] sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') for i in range(10): page = i*25 url = "https://movie.douban.com/top250?start=%s&filter=" % page html = cache_html(url) p = re.compile(r'<div class="item">.*?<em class="">(d{1,3})</em>.*?<span class="title">([u4e00-u9fa5]{0,})</span>.*?<p class="">(.+?)</p>.*?<span.*?>(d.d)</span>.*?<span>(d{1,7}).+?</span>',re.DOTALL) res = p.findall(html) with open('movie.pkl', 'wb') as f: for m in res: movie_dic = {} index, name, director, grade, estimate = m[0], m[1], m[2], m[3], m[4] movie_dic['index'] = index movie_dic['name'] = name movie_dic['director'] = director.replace(' ', '').replace('...<br> ', '').replace(' ', '').strip() movie_dic['grade'] = grade movie_dic['estimate'] = estimate movie.append(movie_dic) pickle.dump(movie, f) # 评论超过500000的电影 print([i['name'] for i in [m for m in movie if int(m['estimate']) > 500000]])