zoukankan      html  css  js  c++  java
  • 缓存网页

    实现每个链接都能独立缓存,如果存在直接读取,如果不存在,则获取网页,然后系列化后保存到本地

    目录功能比较简单:后续可能会改进

    #!/usr/bin/env python
    #coding:utf-8
    #Created by Andy @ 2017/6/28
    
    
    import os
    import hashlib
    import urllib.request
    import random
    import time
    import gzip
    import pickle
    
    # 简单的反防爬,每次随机选下header
    headers = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36'},
               {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'},
               {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)'}]
    
    
    def cache_html(url, header):
    	# 根据不同链接产生不同的哈希值,并以此为文件名,保证一链接一缓存文件
    	md = hashlib.md5()
    	md.update(url.encode(encoding='utf8'))
    	file_name = md.hexdigest()
    
    	# 判断缓存文件是否已经存在,如果存在就直接读取缓存文件
    	path = os.path.join(base_path, file_name)
    	if os.path.exists(path) and os.path.getsize(path):
    		print("Cache file already exist!")
    		with open(path, 'rb') as read_f:
    			html = pickle.load(read_f)
    			try:
    				html = gzip.decompress(html).decode('utf8') # 有些网站的数据经过压缩了
    			except:
    				html = html.decode('utf8')
    	else:
    		req = urllib.request.Request(url, headers=header)
    		html = urllib.request.urlopen(req).read()
    
    		if not req or not html:
    			print("Connection failed...")
    		else:
    			time.sleep(random.randint(1, 3))
    
    			with open(file_name, 'wb') as write_f:
    				pickle.dump(html, write_f)
    
    		try:
    			html = gzip.decompress(html).decode('utf8')
    		except:
    			html = html.decode('utf-8')
    
    
    	return html
    
    if __name__ == '__main__':
    	header = random.choice(headers)
    
    	base_path = os.path.dirname(os.path.abspath(__file__))
    
    	url = 'http://www.python.org'
    	html = cache_html(url, header)
    	print(html)
    

     下面是将上面的缓存 网页当作一个模块来调用,实现爬取豆瓣电影排行top 250的爬虫,因为豆瓣采取了一定的反爬虫策略

    所以这里的缓存的作用就在于,只要成功爬取一次网页,后面就可以从本地读取缓存,而不用对豆瓣发起请求:

    #!/usr/bin/env python
    #coding:utf-8
    #Created by Andy @ 2017/6/28
    
    
    import os
    import hashlib
    import urllib.request
    import random
    import time
    import gzip
    import pickle
    
    # 简单的反防爬,每次随机选下header
    headers = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36'},
               {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'},
               {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)'}]
    
    header = random.choice(headers)
    base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"cache")
    if not os.path.exists(base_path):
    	os.mkdir(base_path)
    
    def cache_html(url, header=header):
    	# 根据不同链接产生不同的哈希值,并以此为文件名,保证一链接一缓存文件
    	md = hashlib.md5()
    	md.update(url.encode(encoding='utf8'))
    	file_name = md.hexdigest()
    
    	# 判断缓存文件是否已经存在,如果存在就直接读取缓存文件
    	path = os.path.join(base_path, file_name)
    	if os.path.exists(path) and os.path.getsize(path):
    		print("Cache file already exist!")
    		with open(path, 'rb') as read_f:
    			html = pickle.load(read_f)
    			try:
    				html = gzip.decompress(html).decode('utf8') # 有些网站的数据经过压缩了
    			except:
    				html = html.decode('utf8')
    	else:
    		req = urllib.request.Request(url, headers=header)
    		html = urllib.request.urlopen(req).read()
    		time.sleep(random.randint(2,5))
    		if not req or not html:
    			print("Connection failed...")
    		else:
    			with open(path, 'wb') as write_f:
    				pickle.dump(html, write_f)
    
    		try:
    			html = gzip.decompress(html).decode('utf8')
    		except:
    			html = html.decode('utf-8')
    
    
    	return html
    
    #!/usr/bin/env python
    #coding:utf-8
    #Created by Andy @ 2017/6/27
    
    
    import urllib.request
    import re
    import random
    import pickle
    import time
    import sys
    import io
    from cache_html import cache_html
    
    
    movie = []
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
    
    for i in range(10):
    	page = i*25
    	url = "https://movie.douban.com/top250?start=%s&filter=" % page
    
    	html = cache_html(url)
    	p = re.compile(r'<div class="item">.*?<em class="">(d{1,3})</em>.*?<span class="title">([u4e00-u9fa5]{0,})</span>.*?<p class="">(.+?)</p>.*?<span.*?>(d.d)</span>.*?<span>(d{1,7}).+?</span>',re.DOTALL)
    
    	res = p.findall(html)
    
    
    	with open('movie.pkl', 'wb') as f:
    		for m in res:
    			movie_dic = {}			
    			index, name, director, grade, estimate = m[0], m[1], m[2], m[3], m[4]
    			movie_dic['index'] = index
    			movie_dic['name'] = name
    			movie_dic['director'] = director.replace(' ', '').replace('...<br>
    ', '').replace(' ', '').strip()
    			movie_dic['grade'] = grade
    			movie_dic['estimate'] = estimate
    			movie.append(movie_dic)
    		pickle.dump(movie, f)
    
    # 评论超过500000的电影
    print([i['name'] for i in [m for m in movie if int(m['estimate']) > 500000]])
    
  • 相关阅读:
    Ajax beforeSend和complete 方法与防止重复提交
    tablesorter周边文档
    对委托的一些短浅理解
    Nginx核心要领五:worker_processes、worker_connections设置
    二进制安装k8s 教程
    安装 Docker Engine-Community
    centos7.x 安装 NodeJS、yarn、pm2
    cfssl
    k8s各个进程 占用内存大小
    Linux下查看某一进程所占用内存的方法
  • 原文地址:https://www.cnblogs.com/Andy963/p/7103352.html
Copyright © 2011-2022 走看看