1.高度伪装封装类HeadersHelper.py
1 import urllib.request 2 import http.cookiejar 3 4 class HeadersHelper: 5 def __init__(self, url, path=None): 6 self.url = urllib.request.quote(url,safe='/:?=', encoding='utf-8') 7 self.path = path 8 9 # 设置信息头,高度仿照浏览器 10 def set_Headers(self): 11 # 添加报头 注意"Accept-Encoding": "gb2312, utf-8" 防止解码而出现乱码 12 headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 13 "Accept-Encoding": "gb2312, utf-8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", 14 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", 15 "Connection": "keep-alive", "Host": "baidu.com" 16 } 17 cjar = http.cookiejar.CookieJar() 18 opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar)) 19 headall = [] 20 for key, value in headers.items(): 21 item = (key, value) 22 headall.append(item) 23 opener.addheaders = headall 24 urllib.request.install_opener(opener) 25 26 # 信息返回 27 def feedbak_info(self): 28 self.set_Headers() 29 # 有时候用utf-8,有时候用gbk 30 # http://fjrs168.blog.hexun.com 就需要gbk 31 try: 32 info = urllib.request.urlopen(self.url).read().decode('utf-8') 33 except: 34 info = urllib.request.urlopen(self.url).read().decode('gbk') 35 return str(info) 36 37 # 信息存档 38 def save_InFile(self): 39 self.set_Headers() 40 info = urllib.request.urlopen(self.url).read() 41 file = open(self.path, 'wb') 42 file.write(info) 43 file.close()
2.测试headershelper_test.py
1 from HeadersHelper import HeadersHelper 2 url = "https://www.zhibo8.cc" 3 #============================== 4 #hh = HeadersHelper(url) 5 #print(hh.feedbak_info()) 6 #============================== 7 path = "E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo5/headershelper.html" 8 hh = HeadersHelper(url, path=path) 9 hh.save_InFile()