爬取代理
Python3中urllib详细使用方法(header,代理,超时,认证,异常处理),详见https://www.cnblogs.com/ifso/p/4707135.html
验证代理
1 import urllib.request 2 import re 3 import threading 4 5 6 class TestProxy(object): 7 def __init__(self): 8 self.sFile = r'proxy.txt' 9 self.dFile = r'alive.txt' 10 self.URL = r'http://www.baidu.com/' 11 self.threads = 10 12 self.timeout = 3 13 self.regex = re.compile(r'baidu.com') 14 self.aliveList = [] 15 16 self.run() 17 18 def run(self): 19 with open(self.sFile, 'r') as fp: 20 lines = fp.readlines() 21 line = lines.pop() 22 while lines: 23 for i in range(self.threads): 24 t = threading.Thread(target=self.linkWithProxy, args=(line,)) 25 t.start() 26 if lines: 27 line = lines.pop() 28 else: 29 continue 30 31 with open(self.dFile, 'w') as fp: 32 for i in range(len(self.aliveList)): 33 fp.write(self.aliveList[i]) 34 35 36 37 def linkWithProxy(self, line): 38 lineList=line.split(' ') 39 protocol=lineList[2].lower() 40 server=protocol+r'://'+lineList[0]+':'+lineList[1] 41 opener=urllib.request.build_opener(urllib.request.ProxyHandler({protocol:server})) 42 urllib.request.install_opener(opener) 43 try: 44 response=urllib.request.urlopen(self.URL,timeout=self.timeout) 45 except: 46 print('%s connect faild' %server) 47 return 48 else: 49 try: 50 strli=response.read()51 except: 52 print('%s connect failed' %server) 53 return 54 if self.regex.search(strli): 55 print('%s connect success ..........' %server) 56 self.aliveList.append(line) 57 58 if __name__ == '__main__': 59 TP = TestProxy()
第50行报错,TypeError: cannot use a string pattern on a bytes-like object
改成 strli=response.read().decode('utf-8')
反爬虫
1、robots协议:当爬虫访问一个站点时,它会检查该目录下是否存在robot.txt,如果存在,按照文件的内容确定访问范围
解决方法:A. 伪装浏览器 B.设置setting文件, ROBOTSTXT_OBEY = False
2、IP流量异常:网站发现IP流程异常增多时就会封IP
解决方法:A. 增大爬取间隔时间,设置随机时间 B.更改IP