1 # Author:Winter Liu 2 import urllib.request 3 import re 4 import time 5 6 start_time = time.time() 7 html_start = 'http:/*****' 8 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} 9 result = [] 10 for i in range(2,1000): 11 req = urllib.request.Request(url=html_start, headers=headers) 12 response = urllib.request.urlopen(req) 13 buff = response.read() 14 html = buff.decode('utf-8') 15 result.extend(re.findall(r'<a href=.{20,40} id=.{10,30}>[dd.dd] .{10,50}</a></h3>', html)) 16 html_start ="http://*******"+str(i) 17 18 f = open('Bpage.txt', 'w', encoding='UTF-8') 19 f.writelines(map(lambda x: x+' ', result)) 20 end_time = time.time() 21 f.write("耗时:{}秒".format(end_time - start_time)) 22 f.close()