1 import requests 2 import re 3 from multiprocessing import Pool 4 from requests.exceptions import RequestException 5 import json 6 import time 7 8 9 # 抓取单页内容 10 def get_one_page(url): 11 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 12 "Chrome/85.0.4183.121 Safari/537.36"} 13 try: 14 response = requests.get(url, headers=headers) 15 if response.status_code == 200: 16 return response.text 17 else: 18 return None 19 except RequestException: 20 return None 21 22 23 # 解析单页内容 24 def parser_one_page(html): 25 pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>' 26 + '.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>' 27 + '.*?</dd>', re.S) 28 contents = re.findall(pattern, html) 29 for content in contents: 30 yield { # 生成一个generator,对区域内的内容进行迭代处理 31 'index': content[0], 32 'image': content[1], 33 'name': content[2].strip(), 34 'actor': content[3].strip()[3:], 35 'time': content[4][5:], 36 'score': content[5]+content[6] 37 } 38 39 40 # 将单页内容写入文件 41 def write_to_file(content): 42 with open('猫眼电影.txt', 'a', encoding='utf-8') as f: 43 f.write(json.dumps(content, ensure_ascii=False) + '\n') 44 f.close() 45 46 47 def main(offset): 48 url = 'http://maoyan.com/board/4?offset=' + str(offset) 49 html = get_one_page(url) 50 for item in parser_one_page(html): 51 write_to_file(item) 52 53 if __name__ == "__main__": 54 time1 = time.time() 55 for i in range(0, 100, 10): 56 main(i) 57 time2 = time.time() 58 pool = Pool() # 使用多进程提高爬取效率 59 pool.map(main, [i*10 for i in range(0, 10)]) 60 time3 = time.time() 61 print(time2-time1) # for...in花费时间 62 print(time3-time2) # 多线程花费时间
运行时间如下:
补充对yield用法的理解:
相关博客文章:https://blog.csdn.net/qq_33472765/article/details/80839417