提取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的站点URL为,提取的结果会以文件形式保存下来。
1 from multiprocessing import Pool 2 import json 3 import requests 4 from requests.exceptions import RequestException 5 import re 6 7 def get_one_page(url): 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' 10 } 11 try: 12 response = requests.get(url, headers=headers) 13 if response.status_code == 200: 14 return response.text 15 return None 16 except RequestException: 17 return None 18 19 def parse_one_page(html): 20 pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a' 21 + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 22 + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) 23 items = re.findall(pattern,html) 24 for item in items: 25 yield { 26 'index':item[0], 27 'image':item[1], 28 'title':item[2], 29 'actor':item[3].strip()[3:], 30 'time': item[4].strip()[5:], 31 'score': item[5] + item[6] 32 } 33 34 def write_to_file(content): 35 with open('result.txt','a',encoding='utf-8') as f: 36 f.write(json.dumps(content,ensure_ascii=False)+' ') 37 f.close() 38 39 def main (offset): 40 url = 'http://maoyan.com/board/4?offset='+str(offset) 41 html = get_one_page(url) 42 for item in parse_one_page(html): 43 print(item) 44 write_to_file(item) 45 46 if __name__ == '__main__': 47 #for i in range(10): 48 #main(offset = i * 10) 49 pool = Pool() 50 pool.map(main,[i*10 for i in range(10)])