spider.py
1 # -*- coding:utf-8 -*- 2 import requests 3 import re 4 import json 5 import codecs 6 from requests.exceptions import RequestException 7 from multiprocessing import Pool 8 9 headers = { 10 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0' 11 } 12 13 def get_one_page(url): 14 try: 15 response = requests.get(url,headers=headers) 16 if response.status_code == 200: 17 return response.text 18 return None 19 except RequestException: 20 return None 21 22 def parse_one_page(html): 23 pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?<a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) 24 items = re.findall(pattern, html) 25 for item in items: 26 yield { 27 'index': item[0], 28 'image': item[1], 29 'title': item[2], 30 'actor': item[3].strip()[3:], 31 'time': item[4].strip()[5:], 32 'score': item[5] + item[6] 33 } 34 35 def save_to_file(content): 36 with codecs.open('result.txt', 'a', 'utf-8') as f: 37 f.write(json.dumps(content, ensure_ascii=False) + ' ') 38 39 def main(offset): 40 url = 'http://maoyan.com/board/4?offset=' + str(offset) 41 html = get_one_page(url) 42 for item in parse_one_page(html): 43 print json.dumps(item, ensure_ascii=False, encoding='utf-8') 44 save_to_file(item) 45 46 if __name__ == '__main__': 47 pool = Pool() 48 pool.map(main, [i*10 for i in range(10)])