基本配置:阿里云服务器低配,单核2G内存
首先是看协程的效果:

import requests import lxml.html as HTML import sys import time import gevent from gevent import monkey monkey.patch_all() # create url urls = [] for i in range(int(sys.argv[1]),int(sys.argv[2])): url = 'http://grri94kmi4.app.tianmaying.com/songs?page='+str(i) urls.append(url) def get_data(url): t1 = time.time() res = requests.get(url) if res.status_code == 200: print(url+' : '+'url open success'+' time use: '+ str(time.time()-t1)) html = HTML.fromstring(res.content) trs = html.xpath('//tbody/tr') data = [] for tr in trs: s = {} s['name'] = tr.xpath('./td/a/text()')[0] s['url'] = tr.xpath('./td/a/@href')[0] s['id'] = s['url'][30:] s['comment'] = tr.xpath('./td[last()]/text()')[0] data.append(s) if __name__ == '__main__': total = time.time() task = [] for url in urls: task.append(gevent.spawn(get_data,url)) gevent.joinall(task) print('total time use :', time.time()-total)
在爬取20个链接的情况下,用时为4s:
total time use : 4.873192071914673
线程和进程差不多 ,用时6s左右
import requests import lxml.html as HTML import sys import time from multiprocessing import Pool as ThreadPool # create url urls = [] for i in range(int(sys.argv[1]),int(sys.argv[2])): url = 'http://grri94kmi4.app.tianmaying.com/songs?page='+str(i) urls.append(url) def get_data(url): t1 = time.time() res = requests.get(url) if res.status_code == 200: print(url+' : '+'url open success'+' time use: '+ str(time.time()-t1)) html = HTML.fromstring(res.content) trs = html.xpath('//tbody/tr') data = [] for tr in trs: s = {} s['name'] = tr.xpath('./td/a/text()')[0] s['url'] = tr.xpath('./td/a/@href')[0] s['id'] = s['url'][30:] s['comment'] = tr.xpath('./td[last()]/text()')[0] data.append(s) if __name__ == '__main__': total = time.time() pool = ThreadPool() results = pool.map(get_data,urls) pool.close() pool.join() print('total time use :', time.time()-total)