zoukankan      html  css  js  c++  java
  • Python 爬虫(2)多线程

    前面说过由于GIL的存在,Python的多线程效率没有希望的那么高,python的多线程适合IO密集型的情况,而爬虫恰好就是一个IO密集的情况,因为爬虫中很大一部分时间,是在等待socket返回数据。

    下面写一个例子:

    import requests
    import time
    
    if __name__ == '__main__':
        codes = ['sh600993', 'sh000006', 'sh600658', 'sh600153', 'sh600005']
        start = time.time()
        for code in codes:
            url = 'http://hq.sinajs.cn/list=' + code
            response = requests.get(url).text
            print response
        print time.time() - start
    

      

    var hq_str_sh600993="马应龙,20.020,20.090,20.060,20.060,19.950,20.040,20.060,486809,9740634.000,2100,20.040,8300,20.030,1300,20.020,2300,20.010,4100,20.000,101,20.060,10000,20.070,14400,20.080,19000,20.090,25700,20.100,2017-01-24,11:30:00,00";
    
    var hq_str_sh000006="地产指数,6567.8364,6574.1060,6568.6375,6577.7249,6542.6599,0,0,1486830,1392918131,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2017-01-24,11:35:51,00";
    
    var hq_str_sh600658="电子城,13.320,13.200,13.270,13.320,13.040,13.270,13.280,559733,7389992.000,30800,13.270,300,13.220,6200,13.200,2500,13.100,4900,13.090,9300,13.280,6400,13.290,8200,13.300,6900,13.310,9000,13.320,2017-01-24,11:30:00,00";
    
    var hq_str_sh600153="建发股份,10.520,10.510,10.500,10.540,10.460,10.490,10.500,4834159,50730040.000,32800,10.490,60100,10.480,186000,10.470,181241,10.460,125800,10.450,56600,10.500,105500,10.510,108400,10.520,110400,10.530,139900,10.540,2017-01-24,11:30:00,00";
    
    var hq_str_sh600005="武钢股份,0.000,3.710,3.710,0.000,0.000,0.000,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,2017-01-24,11:30:00,03";
    
    0.110999822617
    

      换成多线程之后:

    import requests
    import threading
    import time
    
    def get_stock(code):
        url = 'http://hq.sinajs.cn/list=' + code
        response = requests.get(url).text
        # js_info = response.read()
        print response
        
    if __name__ == '__main__':
        codes = ['sh600993', 'sh000006', 'sh600658', 'sh600153', 'sh600005']
        start = time.time()
        threads = [threading.Thread(target = get_stock,args = (code,)) for code in codes]
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        print time.time()-start
    

      

    var hq_str_sh600993="马应龙,20.020,20.090,20.060,20.060,19.950,20.040,20.060,486809,9740634.000,2100,20.040,8300,20.030,1300,20.020,2300,20.010,4100,20.000,101,20.060,10000,20.070,14400,20.080,19000,20.090,25700,20.100,2017-01-24,11:30:00,00";
    
    var hq_str_sh600658="电子城,13.320,13.200,13.270,13.320,13.040,13.270,13.280,559733,7389992.000,30800,13.270,300,13.220,6200,13.200,2500,13.100,4900,13.090,9300,13.280,6400,13.290,8200,13.300,6900,13.310,9000,13.320,2017-01-24,11:30:00,00";
    
    var hq_str_sh000006="地产指数,6567.8364,6574.1060,6568.6375,6577.7249,6542.6599,0,0,1486830,1392918131,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2017-01-24,11:35:51,00";
    
    var hq_str_sh600153="建发股份,10.520,10.510,10.500,10.540,10.460,10.490,10.500,4834159,50730040.000,32800,10.490,60100,10.480,186000,10.470,181241,10.460,125800,10.450,56600,10.500,105500,10.510,108400,10.520,110400,10.530,139900,10.540,2017-01-24,11:30:00,00";
    
    var hq_str_sh600005="武钢股份,0.000,3.710,3.710,0.000,0.000,0.000,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,2017-01-24,11:30:00,03";
    
    0.0379998683929
    

      速度有了很大的提升

    线程池

    import requests
    import threadpool
    import time
    
    def get_stock(code):
        url = 'http://hq.sinajs.cn/list=' + code
        response = requests.get(url).text
        # js_info = response.read()
        print response
        
    if __name__ == '__main__':
        codes = ['sh600993', 'sh000006', 'sh600658', 'sh600153', 'sh600005']
        start = time.time()
        pool = threadpool.ThreadPool(5)
        tasks = threadpool.makeRequests(get_stock,codes)
        [pool.putRequest(task) for task in tasks]
        pool.wait()
        print time.time() - start
    

    threadpool.ThreadPool定义了一个线程池,表示可以创建4个线程;

    makeRequests创建了要开启多线程的函数,已经函数的参数以及回调函数,回调函数callback可以不写,默认是无。

    [pool.putRequest(task) for task in tasks]是将所有多线程的请求扔进了线程池,等价于
    for code in codes:
        pool.putRequest(code)
    

     pool.wait()是等待所有工作完成后退出。这里执行的数量还比较少,基本的多线程就够用了,当数量多了起来之后,线程池的效果会好一些。

  • 相关阅读:
    django filefield
    django HttpResponseRedirect
    Django 自定义 error_messages={} 返回错误信息
    Django validators 官方文档
    djnago 官方关系反向查询案例
    django logging settings.py
    分页 restframe work 排序,分页,自定义过滤
    论文阅读笔记六十三:DeNet: Scalable Real-time Object Detection with Directed Sparse Sampling(CVPR2017)
    基于Distiller的模型压缩工具简介
    基于Intel OpenVINO的搭建及应用,包含分类,目标检测,及分割,超分辨
  • 原文地址:https://www.cnblogs.com/zephyr-1/p/6346889.html
Copyright © 2011-2022 走看看