zoukankan      html  css  js  c++  java
  • python_爬虫_multiprocessing.dummy以及multiprocessing

    ''' # 代码范本
    任务添加函数、任务执行函数;进程、线程切换函数;进、线程开启函数;
    '''
    from multiprocessing import Pool as processPoll
    from multiprocessing.dummy import Pool as ThreadPool
    
    def get_page(): # 任务执行
        pass
    
    def url_list(): # 任务添加
        pass
    
    def get_pool(): # 设定进、线程
        pass
    
    def open_pool(): # 启动
        pass
    
    if __name__ == '__main__':
        open_pool()

    使用16线程爬取腾讯的招聘的100页分页信息,用时6秒左右(3M网速)

    '''
    任务添加函数、任务执行函数;进程、线程切换函数;进、线程开启函数;
    '''
    import requests
    from urllib import request
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    from datetime import datetime
    from multiprocessing import Pool as ProcessPoll
    from multiprocessing.dummy import Pool as ThreadPool
    
    def get_page(task_q): # 任务执行
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    
        req = request.Request(task_q,headers=headers)
        response = request.urlopen(req)
        print(response.url)
        # response = requests.get(task_q,headers=headers,verify=False)
        # print(response.url,response.status_code)
    
    def url_list(): # 任务添加
        task_q = []
        base_url = 'http://hr.tencent.com/position.php?start={}'
        for i in range(0,10*100,10):
            full_url = base_url.format(i)
            task_q.append(full_url)
        return task_q
    
    def get_pool(way=True,count=4): # 设定进、线程
        if way:
            pool = ProcessPoll(count) # 进程
        else:
            pool = ThreadPool(count) # 线程
        return pool
    
    def open_pool(): # 启动
        start = datetime.now()
        pool = get_pool(way=False,count=16)
        task_q = url_list()
    
        pool.map(get_page,task_q)
        pool.close()
        pool.join()
        end = datetime.now()
        print('程序结束,用时',end-start)
    if __name__ == '__main__':
        open_pool()
  • 相关阅读:
    http://git.oschina.net/
    六、jquery操作下拉列表
    在线JS/CSS/HTML压缩,格式化
    网站安全考虑:1、sql注入 2、跨站脚本攻击
    Thinkphp列表搜索排序-----查
    控制器里面打印sql语句
    Thinkphp增加操作(Controller到模型Model的逻辑)
    将文本转换成语音
    随点击来改变按钮的style
    那些年挠头的git
  • 原文地址:https://www.cnblogs.com/hejianlong/p/9345450.html
Copyright © 2011-2022 走看看