zoukankan      html  css  js  c++  java
  • 爬虫之同步、异步处理

      爬虫的本质就是模拟client频繁请求server,获取响应数据,对响应数据进行解析处理。常规的串行方式同步阻塞执行,必须等待一个任务处理完后才能之后才能继续下一个,这样效率就非常低。最常用的聚焦爬虫对数据处理的IO操作(阻塞)相对密集,因此需要考虑使用异步方案解决。

    1.同步串行:提交任务之后,只有等待这个任务执行完毕返回结果才会继续执行下一个,这样效率比较低下!

     1 '''
     2 同步串爬取
     3 '''
     4 import os
     5 import time
     6 import requests
     7 headers={
     8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
     9 }
    10 urls = [
    11     ('百度知道','https://zhidao.baidu.com/daily?fr=daohang'),
    12     ('新浪', 'https://www.sina.com.cn/'),
    13     ('腾讯网', 'https://www.qq.com/'),
    14     ('简书','https://www.jianshu.com/'),
    15     ('今日头条','https://www.toutiao.com/'),
    16     ('新浪财经','https://finance.sina.com.cn/'),
    17     ('东方财富','http://www.eastmoney.com/'),
    18     ('襄阳家教网','http://www.jiajiao100.com/'),
    19 ]
    20 
    21 #同步串行爬取
    22 start=time.time()
    23 for url_info in urls:
    24     url=url_info[1]
    25     path=os.path.join('response',url_info[0]+'.html')
    26     response=requests.get(url,headers=headers)
    27     response.encoding='utf-8'
    28     with open(path,'w',encoding='utf-8')as f:
    29         f.write(response.text)
    30 t=time.time()-start
    31 print(t)#4.652341365814209
    同步串行
     1 '''
     2 异步多进程
     3 '''
     4 import os
     5 import time
     6 import requests
     7 from multiprocessing import Process
     8 headers={
     9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    10 }
    11 urls = [
    12     ('百度知道','https://zhidao.baidu.com/daily?fr=daohang'),
    13     ('新浪', 'https://www.sina.com.cn/'),
    14     ('腾讯网', 'https://www.qq.com/'),
    15     ('简书','https://www.jianshu.com/'),
    16     ('今日头条','https://www.toutiao.com/'),
    17     ('新浪财经','https://finance.sina.com.cn/'),
    18     ('东方财富','http://www.eastmoney.com/'),
    19     ('襄阳家教网','http://www.jiajiao100.com/'),
    20 ]
    21 
    22 def get_html(url_info):
    23     print(os.getppid(),os.getpid())
    24     url = url_info[1]
    25     path = os.path.join('response', url_info[0] + '.html')
    26     response = requests.get(url, headers=headers)
    27     response.encoding = 'utf-8'
    28     with open(path, 'w', encoding='utf-8')as f:
    29         f.write(response.text)
    30 
    31 
    32 if __name__ == '__main__':
    33     start = time.time()
    34     p_list=[]
    35     for url_info in urls:
    36         p=Process(target=get_html,args=(url_info,))
    37         p_list.append(p)
    38         p.start()
    39     for p in p_list:
    40         p.join()
    41     t=time.time()-start
    42     print(t)#3.1241235733032227
    异步多进程
     1 '''
     2 异步进程池
     3 '''
     4 import os
     5 import time
     6 import requests
     7 from concurrent.futures import ProcessPoolExecutor
     8 headers={
     9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    10 }
    11 urls = [
    12     ('百度知道', 'https://zhidao.baidu.com/daily?fr=daohang'),
    13     ('新浪', 'https://www.sina.com.cn/'),
    14     ('腾讯网', 'https://www.qq.com/'),
    15     ('简书', 'https://www.jianshu.com/'),
    16     ('今日头条', 'https://www.toutiao.com/'),
    17     ('新浪财经', 'https://finance.sina.com.cn/'),
    18     ('东方财富', 'http://www.eastmoney.com/'),
    19     ('襄阳家教网', 'http://www.jiajiao100.com/'),
    20 ]
    21 
    22 
    23 def get_html(url_info):
    24     print(os.getppid(), os.getpid())
    25     url = url_info[1]
    26     path = os.path.join('response', url_info[0] + '.html')
    27     response = requests.get(url, headers=headers)
    28     return response,path
    29 
    30 
    31 def done_callback(task):
    32     path=task.result()[1]
    33     response = task.result()[0]
    34     response.encoding = 'utf-8'
    35     with open(path, 'w', encoding='utf-8')as f:
    36         f.write(response.text)
    37 
    38 if __name__ == '__main__':
    39     start = time.time()
    40     ps = ProcessPoolExecutor(8)
    41     for url_info in urls:
    42         task = ps.submit(get_html, url_info)
    43         task.add_done_callback(done_callback)
    44     ps.shutdown()
    45     t = time.time() - start
    46     print(t)  #3.589127779006958
    异步进程池
     1 '''
     2 异步多线程
     3 '''
     4 import os
     5 import time
     6 import requests
     7 from threading import Thread,activeCount
     8 headers={
     9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    10 }
    11 urls = [
    12     ('百度知道','https://zhidao.baidu.com/daily?fr=daohang'),
    13     ('新浪', 'https://www.sina.com.cn/'),
    14     ('腾讯网', 'https://www.qq.com/'),
    15     ('简书','https://www.jianshu.com/'),
    16     ('今日头条','https://www.toutiao.com/'),
    17     ('新浪财经','https://finance.sina.com.cn/'),
    18     ('东方财富','http://www.eastmoney.com/'),
    19     ('襄阳家教网','http://www.jiajiao100.com/'),
    20 ]
    21 
    22 def get_html(url_info):
    23     print(os.getppid(), os.getpid())
    24     url = url_info[1]
    25     path = os.path.join('response', url_info[0] + '.html')
    26     response = requests.get(url, headers=headers)
    27     response.encoding = 'utf-8'
    28     with open(path, 'w', encoding='utf-8')as f:
    29         f.write(response.text)
    30 
    31 if __name__ == '__main__':
    32     start = time.time()
    33     t_list=[]
    34     for url_info in urls:
    35         # 为每一个任务开启线程
    36         t=Thread(target=get_html,args=(url_info,))
    37         t_list.append(t)
    38         t.start()
    39     print(activeCount())#存活的线程个数
    40     #阻塞直到所有线程结束
    41     for t in t_list:
    42         t.join()
    43     t=time.time()-start
    44     print(t)#1.2163612842559814
    异步多线程
     1 '''
     2 异步线程池
     3 '''
     4 import os
     5 import time
     6 import requests
     7 from threading import activeCount
     8 from concurrent.futures import ThreadPoolExecutor
     9 headers={
    10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    11 }
    12 urls = [
    13     ('百度知道', 'https://zhidao.baidu.com/daily?fr=daohang'),
    14     ('新浪', 'https://www.sina.com.cn/'),
    15     ('腾讯网', 'https://www.qq.com/'),
    16     ('简书', 'https://www.jianshu.com/'),
    17     ('今日头条', 'https://www.toutiao.com/'),
    18     ('新浪财经', 'https://finance.sina.com.cn/'),
    19     ('东方财富', 'http://www.eastmoney.com/'),
    20     ('襄阳家教网', 'http://www.jiajiao100.com/'),
    21 ]
    22 
    23 
    24 def get_html(url_info):
    25     print(os.getppid(), os.getpid())
    26     url = url_info[1]
    27     path = os.path.join('response', url_info[0] + '.html')
    28     response = requests.get(url, headers=headers)
    29     return response,path
    30 
    31 
    32 def done_callback(task):
    33     path=task.result()[1]
    34     response = task.result()[0]
    35     response.encoding = 'utf-8'
    36     with open(path, 'w', encoding='utf-8')as f:
    37         f.write(response.text)
    38 
    39 if __name__ == '__main__':
    40     start = time.time()
    41     ts = ThreadPoolExecutor(8)
    42     for url_info in urls:
    43         task = ts.submit(get_html, url_info)
    44         task.add_done_callback(done_callback)
    45     print(activeCount())
    46     ts.shutdown()
    47     t = time.time() - start
    48     print(t)  #1.2402942180633545
    异步线程池
    另一种线程池:
     1 '''
     2 异步线程池from multiprocessing.dummy import Pool
     3 '''
     4 import os
     5 import time
     6 import requests
     7 from multiprocessing.dummy import Pool
     8 headers={
     9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    10 }
    11 urls = [
    12     ('百度知道', 'https://zhidao.baidu.com/daily?fr=daohang'),
    13     ('新浪', 'https://www.sina.com.cn/'),
    14     ('腾讯网', 'https://www.qq.com/'),
    15     ('简书', 'https://www.jianshu.com/'),
    16     ('今日头条', 'https://www.toutiao.com/'),
    17     ('新浪财经', 'https://finance.sina.com.cn/'),
    18     ('东方财富', 'http://www.eastmoney.com/'),
    19     ('襄阳家教网', 'http://www.jiajiao100.com/'),
    20 ]
    21 
    22 
    23 def get_html(url_info):
    24     print(os.getppid(), os.getpid())
    25     url = url_info[1]
    26     path = os.path.join('response', url_info[0] + '.html')
    27     response = requests.get(url, headers=headers)
    28 
    29     response.encoding = 'utf-8'
    30     with open(path, 'w', encoding='utf-8')as f:
    31         f.write(response.text)
    32 
    33 
    34 if __name__ == '__main__':
    35     start = time.time()
    36     pool = Pool(8)
    37     pool.map(get_html,urls)
    38     t = time.time() - start
    39     print(t)  #0.7495629787445068

    分析对比:测试结果进程耗时最长,线程耗时时间最短。虽然在处理多任务时,开启多个进程或线程能提高效率,但是在实际运行时也会受到一些因素的约束,达不到期望的效果。

    2.异步处理规避阻塞是处理问题的思路,接下来对比一下异步多进程/进程池与多线程/线程池:  

      (1)解决同步调用方案之多线程/多进程

      •   好处:在服务器端使用多线程(或多进程)。多线程(或多进程)的目的是让每个连接都拥有独立的线程(或进程),这样任何一个连接的阻塞都不会影响其他的连接。
    •     弊端:开启多进程或都线程的方式,我们是无法无限制地开启多进程或多线程的:在遇到要同时响应成百上千路的连接请求,则无论多线程还是多进程都会严重占据系统资源,降低系统对外界响应效率,而且线程与进程本身也更容易进入假死状态。

      (2) 解决同步调用方案之线程/进程池

        好处:很多程序员可能会考虑使用“线程池”或“连接池”。“线程池”旨在减少创建和销毁线程的频率,其维持一定合理数量的线程,并让空闲的线程重新承担新的执行任务。可以很好的降低系统开销。

    •     弊端:“线程池”和“连接池”技术也只是在一定程度上缓解了频繁调用IO接口带来的资源占用。而且,所谓“池”始终有其上限,当请求大大超过上限时,“池”构成的系统对外界的响应并不比没有池的时候效果好多少。所以使用“池”必须考虑其面临的响应规模,并根据响应规模调整“池”的大小。

      “线程池”或“连接池”或许可以缓解部分压力,但是不能解决所有问题。总之,多线程模型可以方便高效的解决小规模的服务请求,但面对大规模的服务请求,多线程模型也会遇到瓶颈,在遇到IO阻塞是并不能完全充分利用cpu,因此需要通过非阻塞接口来解决这个问题.

  • 相关阅读:
    liunx 学习
    Tracert 命令使用说明图解
    好的程序员应该收集一些不错的 类和方法
    apache 多端口
    数组中随机抽取一个或多个单元 (0086一随机显示列表)
    PHP 应具备的知识 学习
    rdlc报表中不显示0
    教程:VS2010 之TFS入门指南
    ORA00161: 事务处理的分支长度 xx 非法 (允许的最大长度为 64) 解决方法
    DataGridView编辑
  • 原文地址:https://www.cnblogs.com/open-yang/p/11307327.html
Copyright © 2011-2022 走看看