zoukankan      html  css  js  c++  java
  • 进程、线程和协程的结合使用

    # -*- coding: utf-8 -*-
    #2019/12/16 10:35
    import os
    import time
    import requests
    import traceback
    from bs4 import BeautifulSoup
    from  multiprocessing import Process, Pool
    import threading
    from threading import Thread
    from concurrent.futures import ThreadPoolExecutor  # 线程池
    import gevent
    from conf import settings as ss
    from core.log import Log as log
    
    def save_img(file_name,content):
        '''保存图片到本地'''
        with open(file_name, 'wb') as f:
            f.write(content)
    
    def spider(num):
        '''爬虫操作'''
        rep_url = r"https://www.autohome.com.cn/all/{}/#liststart".format(num)
        print(rep_url)
        # 1. 模拟浏览器发请求
        response = requests.get(url=rep_url)
        # response.encoding = 'gbk'
        # 2. 获取请求内容
        text = response.text
        # 3. 使用bs4库解析请求
        soup = BeautifulSoup(text, 'html.parser')  # html.parser:解析器,负责解析文本
        # 从整个文本中进一步缩小定位范围, div:所有图片外部的盒子
        div_obj = soup.find(name='div', attrs={"class": "article-wrapper"})
        # print(div_obj)
        # 4. 定位图片位置
        # 从盒子中找所有li标签
        li_list = div_obj.find_all(name="li")
        for li in li_list:
            # 5. 获取图片链接
            img = li.find(name='img')
            try:
                src = img.get("src")
                # print(src)
                img_name = src.rsplit('/',1)[-1]
                file_name = os.path.join(ss.IMG_PATH,img_name)
                # 获取到的图片链接没有http,给补齐
                if not src.startswith('http'): src = 'http:' + src
                # 6. 使用requests模块向图片链接发请求
                res = requests.get(url=src)
                # 7.保存图片到本地
                save_img(file_name, res.content)
            except Exception as e:
                log.error(traceback.format_exc())
    
            # break
    
    
    def thread_func(n):
        '''协程操作'''
        gevent_list = []
        t = 50  # 开50个协程
        m = t * n
        s = 1 + (m - t)
        for i in range(s, m+1):
            # print(i)
            gevent_list.append(gevent.spawn(spider,i)) # 生成的协程加入列表
        gevent.joinall(gevent_list) #执行协程,遇到IO阻塞时会自动切换任务
    
    def process_func(n):
        '''线程操作'''
        t = 20 # 开20个线程
        m = t*n
        s = 1+(m-t)
        for num in range(s, m+1):
            m = threading.Thread(target=thread_func,args=(num,))
            m.start()
            # m.join()
    
    if __name__ == '__main__':
        start = time.time()
        for i in range(1,6): # 开5个进程
            m = Process(target=process_func,args=(i,))
            m.start()
            # m.join()
        m.join()
        print(time.time() - start)
    # -*- coding: utf-8 -*-
    #2019/12/19 16:41
    
    import os
    import threading
    import gevent
    from multiprocessing import Process
    # 进程、线程和协程结合使用
    
    def gevent_func(i):
        print('@@子协程@@',i)
    
    def thread_func():
        print('-->子线程-->', os.getpid(), threading.current_thread().ident)
        gevent_list = []
        for i in range(3):
            gevent_list.append(gevent.spawn(gevent_func,i)) # 生成3个协程加入列表
        gevent.joinall(gevent_list) #执行协程,遇到IO阻塞时会自动切换任务
    
    
    def process_func():
        print('!!子进程!!', os.getpid(), threading.current_thread().ident)
        for i in range(3):# 开3个线程
            threading.Thread(target=thread_func).start()
    
    if __name__ == '__main__':
        print('-->主进程',os.getpid(), threading.current_thread().ident)
        for i in range(3): #开3个进程
            Process(target=process_func).start()

    只用线程池的情况

    # -*- coding: utf-8 -*-
    #2019/12/16 10:35
    import os
    import time
    import requests
    import traceback
    from bs4 import BeautifulSoup
    from  multiprocessing import Process, Pool
    from threading import Thread
    from concurrent.futures import ThreadPoolExecutor  # 线程池
    import gevent
    from conf import settings as ss
    from core.log import Log as log
    
    def save_img(file_name,content):
        '''保存图片到本地'''
        with open(file_name, 'wb') as f:
            f.write(content)
    
    def spider(num):
        '''爬虫操作'''
        rep_url = r"https://www.autohome.com.cn/all/{}/#liststart".format(num)
        print(rep_url)
        # 1. 模拟浏览器发请求
        response = requests.get(url=rep_url)
        # response.encoding = 'gbk'
        # 2. 获取请求内容
        text = response.text
        # 3. 使用bs4库解析请求
        soup = BeautifulSoup(text, 'html.parser')  # html.parser:解析器,负责解析文本
        # 从整个文本中进一步缩小定位范围, div:所有图片外部的盒子
        div_obj = soup.find(name='div', attrs={"class": "article-wrapper"})
        # print(div_obj)
        # 4. 定位图片位置
        # 从盒子中找所有li标签
        li_list = div_obj.find_all(name="li")
        for li in li_list:
            # 5. 获取图片链接
            img = li.find(name='img')
            try:
                src = img.get("src")
                # print(src)
                img_name = src.rsplit('/',1)[-1]
                file_name = os.path.join(ss.IMG_PATH,img_name)
                # 获取到的图片链接没有http,给补齐
                if not src.startswith('http'): src = 'http:' + src
                # 6. 使用requests模块向图片链接发请求
                res = requests.get(url=src)
                # 7.保存图片到本地
                save_img(file_name, res.content)
            except Exception as e:
                log.error(traceback.format_exc())
    
            # break
    
    
    def thread_pool():
        t = ThreadPoolExecutor(max_workers=10)
        for num in range(1, 101):
            t.submit(spider,num)
        t.shutdown()
    
    
    if __name__ == '__main__':
        start = time.time()
        thread_pool()
        print(time.time() - start)
  • 相关阅读:
    专题——递归
    今日听郝斌老师鸡汤
    线性结构的两种常见应用之一 队列
    线性结构的两种应用之一——栈
    C++常用库函数
    洛谷 标志重捕法?
    c++复习——临考前的女娲补天 >=.<
    c++复习——类(2)
    c++复习——类(1)
    Python单元测试
  • 原文地址:https://www.cnblogs.com/hulk-1029/p/12084766.html
Copyright © 2011-2022 走看看