zoukankan      html  css  js  c++  java
  • 多线程,进程,协程用法

    import requests
    import re
    from lxml import etree
    import os
    import random
    import threading
    import time
    import datetime
    import hashlib
    import multiprocessing
    from string import punctuation
    from gevent import monkey
    import gevent

    monkey.patch_socket()
    class Dou:
    def __init__(self):
    self.headers = {
    'Referer': 'http://www.doutula.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'
    }
    def filters(self,data):
    strs = ''
    for i in data:
    if i not in punctuation:
    strs+=i
    return strs.strip()

    def get_source(self,url,headers=1):
    if headers==1:
    source = requests.get(url,headers=self.headers,timeout=5).content
    return source
    else:
    source = requests.get(url, headers=headers,timeout=5).content
    return source
    def make_dir(self,dir_name):
    dir_name = self.filters(dir_name)
    dir_name = 'F:\27\dou\' + dir_name
    if os.path.isdir(dir_name):
    print(dir_name,'已经存在')
    return dir_name
    else:
    os.mkdir(dir_name)
    print(dir_name, '开始创建')
    return dir_name

    def save_pic(self,data):
    dir_name = data[0]
    pic_name= data[1]
    pic_url= data[2]
    pic_type= data[3]
    try:
    pic_name = self.filters(pic_name)
    pic_path = dir_name+'\'+pic_name+pic_type
    pic_source = self.get_source(pic_url)
    op = open(pic_path,'wb')
    op.write(pic_source)
    op.close()
    except:
    pass

    def begin_by_page(self,page):
    try:
    source = self.get_source('http://www.doutula.com/article/list/?page='+str(page)).decode('utf8')
    dir_name_lists = etree.HTML(source).xpath('//*[@id="home"]/div/div[2]/a/div[1]/text()')
    pic_lists = etree.HTML(source).xpath('//*[@id="home"]/div/div[2]/a/@href')
    print(len(pic_lists))
    for i in range(len(pic_lists)):
    print(dir_name_lists[i])

    dir_name = self.make_dir(dir_name_lists[i])

    pic_href = pic_lists[i]
    show_source = self.get_source(pic_href).decode('utf8')
    pic_src = etree.HTML(show_source).xpath('/html/body/div[2]/div[1]/div/div[2]/li/div[3]/div/table/tbody/tr[1]/td/a/img/@src')
    pic_name = etree.HTML(show_source).xpath('/html/body/div[2]/div[1]/div/div[2]/li/div[3]/div/table/tbody/tr[1]/td/a/img/@alt')
    # print(len(pic_src),len(pic_name),pic_src)
    ths_lists = []
    for ii in range(len(pic_src)):
    # print(ii)
    pic_type = pic_src[ii][-4:]
    pic_names = pic_name[ii]
    pic_url = pic_src[ii]
    # print(dir_name,pic_type)
    xie = gevent.spawn(self.save_pic, (dir_name, pic_names, pic_url, pic_type))
    xie.start()
    ths_lists.append(xie)
    # ths = threading.Thread(target=self.save_pic, args=(dir_name, pic_names, pic_url, pic_type,))
    # ths.start()
    # ths_lists.append(ths)
    # self.save_pic(dir_name, pic_names, pic_url, pic_type)

    print(ths_lists)
    for ths in ths_lists:
    ths.join()
    except:
    print('有问题产生,问题页面为',page)

    if __name__ == '__main__':
    thread_lists = []
    def run():
    dou = Dou()
    for page in range(50,81):
    while True:
    if len(thread_lists)<8:
    # th = threading.Thread(target=dou.begin_by_page,args=(page,))
    th = multiprocessing.Process(target=dou.begin_by_page,args=(page,))
    th.start()
    thread_lists.append(th)
    break
    else:
    time.sleep(3)
    print(thread_lists)
    print('进程池已经满了')
    for ths in thread_lists:
    if not ths.is_alive():
    thread_lists.remove(ths)
    for ths in thread_lists:
    ths.join()

    def custom_run():
    dou = Dou()
    for page in range(1, 11):
    dou.begin_by_page(page)


    start_time = datetime.datetime.now()
    run()
    over_time = datetime.datetime.now()
    the_time = over_time-start_time
    print(the_time)


    #多线程10页时间:0:01:04.329867
    #啥也不加10页的时间:0:05:48.150459
    #多进程10页0:01:23.375896
    #多进程加线程 0:01:19.268139

    这段代码为抓取斗图网的代码,现在开启的进程加协程,协程的数量没有控制,每保存一个图片生成一个协成,进程开启了8个,因为我这个电脑是八核的,线程的代码先注释掉了,期间遇到问题,程序运行完毕无法自动结束,最后找到原因是因为请求卡住了,导致协程一直处于活跃状态,那么进程也就一直结束不了,整个程序也就结束不了,在请求的地方加上一个timeout,即可解决这个问题了
  • 相关阅读:
    Django项目:CRM(客户关系管理系统)--20--12PerfectCRM实现King_admin分页上下页
    Django项目:CRM(客户关系管理系统)--19--11PerfectCRM实现King_admin分页显示条数
    Django项目:CRM(客户关系管理系统)--18--10PerfectCRM实现King_admin日期优化
    Django项目:CRM(客户关系管理系统)--17--09PerfectCRM实现King_admin显示注册表的内容
    Oracle数据库,非空约束、主键约束、外键约束、唯一约束

    Oracle数据库,用户的创建及表的创建
    点击时显示遮罩层,登录时灰色遮罩效果
    多个视频时,利用函数实现播放一个,其他自动暂停
    正则表达式、事件调用
  • 原文地址:https://www.cnblogs.com/mypath/p/9086293.html
Copyright © 2011-2022 走看看