zoukankan      html  css  js  c++  java
  • 批量抓取表情包爬虫脚本

      

    import re
    import os
    import time
    import requests
    import multiprocessing
    from multiprocessing.pool import ThreadPool
    picqueue = multiprocessing.Queue()
    pagequeue = multiprocessing.Queue()
    logqueue = multiprocessing.Queue()
    picpool = ThreadPool(50)
    pagepool = ThreadPool(5)
    error = []
    for x in range(1, 838):
        pagequeue.put(x)
    
    
    def getimglist(body):
        imglist = re.findall(
            ur'data-original="//wsd([^"]+)" data-backup="[^"]+" alt="([^"]+)"', body)
        for url, name in imglist:
    
            if name:
                name = name + url[-4:]
                url = "http://ws1" + url
                logqueue.put(url)
                picqueue.put((name, url))
        if len(imglist)==0:
            print body
    
    
    
    def savefile():
        http = requests.Session()
        while True:
            name, url = picqueue.get()
            if not os.path.isfile(name):
                req = http.get(url)
                try:
                    open(name, 'wb').write(req.content)
                except:
                    error.append([name, url])
    
    
    def getpage():
        http = requests.Session()
        while True:
            pageid = pagequeue.get()
            req = http.get(
                    "https://www.doutula.com/photo/list/?page={}".format(pageid))
            getimglist(req.text)
            time.sleep(1)
    
    
    for x in range(5):
        pagepool.apply_async(getpage)
    for x in range(50):
        picpool.apply_async(savefile)
    while True:
        print picqueue.qsize(), pagequeue.qsize(), logqueue.qsize()
        time.sleep(1)

    7分钟左右,即可爬完

  • 相关阅读:
    构建之法阅读笔记03
    周进度条
    周活动总结表
    电脑桌面美化
    如何让自己进步,去做成一件事
    后台网站
    laravel RBAC权限管理学习
    laravle定时任务
    django第一次简单讲解使用
    css3网页的淡入淡出效果
  • 原文地址:https://www.cnblogs.com/howmp/p/6947151.html
Copyright © 2011-2022 走看看