zoukankan      html  css  js  c++  java
  • 利用斗图啦网站API批量下载表情图片

    decorator.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    
    import logging
    import os
    from functools import wraps
    
    #set the handler string format
    FORMAT = '%(asctime)-15s %(filename)s %(message)s %(imageurl)s %(imagename)s'
    logging.basicConfig(format=FORMAT,level=logging.INFO,filename="biaoqingDownloader.log",datefmt="[%Y-%m-%d %H:%M:%S]")
    my_log_extra={"imageurl" :"","imagename":""}
    
    #downloader logger
    def downloader_logger(func) :
        @wraps(func)
        def wrapper(*args, **kwargs) :
            func(*args,**kwargs)
            try:
                image_name=args[0]
                image_url=args[1]
            except KeyError as e:
                raise e
            my_log_extra["imagename"]=image_name
            my_log_extra["imageurl"]=image_url
            logging.info("biaoqingbaoDownloader downloaded image:",extra=my_log_extra)
        return wrapper
    
    if __name__ == '__main__':
        ## test this logger
        @downloader_logger
        def foo(filename,imageurl) :
            print('logging test')
    
        foo('test.png','www.baidu.com')
        #if no error appears, clear the log file
        with open('./biaoqingDownloader.log','w') as f :
            f.truncate()
    

    downloader.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*
    # @Link    : https://github.com/coderchaser
    
    
    import os
    import time
    import argparse
    import requests
    import random
    import json
    import threading
    from decorator import downloader_logger
    
    # code 0 represents https://www.doutula.com/apidoc
    API_URL_DICT={0:"https://www.doutula.com/api/search?keyword={keyword}&mime={image_type}&page={page}"}
    #random choice from these user agents
    USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
                   'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
                   'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
                   ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) '
                    'Chrome/19.0.1084.46 Safari/536.5'),
                   ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46'
                    'Safari/536.5'), )
    
    class Downloader(object) :
        def __init__(self,number,keyword,image_type,filepath,verbose,api_code=0) :
            #TODO: self.__image_url_list defined as queue instead of list to support multi thread downloading
            self.__image_url_list = []
            #a list for storing image urls
            self.__number = number
            #the number of images to be downloaded
            self.__api_code = api_code
            #which site's api to choose. Currently only one.
            self.__keyword = keyword
            #keywords of the images
            self.__image_type = image_type
            #image type 0 :all kinds 1:GIF 2:static images
            self.__filepath = filepath
            #where to store those images
            self.__verbose = verbose
            #enable the verbose info?
        def __get_image_url(self) :
            for i in range(1,51) :
                api_url=API_URL_DICT[self.__api_code].format(keyword=self.__keyword,image_type=self.__image_type,page=i)
                try:
                    rq=requests.get(api_url,headers={'User-Agent':random.choice(USER_AGENTS)},timeout=5)
                    response_dict=rq.json()
                    self.__image_url_list.extend([entry['image_url'] for entry in response_dict['data']['list']])
                    #TODO: Can i use multi threads here? This is a kind of IO-intenseive work ?
                    if len(self.__image_url_list) >= self.__number:
                        break
                    if response_dict['data']['more'] != 1:
                        break;
                except requests.ConnectionError as e:
                    print(e)
                finally :
                    rq.close()# close exsists?
    
        def __download(self):
            self.__get_image_url()
            print('Now downloading images from https://www.doutula.com ...')
            if not os.path.exists(self.__filepath):
                os.makedirs(self.__filepath)
            for i in range(self.__number) :
                image_url=self.__image_url_list[i]
                if self.__verbose :
                    print("Dwonload images: {}".format(image_url))
                extension='.'+image_url.split('.')[-1]
                try:
                    filename=os.path.join(self.__filepath,'{0}{1}'.format(self.__keyword,i)+extension)
                    # self.image_download(filename,image_url)
                    download_rq=requests.get(image_url)
                    with open(filename,'wb') as f :
                        f.write(download_rq.content)
                except Exception as e:
                    print(e)
                time.sleep(1)
            print("Images about {} have been downloaded.".format(self.__keyword))
        def run(self) :
            self.__download()
    
        # @downloader_logger
        # def image_download(self,filename,image_url):
        #     download_rq=requests.get(image_url)
        #     with open(filename,'wb') as f :
        #                 f.write(download_rq.content)
    
    
    def get_parser() :
        parser = argparse.ArgumentParser(description="download interesting emoj images from www.doutula.com via command line")
        parser.add_argument('keywords',metavar='KEYWORD',type=str,nargs='*',
            help='the keywords to be searched')
        parser.add_argument('-t','--type',type=int,default=0,choices=range(0,3),
            help='choose image type to be downloaded. 0 represents all, 1 represents GIF , 2 represents static image')
        parser.add_argument('-n','--num',type=int,default=50,
            help='number of images to be downloaded')
        parser.add_argument('-c','--clear',action='store_true',
            help='enable clear the log file')
        parser.add_argument('-d','--dir',type=str,
            help='where to store the images, default is ./tmp/keyword/')
        parser.add_argument('-v','--verbose',action='store_true',
            help='enable show the whole downloading info')
    
        return parser
    
    def download(**kwargs):
        for keyword in kwargs['keywords'] :
            if kwargs['dir'] :
                dirpath=kwargs['dir']+"/"+keyword
            else :
                dirpath='./tmp/'+keyword
            print('Making dir:',dirpath)
            downloader=Downloader(kwargs['num'],keyword,kwargs['type'],dirpath,kwargs['verbose'])
            downloader.run()
    
    def command_line_runner():
        parser=get_parser()
        kwargs=vars(parser.parse_args())
    
        if kwargs['clear']:
            with open('./biaoqingDownloader.log','w') as f:
                f.truncate()
    
        if not kwargs['keywords']:
            #if no keywords assigned, return with help info
            parser.print_help()
            return
    
        download(**kwargs)
    
    
    
    if __name__ == '__main__':
    
        ###
        #test this downloader
        ###
        # downloader=Downloader(20,'金馆长',0,'./tmp',false)
        # downloader.run()
        command_line_runner()
    

    代码以上传至Github,link
    使用方法:python downloader.py -h

    世事茫茫,光阴何其有限!
  • 相关阅读:
    LeetCode OJ 112. Path Sum
    LeetCode OJ 226. Invert Binary Tree
    LeetCode OJ 100. Same Tree
    LeetCode OJ 104. Maximum Depth of Binary Tree
    LeetCode OJ 111. Minimum Depth of Binary Tree
    LeetCode OJ 110. Balanced Binary Tree
    apache-jmeter-3.1的简单压力测试使用方法(下载和安装)
    JMeter入门教程
    CentOS6(CentOS7)设置静态IP 并且 能够上网
    分享好文:分享我在阿里8年,是如何一步一步走向架构师的
  • 原文地址:https://www.cnblogs.com/bobliao/p/9205172.html
Copyright © 2011-2022 走看看