zoukankan      html  css  js  c++  java
  • python多线程采集图片

    cmd中运行

    >python untitled2.py    图片的网站

    import requests 
    import threading 
    from bs4 import BeautifulSoup 
    import sys 
    import os 
    
    if len(sys.argv) != 2: 
        print("Usage : " )
        print(" python main.py [URL]" )
        exit(1) 
    # config-start 
    url = sys.argv[1] 
    threadNumber = 20 
    # 设置线程数 # config-end 
    def getContent(url): 
        try: 
            response = requests.get(url) 
            response.raise_for_status() 
            response.encoding = response.apparent_encoding 
            return response.text 
        except Exception  as e:
            print(e)
            return str(e) 
    def getTitle(soup): 
        try: 
            return soup.title.string 
        except: 
            return "UnTitled" 
    def getImageLinks(soup): 
        imgs = soup.findAll("img") 
        result = [] 
        for img in imgs: 
            try: 
                src = img['src'] 
                if src.startswith("http"): 
                    result.append(img['src']) 
                else: 
                    result.append(domain + img['src']) 
            except: 
                continue 
        return result 
    def makeDirectory(dicName): 
        if not os.path.exists(dicName): 
            os.mkdir(dicName)
    def downloadImage(imgUrl,savePath): 
        local_filename = imgUrl.split('/')[-1] 
        local_filename = formatFileName(local_filename) 
        r = requests.get(imgUrl, stream=True) 
        counter = 0 
        if not savePath.endswith("/"): 
            savePath += "/" 
        f = open(savePath + local_filename, 'wb') 
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: 
                f.write(chunk) 
                f.flush() 
                counter += 1 
        f.close()
    def formatFileName(fileName): 
        fileName = fileName.replace("/","_") 
        fileName = fileName.replace("\","_") 
        fileName = fileName.replace(":","_") 
        fileName = fileName.replace("*","_") 
        fileName = fileName.replace("?","_") 
        fileName = fileName.replace(""","_") 
        fileName = fileName.replace(">","_") 
        fileName = fileName.replace("<","_") 
        fileName = fileName.replace("|","_") 
        fileName = fileName.replace(" ","_") 
        return fileName
    def threadFunction(imgSrc,directoryName): 
        downloadImage(imgSrc,directoryName) 
        
    class myThread (threading.Thread): 
        def __init__(self, imgSrc, directoryName): 
            threading.Thread.__init__(self) 
            self.imgSrc = imgSrc 
            self.directoryName = directoryName 
        def run(self): 
            threadFunction(self.imgSrc, self.directoryName) 
    def getPrefix(url): 
        # http://doamin/xxx.jpg 
        return ''.join(i+"/" for i in url.split("/")[0:4]) 
    def getDomain(url): 
        return ''.join(i+"/" for i in url.split("/")[0:3]) 
    content = getContent(url) 
    prefix = getPrefix(url) 
    domain = getDomain(url) 
    soup = BeautifulSoup(content, "html.parser") 
    images = getImageLinks(soup) 
    title = getTitle(soup) 
    title = formatFileName(title) 
    print(u"页面标题 : " , title )
    print(u"本页图片数量 :",len(images))
    print(u"正在创建文件夹以用来保存所有图片") 
    makeDirectory(title) 
    threads = [] 
    for image in images: 
        print(u"图片地址 : " + image) 
        threads.append(myThread(image, title)) 
    for t in threads: 
        t.start() 
        while True: 
            if(len(threading.enumerate()) < threadNumber): 
                break 
    print(u"所有图片已加入下载队列 ! 正在下载...")
    

    关注公众号:Python爬虫数据分析挖掘,学习更多python知识

    耐得住寂寞,才能登得顶
    Gitee码云:https://gitee.com/lyc96/projects
  • 相关阅读:
    Windows下通过Xmanager远程桌面控制Linux
    kk
    Wingware.WingIDE.Professional.v3.2.9.1破解并激活
    CentOS LInux启动关闭和服务管理(zt)
    Windows 7开启ping
    apache和cgi问题
    CentOS启动时自动加载内核模块
    bash shell执行、排错、启动配置文件
    程序员都应该好好想想!
    有点意思啊!
  • 原文地址:https://www.cnblogs.com/chenlove/p/14038666.html
Copyright © 2011-2022 走看看