zoukankan      html  css  js  c++  java
  • python多线程采集图片

    cmd中运行

    >python untitled2.py    图片的网站

    import requests
    import threading
    from bs4 import BeautifulSoup
    import sys
    import os

    if len(sys.argv) != 2:
        print("Usage : " )
        print(" python main.py [URL]" )
        exit(1)
    # config-start
    url = sys.argv[1]
    threadNumber = 20
    # 设置线程数 # config-end
    def getContent(url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            response.encoding = response.apparent_encoding
            return response.text
        except Exception  as e:
            print(e)
            return str(e)
    def getTitle(soup):
        try:
            return soup.title.string
        except:
            return "UnTitled"
    def getImageLinks(soup):
        imgs = soup.findAll("img")
        result = []
        for img in imgs:
            try:
                src = img['src']
                if src.startswith("http"):
                    result.append(img['src'])
                else:
                    result.append(domain + img['src'])
            except:
                continue
        return result
    def makeDirectory(dicName):
        if not os.path.exists(dicName):
            os.mkdir(dicName)
    def downloadImage(imgUrl,savePath):
        local_filename = imgUrl.split('/')[-1]
        local_filename = formatFileName(local_filename)
        r = requests.get(imgUrl, stream=True)
        counter = 0
        if not savePath.endswith("/"):
            savePath += "/"
        f = open(savePath + local_filename, 'wb')
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                f.flush()
                counter += 1
        f.close()
    def formatFileName(fileName):
        fileName = fileName.replace("/","_")
        fileName = fileName.replace("\","_")
        fileName = fileName.replace(":","_")
        fileName = fileName.replace("*","_")
        fileName = fileName.replace("?","_")
        fileName = fileName.replace(""","_")
        fileName = fileName.replace(">","_")
        fileName = fileName.replace("<","_")
        fileName = fileName.replace("|","_")
        fileName = fileName.replace(" ","_")
        return fileName
    def threadFunction(imgSrc,directoryName):
        downloadImage(imgSrc,directoryName)
        
    class myThread (threading.Thread):
        def __init__(self, imgSrc, directoryName):
            threading.Thread.__init__(self)
            self.imgSrc = imgSrc
            self.directoryName = directoryName
        def run(self):
            threadFunction(self.imgSrc, self.directoryName)
    def getPrefix(url):
        # http://doamin/xxx.jpg
        return ''.join(i+"/" for i in url.split("/")[0:4])
    def getDomain(url):
        return ''.join(i+"/" for i in url.split("/")[0:3])
    content = getContent(url)
    prefix = getPrefix(url)
    domain = getDomain(url)
    soup = BeautifulSoup(content, "html.parser")
    images = getImageLinks(soup)
    title = getTitle(soup)
    title = formatFileName(title)
    print(u"页面标题 : " , title )
    print(u"本页图片数量 :",len(images))
    print(u"正在创建文件夹以用来保存所有图片")
    makeDirectory(title)
    threads = []
    for image in images:
        print(u"图片地址 : " + image)
        threads.append(myThread(image, title))
    for t in threads:
        t.start()
        while True:
            if(len(threading.enumerate()) < threadNumber):
                break
    print(u"所有图片已加入下载队列 ! 正在下载...")



  • 相关阅读:
    韩式英语
    Daily dictation 听课笔记
    words with same pronunciation
    you will need to restart eclipse for the changes to take effect. would you like to restart now?
    glottal stop(britain fountain mountain)
    education 的发音
    第一次用Matlab 的lamada语句
    SVN的switch命令
    String的split
    SVN模型仓库中的资源从一个地方移动到另一个地方的办法(很久才解决)
  • 原文地址:https://www.cnblogs.com/chenlove/p/8991828.html
Copyright © 2011-2022 走看看