zoukankan      html  css  js  c++  java
  • 【pyhon】nvshens按目录图片批量下载爬虫1.00(多线程版)

    # nvshens按目录图片批量下载爬虫1.00(多线程版)
    from bs4 import BeautifulSoup
    import requests
    import datetime
    import urllib.request
    import os
    import threading
    
    user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)'
    headers={'User-Agent':user_agent}
    
    
    # 下载图片到本地
    def downloadPics(pictures):
        while(len(pictures)>0):
            pic=pictures.pop()
    
            name=pic.split('/')[-1]
            folder=pic.split('/')[-2]
    
            # 判断目录是否存在,不存在则创建之
            if os.path.exists('./'+folder)==False:
                os.makedirs('./'+folder)
    
            try:
                rsp=urllib.request.urlopen(pic)
                img=rsp.read()
                with open('./'+folder+"/"+name,'wb') as f:
                    f.write(img)
                print('图片'+pic+'下载完成')
            except Exception as e:
                print('图片'+pic+'下载异常,塞回重试')
                pictures.append(pic);
    
    
    #下载线程类
    class dldThread(threading.Thread):
        def __init__(self,name,url):
            threading.Thread.__init__(self,name=name)
            self.name=name
            self.url=url
            self.pictures=[]
        
        def run(self):
            while(self.url!="none"):
                print("线程"+self.name+"开始爬取页面"+self.url);
    
                try:
                    rsp=requests.get(self.url,headers=headers)
                    self.url="none"#用完之后置空,看下一页能否取到值
                    soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='utf-8')                
    
                    for divs in soup.find_all(class_="gallery_wrapper"):
                        # 把找到的图片放到数组里去
                        for img in divs.find_all('img'):
                            print(img.get("src"))
                            self.pictures.append(img.get("src"))
    
                        #找下一页
                        for link in divs.find_all('a',class_='a1'):
                            if link.string=='下一页' and link.get("href").find('.html')!=-1:
                                self.url='https://www.nvshens.com'+link.get("href")
    
    
                    if self.url!="none":
                        print("线程"+self.name+"前往下一页")
                        continue
                    else:
                        print("线程"+self.name+'爬取结束,开始下载...')
                        downloadPics(self.pictures)
                        print("线程"+self.name+'下载图片结束.')
                except Exception as e:
                    print("线程"+self.name+"发生异常。重新爬行")# 不管怎么出现的异常,就让它一直爬到底
                    continue
    
    
    # 循环下载图片
    def main():
        for i in range(10000,20000):#范围自己调整
            url='https://www.nvshens.com/g/'+str(i)+'/'
    
            th=dldThread(name=str(i),url=url)
            th.start()
    
    # Kickoff Start
    main()
  • 相关阅读:
    GoldenGate 19.1实时文本文件加载攻略
    windows 10 excel 打开超连接提示 组织策略阻止...
    验证ogg同步数据库表无主键表且目标表包含隐藏字段
    配置ogg从Oracle到PostgreSQL的同步复制json数据
    pi
    GoldenGate 19.1 发布
    ogg同步DDL时,源和目标端表空间名称不同的解决思路
    总目录索引(开发精华总结)
    Spring Cloud Nacos分布式配置中心
    Spring Cloud Nacos&Feign负载均衡
  • 原文地址:https://www.cnblogs.com/heyang78/p/8696984.html
Copyright © 2011-2022 走看看