zoukankan      html  css  js  c++  java
  • 【Python】nvshens按目录批量下载图片爬虫1.00(单线程版)

    # nvshens按目录批量下载图片爬虫1.00(单线程版)
    from bs4 import BeautifulSoup
    import requests
    import datetime
    import urllib.request
    import os
    
    user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)'
    headers={'User-Agent':user_agent}
    
    # 找到的图片
    pictures=[]
    
    # 不断追逐,直到结束
    def crawl(url):
        print("爬取页面"+url);
    
        try:
            rsp=requests.get(url,headers=headers)
            soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='utf-8')
            nextUrl="none";
    
            for divs in soup.find_all(class_="gallery_wrapper"):
                # 把找到的图片放到数组里去
                for img in divs.find_all('img'):
                    print(img.get("src"))
                    pictures.append(img.get("src"))
    
                #找下一页
                for link in divs.find_all('a',class_='a1'):
                    if link.string=='下一页' and link.get("href").find('.html')!=-1:
                        nextUrl='https://www.nvshens.com'+link.get("href");
    
            #if len(pictures)>10:
            #    downloadPics()
    
            #time.sleep(5)
    
            if nextUrl!="none":
                print("前往下一页");
                crawl(nextUrl)
            else:
                print('爬取结束,开始下载...')
                downloadPics()
                print('下载结束.')
        except Exception as e:
            print("发生异常。重新爬行")# 不管怎么出现的异常,就让它一直爬到底
            crawl(url)
    
    # 下载图片到本地
    def downloadPics():
        while(len(pictures)>0):
            pic=pictures.pop()
    
            name=pic.split('/')[-1]
            folder=pic.split('/')[-2]
    
            # 判断目录是否存在,不存在则创建之
            if os.path.exists('./'+folder)==False:
                os.makedirs('./'+folder)
    
            try:
                rsp=urllib.request.urlopen(pic)
                img=rsp.read()
                with open('./'+folder+"/"+name,'wb') as f:
                    f.write(img)
                print('图片'+pic+'下载完成')
            except Exception as e:
                print('图片'+pic+'下载异常,塞回重试')
                pictures.append(pic);
    
    # 循环下载图片
    def main():
        for i in range(10000,30000):
            url='https://www.nvshens.com/g/'+str(i)+'/'
            starttime = datetime.datetime.now()
            crawl(url)
            endtime = datetime.datetime.now()
            print("下载用时"+str((endtime - starttime).seconds)+"")
            print(''+url+'的下载结束.')
    
    # Kickoff Start
    main()
  • 相关阅读:
    面向对象的三个基本特征
    OGRE启动过程详解(OGRE HelloWorld程序原理解析)
    Bullet核心类介绍(Bullet 2.82 HelloWorld程序及其详解,附程序代码)
    windows下Bullet 2.82编译安装(Bullet Physics开发环境配置)
    1303: Decimal
    分组背包,每组最多选1个
    椒盐效果
    自我介绍
    题目1539:师弟
    upper_bound()
  • 原文地址:https://www.cnblogs.com/heyang78/p/8696961.html
Copyright © 2011-2022 走看看