zoukankan      html  css  js  c++  java
  • 【Python】nvshens按目录批量下载图片爬虫1.00(单线程版)

    # nvshens按目录批量下载图片爬虫1.00(单线程版)
    from bs4 import BeautifulSoup
    import requests
    import datetime
    import urllib.request
    import os
    
    user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)'
    headers={'User-Agent':user_agent}
    
    # 找到的图片
    pictures=[]
    
    # 不断追逐,直到结束
    def crawl(url):
        print("爬取页面"+url);
    
        try:
            rsp=requests.get(url,headers=headers)
            soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='utf-8')
            nextUrl="none";
    
            for divs in soup.find_all(class_="gallery_wrapper"):
                # 把找到的图片放到数组里去
                for img in divs.find_all('img'):
                    print(img.get("src"))
                    pictures.append(img.get("src"))
    
                #找下一页
                for link in divs.find_all('a',class_='a1'):
                    if link.string=='下一页' and link.get("href").find('.html')!=-1:
                        nextUrl='https://www.nvshens.com'+link.get("href");
    
            #if len(pictures)>10:
            #    downloadPics()
    
            #time.sleep(5)
    
            if nextUrl!="none":
                print("前往下一页");
                crawl(nextUrl)
            else:
                print('爬取结束,开始下载...')
                downloadPics()
                print('下载结束.')
        except Exception as e:
            print("发生异常。重新爬行")# 不管怎么出现的异常,就让它一直爬到底
            crawl(url)
    
    # 下载图片到本地
    def downloadPics():
        while(len(pictures)>0):
            pic=pictures.pop()
    
            name=pic.split('/')[-1]
            folder=pic.split('/')[-2]
    
            # 判断目录是否存在,不存在则创建之
            if os.path.exists('./'+folder)==False:
                os.makedirs('./'+folder)
    
            try:
                rsp=urllib.request.urlopen(pic)
                img=rsp.read()
                with open('./'+folder+"/"+name,'wb') as f:
                    f.write(img)
                print('图片'+pic+'下载完成')
            except Exception as e:
                print('图片'+pic+'下载异常,塞回重试')
                pictures.append(pic);
    
    # 循环下载图片
    def main():
        for i in range(10000,30000):
            url='https://www.nvshens.com/g/'+str(i)+'/'
            starttime = datetime.datetime.now()
            crawl(url)
            endtime = datetime.datetime.now()
            print("下载用时"+str((endtime - starttime).seconds)+"")
            print(''+url+'的下载结束.')
    
    # Kickoff Start
    main()
  • 相关阅读:
    Python学习札记(十五) 高级特性1 切片
    LeetCode Longest Substring Without Repeating Characters
    Python学习札记(十四) Function4 递归函数 & Hanoi Tower
    single number和变体
    tusen 刷题
    实验室网站
    leetcode 76. Minimum Window Substring
    leetcode 4. Median of Two Sorted Arrays
    leetcode 200. Number of Islands 、694 Number of Distinct Islands 、695. Max Area of Island 、130. Surrounded Regions 、434. Number of Islands II(lintcode) 并查集 、178. Graph Valid Tree(lintcode)
    刷题注意事项
  • 原文地址:https://www.cnblogs.com/heyang78/p/8696961.html
Copyright © 2011-2022 走看看