zoukankan      html  css  js  c++  java
  • 抓取网页图片-以本地IIS网页为实践对象

    #抓取网页图片
    #适用于html页面结构为:li>img
    #抓取单个网页图片小程序
    #version:V1.0
    #author:yxmichael
    #更新时间:20210511
    
    import requests
    from bs4 import BeautifulSoup
    import os,shutil
    import time
    
    def getHtmlText(url,code='utf-8'):
        try:
            r = requests.get(url,timeout = 30,headers = my_headers)
            r.raise_for_status
            r.encoding = code
            return r.text
        except:
            return ""
    
    def parseHtml(nlist,html):
        try:
            soup = BeautifulSoup(html,'html.parser')
            div_main = soup.find('div',attrs={'id':'main'})
            lis = div_main.findAll('li')
            for li in lis:
                a_href = li.find('a')['href']
                if a_href != '#':
                    img_src = li.find('img')['src']
                    img_name = a_href.split('/')[-1]
                    #img_name =img_name[-1]
                    #print("{}	{}
    ".format(a_href,img_src))
                    nlist.append([img_name,a_href,img_src])            
        except:
            print("")
    
    def delOldDir(dir_path):
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)
    
    def downImg(nlist,nums,site_url,dir_path): 
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)   
        os.chdir(dir_path)
        print("
    正在获取原图……")
        for i in range(nums):
            img = nlist[i]
            img_name = img[0]
            img_href= site_url + img[1]
            file_name = dir_path +'/' + img_name
            r= requests.get(img_href,timeout=30)
            with open(file_name,'wb') as f:
                f.write(r.content)
            progressBar(i,nums)
    
    def downImgMicro(nlist,nums,site_url,dir_path):
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)   
        os.chdir(dir_path)
        print("
    正在获取缩微图……")
        for i in range(nums):
            img = nlist[i]
            img_name = img[0]
            img_src = site_url + img[2]
            prefix = '缩微图_'
            file_name = dir_path +'/' + prefix + img_name
            r= requests.get(img_src,timeout=30)
            with open(file_name,'wb') as f:
                f.write(r.content)
            progressBar(i,nums)
        
    
    def progressBar(i,total):
        print('
    当前进度:{0}{1:.0f}%'.format('▉'*(i+1),((i+1)/total*100)),end='')
        
    def printHead():
        num = 80
        print("{}".format("*"*num))
        str_intro = '''
                        抓取单个网页图片小程序
                        version:V1.0
                        author:yxmichael
                        更新时间:20210511
        '''
        print(str_intro)
        print("{}".format("*"*num))
        print("
    正在抓取……
    ")
            
    
    def main():    
        global my_headers
        my_headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
        site_url = 'http://127.0.0.1/pg/'
        imgList = []
        start_time = time.time()
        cur_path = os.getcwd() + '/'
        tmp = '老照片'
        dir_path = cur_path  + tmp
        dir_path_micro = cur_path + tmp + '_缩微图'
        
        printHead()
        html = getHtmlText(site_url)
        parseHtml(imgList,html)
        nums = len(imgList)
        #nums =3
        delOldDir(dir_path)
        delOldDir(dir_path_micro)
        downImg(imgList,nums,site_url,dir_path)
        downImgMicro(imgList,nums,site_url,dir_path_micro)
        seconds = time.time() - start_time
        print("
    成功下载{}张图片,耗时:{:.1f}秒。
    保存路径{}".format(nums,seconds,dir_path))
        input("请按任意键退出……")    
        
    main()    
    
  • 相关阅读:
    内存管理3 Win32汇编语言056
    高级强制类型转换 C++快速入门37
    内存管理3 Win32汇编语言056
    密码学基础
    危险API的禁用列表
    危险API的禁用列表
    《那些年啊,那些事——一个程序员的奋斗史》——68
    《那些年啊,那些事——一个程序员的奋斗史》——68
    《那些年啊,那些事——一个程序员的奋斗史》——68
    春节期间停止更新
  • 原文地址:https://www.cnblogs.com/yuexiao/p/14756234.html
Copyright © 2011-2022 走看看