zoukankan      html  css  js  c++  java
  • python3爬虫.3.下载网页图片

    目标,豆瓣读书,

    下载页面书籍图片。

    import urllib.request 
    import re                   #使用正则表达式
    
    
    def getJpg(date):
        jpgList = re.findall(r'(img src="http.+?.jpg")([sS]*?)(.+?.alt=".+?.")',date)
        return jpgList
    
    def downLoad(jpgUrl,sTitle,n):
        try:  
            urllib.request.urlretrieve(jpgUrl,
                'C:\Users\74172\source\repos\Python\spidertest1\images\book.douban\%s.jpg'  %sTitle)
        except Exception as e:  
            print(e)  
        finally:  
            print('图片%s下载操作完成' % n)  
    
    def getTitle(date):
        titleList = re.findall(r'title=".">',date)
        return titleList
    
    
    if __name__ == '__main__':     
        url = 'https://book.douban.com/'
        res = urllib.request.urlopen(url)   
        date = res.read().decode('utf-8')
        date_jpg = getJpg(date)
        imageTitle = getTitle(date)
        global n
        n = 1                      
        for jpginfo in date_jpg:
            s = re.findall(r'http.+?.jpg',str(jpginfo))
            print(n,'--- url -->',str(s)[2:-2])
            sTitleInfo = re.findall(r'alt=".+?."',str(jpginfo))
            sTitleL = re.findall(r'".+?."',str(sTitleInfo))
            sTitle = str(sTitleL)[3:-3]
            downLoad(s[0],sTitle,n)
            n = n + 1 
            
    

     又做了点修改,并将书名写入txt文件中

    import urllib.request 
    import re                   #使用正则表达式
    
    
    def getJpg(html):
        jpgList = re.findall(r'(img src="http.+?.jpg")([sS]*?)(.+?.alt=".+?.")',html)
        jpgList = re.findall(r'http.+?.jpg',str(jpgList))
        return jpgList
    
    def downLoad(jpgUrl,sTitle,n):
        try:  
            urllib.request.urlretrieve(jpgUrl,
                'C:/Users/74172/source/repos/Python/spidertest1/images/book.douban/%s.jpg'  %sTitle)
        finally:  
            print('图片---%s----下载操作完成' % sTitle)  
    
    def getTitle(html):
        titleList = re.findall(r'(img src="http.+?.jpg")([sS]*?)(.+?.alt=".+?.")',html)
        titleList = re.findall(r'alt=".+?."',str(titleList))
        titleList = re.findall(r'".+?."',str(titleList))
        return titleList
    
    def writeTxt(imageTitle):
        try:
            #目录建立txt文件
            f = open((url[8:-5]+'.txt'),"a",encoding="utf-8")
            #写入
            f.write(imageTitle+'
    ') 
        finally:
            if f:
                #关闭文件 
                f.close()
    
    if __name__ == '__main__':     
        url = 'https://book.douban.com/'
        res = urllib.request.urlopen(url)   
        html = res.read().decode('utf-8')
        urlJpgs = getJpg(html)
        imageTitle = getTitle(html)
        n = 0                      
        for urlJpg in urlJpgs:
            print(n,'--- url -->',urlJpg)
            downLoad(urlJpg,imageTitle[n][1:-1],n)
            writeTxt(imageTitle[n][1:-1])
            n = n + 1 
       
    
  • 相关阅读:
    python使用数据库的一些操作
    正则表达式整理
    tomcat启动成功但是访问不到官网
    控制反转 依赖注入 AOP 和 IOC
    abstract 和 interface区别
    input文本框 鼠标点击默认消失,不输入离开鼠标恢复默认值
    声明式管理事务:基于AspectJ的xml方式
    使用注解配置声明式事务
    spring事物 (mu课)
    oracle表之数据类型
  • 原文地址:https://www.cnblogs.com/protogenoi/p/8908309.html
Copyright © 2011-2022 走看看