zoukankan      html  css  js  c++  java
  • python抓取网页图片

    网页的图片大致是用Image导入的,使用的是相对路径,例如

    <image src="image/bg.jpg"/>

    通过匹配可以获取image/bg.jpg,与页面地址组合可以得到图片的地址

    除了直接引入的图片,还有通过CSS,HTML引入的图片,也需要处理

    # -*- coding: utf-8 -*-
    import urllib, httplib, urlparse
    import sys
    import re
    
    def httpExists(url):
        host, path = urlparse.urlsplit(url)[1:3]
        if ':' in host:
            # port specified, try to use it
            host, port = host.split(':', 1)
            try:
                port = int(port)
            except ValueError:
                print 'invalid port number %r' % (port,)
                return False
        else:
            # no port specified, use default port
            port = None
        try:
            connection = httplib.HTTPConnection(host, port=port)
            connection.request("HEAD", path)
            resp = connection.getresponse( )
            if resp.status == 200:       # normal 'found' status
                found = True
            elif resp.status == 302:     # recurse on temporary redirect
                found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))
            else:                        # everything else -> not found
                print "Status %d %s : %s" % (resp.status, resp.reason, url)
                found = False
        except Exception, e:
            print e.__class__, e, url
            found = False
        return found
    
    """根据url获取文件名"""
    def gGetFileName(url):
        if url==None: return None
        if url=="" : return ""
        arr=url.split("/")
        return arr[len(arr)-1]
    
    """根据url下载文件,文件名参数指定"""
    def gDownloadWithFilename(url,savePath,file):
        #参数检查,现忽略
        try:
            urlopen=urllib.URLopener()
            fp = urlopen.open(url)
            data = fp.read()
            fp.close()
            print 'download file url :',url
            file=open(savePath + file,'w+b')
            file.write(data)
            file.close()
        except IOError:
            print "download error!"+ url
    
    def gDownload(url,savePath):
    
        fileName = gGetFileName(url)
        gDownloadWithFilename(url,savePath,fileName)
    
    def getRexgList(lines,regx,searchRegx):
        if lines==None : return 
        lists =[]
        for line in lines:
            ismatch = re.search(regx,line,re.IGNORECASE)
            if ismatch :
               
                matchs = re.search(searchRegx,line,re.IGNORECASE)
                if matchs != None:
                    groups = matchs.groups()
                    for str in groups:
                        if str not in lists:
                            lists.append(str)
        return lists
    def checkLine(lines):
        for line in lines :
            matchs = re.search(r'url((S+))',re.IGNORECASE)
            if matchs != None :
                print matchs.groups()
    def  getPageLines(url):
        if url==None : return
        if not httpExists(url): return 
        try:
            page = urllib.urlopen(url)   
            html = page.readlines()
            page.close()
            return html
        except:
            print "getPageLines() error!"
            return
    def getCurrentPageImage(url,savePath):
        lines = getPageLines(url)
        print 'lines.length',len(lines)
       
        regxlists =  getRexgList(lines,r'srcs*="images(S+)"',r'srcs*="(S+)"')
        if regxlists==None: return 
        print 'getCurrentPageImage() images.length',len(regxlists)
        for jpg in regxlists:
            jpg =url + jpg
            gDownload(jpg,savePath)
    
    def getCSSImages(link,savePath,url):
        lines = getPageLines(link)
        print 'lines.length',len(lines)
        regxlists =  getRexgList(lines,r'url((S+))',r'url((S+))')
        if regxlists==None: return 
        print 'getCurrentPageImage() images.length',len(regxlists)
        for jpg in regxlists:
            jpg =url + jpg
            gDownload(jpg,savePath)
    
    """根据url获取其上的相关htm、html链接,返回list"""
    def gGetHtmlLink(url):
        #参数检查,现忽略
        rtnList=[]
        lines=getPageLines(url)
        regx = r"""href="?(S+).htm"""
        for link in getRexgList(lines,regx,r'href="(S+)"'):
            link =url + link
            if link not in rtnList:
                rtnList.append(link)
                print link
        return rtnList
    """根据url获取其上的相关css链接,返回list"""
    def gGetCSSLink(url):
        #参数检查,现忽略
        rtnList=[]
        lines=getPageLines(url)
        regx = r"""href="?(S+).css"""
        for link in getRexgList(lines,regx,r'href="(S+)"'):
            link = url + link
            if link not in rtnList:
                rtnList.append(link)
        return rtnList   
    def getPageImage(url,savePath):
        """getCurrentPageImage(url,savePath)"""
    
        """读取其他的CSS,html文件中的图片
        links=gGetHtmlLink(url)
        for link in links:
            print u'get images on link-html读取'
            getCurrentPageImage(link,savePath)"""
        links=gGetCSSLink(url)
        for link in links:
            print 'get images on link:',link
            getCSSImages(link,savePath,url)
    if __name__ == '__main__':
        
        url = 'http://www.templatemo.com/templates/templatemo_281_chrome/'
        savePath = 'd:/tmp/'
        print 'download pic from [' + url +']'
        print 'save to [' +savePath+'] ...'
        getPageImage(url,savePath)
        print "download finished"

    具体使用的时候根据URL的情况,具体分析得到图片地址的方式。

  • 相关阅读:
    CF 633 E. Binary Table
    BZOJ 4589 Hard Nim
    不走弯路,微信小程序的快速入门?
    如果通过cookies和localStorage取值?
    Airbub 弃用React Native
    如何在登陆注册的时候,实现密码框的小眼睛的显示与与隐藏?
    js 实用封装 点击按钮复制到剪贴板
    css渐变写法 从左到右渐变三种颜色示例;
    vue-router 使用二级路由去实现子组件的显示和隐藏
    vue 路由传参中刷新页面参数丢失 及传参的几种方式?
  • 原文地址:https://www.cnblogs.com/yangchengInfo/p/3279374.html
Copyright © 2011-2022 走看看