zoukankan html css js c++ java

python抓取网页图片

网页的图片大致是用Image导入的，使用的是相对路径，例如

<image src="image/bg.jpg"/>

通过匹配可以获取image/bg.jpg,与页面地址组合可以得到图片的地址

除了直接引入的图片，还有通过CSS，HTML引入的图片，也需要处理

# -*- coding: utf-8 -*-
import urllib, httplib, urlparse
import sys
import re

def httpExists(url):
    host, path = urlparse.urlsplit(url)[1:3]
    if ':' in host:
        # port specified, try to use it
        host, port = host.split(':', 1)
        try:
            port = int(port)
        except ValueError:
            print 'invalid port number %r' % (port,)
            return False
    else:
        # no port specified, use default port
        port = None
    try:
        connection = httplib.HTTPConnection(host, port=port)
        connection.request("HEAD", path)
        resp = connection.getresponse( )
        if resp.status == 200:       # normal 'found' status
            found = True
        elif resp.status == 302:     # recurse on temporary redirect
            found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))
        else:                        # everything else -> not found
            print "Status %d %s : %s" % (resp.status, resp.reason, url)
            found = False
    except Exception, e:
        print e.__class__, e, url
        found = False
    return found

"""根据url获取文件名"""
def gGetFileName(url):
    if url==None: return None
    if url=="" : return ""
    arr=url.split("/")
    return arr[len(arr)-1]

"""根据url下载文件，文件名参数指定"""
def gDownloadWithFilename(url,savePath,file):
    #参数检查，现忽略
    try:
        urlopen=urllib.URLopener()
        fp = urlopen.open(url)
        data = fp.read()
        fp.close()
        print 'download file url :',url
        file=open(savePath + file,'w+b')
        file.write(data)
        file.close()
    except IOError:
        print "download error!"+ url

def gDownload(url,savePath):

    fileName = gGetFileName(url)
    gDownloadWithFilename(url,savePath,fileName)

def getRexgList(lines,regx,searchRegx):
    if lines==None : return 
    lists =[]
    for line in lines:
        ismatch = re.search(regx,line,re.IGNORECASE)
        if ismatch :
           
            matchs = re.search(searchRegx,line,re.IGNORECASE)
            if matchs != None:
                groups = matchs.groups()
                for str in groups:
                    if str not in lists:
                        lists.append(str)
    return lists
def checkLine(lines):
    for line in lines :
        matchs = re.search(r'url((S+))',re.IGNORECASE)
        if matchs != None :
            print matchs.groups()
def  getPageLines(url):
    if url==None : return
    if not httpExists(url): return 
    try:
        page = urllib.urlopen(url)   
        html = page.readlines()
        page.close()
        return html
    except:
        print "getPageLines() error!"
        return
def getCurrentPageImage(url,savePath):
    lines = getPageLines(url)
    print 'lines.length',len(lines)
   
    regxlists =  getRexgList(lines,r'srcs*="images(S+)"',r'srcs*="(S+)"')
    if regxlists==None: return 
    print 'getCurrentPageImage() images.length',len(regxlists)
    for jpg in regxlists:
        jpg =url + jpg
        gDownload(jpg,savePath)

def getCSSImages(link,savePath,url):
    lines = getPageLines(link)
    print 'lines.length',len(lines)
    regxlists =  getRexgList(lines,r'url((S+))',r'url((S+))')
    if regxlists==None: return 
    print 'getCurrentPageImage() images.length',len(regxlists)
    for jpg in regxlists:
        jpg =url + jpg
        gDownload(jpg,savePath)

"""根据url获取其上的相关htm、html链接，返回list"""
def gGetHtmlLink(url):
    #参数检查，现忽略
    rtnList=[]
    lines=getPageLines(url)
    regx = r"""href="?(S+).htm"""
    for link in getRexgList(lines,regx,r'href="(S+)"'):
        link =url + link
        if link not in rtnList:
            rtnList.append(link)
            print link
    return rtnList
"""根据url获取其上的相关css链接，返回list"""
def gGetCSSLink(url):
    #参数检查，现忽略
    rtnList=[]
    lines=getPageLines(url)
    regx = r"""href="?(S+).css"""
    for link in getRexgList(lines,regx,r'href="(S+)"'):
        link = url + link
        if link not in rtnList:
            rtnList.append(link)
    return rtnList   
def getPageImage(url,savePath):
    """getCurrentPageImage(url,savePath)"""

    """读取其他的CSS，html文件中的图片
    links=gGetHtmlLink(url)
    for link in links:
        print u'get images on link-html读取'
        getCurrentPageImage(link,savePath)"""
    links=gGetCSSLink(url)
    for link in links:
        print 'get images on link:',link
        getCSSImages(link,savePath,url)
if __name__ == '__main__':
    
    url = 'http://www.templatemo.com/templates/templatemo_281_chrome/'
    savePath = 'd:/tmp/'
    print 'download pic from [' + url +']'
    print 'save to [' +savePath+'] ...'
    getPageImage(url,savePath)
    print "download finished"

具体使用的时候根据URL的情况，具体分析得到图片地址的方式。

查看全文

相关阅读:
Java 8 锁机制
 Elasticsearch学习资料
 数据库相关
 Linux常用Shell命令
 如何高效编写可维护代码?
分布式开放消息系统(RocketMQ)的原理与实践（转载）
学习Linux第六天（学习文件权限和特殊权限）
各大网络厂商列表
 Linux服务器系统信息查询
 关于情感

原文地址：https://www.cnblogs.com/yangchengInfo/p/3279374.html