zoukankan      html  css  js  c++  java
  • python下载图片(3)

    # -*- coding: utf-8 -*-
    """
    some function by metaphy,2007-04-03,copyleft
    version 0.2
    """
    import urllib, httplib, urlparse
    import re
    import random

    """judge url exists or not,by others"""
    def httpExists(url):
    host, path = urlparse.urlsplit(url)[1:3]
    if ':' in host:
    # port specified, try to use it
    host, port = host.split(':', 1)
    try:
    port = int(port)
    except ValueError:
    print 'invalid port number %r' % (port,)
    return False
    else:
    # no port specified, use default port
    port = None
    try:
    connection = httplib.HTTPConnection(host, port=port)
    connection.request("HEAD", path)
    resp = connection.getresponse( )
    if resp.status == 200: # normal 'found' status
    found = True
    elif resp.status == 302: # recurse on temporary redirect
    found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))
    else: # everything else -> not found
    print "Status %d %s : %s" % (resp.status, resp.reason, url)
    found = False
    except Exception, e:
    print e.__class__, e, url
    found = False
    return found

    """get html src,return lines[]"""
    def gGetHtmlLines(url):
    if url==None : return
    if not httpExists(url): return
    try:
    page = urllib.urlopen(url)
    html = page.readlines()
    page.close()
    return html
    except:
    print "gGetHtmlLines() error!"
    return
    """get html src,return string"""
    def gGetHtml(url):
    if url==None : return
    if not httpExists(url): return
    try:
    page = urllib.urlopen(url)
    html = page.read()
    page.close()
    return html
    except:
    print "gGetHtml() error!"
    return

    """根据url获取文件名"""
    def gGetFileName(url):
    if url==None: return None
    if url=="" : return ""
    arr=url.split("/")
    return arr[len(arr)-1]

    """生成随机文件名"""
    def gRandFilename(type):
    fname = ''
    for i in range(16):
    fname = fname + chr(random.randint(65,90))
    fname = fname + chr(random.randint(48,57))
    return fname + '.' + type
    """根据url和其上的link,得到link的绝对地址"""
    def gGetAbslLink(url,link):
    if url==None or link == None : return
    if url=='' or link=='' : return url
    addr = ''
    if link[0] == '/' :
    addr = gGetHttpAddr(url) + link
    elif len(link)>3 and link[0:4] == 'http':
    addr = link
    elif len(link)>2 and link[0:2] == '..':
    addr = gGetHttpAddrFatherAssign(url,link)
    else:
    addr = gGetHttpAddrFather(url) + link

    return addr

    """根据输入的lines,匹配正则表达式,返回list"""
    def gGetRegList(linesList,regx):
    if linesList==None : return
    rtnList=[]
    for line in linesList:
    matchs = re.search(regx, line, re.IGNORECASE)
    if matchs!=None:
    allGroups = matchs.groups()
    for foundStr in allGroups:
    if foundStr not in rtnList:
    rtnList.append(foundStr)
    return rtnList
    """根据url下载文件,文件名参数指定"""
    def gDownloadWithFilename(url,savePath,file):
    #参数检查,现忽略
    try:
    urlopen=urllib.URLopener()
    fp = urlopen.open(url)
    data = fp.read()
    fp.close()
    file=open(savePath + file,'w+b')
    file.write(data)
    file.close()
    except IOError:
    print "download error!"+ url

    """根据url下载文件,文件名自动从url获取"""
    def gDownload(url,savePath):
    #参数检查,现忽略
    fileName = gGetFileName(url)
    #fileName =gRandFilename('jpg')
    gDownloadWithFilename(url,savePath,fileName)

    """根据某网页的url,下载该网页的jpg"""
    def gDownloadHtmlJpg(downloadUrl,savePath):
    lines= gGetHtmlLines(downloadUrl)
    regx = r"""srcs*="?(S+).jpg"""
    lists =gGetRegList(lines,regx)
    if lists==None: return
    for jpg in lists:
    jpg = gGetAbslLink(downloadUrl,jpg) + '.jpg'
    gDownload(jpg,savePath)
    ### print gGetFileName(jpg)
    """根据url取主站地址"""
    def gGetHttpAddr(url):
    if url== '' : return ''
    arr=url.split("/")
    return arr[0]+"//"+arr[2]
    """根据url取上级目录"""
    def gGetHttpAddrFather(url):
    if url=='' : return ''
    arr=url.split("/")
    addr = arr[0]+'//'+arr[2]+ '/'
    if len(arr)-1>3 :
    for i in range(3,len(arr)-1):
    addr = addr + arr[i] + '/'
    return addr

    """根据url和上级的link取link的绝对地址"""
    def gGetHttpAddrFatherAssign(url,link):
    if url=='' : return ''
    if link=='': return ''
    linkArray=link.split("/")
    urlArray = url.split("/")
    partLink =''
    partUrl = ''
    for i in range(len(linkArray)):
    if linkArray[i]=='..':
    numOfFather = i + 1 #上级数
    else:
    partLink = partLink + '/' + linkArray[i]
    for i in range(len(urlArray)-1-numOfFather):
    partUrl = partUrl + urlArray[i]
    if i < len(urlArray)-1-numOfFather -1 :
    partUrl = partUrl + '/'
    return partUrl + partLink

    """根据url获取其上的相关htm、html链接,返回list"""
    def gGetHtmlLink(url):
    #参数检查,现忽略
    rtnList=[]
    lines=gGetHtmlLines(url)
    regx = r"""href="?(S+).htm"""
    for link in gGetRegList(lines,regx):
    link = gGetAbslLink(url,link) + '.htm'
    if link not in rtnList:
    rtnList.append(link)
    print link
    return rtnList

    """根据url,抓取其上的jpg和其链接htm上的jpg"""
    def gDownloadAllJpg(url,savePath):
    #参数检查,现忽略
    gDownloadHtmlJpg(url,savePath)
    #抓取link上的jpg
    links=gGetHtmlLink(url)
    for link in links:
    gDownloadHtmlJpg(link,savePath)

    """test"""
    def test():
    u='http://www.gs.xinhuanet.com/news/2007-01/31/content_9188207_1.htm'
    save='d:/tmp/'
    print 'download pic from [' + u +']'
    print 'save to [' +save+'] ...'
    gDownloadHtmlJpg(u,save)
    print "download finished"

    test()
  • 相关阅读:
    11.分类与监督学习,朴素贝叶斯分类算法
    14 深度学习-卷积
    13-垃圾邮件分类2
    12.朴素贝叶斯-垃圾邮件分类
    9、主成分分析
    8、特征选择
    7.逻辑回归实践
    6.逻辑归回
    5.线性回归算法
    15 手写数字识别-小数据集
  • 原文地址:https://www.cnblogs.com/lanzhi/p/6468707.html
Copyright © 2011-2022 走看看