zoukankan      html  css  js  c++  java
  • [网络]_[批量下载网站文件]


    场景:

    1.有时候需要下载某个网站上提供的所有pdf文件,貌似没发现哟下载工具提供。


    #! encoding=utf-8
    
    import urllib2
    import re
    import os
    
    def Download(url,output):
        print "downloading..."+url
        response = urllib2.urlopen(url)
        resourceFile = open(output,"wb")
        resourceFile.write(response.read())
        resourceFile.close()
        print "downloaded"
    
    def Action(url,ext = "pdf",output = "."):
        
        #1.domain
        index = url.rfind("/");
        domain = url[0:index+1];
        print domain
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        
        #2.content
        content = response.read()
    #    print content
        
        #3.resource
        mode = '"([^"]+'+ext+')"'
        pattern = re.compile(mode)
        strMatch = pattern.findall(content)
        size = len(strMatch)
        print "file num: "+str(size)
        for i in range(0,size,1):
    #        print strMatch[i]
            one = strMatch[i]
            partIndex = one.rfind('/')
            if not one.startswith('http://'):
                if -1!=partIndex:
                    directDir = one[0:partIndex+1]
                else:
                    directDir = ""
    #            print directDir
                try:
                    os.makedirs(output+"/"+directDir)
                except Exception,e:
                    pass
                fileUrl = domain+one
                fileOutput = output+"/"+one
                print fileUrl
                print fileOutput
                Download(fileUrl,fileOutput)
            else:
                print one
                print "........."
                print one[partIndex:]
                fileOutput = output+"/"+one[partIndex:]
                print fileOutput
                Download(one,fileOutput)
        #5.download
    
    if __name__=='__main__':
        print "download"
        url = "http://compgeom.cs.uiuc.edu/~jeffe/teaching/algorithms/";
        Action("http://tech.qq.com/","jpg");
        
        
        
    
        
        
        



  • 相关阅读:
    jqurey技术总结
    ie浏览器兼容问题小结
    FIS的合并压缩技术
    对js中数组的一些总结
    浅谈如何面向对象进行封装
    13th week blog
    12th week blog
    11th week blog
    10th week blog
    9th Week blog
  • 原文地址:https://www.cnblogs.com/jiangu66/p/3194162.html
Copyright © 2011-2022 走看看