zoukankan      html  css  js  c++  java
  • [网络]_[批量下载网站文件]


    场景:

    1.有时候需要下载某个网站上提供的所有pdf文件,貌似没发现哟下载工具提供。


    #! encoding=utf-8
    
    import urllib2
    import re
    import os
    
    def Download(url,output):
        print "downloading..."+url
        response = urllib2.urlopen(url)
        resourceFile = open(output,"wb")
        resourceFile.write(response.read())
        resourceFile.close()
        print "downloaded"
    
    def Action(url,ext = "pdf",output = "."):
        
        #1.domain
        index = url.rfind("/");
        domain = url[0:index+1];
        print domain
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        
        #2.content
        content = response.read()
    #    print content
        
        #3.resource
        mode = '"([^"]+'+ext+')"'
        pattern = re.compile(mode)
        strMatch = pattern.findall(content)
        size = len(strMatch)
        print "file num: "+str(size)
        for i in range(0,size,1):
    #        print strMatch[i]
            one = strMatch[i]
            partIndex = one.rfind('/')
            if not one.startswith('http://'):
                if -1!=partIndex:
                    directDir = one[0:partIndex+1]
                else:
                    directDir = ""
    #            print directDir
                try:
                    os.makedirs(output+"/"+directDir)
                except Exception,e:
                    pass
                fileUrl = domain+one
                fileOutput = output+"/"+one
                print fileUrl
                print fileOutput
                Download(fileUrl,fileOutput)
            else:
                print one
                print "........."
                print one[partIndex:]
                fileOutput = output+"/"+one[partIndex:]
                print fileOutput
                Download(one,fileOutput)
        #5.download
    
    if __name__=='__main__':
        print "download"
        url = "http://compgeom.cs.uiuc.edu/~jeffe/teaching/algorithms/";
        Action("http://tech.qq.com/","jpg");
        
        
        
    
        
        
        



  • 相关阅读:
    火狐 http://localhost:8080自动跳转到http://www.localhost.com:8080
    Windows下搭建PHP开发环境
    对帝国cms、dedecms、phpcms等负载测试总结
    System.ExecutionEngineException: Attempting to JIT compile method System.Linq.Enumerable
    SQLCMD Mode: give it one more chance
    transition状态下Mecanim动画的跳转
    Lua库-bit32库
    C语言输入输出函数总结
    Lua库-table
    Lua中的数据结构
  • 原文地址:https://www.cnblogs.com/jiangu66/p/3194162.html
Copyright © 2011-2022 走看看