zoukankan      html  css  js  c++  java
  • [网络]_[批量下载网站文件]


    场景:

    1.有时候需要下载某个网站上提供的所有pdf文件,貌似没发现哟下载工具提供。


    #! encoding=utf-8
    
    import urllib2
    import re
    import os
    
    def Download(url,output):
        print "downloading..."+url
        response = urllib2.urlopen(url)
        resourceFile = open(output,"wb")
        resourceFile.write(response.read())
        resourceFile.close()
        print "downloaded"
    
    def Action(url,ext = "pdf",output = "."):
        
        #1.domain
        index = url.rfind("/");
        domain = url[0:index+1];
        print domain
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        
        #2.content
        content = response.read()
    #    print content
        
        #3.resource
        mode = '"([^"]+'+ext+')"'
        pattern = re.compile(mode)
        strMatch = pattern.findall(content)
        size = len(strMatch)
        print "file num: "+str(size)
        for i in range(0,size,1):
    #        print strMatch[i]
            one = strMatch[i]
            partIndex = one.rfind('/')
            if not one.startswith('http://'):
                if -1!=partIndex:
                    directDir = one[0:partIndex+1]
                else:
                    directDir = ""
    #            print directDir
                try:
                    os.makedirs(output+"/"+directDir)
                except Exception,e:
                    pass
                fileUrl = domain+one
                fileOutput = output+"/"+one
                print fileUrl
                print fileOutput
                Download(fileUrl,fileOutput)
            else:
                print one
                print "........."
                print one[partIndex:]
                fileOutput = output+"/"+one[partIndex:]
                print fileOutput
                Download(one,fileOutput)
        #5.download
    
    if __name__=='__main__':
        print "download"
        url = "http://compgeom.cs.uiuc.edu/~jeffe/teaching/algorithms/";
        Action("http://tech.qq.com/","jpg");
        
        
        
    
        
        
        



  • 相关阅读:
    新建安卓项目后,manifest.xml中会出现大段的黄色警告
    TextView设置setCompoundDrawables不生效解决办法
    数据结构->队列->顺序循环队列ADT代码
    数据结构->栈->顺序栈ADT代码
    JAVA环境变量的配置
    计算机科学的范围-----18.12.08
    字符串复制
    扔鸡蛋问题和找零钱问题
    动态规划
    最小二乘法
  • 原文地址:https://www.cnblogs.com/jiangu66/p/3194162.html
Copyright © 2011-2022 走看看