zoukankan      html  css  js  c++  java
  • 同步pypi源

    bandsnatch 不好用,不符合自己心意,所以自己就简单弄了个同步的

    # -*- coding: utf-8 -*-
    import re
    import os
    import shutil
    import urllib
    import json
    import copy
    import wget
    import subprocess as sp
    from bs4 import BeautifulSoup as bs
    import multiprocessing as mp
    import requests
    import concurrent.futures
    
    simpleurl = 'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/'
    
    #首先获取所有的packages的名称以及对应的packages所有版本
    def get_package_name():
      simple_index = requests.get(simpleurl)
      simple_index = bs(simple_index.text,'lxml')
      packages_list = [i['href'] for i in simple_index.findAll('a')]
      print('一共 {} 个项目'.format(len(packages_list)))
      return packages_list
    
    
    
    #每个包链接尝试下载6次,不行就返回空
    def get_requests(url):
      for i in range(6):
       try:
         download_urls = requests.get(simpleurl+url,timeout = 60)
       except Exception as e:
         print(url,str(e))
       else:
         return download_urls
      return ''
    
    
    #过滤当前访问的包链接,看看还有多少包没访问
    def filter_packages(packages):
      if os.path.exists('packages_json.txt'):
        packages_dict = json.loads(open('packages_json.txt','r').read())
        packages_dict = eval(packages_dict)
      _packages = list(filter(lambda k: k not in packages_dict, packages))
      return _packages
    
    #多进程访问包链接
    def get_download_url1(packages):
      packages_dict = {}
      with concurrent.futures.ProcessPoolExecutor() as executor:
        for package,download_urls in zip(packages,executor.map(get_requests,packages)):
          packages_dict[package] = [];
          if not download_urls: continue
          download_urls = bs(download_urls.text,'lxml')
          for j in download_urls.findAll('a'):
            packages_dict[package].append(j['href'].replace('../../','../'))
          print('项目:{} 有{}个版本'.format(package, 
                                          len(packages_dict[package])))
        json.dump(packages_dict,open('packages_json.txt','w'))  
      return packages_dict
          
    
     #----------------------------
    #基于当前本地的pypi源,进行过滤哪些不需要下, 依据文件名,过滤win,macos
    def filter_local(packages_dict):
      icount = 0
      done_packages = {k.replace('packages/','').strip():1 for k in open('done_packages.txt')}
      done_packages_cp = copy.deepcopy(done_packages)
      filters_packages = []
      for k,v in packages_dict.items():
        for ind,v1 in enumerate(v):
          if 'packages' not in v1:
             print("非法链接: ",k,':',v1)
             icount += 1
             packages_dict[k][ind] = ''
             continue
          prePackage = v1.split('../packages/')[1].split('#')[0] 
          package_name = prePackage.split('/')[-1].lower()
          if prePackage in done_packages or 
             'macosx' in package_name or 
             'win_' in package_name :
            packages_dict[k][ind] = ''
            #如果当前包在done,则表示这个包还有效,从而将未访问到的包进行删除
            if prePackage in done_packages: done_packages_cp.pop(prePackage)
            icount += 1
    
      print('经过本地库,过滤{}个包'.format(icount))
      print('其中filters_packages.txt 表示当前未访问到的包')
      with open('filters_packages.txt','w',encoding='utf-8') as fw:
        [fw.write(i+'
    ') for i in done_packages_cp.keys()]
      json.dump(packages_dict,open('packages_after_done.txt','w'))  
          
      return packages_dict
    
    
    #基于黑名单过滤
    def filter_blacklist(packages_dict):
      blacklist = [k.strip() for k in open('blacklist.txt')] 
      iproject = 0
      for k,v in copy.deepcopy(packages_dict).items():
    
        if not any(v):
          packages_dict.pop(k)
          continue
    
        for black in blacklist:
          if black.lower() in k.lower():
            iproject += 1
            try:
              packages_dict.pop(k)
            except:
              pass
    
      print('经过黑名单,过滤{}个项目'.format(iproject))
      json.dump(packages_dict,open('packages_remain.txt','w'))  
    
      packages_list = []
      for v in packages_dict.values():
        packages_list.extend([v1 for v1 in v if v1])
    
      if os.path.exists('current_download_url.txt'):
         done_di = {k.strip():1 for k in open('current_download_url.txt')}  
         packages_list = list(filter(lambda k: k.split('../packages/')[1].split('#')[0] not in done_di, packages_list))
         print('经过当前下载的部分{}个,还有{}个包要下载'.format(len(done_di),len(packages_list)))
    
      with open('un_download_packages_url.txt','w') as fw:
        for package in packages_list:
          fw.write(simpleurl+package+'
    ')
    
      return packages_dict,packages_list
        
    
    #多进程下载包,创建对应的位置,目录   
    def wget1(url):
      #out = sp.check_all('wget -r -np {}'.format(simpleurl+url), shell=True)
      try:
        filename = wget.download(simpleurl+url)
      except :
        return '' 
      pathname = url.split('../')[1].split('#')[0]
      dirname = os.path.dirname(pathname)
      if not os.path.exists(dirname):
        try:
            os.makedirs(dirname)
        except:
            pass
      shutil.move(filename,pathname)
      return url.split('../packages/')[1].split('#')[0]
      
    
    def download_packages(packages_list):
    #
      with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
        for package,package_name in zip(packages_list,executor.map(wget1,packages_list)):
    #
          if not package_name:
             print(package,":","下载不了")
             continue
          print('包{}下载完毕'.format(package_name))
          with open('current_download_url.txt','a') as fa:
            fa.write(package_name+'
    ')
          
    
    if __name__ == '__main__':
      if os.path.exists('packages_json.txt'):
        packages_dict = json.load(open('packages_json.txt'))
      else:
        packages = get_package_name()
        packages_dict = get_download_url1(packages)
    
      nsubpackages = 0
      for k,v in packages_dict.items():
        nsubpackages += len(v)
      print('获取完所有{}包以及对应版本的链接,开始进行过滤, 一共有{}个包'.format(len(packages_dict),
                                                                   nsubpackages))
      #========================================================
      if os.path.exists('done_packages.txt'):
        packages_dict = filter_local(packages_dict)
      if os.path.exists('blacklist.txt'):
        packages_dict,packages_list = filter_blacklist(packages_dict)
      print('还有{}个包需要更新,过滤掉{}个不下载的包'.format(len(packages_list),
                                                nsubpackages-len(packages_list)))
      print('-'*50)
      download_packages(packages_list)
      #========================================================
      #done_packages.txt文件是通过find 命令在packages下生成的,形如
      '''  
    0d/0d/fac29d2f0a57e3321804a84239389255e9ac7d649502c359de888b71ffb1/mastercard_mc_on-1.0.1.tar.gz
    0d/0d/8c53e8b9249fecbfb370f3e3c22ef514f9bfbedf0e8917a1e21aed16bafa/tulgey-0.1.6.tar.gz
    0d/0d/8d553e72a079ca545859cccda9b9df05f6ea7b49376e1beefcc870647b27/keepkey-4.0.0.tar.gz
    0d/0d/2b8dec79ef1a19cdc6ecfa2878bb8eb7c78d8ee82f37abbe2e3df0b8249d/bio2bel_chebi-0.2.0.tar.gz
    0d/0d/5801c7bebf6dfb2e1d81bda054de27d7e26776cbe74ed9db7f7b9b6fcd88/coinbase-1.0.0-py2.py3-none-any.whl
    0d/91/8d860c75c3e70e6bbec7b898b5f753bf5da404be9296e245034360759645/tree-format-0.1.2.tar.gz
    0d/91/c62a6b11ac6839fd39da55c1b04ce89ed460644d34b8cff14a5550890a68/crawlib-0.0.4.zip
    0d/91/ca443d226b42bbfab0e2d1e1bd1e5f69cff948fee6dac2e42d7350e28c47/FlexGet-2.12.11-py2.py3-none-any.whl
    0d/91/7e7b0debbfc755876f496ce99d330f6056e1f679964cbba13c029569f64c/agora-graphql-0.1.5.tar.gz
    0d/91/cea4732387324199505eaca476c4837bd6463bc4efa207862c96352b3390/kervi-0.6.3.zip
      '''
    
       #黑名单,就是直接输入项目文件,比如
      '''
    mxnet
    tensorflow
      '''
    

    然后接下来就是运行如下的createsimple.py,思路是在不下载的时候都可以完全创建simple文件夹,然后去检测对应的packages里面是否存在对应的文件,如果不存在,则当前文件夹及文件不创建,很简单。

    import os
    import json
    
    def _createsimple(k,v,filepath, packages_path):
    
        prestring = ['''<!DOCTYPE html>
    <html>
      <head>
        <title>Links for {}</title>
      </head>
      <body>
        <h1>Links for {}</h1>
    '''.format(k.strip('./'),k.strip('./')) ]
        flag = False
        for v_ in v:
            packages_basename = v_.split('#')[0].replace('../packages/','')
            packages_filename = os.path.join(packages_path,packages_basename)
            if not os.path.exists(packages_filename): continue
            flag = True
            prestring.append( '    <a href="../{}">{}</a><br/>
    '.format(v_,v_.split('#')[0].split('/')[-1]))
        prestring.append('''  </body>
    </html>
    ''')
        if flag:
    
          os.makedirs(os.path.dirname(filepath))
          with open(filepath, 'w')as fw:
            fw.write(''.join(prestring))
        return
    
    def createsimple(packages_json, packages_path, simple):
      ans = json.load(open(packages_json))
      for k,v in ans.items():
        dirname = os.path.join(simple,k)
        if not all(v): continue
    
        filepath = os.path.join(dirname,'index.html')
        try:
          _createsimple(k,v,filepath, packages_path)
        except Exception as e:
          print(str(e))
    
    
    if __name__ == '__main__':
      packages_json = './packages_json.txt'
      packages_path =  './packages'
      simple = './simple'
      createsimple(packages_json, packages_path, simple)
    
    
  • 相关阅读:
    面经中高频知识点归纳(四)
    ssh整合思想初步 struts2与Spring的整合 struts2-spring-plugin-2.3.4.1.jar下载地址 自动加载Spring中的XML配置文件 Struts2下载地址
    ssh整合思想初步 structs2 Spring Hibernate三大框架各自要点
    Spring中使用事务搭建转账环境方法二 相对简便的注解方法 ——配置文件注入对象属性需要setter方法 注解方法,不需要生成setter方法
    Spring中使用事务搭建转账环境 转账操作,
    Spring中c3p0连接池的配置 及JdbcTemplate的使用 通过XML配置文件注入各种需要对象的操作 来完成数据库添加Add()方法
    Spring XML配置文件无法自动提示 eclipse中XML配置文件open with打开方式选择 XML Editor:注意它的编辑方式也是有两种的design和source
    Spring中c3p0连接池 jar包下载 c3p0-0.9.2.1 jar包和mchange-commons-java-0.2.3.4 jar 包
    在线聊天项目1.4版 使用Gson方法解析Json字符串以便重构request和response的各种请求和响应 解决聊天不畅问题 Gson包下载地址
    java在线聊天项目1.3版设计好友列表框功能补充,因只要用户登录就发送一串新列表,导致不同客户端好友列表不同问题
  • 原文地址:https://www.cnblogs.com/shouhuxianjian/p/13068345.html
Copyright © 2011-2022 走看看