zoukankan      html  css  js  c++  java
  • 同步pypi源

    bandsnatch 不好用,不符合自己心意,所以自己就简单弄了个同步的

    # -*- coding: utf-8 -*-
    import re
    import os
    import shutil
    import urllib
    import json
    import copy
    import wget
    import subprocess as sp
    from bs4 import BeautifulSoup as bs
    import multiprocessing as mp
    import requests
    import concurrent.futures
    
    simpleurl = 'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/'
    
    #首先获取所有的packages的名称以及对应的packages所有版本
    def get_package_name():
      simple_index = requests.get(simpleurl)
      simple_index = bs(simple_index.text,'lxml')
      packages_list = [i['href'] for i in simple_index.findAll('a')]
      print('一共 {} 个项目'.format(len(packages_list)))
      return packages_list
    
    
    
    #每个包链接尝试下载6次,不行就返回空
    def get_requests(url):
      for i in range(6):
       try:
         download_urls = requests.get(simpleurl+url,timeout = 60)
       except Exception as e:
         print(url,str(e))
       else:
         return download_urls
      return ''
    
    
    #过滤当前访问的包链接,看看还有多少包没访问
    def filter_packages(packages):
      if os.path.exists('packages_json.txt'):
        packages_dict = json.loads(open('packages_json.txt','r').read())
        packages_dict = eval(packages_dict)
      _packages = list(filter(lambda k: k not in packages_dict, packages))
      return _packages
    
    #多进程访问包链接
    def get_download_url1(packages):
      packages_dict = {}
      with concurrent.futures.ProcessPoolExecutor() as executor:
        for package,download_urls in zip(packages,executor.map(get_requests,packages)):
          packages_dict[package] = [];
          if not download_urls: continue
          download_urls = bs(download_urls.text,'lxml')
          for j in download_urls.findAll('a'):
            packages_dict[package].append(j['href'].replace('../../','../'))
          print('项目:{} 有{}个版本'.format(package, 
                                          len(packages_dict[package])))
        json.dump(packages_dict,open('packages_json.txt','w'))  
      return packages_dict
          
    
     #----------------------------
    #基于当前本地的pypi源,进行过滤哪些不需要下, 依据文件名,过滤win,macos
    def filter_local(packages_dict):
      icount = 0
      done_packages = {k.replace('packages/','').strip():1 for k in open('done_packages.txt')}
      done_packages_cp = copy.deepcopy(done_packages)
      filters_packages = []
      for k,v in packages_dict.items():
        for ind,v1 in enumerate(v):
          if 'packages' not in v1:
             print("非法链接: ",k,':',v1)
             icount += 1
             packages_dict[k][ind] = ''
             continue
          prePackage = v1.split('../packages/')[1].split('#')[0] 
          package_name = prePackage.split('/')[-1].lower()
          if prePackage in done_packages or 
             'macosx' in package_name or 
             'win_' in package_name :
            packages_dict[k][ind] = ''
            #如果当前包在done,则表示这个包还有效,从而将未访问到的包进行删除
            if prePackage in done_packages: done_packages_cp.pop(prePackage)
            icount += 1
    
      print('经过本地库,过滤{}个包'.format(icount))
      print('其中filters_packages.txt 表示当前未访问到的包')
      with open('filters_packages.txt','w',encoding='utf-8') as fw:
        [fw.write(i+'
    ') for i in done_packages_cp.keys()]
      json.dump(packages_dict,open('packages_after_done.txt','w'))  
          
      return packages_dict
    
    
    #基于黑名单过滤
    def filter_blacklist(packages_dict):
      blacklist = [k.strip() for k in open('blacklist.txt')] 
      iproject = 0
      for k,v in copy.deepcopy(packages_dict).items():
    
        if not any(v):
          packages_dict.pop(k)
          continue
    
        for black in blacklist:
          if black.lower() in k.lower():
            iproject += 1
            try:
              packages_dict.pop(k)
            except:
              pass
    
      print('经过黑名单,过滤{}个项目'.format(iproject))
      json.dump(packages_dict,open('packages_remain.txt','w'))  
    
      packages_list = []
      for v in packages_dict.values():
        packages_list.extend([v1 for v1 in v if v1])
    
      if os.path.exists('current_download_url.txt'):
         done_di = {k.strip():1 for k in open('current_download_url.txt')}  
         packages_list = list(filter(lambda k: k.split('../packages/')[1].split('#')[0] not in done_di, packages_list))
         print('经过当前下载的部分{}个,还有{}个包要下载'.format(len(done_di),len(packages_list)))
    
      with open('un_download_packages_url.txt','w') as fw:
        for package in packages_list:
          fw.write(simpleurl+package+'
    ')
    
      return packages_dict,packages_list
        
    
    #多进程下载包,创建对应的位置,目录   
    def wget1(url):
      #out = sp.check_all('wget -r -np {}'.format(simpleurl+url), shell=True)
      try:
        filename = wget.download(simpleurl+url)
      except :
        return '' 
      pathname = url.split('../')[1].split('#')[0]
      dirname = os.path.dirname(pathname)
      if not os.path.exists(dirname):
        try:
            os.makedirs(dirname)
        except:
            pass
      shutil.move(filename,pathname)
      return url.split('../packages/')[1].split('#')[0]
      
    
    def download_packages(packages_list):
    #
      with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
        for package,package_name in zip(packages_list,executor.map(wget1,packages_list)):
    #
          if not package_name:
             print(package,":","下载不了")
             continue
          print('包{}下载完毕'.format(package_name))
          with open('current_download_url.txt','a') as fa:
            fa.write(package_name+'
    ')
          
    
    if __name__ == '__main__':
      if os.path.exists('packages_json.txt'):
        packages_dict = json.load(open('packages_json.txt'))
      else:
        packages = get_package_name()
        packages_dict = get_download_url1(packages)
    
      nsubpackages = 0
      for k,v in packages_dict.items():
        nsubpackages += len(v)
      print('获取完所有{}包以及对应版本的链接,开始进行过滤, 一共有{}个包'.format(len(packages_dict),
                                                                   nsubpackages))
      #========================================================
      if os.path.exists('done_packages.txt'):
        packages_dict = filter_local(packages_dict)
      if os.path.exists('blacklist.txt'):
        packages_dict,packages_list = filter_blacklist(packages_dict)
      print('还有{}个包需要更新,过滤掉{}个不下载的包'.format(len(packages_list),
                                                nsubpackages-len(packages_list)))
      print('-'*50)
      download_packages(packages_list)
      #========================================================
      #done_packages.txt文件是通过find 命令在packages下生成的,形如
      '''  
    0d/0d/fac29d2f0a57e3321804a84239389255e9ac7d649502c359de888b71ffb1/mastercard_mc_on-1.0.1.tar.gz
    0d/0d/8c53e8b9249fecbfb370f3e3c22ef514f9bfbedf0e8917a1e21aed16bafa/tulgey-0.1.6.tar.gz
    0d/0d/8d553e72a079ca545859cccda9b9df05f6ea7b49376e1beefcc870647b27/keepkey-4.0.0.tar.gz
    0d/0d/2b8dec79ef1a19cdc6ecfa2878bb8eb7c78d8ee82f37abbe2e3df0b8249d/bio2bel_chebi-0.2.0.tar.gz
    0d/0d/5801c7bebf6dfb2e1d81bda054de27d7e26776cbe74ed9db7f7b9b6fcd88/coinbase-1.0.0-py2.py3-none-any.whl
    0d/91/8d860c75c3e70e6bbec7b898b5f753bf5da404be9296e245034360759645/tree-format-0.1.2.tar.gz
    0d/91/c62a6b11ac6839fd39da55c1b04ce89ed460644d34b8cff14a5550890a68/crawlib-0.0.4.zip
    0d/91/ca443d226b42bbfab0e2d1e1bd1e5f69cff948fee6dac2e42d7350e28c47/FlexGet-2.12.11-py2.py3-none-any.whl
    0d/91/7e7b0debbfc755876f496ce99d330f6056e1f679964cbba13c029569f64c/agora-graphql-0.1.5.tar.gz
    0d/91/cea4732387324199505eaca476c4837bd6463bc4efa207862c96352b3390/kervi-0.6.3.zip
      '''
    
       #黑名单,就是直接输入项目文件,比如
      '''
    mxnet
    tensorflow
      '''
    

    然后接下来就是运行如下的createsimple.py,思路是在不下载的时候都可以完全创建simple文件夹,然后去检测对应的packages里面是否存在对应的文件,如果不存在,则当前文件夹及文件不创建,很简单。

    import os
    import json
    
    def _createsimple(k,v,filepath, packages_path):
    
        prestring = ['''<!DOCTYPE html>
    <html>
      <head>
        <title>Links for {}</title>
      </head>
      <body>
        <h1>Links for {}</h1>
    '''.format(k.strip('./'),k.strip('./')) ]
        flag = False
        for v_ in v:
            packages_basename = v_.split('#')[0].replace('../packages/','')
            packages_filename = os.path.join(packages_path,packages_basename)
            if not os.path.exists(packages_filename): continue
            flag = True
            prestring.append( '    <a href="../{}">{}</a><br/>
    '.format(v_,v_.split('#')[0].split('/')[-1]))
        prestring.append('''  </body>
    </html>
    ''')
        if flag:
    
          os.makedirs(os.path.dirname(filepath))
          with open(filepath, 'w')as fw:
            fw.write(''.join(prestring))
        return
    
    def createsimple(packages_json, packages_path, simple):
      ans = json.load(open(packages_json))
      for k,v in ans.items():
        dirname = os.path.join(simple,k)
        if not all(v): continue
    
        filepath = os.path.join(dirname,'index.html')
        try:
          _createsimple(k,v,filepath, packages_path)
        except Exception as e:
          print(str(e))
    
    
    if __name__ == '__main__':
      packages_json = './packages_json.txt'
      packages_path =  './packages'
      simple = './simple'
      createsimple(packages_json, packages_path, simple)
    
    
  • 相关阅读:
    基于Servlet+smartUpload的文件上传
    基于Servlet+smartUpload的文件上传
    基于Servlet+smartUpload的文件上传
    dispatch_async 的 block 中是否该使用_weak self
    dispatch_async 的 block 中是否该使用_weak self
    dispatch_async 的 block 中是否该使用_weak self
    dispatch_async 的 block 中是否该使用_weak self
    Vagrant Tip: Virtualbox Guest Additions
    Vagrant Tip: Virtualbox Guest Additions
    Vagrant Tip: Virtualbox Guest Additions
  • 原文地址:https://www.cnblogs.com/shouhuxianjian/p/13068345.html
Copyright © 2011-2022 走看看