bandsnatch 不好用,不符合自己心意,所以自己就简单弄了个同步的
# -*- coding: utf-8 -*-
import re
import os
import shutil
import urllib
import json
import copy
import wget
import subprocess as sp
from bs4 import BeautifulSoup as bs
import multiprocessing as mp
import requests
import concurrent.futures
simpleurl = 'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/'
#首先获取所有的packages的名称以及对应的packages所有版本
def get_package_name():
simple_index = requests.get(simpleurl)
simple_index = bs(simple_index.text,'lxml')
packages_list = [i['href'] for i in simple_index.findAll('a')]
print('一共 {} 个项目'.format(len(packages_list)))
return packages_list
#每个包链接尝试下载6次,不行就返回空
def get_requests(url):
for i in range(6):
try:
download_urls = requests.get(simpleurl+url,timeout = 60)
except Exception as e:
print(url,str(e))
else:
return download_urls
return ''
#过滤当前访问的包链接,看看还有多少包没访问
def filter_packages(packages):
if os.path.exists('packages_json.txt'):
packages_dict = json.loads(open('packages_json.txt','r').read())
packages_dict = eval(packages_dict)
_packages = list(filter(lambda k: k not in packages_dict, packages))
return _packages
#多进程访问包链接
def get_download_url1(packages):
packages_dict = {}
with concurrent.futures.ProcessPoolExecutor() as executor:
for package,download_urls in zip(packages,executor.map(get_requests,packages)):
packages_dict[package] = [];
if not download_urls: continue
download_urls = bs(download_urls.text,'lxml')
for j in download_urls.findAll('a'):
packages_dict[package].append(j['href'].replace('../../','../'))
print('项目:{} 有{}个版本'.format(package,
len(packages_dict[package])))
json.dump(packages_dict,open('packages_json.txt','w'))
return packages_dict
#----------------------------
#基于当前本地的pypi源,进行过滤哪些不需要下, 依据文件名,过滤win,macos
def filter_local(packages_dict):
icount = 0
done_packages = {k.replace('packages/','').strip():1 for k in open('done_packages.txt')}
done_packages_cp = copy.deepcopy(done_packages)
filters_packages = []
for k,v in packages_dict.items():
for ind,v1 in enumerate(v):
if 'packages' not in v1:
print("非法链接: ",k,':',v1)
icount += 1
packages_dict[k][ind] = ''
continue
prePackage = v1.split('../packages/')[1].split('#')[0]
package_name = prePackage.split('/')[-1].lower()
if prePackage in done_packages or
'macosx' in package_name or
'win_' in package_name :
packages_dict[k][ind] = ''
#如果当前包在done,则表示这个包还有效,从而将未访问到的包进行删除
if prePackage in done_packages: done_packages_cp.pop(prePackage)
icount += 1
print('经过本地库,过滤{}个包'.format(icount))
print('其中filters_packages.txt 表示当前未访问到的包')
with open('filters_packages.txt','w',encoding='utf-8') as fw:
[fw.write(i+'
') for i in done_packages_cp.keys()]
json.dump(packages_dict,open('packages_after_done.txt','w'))
return packages_dict
#基于黑名单过滤
def filter_blacklist(packages_dict):
blacklist = [k.strip() for k in open('blacklist.txt')]
iproject = 0
for k,v in copy.deepcopy(packages_dict).items():
if not any(v):
packages_dict.pop(k)
continue
for black in blacklist:
if black.lower() in k.lower():
iproject += 1
try:
packages_dict.pop(k)
except:
pass
print('经过黑名单,过滤{}个项目'.format(iproject))
json.dump(packages_dict,open('packages_remain.txt','w'))
packages_list = []
for v in packages_dict.values():
packages_list.extend([v1 for v1 in v if v1])
if os.path.exists('current_download_url.txt'):
done_di = {k.strip():1 for k in open('current_download_url.txt')}
packages_list = list(filter(lambda k: k.split('../packages/')[1].split('#')[0] not in done_di, packages_list))
print('经过当前下载的部分{}个,还有{}个包要下载'.format(len(done_di),len(packages_list)))
with open('un_download_packages_url.txt','w') as fw:
for package in packages_list:
fw.write(simpleurl+package+'
')
return packages_dict,packages_list
#多进程下载包,创建对应的位置,目录
def wget1(url):
#out = sp.check_all('wget -r -np {}'.format(simpleurl+url), shell=True)
try:
filename = wget.download(simpleurl+url)
except :
return ''
pathname = url.split('../')[1].split('#')[0]
dirname = os.path.dirname(pathname)
if not os.path.exists(dirname):
try:
os.makedirs(dirname)
except:
pass
shutil.move(filename,pathname)
return url.split('../packages/')[1].split('#')[0]
def download_packages(packages_list):
#
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
for package,package_name in zip(packages_list,executor.map(wget1,packages_list)):
#
if not package_name:
print(package,":","下载不了")
continue
print('包{}下载完毕'.format(package_name))
with open('current_download_url.txt','a') as fa:
fa.write(package_name+'
')
if __name__ == '__main__':
if os.path.exists('packages_json.txt'):
packages_dict = json.load(open('packages_json.txt'))
else:
packages = get_package_name()
packages_dict = get_download_url1(packages)
nsubpackages = 0
for k,v in packages_dict.items():
nsubpackages += len(v)
print('获取完所有{}包以及对应版本的链接,开始进行过滤, 一共有{}个包'.format(len(packages_dict),
nsubpackages))
#========================================================
if os.path.exists('done_packages.txt'):
packages_dict = filter_local(packages_dict)
if os.path.exists('blacklist.txt'):
packages_dict,packages_list = filter_blacklist(packages_dict)
print('还有{}个包需要更新,过滤掉{}个不下载的包'.format(len(packages_list),
nsubpackages-len(packages_list)))
print('-'*50)
download_packages(packages_list)
#========================================================
#done_packages.txt文件是通过find 命令在packages下生成的,形如
'''
0d/0d/fac29d2f0a57e3321804a84239389255e9ac7d649502c359de888b71ffb1/mastercard_mc_on-1.0.1.tar.gz
0d/0d/8c53e8b9249fecbfb370f3e3c22ef514f9bfbedf0e8917a1e21aed16bafa/tulgey-0.1.6.tar.gz
0d/0d/8d553e72a079ca545859cccda9b9df05f6ea7b49376e1beefcc870647b27/keepkey-4.0.0.tar.gz
0d/0d/2b8dec79ef1a19cdc6ecfa2878bb8eb7c78d8ee82f37abbe2e3df0b8249d/bio2bel_chebi-0.2.0.tar.gz
0d/0d/5801c7bebf6dfb2e1d81bda054de27d7e26776cbe74ed9db7f7b9b6fcd88/coinbase-1.0.0-py2.py3-none-any.whl
0d/91/8d860c75c3e70e6bbec7b898b5f753bf5da404be9296e245034360759645/tree-format-0.1.2.tar.gz
0d/91/c62a6b11ac6839fd39da55c1b04ce89ed460644d34b8cff14a5550890a68/crawlib-0.0.4.zip
0d/91/ca443d226b42bbfab0e2d1e1bd1e5f69cff948fee6dac2e42d7350e28c47/FlexGet-2.12.11-py2.py3-none-any.whl
0d/91/7e7b0debbfc755876f496ce99d330f6056e1f679964cbba13c029569f64c/agora-graphql-0.1.5.tar.gz
0d/91/cea4732387324199505eaca476c4837bd6463bc4efa207862c96352b3390/kervi-0.6.3.zip
'''
#黑名单,就是直接输入项目文件,比如
'''
mxnet
tensorflow
'''
然后接下来就是运行如下的createsimple.py,思路是在不下载的时候都可以完全创建simple文件夹,然后去检测对应的packages里面是否存在对应的文件,如果不存在,则当前文件夹及文件不创建,很简单。
import os
import json
def _createsimple(k,v,filepath, packages_path):
prestring = ['''<!DOCTYPE html>
<html>
<head>
<title>Links for {}</title>
</head>
<body>
<h1>Links for {}</h1>
'''.format(k.strip('./'),k.strip('./')) ]
flag = False
for v_ in v:
packages_basename = v_.split('#')[0].replace('../packages/','')
packages_filename = os.path.join(packages_path,packages_basename)
if not os.path.exists(packages_filename): continue
flag = True
prestring.append( ' <a href="../{}">{}</a><br/>
'.format(v_,v_.split('#')[0].split('/')[-1]))
prestring.append(''' </body>
</html>
''')
if flag:
os.makedirs(os.path.dirname(filepath))
with open(filepath, 'w')as fw:
fw.write(''.join(prestring))
return
def createsimple(packages_json, packages_path, simple):
ans = json.load(open(packages_json))
for k,v in ans.items():
dirname = os.path.join(simple,k)
if not all(v): continue
filepath = os.path.join(dirname,'index.html')
try:
_createsimple(k,v,filepath, packages_path)
except Exception as e:
print(str(e))
if __name__ == '__main__':
packages_json = './packages_json.txt'
packages_path = './packages'
simple = './simple'
createsimple(packages_json, packages_path, simple)