代码:
1 # # -*- coding:utf-8 -*- 2 # @Time : 2021/7/22 22:04 3 # @Author : 周博 4 # @File : test_1.py 5 # @博客园: https://www.cnblogs.com/smartisn/ 6 7 import requests 8 from lxml import etree 9 import re 10 from urllib import request 11 import zipfile 12 import os 13 def Get_whole_file(file): 14 Lists_val=[] 15 for root, dirs, files in os.walk(file): 16 # root 表示当前正在访问的文件夹路径 17 # dirs 表示该文件夹下的子目录名list 18 # files 表示该文件夹下的文件list 19 # 遍历文件 20 for f in files: 21 Lists_val.append(os.path.join(root, f)) 22 # # 遍历所有的文件夹 23 # for d in dirs: 24 # print(os.path.join(root, d)) 25 return Lists_val 26 def un_zip(zip_filename,des_dir): 27 ''' 28 解压压缩包至des_dir指定文件夹 29 :param zip_filename:输入的压缩包名字,例如a.zip 30 :param des_dir: 解压到的位置:例如为 ./文件存储/ 31 :return: 32 ''' 33 with zipfile.ZipFile(zip_filename, 'r') as zzz: 34 # 捕捉错误并且 返回存在错误的 压缩包名称 35 try: 36 zzz.extractall(des_dir) 37 print(zip_filename,"解压成功") 38 except zipfile.BadZipFile: 39 print("Error: 压缩文件不完整:",zip_filename) 40 41 def DownLoadGithub(start,end): 42 # 51-60 43 for page in range(start,end): 44 url = 'https://github.com/search?l=C%23&o=desc&p='+str(page)+'&q=C%23&s=stars&type=Repositories' 45 print("*******************") 46 print(url) 47 strhtml = requests.get(url, timeout=7) 48 tree = etree.HTML(strhtml.text) 49 hreff = tree.xpath('//*[@id="js-pjax-container"]/div/div[3]/div/ul//div[@class="f4 text-normal"]//a//@href') 50 for hh in hreff: 51 try: 52 file_name=hh.replace("/","_") 53 hh="https://github.com"+hh 54 strhtml = requests.get(hh, timeout=7) # Get方式获取网页数据 55 tree = etree.HTML(strhtml.text) 56 href_down = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[2]/a//@href')[0] 57 href_down="https://github.com"+href_down 58 print(href_down) 59 print("./data/" + file_name + '.zip') 60 request.urlretrieve(href_down, "./data/" + file_name + '.zip') 61 print("下载成功") 62 except: 63 continue 64 if __name__=="__main__": 65 # E:pycharmWorkPlace.net_analyzerDownLoad_GitHubdata 66 67 # un_zip("./data/_5argon_protobuf-unity.zip","./extract_data") 68 List_vals=Get_whole_file("./data/") 69 for val in List_vals: 70 try: 71 un_zip(val, "./extract_data") 72 except Exception as e: 73 print(e) 74 continue