zoukankan      html  css  js  c++  java
  • 批量下载github代码,同时含有解压zip,遍历文件函数

    代码:

     1 # # -*- coding:utf-8 -*-
     2 # @Time : 2021/7/22 22:04 
     3 # @Author : 周博
     4 # @File : test_1.py 
     5 # @博客园: https://www.cnblogs.com/smartisn/
     6 
     7 import requests
     8 from lxml import etree
     9 import re
    10 from urllib import request
    11 import zipfile
    12 import os
    13 def Get_whole_file(file):
    14     Lists_val=[]
    15     for root, dirs, files in os.walk(file):
    16         # root 表示当前正在访问的文件夹路径
    17         # dirs 表示该文件夹下的子目录名list
    18         # files 表示该文件夹下的文件list
    19         # 遍历文件
    20         for f in files:
    21             Lists_val.append(os.path.join(root, f))
    22         # # 遍历所有的文件夹
    23         # for d in dirs:
    24         #     print(os.path.join(root, d))
    25     return Lists_val
    26 def un_zip(zip_filename,des_dir):
    27     '''
    28     解压压缩包至des_dir指定文件夹
    29     :param zip_filename:输入的压缩包名字,例如a.zip
    30     :param des_dir: 解压到的位置:例如为  ./文件存储/
    31     :return:
    32     '''
    33     with zipfile.ZipFile(zip_filename, 'r') as zzz:
    34         # 捕捉错误并且 返回存在错误的 压缩包名称
    35         try:
    36             zzz.extractall(des_dir)
    37             print(zip_filename,"解压成功")
    38         except zipfile.BadZipFile:
    39             print("Error: 压缩文件不完整:",zip_filename)
    40 
    41 def DownLoadGithub(start,end):
    42     # 51-60
    43     for page in range(start,end):
    44         url = 'https://github.com/search?l=C%23&o=desc&p='+str(page)+'&q=C%23&s=stars&type=Repositories'
    45         print("*******************")
    46         print(url)
    47         strhtml = requests.get(url, timeout=7)
    48         tree = etree.HTML(strhtml.text)
    49         hreff = tree.xpath('//*[@id="js-pjax-container"]/div/div[3]/div/ul//div[@class="f4 text-normal"]//a//@href')
    50         for hh in hreff:
    51             try:
    52                 file_name=hh.replace("/","_")
    53                 hh="https://github.com"+hh
    54                 strhtml = requests.get(hh, timeout=7)  # Get方式获取网页数据
    55                 tree = etree.HTML(strhtml.text)
    56                 href_down = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[2]/a//@href')[0]
    57                 href_down="https://github.com"+href_down
    58                 print(href_down)
    59                 print("./data/" + file_name + '.zip')
    60                 request.urlretrieve(href_down, "./data/" + file_name + '.zip')
    61                 print("下载成功")
    62             except:
    63                 continue
    64 if __name__=="__main__":
    65     # E:pycharmWorkPlace.net_analyzerDownLoad_GitHubdata
    66 
    67     # un_zip("./data/_5argon_protobuf-unity.zip","./extract_data")
    68     List_vals=Get_whole_file("./data/")
    69     for val in List_vals:
    70         try:
    71             un_zip(val, "./extract_data")
    72         except Exception as e:
    73             print(e)
    74             continue
  • 相关阅读:
    不同浏览器的JS如何兼容?
    过滤器如何配置(javax.servlet.Filter)?
    hibernate中 dialect,lazy,inverse,cascade属性的用途?
    json注记
    php: $$str
    MySql计算字段的长度
    封装一个获取变量准确类型的函数
    JavaScript如何创建一个对象
    python+selenium自动登录163邮箱
    获取cookie
  • 原文地址:https://www.cnblogs.com/smartisn/p/15047620.html
Copyright © 2011-2022 走看看