zoukankan      html  css  js  c++  java
  • 【爬虫】获取Github仓库提交纪录历史的脚本 python

    本脚本为第一版开发;后续会进行扩展

    #! python3
    
    import requests
    import time, datetime
    import json
    from colorama import Fore,Back,Style,init
    from bs4 import BeautifulSoup
    
    process = 0
    output  = 0
    
    
    def req(type,addr,data='',**args):
        if type == 'get':
            try:
                responses = requests.get(addr,timeout=50)
            except requests.exceptions.RequestException as e:
                pass
        elif type == 'post':
            try:
                responses = requests.post(addr,timeout=50)
            except requests.exceptions.RequestException as e:
                pass
        return responses
    
    def access(url_addr):
        # print("access")
        for i in url_addr:
            print(i['git_addr'])
            responses = req('get',i['git_addr'])
            if responses.status_code == 200:
                print("[SUCCESS]" + " %s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
                i['git_addr'] = i['git_addr'] + '/commits/'
                commits(i)
            else :
                print(Fore.BLACK + Back.RED + "[ERROR]   "+"%s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
                i['code'] = responses.status_code
                # return 
    
    def commits(addr):
        url = addr['git_addr']
        responses = req('get', url)
        if responses.status_code != 200:
            print("[SUCCESS] %s [status] %s"%(str(url), str(responses.status_code)))
            addr['code'] = responses.status_code
            return 
        text = BeautifulSoup(responses.text, "html.parser")
        # 判断空仓库
    
        if "This repository is empty." in text:
            print(print(Fore.RED + Back.WHITE +"%s 的仓库内容爬取过程中发现告警[This repository is empty.]"%(addr['username'])))
            return
        # commits_all_dict = []
        
        all_commits = text.find_all(class_='TimelineItem-body')
        
        # 展露细节内容的
        try:
            for texts in all_commits:
                dateBar = texts.find(class_='text-normal').get_text()[11:] # 日期
                # 我们获取的日期格式是标准的英文格式日期"Nov 26, 2020",所以我们需要进行日期的转换
                date = datetime.datetime.strptime(dateBar, '%b %d, %Y').strftime('%Y年%m月%d日')
                commits_second = 0
                if process:
                    print("
    =================[%s]================="%(str(date)))
                all_commits_find = texts.ol.find_all('li')
                for commits_find in all_commits_find:
                    commits_dict = {
                        'commits_auth' : commits_find.div.find('div',class_='d-flex').find('div',class_='f6').find(class_='commit-author').get_text(),
                        'commits_time' : commits_find.find('relative-time')['datetime'],  # 当前日期所提交的内容  
                        'commits_href' : "https://github.com" + commits_find.div.p.a['href'],
                        # 我们的text中式把summary和description内容融合在一起的于是我们需要把他们分开
                        'commits_summary' : commits_find.div.p.a['aria-label'][:len(commits_find.div.p.a.get_text())] ,
                        'commits_description' :  commits_find.div.p.a['aria-label'][len(commits_find.div.p.a.get_text()):].strip()
                    }
                    # commits_all_dict.append(commits_dict)
                    commits_second += 1
                    # 处理爬取数据的输出
                    if process :
                        print("
    -----------------[%s]-----------------"%(commits_dict['commits_auth']))
                        print ("[提交时间] %s 
    [提交代码] %s
    [提交主题] %s
    [提交描述] %s"
                                    %(commits_dict['commits_time'], commits_dict['commits_href'], 
                                    commits_dict['commits_summary'], commits_dict['commits_description']))
            print(Fore.BLACK + Back.WHITE +"%s 于 %s 共计提交了 %s 次代码"%(addr['username'], date, commits_second))
                    
            # 处理分页爬取
            next_a = text.find(class_='paginate-container').find_all('a')
            if  len(next_a) and next_a[-1].get_text() == 'Older':
                print("------next page------")
                addr['git_addr'] = next_a[-1]['href']
                commits(addr)
        except Exception as e:
            print(print(Fore.RED + Back.WHITE +"%s 的仓库爬取过程中发生错误."%(addr['username'])))
            return
    
    def main():
        global process
        url_addr = [
            {
                'username' : 'X1',
                'git_addr' : 'https://github.com/litbird0/elevator', # 项目地址
                'start'    : '', 
                'commins'  : [],
            },
            {
                'username' : 'X2',
                'git_addr' : 'https://github.com/1564820398/cjwc_dianti',
                'start'    : '', 
                'commins'  : [],
            },
    
        ]
    
        Webcrawler_key = "mirror"
        if input("请输入爬虫Key:") != Webcrawler_key:
            print(Fore.RED + Back.WHITE + "Key错误!")
            time.sleep(10)
            exit()
    
        if input("是否爬取commits细节(Y/N):").upper() == "Y":
            process = 1
        else :
            process = 0
    
        access(url_addr)
        print("[OK] 爬行结束 ...")
        if input("是否关闭当前窗口(Y/N):").upper() == "Y":
            exit()
        else :
            pass
        exit()
    
    if __name__ == "__main__":
        init(autoreset=True)
        main()
    
  • 相关阅读:
    织梦DedeCms网站首页不生成html文件动态显示方法
    PHP7.0下安装DEDE织梦 出现 GD不支持的解决方法
    DEDECMS5.7支持伪静态的方法
    DEDECMS全站伪静态设置方法
    设置 SSH 通过密钥登录
    Windows安装OpenSSH服务
    VS Code远程开发工具错误-找不到SSH安装
    帝国CMS自定义列表的排序
    帝国CMS灵动标签e:loop的使用方法
    cisco 3750交换机堆叠后配置恢复方法
  • 原文地址:https://www.cnblogs.com/wangyuyang1016/p/14125103.html
Copyright © 2011-2022 走看看