zoukankan      html  css  js  c++  java
  • 【爬虫】获取Github仓库提交纪录历史的脚本 python

    本脚本为第一版开发;后续会进行扩展

    #! python3
    
    import requests
    import time, datetime
    import json
    from colorama import Fore,Back,Style,init
    from bs4 import BeautifulSoup
    
    process = 0
    output  = 0
    
    
    def req(type,addr,data='',**args):
        if type == 'get':
            try:
                responses = requests.get(addr,timeout=50)
            except requests.exceptions.RequestException as e:
                pass
        elif type == 'post':
            try:
                responses = requests.post(addr,timeout=50)
            except requests.exceptions.RequestException as e:
                pass
        return responses
    
    def access(url_addr):
        # print("access")
        for i in url_addr:
            print(i['git_addr'])
            responses = req('get',i['git_addr'])
            if responses.status_code == 200:
                print("[SUCCESS]" + " %s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
                i['git_addr'] = i['git_addr'] + '/commits/'
                commits(i)
            else :
                print(Fore.BLACK + Back.RED + "[ERROR]   "+"%s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
                i['code'] = responses.status_code
                # return 
    
    def commits(addr):
        url = addr['git_addr']
        responses = req('get', url)
        if responses.status_code != 200:
            print("[SUCCESS] %s [status] %s"%(str(url), str(responses.status_code)))
            addr['code'] = responses.status_code
            return 
        text = BeautifulSoup(responses.text, "html.parser")
        # 判断空仓库
    
        if "This repository is empty." in text:
            print(print(Fore.RED + Back.WHITE +"%s 的仓库内容爬取过程中发现告警[This repository is empty.]"%(addr['username'])))
            return
        # commits_all_dict = []
        
        all_commits = text.find_all(class_='TimelineItem-body')
        
        # 展露细节内容的
        try:
            for texts in all_commits:
                dateBar = texts.find(class_='text-normal').get_text()[11:] # 日期
                # 我们获取的日期格式是标准的英文格式日期"Nov 26, 2020",所以我们需要进行日期的转换
                date = datetime.datetime.strptime(dateBar, '%b %d, %Y').strftime('%Y年%m月%d日')
                commits_second = 0
                if process:
                    print("
    =================[%s]================="%(str(date)))
                all_commits_find = texts.ol.find_all('li')
                for commits_find in all_commits_find:
                    commits_dict = {
                        'commits_auth' : commits_find.div.find('div',class_='d-flex').find('div',class_='f6').find(class_='commit-author').get_text(),
                        'commits_time' : commits_find.find('relative-time')['datetime'],  # 当前日期所提交的内容  
                        'commits_href' : "https://github.com" + commits_find.div.p.a['href'],
                        # 我们的text中式把summary和description内容融合在一起的于是我们需要把他们分开
                        'commits_summary' : commits_find.div.p.a['aria-label'][:len(commits_find.div.p.a.get_text())] ,
                        'commits_description' :  commits_find.div.p.a['aria-label'][len(commits_find.div.p.a.get_text()):].strip()
                    }
                    # commits_all_dict.append(commits_dict)
                    commits_second += 1
                    # 处理爬取数据的输出
                    if process :
                        print("
    -----------------[%s]-----------------"%(commits_dict['commits_auth']))
                        print ("[提交时间] %s 
    [提交代码] %s
    [提交主题] %s
    [提交描述] %s"
                                    %(commits_dict['commits_time'], commits_dict['commits_href'], 
                                    commits_dict['commits_summary'], commits_dict['commits_description']))
            print(Fore.BLACK + Back.WHITE +"%s 于 %s 共计提交了 %s 次代码"%(addr['username'], date, commits_second))
                    
            # 处理分页爬取
            next_a = text.find(class_='paginate-container').find_all('a')
            if  len(next_a) and next_a[-1].get_text() == 'Older':
                print("------next page------")
                addr['git_addr'] = next_a[-1]['href']
                commits(addr)
        except Exception as e:
            print(print(Fore.RED + Back.WHITE +"%s 的仓库爬取过程中发生错误."%(addr['username'])))
            return
    
    def main():
        global process
        url_addr = [
            {
                'username' : 'X1',
                'git_addr' : 'https://github.com/litbird0/elevator', # 项目地址
                'start'    : '', 
                'commins'  : [],
            },
            {
                'username' : 'X2',
                'git_addr' : 'https://github.com/1564820398/cjwc_dianti',
                'start'    : '', 
                'commins'  : [],
            },
    
        ]
    
        Webcrawler_key = "mirror"
        if input("请输入爬虫Key:") != Webcrawler_key:
            print(Fore.RED + Back.WHITE + "Key错误!")
            time.sleep(10)
            exit()
    
        if input("是否爬取commits细节(Y/N):").upper() == "Y":
            process = 1
        else :
            process = 0
    
        access(url_addr)
        print("[OK] 爬行结束 ...")
        if input("是否关闭当前窗口(Y/N):").upper() == "Y":
            exit()
        else :
            pass
        exit()
    
    if __name__ == "__main__":
        init(autoreset=True)
        main()
    
  • 相关阅读:
    sabaki and leelazero
    apply current folder view to all folders
    string operation in powershell
    wirte function in powershell
    add environment path to powershell
    Module in powershell
    sql prompt
    vmware中鼠标在部分区域不能使用
    调整多个控件的dock的顺序
    行为型模型 策略模式
  • 原文地址:https://www.cnblogs.com/wangyuyang1016/p/14125103.html
Copyright © 2011-2022 走看看