zoukankan      html  css  js  c++  java
  • 你是啥成份?

    各种编程语言我都很喜欢,但平时用的最多的是什么呢?

    一个github小爬虫,获取全部repo及其主要语言,画出饼图。

    """
    你是什么成份?
    """
    
    import requests
    from pyquery import PyQuery as pq
    import matplotlib.pyplot as plt
    from collections import Counter
    import numpy as np
    
    
    def parse_page(url):
        print("visiting",url)
        resp = requests.get(url)
        html = pq(resp.text)
        repo_list = html("#user-repositories-list li")
        repos = []
        for i in range(repo_list.length):
            repo = repo_list.eq(i)
            it = dict()
            repo_name = repo('h3').text()
            repo_language = repo("[itemprop='programmingLanguage']").text()
            it['name'] = repo_name
            it['language'] = repo_language
            repos.append(it)
        sons = html(".pagination a")
        next_page=None
        for i in sons:
            if pq(i).text().strip()=='Next':
                next_page=pq(i).attr("href")
                break  
        sons=[]if next_page is None else [next_page]
        return repos, sons
    
    
    def analyze(repos):
        # unique
        ma = dict([(i['name'], i) for i in repos])
        repos = ma.values()
        cnt = Counter([i['language'] for i in repos if i['language']])
        labels = cnt.keys()
        sizes = np.array(list(cnt.values()))
        explode = np.zeros_like(sizes, dtype=np.float32)  # 0.1表示将Hogs那一块凸显出来
        explode[np.argsort(sizes)[-3:].reshape(-1, 1)] = 0.1  # 前三名突出显示
        plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=90)
        # startangle表示饼图的起始角度
        plt.show()
    
    
    def schedule():
        user = "weiyinfu"
        q = []
        seed = "https://github.com/" + user + "?tab=repositories"
        q.append(seed)
        visited = set()
        repos = []
        while q:
            now = q.pop()
            repo_list, url_list = parse_page(now) 
            for i in url_list:
                if i not in visited: 
                    q.append(i)
                visited.add(i)
            repos += repo_list
    
        return repos
    
    
    def main():
        repos = schedule() 
        print(repos)
        analyze(repos)
    
    
    if __name__ == '__main__':
        main()
    
    
  • 相关阅读:
    阅读笔记(四)——《代码整洁之道》1
    阅读笔记(一)——《需求工程——软件建模与分析》一
    数据提取
    HIVE数据清洗练习
    第八周hadoop-MapReduce
    小程序新增页面导航栏
    个人课程总结
    第十六周总结

    排序
  • 原文地址:https://www.cnblogs.com/weiyinfu/p/9704368.html
Copyright © 2011-2022 走看看