各种编程语言我都很喜欢,但平时用的最多的是什么呢?
一个github小爬虫,获取全部repo及其主要语言,画出饼图。
"""
你是什么成份?
"""
import requests
from pyquery import PyQuery as pq
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
def parse_page(url):
print("visiting",url)
resp = requests.get(url)
html = pq(resp.text)
repo_list = html("#user-repositories-list li")
repos = []
for i in range(repo_list.length):
repo = repo_list.eq(i)
it = dict()
repo_name = repo('h3').text()
repo_language = repo("[itemprop='programmingLanguage']").text()
it['name'] = repo_name
it['language'] = repo_language
repos.append(it)
sons = html(".pagination a")
next_page=None
for i in sons:
if pq(i).text().strip()=='Next':
next_page=pq(i).attr("href")
break
sons=[]if next_page is None else [next_page]
return repos, sons
def analyze(repos):
# unique
ma = dict([(i['name'], i) for i in repos])
repos = ma.values()
cnt = Counter([i['language'] for i in repos if i['language']])
labels = cnt.keys()
sizes = np.array(list(cnt.values()))
explode = np.zeros_like(sizes, dtype=np.float32) # 0.1表示将Hogs那一块凸显出来
explode[np.argsort(sizes)[-3:].reshape(-1, 1)] = 0.1 # 前三名突出显示
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=90)
# startangle表示饼图的起始角度
plt.show()
def schedule():
user = "weiyinfu"
q = []
seed = "https://github.com/" + user + "?tab=repositories"
q.append(seed)
visited = set()
repos = []
while q:
now = q.pop()
repo_list, url_list = parse_page(now)
for i in url_list:
if i not in visited:
q.append(i)
visited.add(i)
repos += repo_list
return repos
def main():
repos = schedule()
print(repos)
analyze(repos)
if __name__ == '__main__':
main()