zoukankan      html  css  js  c++  java
  • 大学排名爬虫

    # 利用 requests 库爬取网页内容
    import requests
    from bs4 import BeautifulSoup
    import bs4


    def get_text(url):
    try:
    r = requests.get(url)
    r.raise_for_status() # 判断状态码是否为200
    r.encoding = r.apparent_encoding # 使返回的编码准确
    return r.text
    except:
    print("异常") # 如果状态码不是200就会产生异常
    return " "


    def university_list(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children: # 查找的子节点
    if isinstance(tr, bs4.element.Tag): # 剔除不需要的信息
    tds = tr('td')
    # 要用strip()去除文本中的空格,不然内容为none时会报错
    # 把td信息存入ulist中
    ulist.append([tds[0].text.strip(), tds[1].text.strip(), tds[2].text.strip(), tds[3].text.strip(), tds[4].text.strip()])


    def university_rank(ulist, num):
    # {5}:中间的部分用第五个元素填充,即chr(12288)所表示的空格
    template = "{0:^10} {1:{5}^10} {2:{5}^10} {3:{5}^10} {4:^10}"
    print(template.format("排名", "学校名称", "省市", "类型", "总分", chr(12288))) # chr(12288)中文空格
    for i in range(num):
    u = ulist[i]
    print(template.format(u[0], u[1], u[2], u[3], u[4], chr(12288)))


    def main():
    university_info = []
    url = "http://www.shanghairanking.cn/rankings/bcur/2020"
    html = get_text(url)
    university_list(university_info, html)
    university_rank(university_info, 76)


    if __name__ == '__main__':
    main()

    转载https://blog.csdn.net/cjx_up/article/details/77883892

  • 相关阅读:
    reorder-list
    HMM基础
    binary-tree-preorder-traversal
    binary-tree-postorder-traversal
    GMM基础
    (七)打印机驱动设置—认识打印机接口
    (八)打印机驱动设置—串口的设置
    (五)打印机驱动设置—没有开不了的钱箱
    (六)打印机驱动设置—装完驱动后没有打印机图标
    (四)揭开打印机驱动的神秘面纱
  • 原文地址:https://www.cnblogs.com/zhang20200703/p/13949852.html
Copyright © 2011-2022 走看看