zoukankan      html  css  js  c++  java
  • #016 爬虫第一次尝试

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import bs4
     4 
     5 def getHTMLText(url):
     6      try:
     7           r = requests.get(url, timeout = 30)
     8           r.raise_for_status()
     9           r.encoding = r.apparent_encoding
    10           return r.text
    11      
    12           
    13      except:
    14           return ""
    15           
    16      return ""
    17 
    18 def fillUnivList(ulist,html):
    19      soup = BeautifulSoup(html, "html.parser")
    20      for tr in soup.find('tbody').children:
    21           if isinstance(tr, bs4.element.Tag):
    22                tds = tr('td')
    23                ulist.append([tds[0].string, tds[1].string, tds[2].string])
    24 
    25                
    26                
    27      
    28 
    29 def printUnivList(ulist, num):
    30      print("{:^10}	{:^6}	{:^10}".format("排名","学校名称","总分"))
    31      for i in range(num):
    32           u=ulist[i]
    33           print("{:^10}	{:^6}	{:^10}".format(u[0],u[1],u[2]))
    34 
    35 def main():
    36      uinfo = []
    37      url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
    38      html = getHTMLText(url)
    39      fillUnivList(uinfo,html)
    40      printUnivList(uinfo, 20)
    41 main()
    42      

    结合 这个课第一次敲得爬虫,,,,没有Pyhton语法,主要是老师说啥我咋敲,没有技术含量.就是不知道咋敲,一直报错。。

  • 相关阅读:
    ThreadPoolExecutor线程池和ProcessPoolExecutor进程池
    canvas画一个时钟
    js中一些注意点 ps不断更新中....
    javascript文档节点
    Go网络编程
    Goroutine和Channel
    Go单元测试
    简单了解Go语言JSON包
    Go的命令行参数
    Go语言文件操作
  • 原文地址:https://www.cnblogs.com/hx97/p/10607503.html
Copyright © 2011-2022 走看看