zoukankan      html  css  js  c++  java
  • python初学(三)

    1.以软科中国最好大学排名为分析对象,基于requests库和bs4库编写爬虫程序,对2015年至2019年间的中国大学排名数据进行爬取,并按照排名先后顺序输出不同年份的前10位大学信息,要求对输出结果的排版进行优化。

     1 import requests
     2 from bs4 import BeautifulSoup
     3 
     4 class Univ:
     5     def __init__(self, url, num):
     6         self.url=url
     7         self.allUniv=[]
     8         self.num=num
     9     
    10     def get_htmltext(self):
    11         try:
    12             r=requests.get(self.url,timeout=30)
    13             r.raise_for_status()
    14             r.encoding='utf8'
    15             return r.text
    16         except:
    17             return ''
    18     
    19     def fillUnivList(self,soup):
    20         data=soup.find_all('tr')
    21         for tr in data:
    22             ltd=tr.find_all('td')
    23             if len(ltd)==0:
    24                 continue
    25             singleUniv=[]
    26             for td in ltd:
    27                 singleUniv.append(td.string)
    28             self.allUniv.append(singleUniv)
    29 
    30     def printUnivList(self):
    31         print("{:^4}	{:^20}	{:^10}	{:^8}	{:^10}	".format("排名","学校名称","省市","总分","生源质量"))
    32         for i in range(self.num):
    33             u=self.allUniv[i]
    34             if u[0]:
    35                 print("{:^4}	{:^20}	{:^10}	{:^8}	{:^10}	".format(u[0],u[1],u[2],u[3],u[4]))
    36             else:
    37                 print("{:^4}	{:^20}	{:^10}	{:^8}	{:^10}	".format(i+1,u[1],u[2],u[3],u[4]))
    38 
    39     def main(self):
    40         html=self.get_htmltext()
    41         soup=BeautifulSoup(html,'html.parser')
    42         self.fillUnivList(soup)
    43         self.printUnivList()
    44 
    45 if __name__ == "__main__":
    46     url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2015_0.html"
    47     print('2015') 
    48     u=Univ(url,10)
    49     u.main()
    50     years=["2016","2017","2018","2019"]
    51     for year in years:
    52         url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming"+year+".html"
    53         print(year) 
    54         u=Univ(url,10)
    55         u.main()

    2.豆瓣图书评论数据爬取。在豆瓣图书上自行选择一本书,编写程序爬取豆瓣图书上针对该图书的短评信息,要求:

    (1)对不同页码的短评信息均可以进行爬取;

    (2)爬取的数据包含用户名、短评内容、评论时间和评分;

    能够根据选择的排序方式进行爬取,并针对热门排序,输出前10个短评信息(包括用户名、短评内容、评论时间和评分);

    (3)能够根据选择的排序方式进行爬取,并针对热门排序,输出前10个短评信息(包括用户名、短评内容、评论时间和评分);

    (4)结合中文分词和词云生成,对前3页的短评内容进行文本分析,并生成一个属于自己的词云图形。

     1 import requests
     2 import re
     3 import jieba
     4 import wordcloud
     5 from bs4 import BeautifulSoup
     6 from fake_useragent import UserAgent
     7 
     8 
     9 
    10 class com:
    11     def __init__(self, no,num,page):
    12         self.no=no
    13         self.page=page
    14         self.num=num
    15         self.url=None
    16         self.header=None
    17         self.bookdata=[]
    18         self.txt=''
    19 
    20     def set_header(self):
    21         ua = UserAgent()
    22         self.header={
    23             "User-Agent":ua.random
    24         }
    25 
    26     def set_url(self,page):
    27         self.url='https://book.douban.com/subject/{0}/comments/hot?p={1}'.format(str(self.no),str(page+1))
    28 
    29     def get_html(self):
    30         try:
    31             r=requests.get(self.url,headers=self.header,timeout=30)
    32             r.raise_for_status()
    33             r.encoding='utf8'
    34             return r.text
    35         except:
    36             return ''
    37 
    38     def fill_bookdata(self, soup):
    39         commentinfo=soup.find_all('span','comment-info')
    40         pat1=re.compile(r'allstar(d+) rating')
    41         pat2=re.compile(r'<span>(dddd-dd-dd)</span>')
    42         comments=soup.find_all('span','short')
    43         for i in range(len(commentinfo)):
    44             p=re.findall(pat1,str(commentinfo[i]))
    45             t=re.findall(pat2,str(commentinfo[i]))
    46             self.bookdata.append([commentinfo[i].a.string,comments[i].string,p,t[0]])
    47 
    48     def printList(self, num):
    49         for i in range(num):
    50             u=self.bookdata[i]        
    51             try:
    52                 print("序号: {}
    用户名: {}
    评论内容: {}
    时间:{}
    评分: {}星
    ".format(i+1,u[0],u[1],u[3],int(eval(u[2][0])/10)))
    53             except:       
    54                 print("序号: {}
    用户名: {}
    评论内容: {}
    ".format(i+1,u[0],u[1]))
    55 
    56     def comment(self):
    57         self.set_header()
    58         self.set_url(0)
    59         html=self.get_html()
    60         soup=BeautifulSoup(html,'html.parser')
    61         self.fill_bookdata(soup)
    62         self.printList(self.num)
    63 
    64     def txtcloud(self):
    65         self.set_header()
    66         for i in range(self.page):
    67             self.bookdata=[]
    68             self.set_url(i)
    69             html=self.get_html()
    70             soup=BeautifulSoup(html,'html.parser')
    71             self.fill_bookdata(soup)
    72             for j in range(len(self.bookdata)):
    73                 self.txt+=self.bookdata[j][1]
    74         w=wordcloud.WordCloud(width=1000,font_path="msyh.ttc",height=700,background_color="white")
    75         w.generate(self.txt)
    76         w.to_file("comment.png")
    77 
    78     def main(self):
    79         self.comment()
    80         self.txtcloud()
    81 
    82 if __name__ == "__main__":
    83     com(34925415,10,10).main()

    3.设  其中,完成下列操作:

    (1)在同一坐标系下用不同的颜色和线性绘制y1、y2和y3三条曲线;

     1 import matplotlib.pyplot as plt
     2 import numpy as np
     3 
     4 x=np.arange(0,360)
     5 y1=x*x
     6 y2=np.cos(x)
     7 y3=y1*y2
     8 plt.plot(x,y1,color='blue')
     9 plt.plot(x,y2,color='red')
    10 plt.plot(x,y3,color='green')
    11 plt.show()

    (2)在同一绘图框内以子图形式绘制y1、y2和y3三条曲线。

     1 import matplotlib.pyplot as plt
     2 import numpy as np
     3 
     4 x=np.arange(0,360)
     5 y1=x*x
     6 y2=np.cos(x)
     7 y3=y1*y2
     8 plt.subplot(311)
     9 plt.plot(x,y1,color='blue')
    10 plt.subplot(312)
    11 plt.plot(x,y2,color='red')
    12 plt.subplot(313)
    13 plt.plot(x,y3,color='green')
    14 plt.show()

    4.已知  ,在-2<=x<=2区间绘制该分段函数的曲线,以及由该曲线所包围的填充图形。

    1 import matplotlib.pyplot as plt 
    2 import numpy as np 
    3 
    4 x=np.arange(-2,2,1e-5)
    5 y1=np.sqrt(2*np.sqrt(np.power(x,2))-np.power(x,2))
    6 y2=-2.14*np.sqrt(np.sqrt(2)-np.sqrt(np.abs(x)))
    7 plt.plot(x,y1,'r',x,y2,'r')
    8 plt.fill_between(x,y1,y2,facecolor='red')
    9 plt.show()

     

  • 相关阅读:
    file is universal (3 slices) but does not contain a(n) armv7s slice error for static libraries on iOS
    WebImageButton does not change images after being enabled in Javascript
    ajax OPTION
    编程遍历页面上所有TextBox控件并给它赋值为string.Empty?
    获取海洋天气预报
    C#线程系列教程(1):BeginInvoke和EndInvoke方法
    js控制只能输入数字和小数点
    Response.AddHeader(,)
    ManualResetEvent的理解
    Convert.ToInt32、int.Parse(Int32.Parse)、int.TryParse、(int) 区别
  • 原文地址:https://www.cnblogs.com/unknowcry/p/12730638.html
Copyright © 2011-2022 走看看