1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
import requests
from bs4 import BeautifulSoup
import re
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 将词云写入到文件
def writeFilekeynews(keywords):
f = open('keyword.txt', 'a', encoding='utf-8')
for word in keywords:
f.write(" "+word)
f.close()
#将新闻内容写入文件
def writeFilecontent(contnet):
f = open('content.txt', 'a', encoding='utf-8')
f.write("
"+contnet)
f.close()
def getWordCloud():
keynewsTowordcloud = open('keyword.txt', 'r', encoding='utf-8').read()
print(keynewsTowordcloud)
wc = WordCloud(font_path='C:WindowsFontsAdobeKaitiStd-Regular.otf', background_color='white',max_words=150).generate(keynewsTowordcloud).to_file("wordcloud.jpg")
plt.imshow(wc)
plt.axis('off')
plt.show()
def getKeynews(content):
content = ''.join(re.findall('[u4e00-u9fa5]', content)) # 通过正则表达式选取中文字符数组,拼接为无标点字符内容
# 去掉重复的字符生成集合
newSet = set(jieba._lcut(content))
newDict = {}
for i in newSet:
newDict[i] = content.count(i)
deleteList, keynews = [], []
for i in newDict.keys():
if len(i) < 2:
deleteList.append(i) #去掉单音无意义字符
deleteList.append('编辑')
for i in deleteList:
del newDict[i]
dictList = list(newDict.items())
dictList.sort(key=lambda item: item[1], reverse=True) # 排序,返回前三关键字
for dict in dictList:
keynews.append(dict[0])
return keynews
def getNewsDetail(newsUrl):
resd = requests.get(newsUrl)
resd.encoding = 'utf-8'
soupd = BeautifulSoup(resd.text, 'html.parser')
content = soupd.select('.artical-main-content')[0].text
writeFilecontent(content)
keynews = getKeynews(content)
writeFilekeynews(keynews)
def Get_page(url):
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
# print(soup.select('.tag-list-box')[0].select('.list'))
for new in soup.select('.tag-list-box')[0].select('.list'):
# print(new.select('.list-content')[0] .select('.name')[0].select('.n1')[0].select('a')[0]['href'])
url = new.select('.list-content')[0].select('.name')[0].select('.n1')[0].select('a')[0]['href']
getNewsDetail(url)
url = 'https://voice.hupu.com/nba/tag/3023-1.html'
resd = requests.get(url)
resd.encoding = 'utf-8'
soup1 = BeautifulSoup(resd.text, 'html.parser')
Get_page(url)
for i in range(2, 4):
Get_page('https://voice.hupu.com/nba/tag/3023-{}.html'.format(i))
getWordCloud()
截图: