1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
# -*- coding: UTF-8 -*- import requests import json import re from bs4 import BeautifulSoup import jieba from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, ImageColorGenerator # 获取评论数 def getCommentsCounts(newsurl): bianhao = re.search('doc-i(.+).shtml', newsurl) newsid = bianhao.group(1) comment = requests.get(commentURL.format(newsid)) jd = json.loads(comment.text) counts = jd['result']['count']['total'] return counts def getNewsDetail(newsurl): result = {} res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') # 获取标题 result['title'] = soup.select(".main-title")[0].text # 来源 result['newssources'] = soup.select('.source')[0].text # 时间 result['timesource'] = soup.select('.date')[0].text # 编辑 result['editor'] = soup.select('.show_author')[0].text.strip('责任编辑:')[-1] # 评论数 result['comments'] = getCommentsCounts(url) # 内容 result['contents'] = soup.select('.article')[0].text.strip() # writeNewsContent(content) return str(result['contents']) # 保为 txt def writeNewsContent(content): f = open('news.txt', 'a', encoding='utf-8') f.write(content) f.close() def parseListLinks(url): newsdetails = [] res = requests.get(url) jss = res.text.lstrip(' newsloadercallback(').rstrip(');') jd = json.loads(jss) for news in jd['result']['data']: allURL = news['url'] newsdetails.append(getNewsDetail(allURL).split()) writeNewsContent(str(newsdetails)) return newsdetails commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1 &format=json&channel=gn&newsid=comos-{}&group=undefined& compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3' url = 'http://finance.sina.com.cn/chanjing/gsnews/2018-04-29/doc-ifzvpatq7964658.shtml' listURL = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}& callback=newsloadercallback&_=1524705663198' news_total = [] for i in range(1, 2): newssurl = listURL.format(i) newsary = parseListLinks(newssurl) news_total.extend(newsary) print(len(news_total)) < br > < br > < br > f = open('content.txt', 'r', encoding='utf-8') news = f.read() f.close() sep = ''',。‘’“”:;()!?、《》[] ''' exclude = {'的', '下', '中', '就', '是', '■'} for c in sep: news = news.replace(c, ' ') wordList = list(jieba.cut(news)) wordDict = {} words = list(set(wordList) - exclude) for w in range(0, len(words)): wordDict[words[w]] = news.count(str(words[w])) dictList = list(wordDict.items()) dictList.sort(key=lambda x: x[1], reverse=True) cy = {} f = open('news.txt', 'a', encoding="utf-8") for i in range(1000): print(dictList[i]) f.write(dictList[i][0] + ':' + str(dictList[i][1]) + ' ') cy[dictList[i][0]] = dictList[i][1] f.close() font = r'C:WindowsFontswb.ttf' image = Image.open('./wordcloud.jpg') graph = np.array(image) wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph) wc.generate_from_frequencies(cy) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.axis("off") plt.show()
生成的词云: