我爬取的是新浪新闻,打开网页链接http://news.sina.com.cn/china/:
打开网页获取需要的链接:,然后开始做项目。
1,获取评论数:
def getCommentsCounts(newsurl): bianhao = re.search('doc-i(.+).shtml', newsurl) newsid=bianhao.group(1) comment=requests.get(commentURL.format(newsid)) jd = json.loads(comment.text) counts=jd['result']['count']['total'] return counts
2 获取新闻内容:
def getNewsDetail(newsurl): result = {} res=requests.get(newsurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') # 获取标题 result['title']=soup.select(".main-title")[0].text # 来源 result['newssources'] = soup.select('.source')[0].text # 时间 result['timesource'] = soup.select('.date')[0].text # 编辑 result['editor']=soup.select('.show_author')[0].text.strip('责任编辑:')[-1] # 评论数 result['comments']=getCommentsCounts(url) # 内容 result['contents']= soup.select('.article')[0].text.strip() # writeNewsContent(content) return str(result['contents'])
3 保存为txt:
def writeNewsContent(content): f=open('news.txt','a',encoding='utf-8') f.write(content) f.close()
得到txt文本:
4 词频分析并生成词云:
for c in sep: news = news.replace(c, ' ') wordList = list(jieba.cut(news)) wordDict = {} words = list(set(wordList) - exclude) for w in range(0, len(words)): wordDict[words[w]] = news.count(str(words[w])) dictList = list(wordDict.items()) dictList.sort(key=lambda x: x[1], reverse=True) cy = {} f = open('news.txt', 'a', encoding="utf-8") for i in range(1000): print(dictList[i]) f.write(dictList[i][0] + ':' + str(dictList[i][1]) + ' ') cy[dictList[i][0]] = dictList[i][1] f.close() font = r'C:WindowsFontswb.ttf' image = Image.open('./wordcloud.jpg') graph = np.array(image) wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph) wc.generate_from_frequencies(cy) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.axis("off") plt.show()
得到词云图片:
在做大作业的过程中,遇到的主要问题还是在安装wordcloud上,出现了Fatal error in launcher: Unable to create process using '"'的问题,当然不止这个问题,只是最后才找到这个关键的问题,这个问题后来困扰了我两天时间,于是我开始了与wordcloud的对抗。查了各种资料,最后终于在一篇博文(https://blog.csdn.net/testcs_dn/article/details/54176504)上找到了解决这个问题的答案。先升级pip,嗯,第一遍不知道为啥不成功,还好又试了一个,很好,成功了。
然后,就下载whl,接着安装,这里就不说了哈,百度上有。最后贴上代码:。
大作业代码:
import requests import json import re from bs4 import BeautifulSoup import jieba # 获取评论数 def getCommentsCounts(newsurl): bianhao = re.search('doc-i(.+).shtml', newsurl) newsid=bianhao.group(1) comment=requests.get(commentURL.format(newsid)) jd = json.loads(comment.text) counts=jd['result']['count']['total'] return counts def getNewsDetail(newsurl): result = {} res=requests.get(newsurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') # 获取标题 result['title']=soup.select(".main-title")[0].text # 来源 result['newssources'] = soup.select('.source')[0].text # 时间 result['timesource'] = soup.select('.date')[0].text # 编辑 result['editor']=soup.select('.show_author')[0].text.strip('责任编辑:')[-1] # 评论数 result['comments']=getCommentsCounts(url) # 内容 result['contents']= soup.select('.article')[0].text.strip() # writeNewsContent(content) return str(result['contents']) # 保为 txt def writeNewsContent(content): f=open('news.txt','a',encoding='utf-8') f.write(content) f.close() def parseListLinks(url): newsdetails=[] res=requests.get(url) jss = res.text.lstrip(' newsloadercallback(').rstrip(');') jd = json.loads(jss) for news in jd['result']['data']: allURL=news['url'] newsdetails.append(getNewsDetail(allURL).split()) writeNewsContent(str(newsdetails)) return newsdetails commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1 &format=json&channel=gn&newsid=comos-{}&group=undefined& compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3' url='http://news.sina.com.cn/c/zj/2018-04-20/doc-ifzihneq2559172.shtml' listURL='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}& callback=newsloadercallback&_=1524705663198' news_total=[] for i in range(1,2): newssurl=listURL.format(i) newsary=parseListLinks(newssurl) news_total.extend(newsary) print(len(news_total))
import jieba from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud,ImageColorGenerator f = open('content.txt', 'r', encoding='utf-8') news = f.read() f.close() sep = ''',。‘’“”:;()!?、《》[] ''' exclude = {'的','下','中','就','是','■'} jieba.add_word('中国芯') jieba.add_word('倪光南') jieba.add_word('梁宁') jieba.add_word('沈静文') jieba.add_word('宋爽') jieba.add_word('冯志远') jieba.add_word('霍宇昂') jieba.add_word('杨冠宇') jieba.add_word('杨渡') for c in sep: news = news.replace(c, ' ') wordList = list(jieba.cut(news)) wordDict = {} words = list(set(wordList) - exclude) for w in range(0, len(words)): wordDict[words[w]] = news.count(str(words[w])) dictList = list(wordDict.items()) dictList.sort(key=lambda x: x[1], reverse=True) cy = {} f = open('news.txt', 'a', encoding="utf-8") for i in range(1000): print(dictList[i]) f.write(dictList[i][0] + ':' + str(dictList[i][1]) + ' ') cy[dictList[i][0]] = dictList[i][1] f.close() font = r'C:WindowsFontswb.ttf' image = Image.open('./wordcloud.jpg') graph = np.array(image) wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph) wc.generate_from_frequencies(cy) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.axis("off") plt.show()
词云底片:
生成的词云: