1.选一个自己感兴趣的主题(所有人不能雷同)。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
本次选的主题是通过爬取雷锋网业界模块的所有新闻的作者,统计出哪个作者在这里发表文章最多,
1.首先,要获取到每条新闻的url,代码如下
def getNewsUrl(newsurl): res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') list = soup.select('.list')[0].select('li') allnewsList = [] for news in list: newsList = [] NewUrl = news.select('.img')[0].select('a')[1].attrs['href'] newsList=getListPage(NewUrl) print(newsList) allnewsList.append(newsList) return allnewsList
2.通过上面的url,获取到每条新闻的详细信息,代码如下
def getListPage(NewUrl): res = requests.get(NewUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') list = soup.select('.article-template') newList =[] new = {} new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip() new['作者'] = soup.select('.article-template')[0].select('a')[0].text new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip() new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip() a = soup.select('.article-template')[0].select('p') # print(new) newList.append(new) for i in range(0,len(a)-2): content = a[i].text; title = soup.select('.article-template')[0].select('a')[0].text writeNewsContent(title) return newList
3.通过获取总页数
def getPageN(): res = requests.get('https://www.leiphone.com/category/sponsor') res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') n = int(soup.select('.pages')[0].select('a')[4].text) return n n=getPageN();
4.获取所有新闻的信息
allnewList = [] for i in range(1,n+1): newsList = [] newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i) print(newsurl) newsList = getNewsUrl(newsurl) allnewList.append(newsList)
5.把所有新闻的作者都写入到txt文档
def writeNewsContent(title): f=open('gzccNews.txt','a',encoding='utf-8') f.write(title) f.close()
6.对txt文档里的信息进行统计
import jieba f = open('gzccNews.txt', 'r', encoding='utf-8') text = f.read() f.close() jieba.add_word('归〇') jieba.add_word('新智造') jieba.add_word('刘芳平') jieba.add_word('吕倩') jieba.add_word('木子') jieba.add_word('李诗') jieba.add_word('Jennings_Zhu') jieba.add_word('包永刚') jieba.add_word('王金许') jieba.add_word('李赓') jieba.add_word('Dude') jieba.add_word('温晓桦') jieba.add_word('李雨晨') jieba.add_word('思颖') jieba.add_word('李智勇') jieba.add_word('咲甜') jieba.add_word('陈伊莉') jieba.add_word('彭赛琼') jieba.add_word('camel') jieba.add_word('赵青晖') jieba.add_word('Alter') jieba.add_word('聊IT') jieba.add_word('大公司日报') jieba.add_word('又田') jieba.add_word('跃斌') jieba.add_word('奕欣') jieba.add_word('张驰') punctuation = ''',。‘’“”:;()!?、 ''' a = {' ','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'} for i in punctuation: text = text.replace(i,'') tempwords = list(jieba.cut(text)) count = {} words = list(set(tempwords) - a) for i in range(0, len(words)): count[words[i]] = text.count(str(words[i])) countList = list(count.items()) countList.sort(key=lambda x: x[1], reverse=True) print(countList) f = open('b.txt', 'a') for i in range(20): f.write(countList[i][0] + ':' + str(countList[i][1]) + ' ') f.close()
遇到问题及解决办法
1.在爬取数据的时候没有什么大的难度,就是有时候会出现类似于这个网站的通知,解决方法是吧所有相同的元素用一个链表包起来,然后删掉那些通知所在的元素给删掉
2.因为对词云库的安装方法不是很了解,网上看到资料也是五花八门,所以在这次的作业里,我依旧是延续了用jieba统计出发表文章最多的20个作者
结论:
通过对统计结果进行分析,发现木子发表的文章最多,所以我们可以得出这样一个结论,这位作者可能对业界有着很多独特见解,所以我们可以通过阅读他的文章去认识他的见解
爬取的全部数据、爬虫及数据分析源代码:
new.py
import requests import re from datetime import datetime from bs4 import BeautifulSoup import openpyxl import pandas # print(NewUrl) # getNewsUrl('https://www.leiphone.com/category/sponsor/page/1') def writeNewsContent(title): f=open('gzccNews.txt','a',encoding='utf-8') f.write(title) f.close() #获取新闻详情 def getListPage(NewUrl): res = requests.get(NewUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') list = soup.select('.article-template') newList =[] new = {} new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip() new['作者'] = soup.select('.article-template')[0].select('a')[0].text new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip() new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip() a = soup.select('.article-template')[0].select('p') # print(new) newList.append(new) for i in range(0,len(a)-2): content = a[i].text; title = soup.select('.article-template')[0].select('a')[0].text writeNewsContent(title) return newList #获取详情url def getNewsUrl(newsurl): res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') list = soup.select('.list')[0].select('li') allnewsList = [] for news in list: newsList = [] NewUrl = news.select('.img')[0].select('a')[1].attrs['href'] newsList=getListPage(NewUrl) print(newsList) allnewsList.append(newsList) return allnewsList #获取总页数 def getPageN(): res = requests.get('https://www.leiphone.com/category/sponsor') res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') n = int(soup.select('.pages')[0].select('a')[4].text) return n n=getPageN(); allnewList = [] for i in range(1,n+1): newsList = [] newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i) print(newsurl) newsList = getNewsUrl(newsurl) allnewList.append(newsList)
jieba.py
import jieba f = open('a.txt', 'r', encoding='utf-8') text = f.read() f.close() jieba.add_word('归〇') jieba.add_word('新智造') jieba.add_word('刘芳平') jieba.add_word('吕倩') jieba.add_word('木子') jieba.add_word('李诗') jieba.add_word('Jennings_Zhu') jieba.add_word('包永刚') jieba.add_word('王金许') jieba.add_word('李赓') jieba.add_word('Dude') jieba.add_word('温晓桦') jieba.add_word('李雨晨') jieba.add_word('思颖') jieba.add_word('李智勇') jieba.add_word('咲甜') jieba.add_word('陈伊莉') jieba.add_word('彭赛琼') jieba.add_word('camel') jieba.add_word('赵青晖') jieba.add_word('Alter') jieba.add_word('聊IT') jieba.add_word('大公司日报') jieba.add_word('又田') jieba.add_word('跃斌') jieba.add_word('奕欣') jieba.add_word('张驰') punctuation = ''',。‘’“”:;()!?、 ''' a = {' ','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'} for i in punctuation: text = text.replace(i,'') tempwords = list(jieba.cut(text)) count = {} words = list(set(tempwords) - a) for i in range(0, len(words)): count[words[i]] = text.count(str(words[i])) countList = list(count.items()) countList.sort(key=lambda x: x[1], reverse=True) print(countList) f = open('b.txt', 'a') for i in range(20): f.write(countList[i][0] + ':' + str(countList[i][1]) + ' ') f.close()