1.选一个自己感兴趣的主题(所有人不能雷同)。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
本次选的主题是通过爬取雷锋网业界模块的所有新闻的作者,统计出哪个作者在这里发表文章最多,
1.首先,要获取到每条新闻的url,代码如下
def getNewsUrl(newsurl):
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
list = soup.select('.list')[0].select('li')
allnewsList = []
for news in list:
newsList = []
NewUrl = news.select('.img')[0].select('a')[1].attrs['href']
newsList=getListPage(NewUrl)
print(newsList)
allnewsList.append(newsList)
return allnewsList
2.通过上面的url,获取到每条新闻的详细信息,代码如下
def getListPage(NewUrl):
res = requests.get(NewUrl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
list = soup.select('.article-template')
newList =[]
new = {}
new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip()
new['作者'] = soup.select('.article-template')[0].select('a')[0].text
new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip()
new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip()
a = soup.select('.article-template')[0].select('p')
# print(new)
newList.append(new)
for i in range(0,len(a)-2):
content = a[i].text;
title = soup.select('.article-template')[0].select('a')[0].text
writeNewsContent(title)
return newList
3.通过获取总页数
def getPageN():
res = requests.get('https://www.leiphone.com/category/sponsor')
res.encoding = "utf-8"
soup = BeautifulSoup(res.text, 'html.parser')
n = int(soup.select('.pages')[0].select('a')[4].text)
return n
n=getPageN();
4.获取所有新闻的信息
allnewList = []
for i in range(1,n+1):
newsList = []
newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i)
print(newsurl)
newsList = getNewsUrl(newsurl)
allnewList.append(newsList)
5.把所有新闻的作者都写入到txt文档
def writeNewsContent(title):
f=open('gzccNews.txt','a',encoding='utf-8')
f.write(title)
f.close()
6.对txt文档里的信息进行统计
import jieba
f = open('gzccNews.txt', 'r', encoding='utf-8')
text = f.read()
f.close()
jieba.add_word('归〇')
jieba.add_word('新智造')
jieba.add_word('刘芳平')
jieba.add_word('吕倩')
jieba.add_word('木子')
jieba.add_word('李诗')
jieba.add_word('Jennings_Zhu')
jieba.add_word('包永刚')
jieba.add_word('王金许')
jieba.add_word('李赓')
jieba.add_word('Dude')
jieba.add_word('温晓桦')
jieba.add_word('李雨晨')
jieba.add_word('思颖')
jieba.add_word('李智勇')
jieba.add_word('咲甜')
jieba.add_word('陈伊莉')
jieba.add_word('彭赛琼')
jieba.add_word('camel')
jieba.add_word('赵青晖')
jieba.add_word('Alter')
jieba.add_word('聊IT')
jieba.add_word('大公司日报')
jieba.add_word('又田')
jieba.add_word('跃斌')
jieba.add_word('奕欣')
jieba.add_word('张驰')
punctuation = ''',。‘’“”:;()!?、 '''
a = {'
','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'}
for i in punctuation:
text = text.replace(i,'')
tempwords = list(jieba.cut(text))
count = {}
words = list(set(tempwords) - a)
for i in range(0, len(words)):
count[words[i]] = text.count(str(words[i]))
countList = list(count.items())
countList.sort(key=lambda x: x[1], reverse=True)
print(countList)
f = open('b.txt', 'a')
for i in range(20):
f.write(countList[i][0] + ':' + str(countList[i][1]) + '
')
f.close()
遇到问题及解决办法
1.在爬取数据的时候没有什么大的难度,就是有时候会出现类似于这个网站的通知,解决方法是吧所有相同的元素用一个链表包起来,然后删掉那些通知所在的元素给删掉
2.因为对词云库的安装方法不是很了解,网上看到资料也是五花八门,所以在这次的作业里,我依旧是延续了用jieba统计出发表文章最多的20个作者
结论:
通过对统计结果进行分析,发现木子发表的文章最多,所以我们可以得出这样一个结论,这位作者可能对业界有着很多独特见解,所以我们可以通过阅读他的文章去认识他的见解
爬取的全部数据、爬虫及数据分析源代码:
new.py
import requests
import re
from datetime import datetime
from bs4 import BeautifulSoup
import openpyxl
import pandas
# print(NewUrl)
# getNewsUrl('https://www.leiphone.com/category/sponsor/page/1')
def writeNewsContent(title):
f=open('gzccNews.txt','a',encoding='utf-8')
f.write(title)
f.close()
#获取新闻详情
def getListPage(NewUrl):
res = requests.get(NewUrl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
list = soup.select('.article-template')
newList =[]
new = {}
new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip()
new['作者'] = soup.select('.article-template')[0].select('a')[0].text
new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip()
new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip()
a = soup.select('.article-template')[0].select('p')
# print(new)
newList.append(new)
for i in range(0,len(a)-2):
content = a[i].text;
title = soup.select('.article-template')[0].select('a')[0].text
writeNewsContent(title)
return newList
#获取详情url
def getNewsUrl(newsurl):
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
list = soup.select('.list')[0].select('li')
allnewsList = []
for news in list:
newsList = []
NewUrl = news.select('.img')[0].select('a')[1].attrs['href']
newsList=getListPage(NewUrl)
print(newsList)
allnewsList.append(newsList)
return allnewsList
#获取总页数
def getPageN():
res = requests.get('https://www.leiphone.com/category/sponsor')
res.encoding = "utf-8"
soup = BeautifulSoup(res.text, 'html.parser')
n = int(soup.select('.pages')[0].select('a')[4].text)
return n
n=getPageN();
allnewList = []
for i in range(1,n+1):
newsList = []
newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i)
print(newsurl)
newsList = getNewsUrl(newsurl)
allnewList.append(newsList)
jieba.py
import jieba
f = open('a.txt', 'r', encoding='utf-8')
text = f.read()
f.close()
jieba.add_word('归〇')
jieba.add_word('新智造')
jieba.add_word('刘芳平')
jieba.add_word('吕倩')
jieba.add_word('木子')
jieba.add_word('李诗')
jieba.add_word('Jennings_Zhu')
jieba.add_word('包永刚')
jieba.add_word('王金许')
jieba.add_word('李赓')
jieba.add_word('Dude')
jieba.add_word('温晓桦')
jieba.add_word('李雨晨')
jieba.add_word('思颖')
jieba.add_word('李智勇')
jieba.add_word('咲甜')
jieba.add_word('陈伊莉')
jieba.add_word('彭赛琼')
jieba.add_word('camel')
jieba.add_word('赵青晖')
jieba.add_word('Alter')
jieba.add_word('聊IT')
jieba.add_word('大公司日报')
jieba.add_word('又田')
jieba.add_word('跃斌')
jieba.add_word('奕欣')
jieba.add_word('张驰')
punctuation = ''',。‘’“”:;()!?、 '''
a = {'
','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'}
for i in punctuation:
text = text.replace(i,'')
tempwords = list(jieba.cut(text))
count = {}
words = list(set(tempwords) - a)
for i in range(0, len(words)):
count[words[i]] = text.count(str(words[i]))
countList = list(count.items())
countList.sort(key=lambda x: x[1], reverse=True)
print(countList)
f = open('b.txt', 'a')
for i in range(20):
f.write(countList[i][0] + ':' + str(countList[i][1]) + '
')
f.close()