1. 将新闻的正文内容保存到文本文件。
f=open('123.txt','w',encoding='utf-8') f.write(content) f.close()
2. 将新闻数据结构化为字典的列表:
- 单条新闻的详情-->字典news
- 一个列表页所有单条新闻汇总-->列表newsls.append(news)
- 所有列表页的所有新闻汇总列表newstotal.extend(newsls)
import requests from bs4 import BeautifulSoup from datetime import datetime import re import pandas # res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') # res.encoding = 'utf-8' # soup = BeautifulSoup(res.text, 'html.parser') # 获取新闻点击次数 def getNewsId(url): #使用正则表达式获得新闻编号 newsId = re.findall(r'\_(.*).html', url)[0][-4:] #生成点击次数的Request URL clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId) clickRes = requests.get(clickUrl) # 利用正则表达式获取新闻点击次数 clickCount = int(re.search("hits').html('(.*)');", clickRes.text).group(1)) return clickCount def getNewDetail(newsurl): # 读取新闻详情 resDescript = requests.get(newsurl) resDescript.encoding = "utf-8" soupDescript = BeautifulSoup(resDescript.text, 'html.parser') title = soupDescript.select('.show-title')[0].text info = soupDescript.select('.show-info')[0].text news = {} if (info.find('作者') > 0): author = re.search('作者:((.{2,20}s|.{2,20}、|.{2,20},){1,5})', info).group(1) else: author = 'none' if (info.find('审核') > 0): right = re.search('审核:((.{2,20}s|.{2,20}、|.{2,20},){1,5})', info).group(1) else: right = 'none' if (info.find('来源') > 0): source = re.search('来源:((.{2,50}s|.{2,50}、|.{2,50},){1,5})', info).group(1) else: source = 'none' if (info.find('摄影') > 0): video = re.search('摄影:((.{2,50}s|.{2,50}、|.{2,50},){1,5})', info).group(1) else: video = 'none' # author = re.search('作者:((.{2,20}s|.{2,20}、|.{2,20},){1,5})', info).group(1) # right = re.search('审核:(.*)xa0xa0来源:', info).group(1) # source = re.search('来源:(.*)xa0xa0xa0xa0摄影:', info).group(1) # video = re.search('摄影:(.*)xa0xa0xa0xa0点击:', info).group(1) dt = datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S') content = soupDescript.select('.show-content')[0].text.strip() # f.write(content) click = getNewsId(newsurl) news['content'] = soupDescript.select('.show-content')[0].text.strip() news['click'] = getNewsId(newsurl) news['newsurl'] = newsurl return (news) # print(click,title,newsurl,source,dt) # print('发布时间:{0} 作者:{1} 审核:{2} 来源:{3} 摄影:{4} 点击次数:{5}'.format(dt, author, right, source, video, click)) def getListPage(listPageUrl): res1 = requests.get(listPageUrl) res1.encoding = 'utf-8' soup = BeautifulSoup(res1.text,'html.parser') newsList = [] for news in soup.select('li'): if len(news.select('.news-list-title'))>0: a = news.select('a')[0].attrs['href'] newsList.append(getNewDetail(a)) # getNewDetail(a) return (newsList) resn = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') resn.encoding = 'utf-8' soupn = BeautifulSoup(resn.text,'html.parser') #新闻总篇数 listcount = int(soupn.select('.a1')[0].text.rstrip('条')) print(listcount) #新闻总页数 n = int(soupn.select('.a1')[0].text.rstrip('条'))//10+1 newsTotal=[] firstPage='http://news.gzcc.cn/html/xiaoyuanxinwen/' newsTotal.extend(getListPage(firstPage)) # f=open('123.txt','w',encoding='utf-8') #首页 # getListPage('http://news.gzcc.cn/html/xiaoyuanxinwen/') #最后一页 for i in range(n,n+1): pageUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) newsTotal.extend(getListPage(pageUrl)) for news in newsTotal: print(news) df=pandas.DataFrame(newsTotal) df.to_excel('gzccnews.xlsx') # f.close()
3. 安装pandas,用pandas.DataFrame(newstotal),创建一个DataFrame对象df.
df = pandas.DataFrame(gzccNews)
4. 通过df将提取的数据保存到csv或excel 文件。
df.to_excel('gzccnews.xlsx')
5. 用pandas提供的函数和方法进行数据分析:
- 提取包含点击次数、标题、来源的前6行数据
-
df[['clicktimes', 'title', 'source']].head(6) print(df[['clicktimes', 'title', 'source']].head(6))
- 提取‘学校综合办’发布的,‘点击次数’超过3000的新闻。
-
df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')] print(df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')])
- 提取'国际学院'和'学生工作处'发布的新闻。
-
soulist = ['国际学院', '学生工作处'] print(df[df['source'].isin(soulist)])