import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import jieba
def getNewsDetail(newsurl):#获取新闻详情
resd=requests.get(newsurl)
resd.encoding='utf-8'
soupd=BeautifulSoup(resd.text,'html.parser')
click=soupd.select('.like')[0].text.split(" ")[0]
title = soupd.select('h1')[0].text
info=soupd.select('.pdate')[0].text
dt=datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')
author=soupd.select('p')[0].text.split(" ")[1].strip('<p>')
delcontent=soupd.select('p')[0].text
newscontent=soupd.select('.maintext')[0].text.lstrip(delcontent)
keyWords=getKeyWords(newscontent)
print(dt)
print(title)
print(click)
print(author)
print(newscontent)
print(keyWords)
f = open("C:python/pachong.txt", 'w', encoding='utf8')
f.write(newscontent)
f.close()
def getKeyWords(newscontent):#获取新闻关键词
newscontent = ''.join(re.findall('[u4e00-u9fa5]', newscontent))
wordSet=set(jieba._lcut(newscontent))
wordDict={}
for i in wordSet:
wordDict[i]=newscontent.count(i)
delList=[]
for i in wordDict.keys():
if len(i)<2:
delList.append(i)
for i in delList:
del wordDict[i]
dictList=list(wordDict.items())
dictList.sort(key=lambda item: item[1], reverse=True)
keyWords=[]
for i in range(20):
keyWords.append(dictList[i][0])
return keyWords
newsurl="http://news.gdufe.edu.cn/11499"
getNewsDetail(newsurl)