def
getNewsId(url):
newsId
=
re.findall(r
'\_(.*).html'
, newsUrl)[
0
][
-
4
:]
clickUrl
=
'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'
.
format
(newsId)
clickRes
=
requests.get(clickUrl)
# 利用正则表达式获取新闻点击次数
clickCount
=
int
(re.search(
"hits').html('(.*)');"
, clickRes.text).group(
1
))
return
clickCount
# 获取新闻细节
def
getNewsDetail(newsUrl):
resd
=
requests.get(newsUrl)
resd.encoding
=
'utf-8'
soupd
=
BeautifulSoup(resd.text,
'html.parser'
)
content
=
soupd.select(
'#content'
)[
0
].text
info
=
soupd.select(
'.show-info'
)[
0
].text
# 调用getNewsId()获取点击次数
count
=
getNewsId(newsUrl)
print
(info)
# 识别时间格式
date
=
re.search(
'(d{4}.d{2}.d{2}sd{2}.d{2}.d{2})'
, info).group(
1
)
# 识别一个至三个数据
author
=
re.search(
'作者:((.{3}s){1,3})'
, info).group(
1
)
check
=
re.search(
'审核:((.{3}s){1,3})'
, info).group(
1
)
sources
=
re.search(
'来源:((.{3}s){1,3})'
, info).group(
1
)
# 用datetime将时间字符串转换为datetime类型
dateTime
=
datetime.strptime(date,
'%Y-%m-%d %H:%M:%S'
)
# 利用format对字符串进行操作
print
(
'发布时间:{0}
作者:{1}
审核:{2}
来源:{3}
点击次数:{4}'
.
format
(dateTime, author, check, sources, count))
print
(content)
str
=
requests.get(
'http://news.gzcc.cn/html/xiaoyuanxinwen/'
)
str
.encoding
=
'utf-8'
from
bs4
import
BeautifulSoup soup
=
BeautifulSoup(
str
.text,
'html.parser'
)
#print(soup) d=soup.select('li') for news in d: if len(news.select('.news-list-title')) > 0: t=print(news.select('.news-list-title')[0].text) a=news.select('a')[0].attrs print(a['href']) strd = requests.get(a['href']) strd.encoding = 'utf-8' soupd = BeautifulSoup(strd.text, 'html.parser') cont=soupd.select('#content') timet=soupd.select('.show-info') print(timet[0].text[0:25]) print(timet[0].text[30:38]) print(timet[0].text[38:45]) print(timet[0].text[46:56]) print(timet[0].text[62:])