import requests
import re
from bs4 import BeautifulSoup
url='http://news.gzcc.cn/html/xiaoyuanxinwen/'
res=requests.get(url)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
li=soup.select('li')
def get(gzcc):
dj=re.search('_.*/(.*).html',gzcc).groups(0)[0]
djcs=int(requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(dj)).text.split('.')[-1].lstrip("html('").rstrip("');"))
return djcs
def sss(label):
for news in label:
if len(news.select('.news-list-title'))>0:
title=news.select('.news-list-title')[0].text #标题
time=news.select('.news-list-info')[0].contents[0].text#时间
url1=news.select('a')[0]['href']#url
bumen=news.select('.news-list-info')[0].contents[1].text#部门
description=news.select('.news-list-description')[0].text #描述
cs=get(url1)
print(time,title,url,cs)
sss(li)
pages=int(soup.select('.a1')[0].text.rstrip('条'))//10+1
for list in range(2,pages+1):
pageurl="http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(list)
pageres=requests.get(pageurl)
pageres.encoding='utf-8'
pagesoup=BeautifulSoup(pageres.text,'html.parser')
pagelist=pagesoup.select('li')
sss(pagelist)
break