import requests from bs4 import BeautifulSoup from datetime import datetime import re def get_soup(url): req = requests.get(url) req.encoding = 'utf-8' soup = BeautifulSoup(req.text, 'html.parser') return soup def getDownNum(urls): pagename = urls.split('/')[-2].split('_')[0] html_id = (re.search('http://news.gzcc.cn/html/2018/'+pagename+'_(.*).html', urls).group(1).split('/')[-1]) down_url = 'http://oa.gzcc.cn/api.php?op=count&id=' + html_id + '&modelid=80' reqd = requests.get(down_url) down_num = (re.search("('#hits').html('(.*)');", reqd.text).group(1)) return down_num def getNewInfo(pageurl): soup = get_soup(pageurl) li_list = soup.select('li') title = list() a = list() info_list = list() con_list = list() cs = list() i=0 for new in li_list: if(len(new.select('.news-list-text'))>0): title.append(new.select('.news-list-text')[0].select('.news-list-title')[0].text) a.append(new.a.attrs['href']) con_soup = get_soup(a[i]) con_list.append(con_soup.select('#content')[0].text) info_list.append(con_soup.select('.show-info')[0].text.split("xa0xa0")) cs.append(''.join(con_list[i])) down_num = getDownNum(a[i]) print('标题:' + title[i]) print('链接:' + a[i]) for j in range(len(info_list[i])): if (len(info_list[i][j]) > 0 and info_list[i][j] != ' '): if (j != len(info_list[i]) - 1): print(info_list[i][j]) else: print(info_list[i][j].rstrip('次'), down_num, '次') print(cs[i]) i=i+1 def getPageNum(url): newsoup = get_soup(url) return int(int(newsoup.select('.a1')[0].text.rstrip('条'))/10) n = getPageNum('http://news.gzcc.cn/html/xiaoyuanxinwen/') for i in range(0,n+2): if(i==0): getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/') else: getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/'+str(i)+'.html')