import requests from bs4 import BeatifulSoup from datetime import datetime import re def getclick(url): #函数:采用匹配方式,动态获取每一文章点击次数 id=re.search('_(.*).html',url).group(1).split('/')[1] clickurl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(id) click=int(requests.get('http://oa.gzcc.cn/api.php?op=count&id=4662&modelid=80').text.split('.')[-1].lstrip("html('").rstrip("');")) return click def getgetdetal(url): resd=requests.get(url) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') news=[] news['url']=url news['title']=soupd.select('show-title')[0].text info=soupd.select('.show-info')[0].tetx news['dt']=tatetime.strptime(info.lstrip('发表时间:')[0:19],'%Y-%m-%d %H:%M:%S') news['source']=re.search('来源:(.*)点击',info).group(l).strip() #news['content']=soupd.select('.show-content')[0].text.strip() news['click']=getclick(url) return(news) def onepage(pageurl): res=requests.get(pageurl) res.encoding='utf-8' soup=BeatifulSoup(res.text,'html.parser') newsls=[] for news in soup.select('li'): if len(news.select('.news-list-title'))>0: newsls.append(getdetail(news.select('a')[0]['hraf'])) return(newsls) newstotal=[] gzccurl='http://news.gzcc.cn/html/xiaoyuanxinwen/' #函数开头 res=requests.get(gzccurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') n=int(soup.select('.a1')[0].text.rstrip('条')) pages=n//10+1 #计算多少条多少页 for i in range(2,3): listsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) newstotal.extend(onepage(listurl)) #print(len(newstotal)) lf=pandas.DataFrame(newstotal) #print(df.head()) #print(df['title']) df.to_excel('gzccnews.xlsx') with sqlital.connect('gzccnewsdb.sqlite') as db: df.to_sql('gzccnewsdb',con=db)