1.取出一个新闻列表页的全部新闻 包装成函数。
2.获取总的新闻篇数,算出新闻总页数。
3.获取全部新闻列表页的全部新闻详情。
import requests import re from bs4 import BeautifulSoup from datetime import datetime newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(newsurl) # 返回response对象 res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') def getNewDetail(pageUrl): res = requests.get(pageUrl) # 返回response对象 res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('li'): if len(news.select('.news-list-title')) > 0: t = news.select('.news-list-title')[0].text # 标题 a = news.select('a')[0].attrs['href'] # 链接 res = requests.get(a) res.encoding = 'utf-8' soupd = BeautifulSoup(res.text, 'html.parser') content = soupd.select('#content')[0].text description = news.select('.news-list-description')[0].text resd = requests.get(a) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') info = soupd.select('.show-info')[0].text d = info.lstrip('发布时间:')[:19] dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S') author = info[info.find('作者:'):].split()[0].lstrip('作者:') source = info[info.find('来源:'):].split()[0].lstrip('来源:') photo = info[info.find('摄影:'):].split()[0].lstrip('摄影:') print("新闻标题:", t) print("链接:", a) print("发布时间:", dt) print("作者:", author) print("来源:", source) print("摄影:", photo) print("描述:", description) getClickCount(a) print("正文:", content) def getClickCount(a): newsid = re.search(r"\_(.*).html", a).group(1)[-4:] clickcounturl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid) clickcount = int(requests.get(clickcounturl).text.split(".html(")[-1].lstrip("'").rstrip("');")) print('点击次数:',clickcount) def getpagelist(path): res = requests.get(path) # 返回response对象 res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') newsnum=int(soup.select('.a1')[0].text.rstrip('条')) #新闻总条数 if(newsnum%10==0): totalpage=newsnum//10 else: totalpage=newsnum//10+1 #新闻总页数 for i in range(1,totalpage): pageUrl = path + '{}.html'.format(i) getNewDetail(pageUrl) getpagelist(newsurl)
4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。
# 爬取环球科技网新闻信息 import requests from bs4 import BeautifulSoup from datetime import datetime import jieba newsurl = 'http://tech.huanqiu.com/internet/' def sort(text): str = '''一!“”,。?;’"',.、: ''' for s in str: text = text.replace(s, ' ') wordlist = list(jieba.cut(text)) exclude = {'这', 'u3000', ' ', 'xa0', '的', '_', ' ', '将', '在', '是', '了', '一', '还', '也', '《', '》', '(', ')'} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) print("top5关键词:") for i in range(5): print(dictlist[i]) def getContent(url): res = requests.get(url) res.encoding = 'utf-8' soup2 = BeautifulSoup(res.text, 'html.parser') for news in soup2.select('.l_a'): if len(news.select('.author'))>0: author=news.select('.author')[0].text print("作者",author) content = soup2.select('.la_con')[0].text.rstrip('AD_SURVEY_Add_AdPos("7000531");') print("正文:", content) sort(content) def getNewDetails(newsurl): res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('.item'): # print(news) title = news.select('a')[0].attrs['title'] a = news.select('a')[0].attrs['href'] brief = news.select('h5')[0].text.rstrip('[详细]') time = news.select('h6')[0].text dt = datetime.strptime(time, '%Y-%m-%d %H:%M') print("新闻标题:", title) print("链接:", a) print("内容简介:", brief) print("时间:", dt) getContent(a) print(' ') # break res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') getNewDetails(newsurl) # for total in soup.select('#pages'): # all=int(total.select('a')[0].text.rstrip('条')) #获取总条数计算总页数 # #print(all) # if(all%60==0): # totalpages=all//60 # else: # totalpages=all//60+1 # print(totalpages) # for i in range(1,totalpages+1): #所有页面的新闻信息 # PageUrl = newsurl + '{}.html'.format(i) # getNewDetails(PageUrl)