1.取出一个新闻列表页的全部新闻 包装成函数。
2.获取总的新闻篇数,算出新闻总页数。
3.获取全部新闻列表页的全部新闻详情。
import requests from bs4 import BeautifulSoup from datetime import datetime import re #获得新闻点击次数 def getclick(link): newId = re.search('\_(.*).html', link).group(1).split('/')[1] click = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newId)) return click.text.split('.html')[-1].lstrip("('").rstrip("');") def getnewsdetail(link): resd = requests.get(link) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') content=soupd.select('.show-content')[0].text info=soupd.select('.show-info')[0].text clickcount = getclick(link) time=re.search('(d{4}.d{2}.d{2}sd{2}.d{2}.d{2})',info).group(1) if (info.find('作者') > 0): author = re.search('作者:((.{2,4}s|.{2,4}、|.{2,4},|w*s){1,5})', info).group(1) else: author = 'none' if (info.find('审核') > 0): auditing = re.search('审核:((.{2,4}s|.{2,4}、|.{2,4},|w*s){1,5})', info).group(1) else: auditingr = 'none' if (info.find('来源:') > 0): source = re.search('来源:(.*)s*摄|点', info).group(1) else: source = 'none' dateTime=datetime.strptime(time,'%Y-%m-%d %H:%M:%S') print('发布时间:{0} 作者:{1} 审核:{2} 来源:{3} 点击次数:{4}'.format(dateTime,author,auditing,source,clickcount)) print(content) def getlistpage(listlink): res=requests.get(listlink) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') for news in soup.select('li'): if (len(news.select('.news-list-title')) > 0): title = news.select('.news-list-title')[0].text description = news.select('.news-list-description')[0].text link = news.a.attrs['href'] print('新闻标题:{0} 新闻描述:{1} 新闻链接:{2}'.format(title,description,link)) getnewsdetail(link) break listlink='http://news.gzcc.cn/html/xiaoyuanxinwen/' from datetime import datetime getlistpage(listlink) res=requests.get(listlink) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') listCount = int(soup.select('.a1')[0].text.rstrip('条'))//10+1 for i in range(2,listCount): listlink='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) getlistpage(listlink)
4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。
# -*- coding: UTF-8 -*- # -*- author: yjw -*- import requests import re import jieba from bs4 import BeautifulSoup from datetime import datetime def getnewdetail(link): res=requests.get(link) res.encoding='gb2312' soup=BeautifulSoup(res.text,'html.parser') Alltext=len(soup.select(".text")) content='' for p in range(0,Alltext): content+=soup.select('.text')[p].text+' ' if(Alltext>0): print(content+" 词频统计:") delword={['我', '他', '你', '了', '那', '又', '-', '的', '我们', '是', '但', '中', '这', '在', '也', '都', '而','你',' ','我','我们', '他', '他们', '我的', '他的', '你的', '呀', '和', '是',',','。',':','“','”','的','啊','?','在','了', '说','去','与','不','是','、','也','又','!','着','儿','这','到','就', ' ','(',')','那','有','上','便','和','只','要','小','罢','那里', '…','一个','?','人','把','被','她','都','道','好','还','’','‘','呢','来','得','你们','才','们' ' ', ',', '。', '?', '!', '“', '”', ':', ';', '、', '.', '‘', '’', '(', ')', ' ', '【', '】', '…'] } word={} newscontent=list(jieba.cut(content)) wordfit=set(newscontent)-set(delword) for i in wordfit: word[i]=newscontent.count(i) text = sorted(text3.items(), key=lambda x: x[1], reverse=True) for i in range(20): print(text[i]) else: print('picture') def getnewlist(link): res=requests.get(link) res.encoding='gb2312' soup=BeautifulSoup(res.text,'html.parser') for newlist in soup.select('.listInfo')[0].select('li'): title = newsList.select('a')[0].text time = newsList.select('.info')[0].select('p') link = newsList.select('a')[0]['href'] print(' 新闻标题:{0} 发表时间:{1} 新闻链接:{2} '.format(title, time, link)) getnewdetail(link) link='http://sports.qq.com/a/20180411/020544.htm' getnewlist(link) for i in range(1,20): if(i==1): getnewlist(link) else: link="http://sports.qq.com/a/20180411/020544_{}.htm".format(i) getnewslist(link)