1.取出一个新闻列表页的全部新闻 包装成函数。
2.获取总的新闻篇数,算出新闻总页数。
3.获取全部新闻列表页的全部新闻详情。
4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。
import requests from bs4 import BeautifulSoup from datetime import datetime newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(newsurl) #返回response对象 res.encoding='utf-8' soup = BeautifulSoup(res.text,'html.parser') #总页数 count = int(soup.select('#pages')[0].a.text.split('条')[0]) sum=int(count/10) def select_pager_news(current_url): res = requests.get(current_url) # 返回response对象 res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('li'): if len(news.select('.news-list-title'))>0: title=news.select('.news-list-title')[0].text descript=news.select('.news-list-description')[0].text a=news.a.attrs['href'] p=news.span.text time=datetime.strptime(p,'%Y-%m-%d') print("发布时间:",time) print("标题:",title) print("描述:",descript) resd = requests.get(a) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') info = soupd.select('.show-info')[0].text author = info[info.find('作者:'):].split()[0].lstrip('作者:') source = info[info.find('来源:'):].split()[0].lstrip('来源:') photo = info[info.find('摄影:'):].split()[0].lstrip('摄影:') content = soupd.select('#content')[0].text print("作者:", author) print("来源:", source) print("摄影:", photo) print("正文:", content) for i in range(sum+1,sum+2): select_pager_news('http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i))
import requests import re import jieba from bs4 import BeautifulSoup def GetPageContent(ContentUrl): REQUEST = requests.get(ContentUrl) REQUEST.encoding = 'utf-8' SOUP = BeautifulSoup(REQUEST.text, 'html.parser') content=SOUP.select('#cnblogs_post_body')[0].text return content def GetPageInfo(PageUrl): Request=requests.get(PageUrl) Request.encoding='utf-8' Soup = BeautifulSoup(Request.text, 'html.parser') for post_item in Soup.select('.post_item'): if len(post_item.select('.titlelnk')[0])>0: #文章item标题 item_title=post_item.select('.titlelnk')[0].text # 文章item url item_url=post_item.select('a')[0].attrs['href'] return GetPageContent(item_url) #print(item_title) def PrintWordsCount(Text, Top): miss_word = "了|他|说|我|你|就|着|又|的|在|是|有|把|到|也|不|都|她|这|便|去|们|还|但|一个|和|却|里|来|要|没|很|"" "|那|么|一|将|呢|起|于|上|只|得|而|而且|对|所以|见|些|才|从|过|被|并|时|且|给|道|虽然|可以|出" Text = re.sub("[s+.!/_",$%^*+—()?【】“《;》”!-:,。?、~@#¥%……&*();{}=]+", "", Text) Text = re.sub(miss_word + '+', "", Text) words = list(jieba.cut(Text)) key_words = {} for i in set(words): # 统计出词频 key_words[i] = words.count(i) sort_word = sorted(key_words.items(), key=lambda d: d[1], reverse=True) # 排序 for j in range(Top): # 输出 print(sort_word[j]) count='' url='https://www.cnblogs.com/' request=requests.get(url) request.encoding='utf-8' soup=BeautifulSoup(request.text,'html.parser') page=int(soup.select('.pager')[0].select('a')[-8].text) for i in range(1,8): count+=GetPageInfo('https://www.cnblogs.com/#p{}'.format(i)) PrintWordsCount(count,10)
结果如下: