zoukankan      html  css  js  c++  java
  • 获取全部校园新闻

    1.取出一个新闻列表页的全部新闻 包装成函数。

    2.获取总的新闻篇数,算出新闻总页数。

    3.获取全部新闻列表页的全部新闻详情。

    4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(newsurl) #返回response对象
    res.encoding='utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    #总页数
    count = int(soup.select('#pages')[0].a.text.split('')[0])
    sum=int(count/10)
    
    def select_pager_news(current_url):
        res = requests.get(current_url)  # 返回response对象
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        for news in soup.select('li'):
            if len(news.select('.news-list-title'))>0:
                title=news.select('.news-list-title')[0].text
                descript=news.select('.news-list-description')[0].text
                a=news.a.attrs['href']
                p=news.span.text
                time=datetime.strptime(p,'%Y-%m-%d')
                print("发布时间:",time)
                print("标题:",title)
                print("描述:",descript)
                resd = requests.get(a)
                resd.encoding = 'utf-8'
                soupd = BeautifulSoup(resd.text, 'html.parser')
                info = soupd.select('.show-info')[0].text
                author = info[info.find('作者:'):].split()[0].lstrip('作者:')
                source = info[info.find('来源:'):].split()[0].lstrip('来源:')
                photo = info[info.find('摄影:'):].split()[0].lstrip('摄影:')
                content = soupd.select('#content')[0].text
                print("作者:", author)
                print("来源:", source)
                print("摄影:", photo)
                print("正文:", content)
    
    for i in range(sum+1,sum+2):
        select_pager_news('http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i))
    import requests
    import re
    import jieba
    from bs4 import BeautifulSoup
    def GetPageContent(ContentUrl):
        REQUEST = requests.get(ContentUrl)
        REQUEST.encoding = 'utf-8'
        SOUP = BeautifulSoup(REQUEST.text, 'html.parser')
        content=SOUP.select('#cnblogs_post_body')[0].text
        return content
    
    def GetPageInfo(PageUrl):
        Request=requests.get(PageUrl)
        Request.encoding='utf-8'
        Soup = BeautifulSoup(Request.text, 'html.parser')
        for post_item in Soup.select('.post_item'):
            if len(post_item.select('.titlelnk')[0])>0:
                #文章item标题
                item_title=post_item.select('.titlelnk')[0].text
                # 文章item url
                item_url=post_item.select('a')[0].attrs['href']
                return GetPageContent(item_url)
                #print(item_title)
    
    def PrintWordsCount(Text, Top):
        miss_word = "了|他|说|我|你|就|着|又|的|在|是|有|把|到|也|不|都|她|这|便|去|们|还|但|一个|和|却|里|来|要|没|很|"" 
                    "|那|么|一|将|呢|起|于|上|只|得|而|而且|对|所以|见|些|才|从|过|被|并|时|且|给|道|虽然|可以|出"
        Text = re.sub("[s+.!/_",$%^*+—()?【】“《;》”!-:,。?、~@#¥%……&*();{}=]+", "", Text)
        Text = re.sub(miss_word + '+', "", Text)
        words = list(jieba.cut(Text))
        key_words = {}
        for i in set(words):  # 统计出词频
            key_words[i] = words.count(i)
        sort_word = sorted(key_words.items(), key=lambda d: d[1], reverse=True)  # 排序
        for j in range(Top):  # 输出
            print(sort_word[j])
    
    count=''
    url='https://www.cnblogs.com/'
    request=requests.get(url)
    request.encoding='utf-8'
    soup=BeautifulSoup(request.text,'html.parser')
    page=int(soup.select('.pager')[0].select('a')[-8].text)
    
    for i in range(1,8):
        count+=GetPageInfo('https://www.cnblogs.com/#p{}'.format(i))
    PrintWordsCount(count,10)

    结果如下:

     

  • 相关阅读:
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    年轻人如何创业?有什么好的建议?
    青年创新创业大赛北京赛区总决赛举行
    90后瘫痪女孩靠创业改变人生
    怎么利用淘宝赚钱?具体方法有哪些?
    淘宝赚钱的方法有哪些?做淘宝要注意哪些?
    现在做淘宝赚钱吗?要注意哪些?
    2019年开淘宝店赚钱吗?需要注意什么?
    淘宝赚钱软件有哪些?具体怎么赚钱?
  • 原文地址:https://www.cnblogs.com/1103a/p/8798116.html
Copyright © 2011-2022 走看看