zoukankan      html  css  js  c++  java
  • 获取全部校园新闻

    1.取出一个新闻列表页的全部新闻 包装成函数。

    2.获取总的新闻篇数,算出新闻总页数。

    3.获取全部新闻列表页的全部新闻详情。

    import requests
    import re
    from bs4 import BeautifulSoup
    from datetime import datetime
    
    newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(newsurl)  # 返回response对象
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    
    def getNewDetail(pageUrl):
        res = requests.get(pageUrl)  # 返回response对象
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        for news in soup.select('li'):
            if len(news.select('.news-list-title')) > 0:
                t = news.select('.news-list-title')[0].text  # 标题
                a = news.select('a')[0].attrs['href']  # 链接
                res = requests.get(a)
                res.encoding = 'utf-8'
                soupd = BeautifulSoup(res.text, 'html.parser')
                content = soupd.select('#content')[0].text
                description = news.select('.news-list-description')[0].text
                resd = requests.get(a)
                resd.encoding = 'utf-8'
                soupd = BeautifulSoup(resd.text, 'html.parser')
                info = soupd.select('.show-info')[0].text
                d = info.lstrip('发布时间:')[:19]
                dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
                author = info[info.find('作者:'):].split()[0].lstrip('作者:')
                source = info[info.find('来源:'):].split()[0].lstrip('来源:')
                photo = info[info.find('摄影:'):].split()[0].lstrip('摄影:')
                print("新闻标题:", t)
                print("链接:", a)
                print("发布时间:", dt)
                print("作者:", author)
                print("来源:", source)
                print("摄影:", photo)
                print("描述:", description)
                getClickCount(a)
                print("正文:", content)
    
    
    
    def getClickCount(a):
        newsid = re.search(r"\_(.*).html", a).group(1)[-4:]
        clickcounturl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid)
        clickcount = int(requests.get(clickcounturl).text.split(".html(")[-1].lstrip("'").rstrip("');"))
        print('点击次数:',clickcount)
    
    
    def getpagelist(path):
            res = requests.get(path)  # 返回response对象
            res.encoding = 'utf-8'
            soup = BeautifulSoup(res.text, 'html.parser')
            newsnum=int(soup.select('.a1')[0].text.rstrip('条'))    #新闻总条数
            if(newsnum%10==0):
                totalpage=newsnum//10
            else:
                totalpage=newsnum//10+1   #新闻总页数
    
            for i in range(1,totalpage):
                pageUrl = path + '{}.html'.format(i)
                getNewDetail(pageUrl)
    getpagelist(newsurl)
    

      

    4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

    # 爬取环球科技网新闻信息
    
    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import jieba
    
    newsurl = 'http://tech.huanqiu.com/internet/'
    
    
    def sort(text):
        str = '''一!“”,。?;’"',.、:
    '''
        for s in str:
            text = text.replace(s, ' ')
        wordlist = list(jieba.cut(text))
        exclude = {'这', 'u3000', '
    ', 'xa0', '的', '_', ' ', '将', '在', '是', '了', '一', '还', '也', '《', '》', '(', ')'}
        set2 = set(wordlist) - exclude
        dict = {}
        for key in set2:
            dict[key] = wordlist.count(key)
        dictlist = list(dict.items())
        dictlist.sort(key=lambda x: x[1], reverse=True)
        print("top5关键词:")
        for i in range(5):
            print(dictlist[i])
    
    
    def getContent(url):
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup2 = BeautifulSoup(res.text, 'html.parser')
        for news in soup2.select('.l_a'):
            if len(news.select('.author'))>0:
                author=news.select('.author')[0].text
                print("作者",author)
        content = soup2.select('.la_con')[0].text.rstrip('AD_SURVEY_Add_AdPos("7000531");')
        print("正文:", content)
        sort(content)
    
    
    def getNewDetails(newsurl):
        res = requests.get(newsurl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        for news in soup.select('.item'):
            #  print(news)
            title = news.select('a')[0].attrs['title']
            a = news.select('a')[0].attrs['href']
            brief = news.select('h5')[0].text.rstrip('[详细]')
            time = news.select('h6')[0].text
            dt = datetime.strptime(time, '%Y-%m-%d %H:%M')
            print("新闻标题:", title)
            print("链接:", a)
            print("内容简介:", brief)
            print("时间:", dt)
            getContent(a)
            print('
    ')
        # break
    
    
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    getNewDetails(newsurl)
    # for total in soup.select('#pages'):
    #     all=int(total.select('a')[0].text.rstrip('条'))  #获取总条数计算总页数
    #     #print(all)
    #     if(all%60==0):
    #         totalpages=all//60
    #     else:
    #         totalpages=all//60+1
    #     print(totalpages)
    #     for i in range(1,totalpages+1):     #所有页面的新闻信息
    #         PageUrl = newsurl + '{}.html'.format(i)
    #         getNewDetails(PageUrl)
    

      

    
    

      

  • 相关阅读:
    手动挂接NFS
    Linux中移动,复制,删除,打包排除某个目录或文件
    关于职业规划,尤其值得我们程序员学习、思考
    深入探究VC —— 链接器link.exe(4)
    用VC实现动态改变Windows的显示特性
    Windows常用控件的创建和使用
    如何添加自定义icon
    深入探究VC —— 资源编译器rc.exe(3)
    深入探究VC —— 编译器cl.exe(2)
    gluLookAt()
  • 原文地址:https://www.cnblogs.com/1996-yxl/p/8794500.html
Copyright © 2011-2022 走看看