zoukankan      html  css  js  c++  java
  • 获取全部校园新闻

    1.取出一个新闻列表页的全部新闻 包装成函数。

    2.获取总的新闻篇数,算出新闻总页数。

    3.获取全部新闻列表页的全部新闻详情。

    4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

    1-3:

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    listPageUrl = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
    page = 1
    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 '
                             '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 获取新闻点击次数
    def getNewsId(url):
        newsId = re.findall(r'\_(.*).html', url)[0][-4:]
        clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
        clickRes = requests.get(clickUrl,headers)
        # 利用正则表达式获取新闻点击次数
        clickCount = int(re.search("hits').html('(.*)');", clickRes.text).group(1))
        return clickCount
    # 读取新闻细节
    def getNewDetail(detail,title,description):
        resDescript = requests.get(detail,headers)
        resDescript.encoding = "utf-8"
        soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
    
        content = soupDescript.select(".show-content")[0].text  # 正文
        info = soupDescript.select(".show-info")[0].text  # info相关内容
        # 第一种方法 分离 message = info.split()
        # 第二种方法 用正则表达式
        print('标题' + ': ' + title)
        print('概要' + ': ' + description)
        print('链接' + ': ' + detail)
        print('正文' + ' :' + content)
        if(re.search("发布时间:(.*) xa0xa0 xa0xa0作者:", info) !="Null" ):
            time = re.search("发布时间:(.*) xa0xa0 xa0xa0作者:", info).group(1)
        if (re.search("作者:(.*)xa0xa0审核:", info) !="Null"):
            author = re.search("作者:(.*)xa0xa0审核:", info).group(1)
            print("作者:" + author)
        if (re.search("审核:(.*)xa0xa0来源:", info) !="Null"):
            right = re.search("审核:(.*)xa0xa0来源:", info).group(1)
        if (re.search('来源:(.*)xa0xa0xa0xa0摄影:', info) != "null"):
            resource = re.search('来源:(.*)xa0xa0xa0xa0摄影:', info)
        if (re.search("摄影:(.*)xa0xa0xa0xa0点击:", info)!="Null"):
            video = re.search("摄影:(.*)xa0xa0xa0xa0点击:", info)
        count = getNewsId(detail)
        dateTime = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    
    
    
    
    #获取新闻
    def getListPage(listPageUrl,n):
        resListPage = requests.get(listPageUrl,headers)
        resListPage.encoding = "utf-8"
        soup = BeautifulSoup(resListPage.text, 'html.parser')
        PageUrl = listPageUrl + str(1) + ".html"
        if(n == 0):
           global page
           page=int(soup.select(".a1")[0].text[:-1])
           print(page)
        for s in soup.select("li"):
          if len(s.select(".news-list-title"))>0:
            title = s.select(".news-list-title")[0].text #新闻标题
            description = s.select(".news-list-description")[0].text #新闻描述
            detail = s.a.attrs["href"] #观看新闻细节
            getNewDetail(detail,title,description)
    
    getListPage(listPageUrl,0)
    print(page)
    for n in range(1,page):
        PageUrl = listPageUrl + str(n) + ".html"
        getListPage(PageUrl,n)
    # for n in range(1,soup.select(".a1")[0].text[:-1]):
     #       PageUrl = listPageUrl+str(n)+"html";      getListPage(PageUrl)
        '''
       '''

     4(爬慕课网):

    import requests,jieba
    from bs4 import BeautifulSoup
    import re
    #网站 慕课网
    url = "https://www.imooc.com/course/list"
    resDescript = requests.get(url)
    resDescript.encoding = "utf-8"
    soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
    #全局变量
    n =1
    m =1
    notelist = ""
    #错误词汇
    delete_word={'使','D','b','e','f','t','C','','o','','a','A','n','S','y','i','',''}
    #分析数据的方向、类型、类别
    for s in soupDescript.select(".course-nav-item.on"):
            if n == 1 : print("方向:" + s.text)
            elif n ==2:print("类型:" + s.text)
            elif n ==3:print("类别:"+s.text)
            n = n+1
    #分析数据的受欢迎类型、难度
    for q in soupDescript.select(".sort-item.active"):
         if m == 1 : print("受欢迎类型:"+q.text)
         else: print("难度:"+q.text)
         m=m+1
    
    #挖掘课程的相关信息
    for content in soupDescript.select(".course-card-container"):
        img = content.select(".course-banner.lazy")[0].attrs["src"][2:]
        name = content.select(".course-card-name")[0].text
        desc = content.select(".course-card-desc")[0].text
        notelist += desc
        notelist+=name
        print("图片链接:"+img)
        print("课程名字:"+name)
        print("简介:"+desc)
        print("
    ")
    
    NoteDic={}
    for i in set(notelist):    #计算次数
        NoteDic[i]=notelist.count(i)
    
    
    for i in delete_word:        #删除非法词汇
        if i in NoteDic:
            del NoteDic[i]
    #进行排序
    sort_word = sorted(NoteDic.items(), key = lambda d:d[1], reverse = True)  # 由大到小排序
    #输出效果
    print(sort_word)

  • 相关阅读:
    10. Regular Expression Matching
    9. Palindrome Number
    6. ZigZag Conversion
    5. Longest Palindromic Substring
    4. Median of Two Sorted Arrays
    3. Longest Substring Without Repeating Characters
    2. Add Two Numbers
    链式表的按序号查找
    可持久化线段树——区间更新hdu4348
    主席树——树链上第k大spoj COT
  • 原文地址:https://www.cnblogs.com/qazwsx833/p/8796308.html
Copyright © 2011-2022 走看看