zoukankan      html  css  js  c++  java
  • 课程作业——获取全部校园新闻

    作业要求:

    1. 取出一个新闻列表页的全部新闻 包装成函数。
    2. 获取总的新闻篇数,算出新闻总页数。
    3. 获取全部新闻列表页的全部新闻详情。
    4. 找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

    前三个要求代码如下:

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    
    # 设置local是处理包含中文格式时间(%Y年%m月%d日)时报错:
    # UnicodeEncodeError: 'locale' codec can't encode character 'u5e74'
    # import locale
    # locale.setlocale(locale.LC_CTYPE, 'chinese')
    
    
    def crawlOnePageSchoolNews(page_url):
        res = requests.get(page_url)
        res.encoding = 'UTF-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        news = soup.select('.news-list > li')
        for n in news:
            # print(n)
            print('**' * 5 + '列表页信息' + '**' * 10)
            print('新闻链接:' + n.a.attrs['href'])
            print('新闻标题:' + n.select('.news-list-title')[0].text)
            print('新闻描述:' + n.a.select('.news-list-description')[0].text)
            print('新闻时间:' + n.a.select('.news-list-info > span')[0].text)
            print('新闻来源:' + n.a.select('.news-list-info > span')[1].text)
            getNewDetail(n.a.attrs['href'])
    
    def getNewDetail(href):
        print('**' * 5 + '详情页信息' + '**' * 10)
        res1 = requests.get(href)
        res1.encoding = 'UTF-8'
        soup1 = BeautifulSoup(res1.text, 'html.parser')
        if soup1.select('.show-info'): # 防止之前网页没有show_info
            news_info = soup1.select('.show-info')[0].text
        else:return
        info_list = ['来源', '发布时间', '点击', '作者', '审核', '摄影']  # 需要解析的字段
        news_info_set = set(news_info.split('xa0')) - {' ', ''}  # 网页中的 获取后会解析成xa0,所以可以使用xa0作为分隔符
        # 循环打印文章信息
        for n_i in news_info_set:
            for info_flag in info_list:
                if n_i.find(info_flag) != -1:  # 因为时间的冒号采用了英文符所以要进行判断
                    if info_flag == '发布时间':
                        # 将发布时间字符串转为datetime格式,方便日后存储到数据库
                        release_time = datetime.strptime(n_i[n_i.index(':') + 1:], '%Y-%m-%d %H:%M:%S ')
                        print(info_flag + ':', release_time)
                    elif info_flag == '点击':  # 点击次数是通过文章id访问php后使用js写入,所以这里单独处理
                        getClickCount(href)
                    else:
                        print(info_flag + ':' + n_i[n_i.index(':') + 1:])
        news_content = soup1.select('#content')[0].text
        print(news_content)  # 文章内容
        print('————' * 40)
    
    
    def getClickCount(news_url):
        # http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80
        # 上面链接为文章页得出访问次数的URL
        click_num_url = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'
        # 通过正则表达式得出文章id
        click_num_url = click_num_url.format(re.search('_(.*)/(.*).html', news_url).group(2))
        res2 = requests.get(click_num_url)
        res2.encoding = 'UTF-8'
        # $('#todaydowns').html('5');$('#weekdowns').html('106');$('#monthdowns').html('129');$('#hits').html('399');
        # 上面为response的内容
    
        # 使用正则表达式的方法获取点击次数
        # res2.text[res2.text.rindex("('") + 2:res2.text.rindex("')")],不使用正则的方式
        print('点击:' + re.search("$('#hits').html('(d*)')", res2.text).group(1))
    
    
    crawlOnePageSchoolNews('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    
    pageURL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'
    res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    res.encoding = 'UTF-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    newsSum = int(re.search('(d*)条', soup.select('a.a1')[0].text).group(1))
    if newsSum % 10:
        pageSum = int(newsSum/10) + 1
    else:
        pageSum = int(newsSum/10)
    
    for i in range(2, pageSum+1):
        crawlOnePageSchoolNews(pageURL.format(i))
    
    

    结果截图:

    第四个要求中,我爬取了校园所有的新闻描述,分析大概学校这些年干了些什么,在哪干,强调些什么,统计出词云。
    主要代码如下:

    import requests
    from bs4 import BeautifulSoup
    import re
    import jieba
    
    editors = []
    descriptions = ''
    
    
    def crawlOnePageSchoolNews(page_url):
        global descriptions
        res0 = requests.get(page_url)
        res0.encoding = 'UTF-8'
        soup0 = BeautifulSoup(res0.text, 'html.parser')
        news = soup0.select('.news-list > li')
        for n in news:
            print('新闻描述:' + n.a.select('.news-list-description')[0].text)
            print('新闻来源:' + n.a.select('.news-list-info > span')[1].text)
            descriptions = descriptions + ' ' + n.a.select('.news-list-description')[0].text
            editors.append(n.a.select('.news-list-info > span')[1].text.split(' ')[0])
    
    
    crawlOnePageSchoolNews('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    
    pageURL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'
    res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    res.encoding = 'UTF-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    newsSum = int(re.search('(d*)条', soup.select('a.a1')[0].text).group(1))
    if newsSum % 10:
        pageSum = int(newsSum / 10) + 1
    else:
        pageSum = int(newsSum / 10)
    
    for i in range(2, pageSum+1):
        crawlOnePageSchoolNews(pageURL.format(i))
    
    with open('punctuation.txt', 'r', encoding='UTF-8') as punctuationFile:
        for punctuation in punctuationFile.readlines():
            descriptions = descriptions.replace(punctuation[0], ' ')
    
    with open('meaningless.txt', 'r', encoding='UTF-8') as meaninglessFile:
        mLessSet = set(meaninglessFile.read().split('
    '))
    mLessSet.add(' ')
    
    # 加载保留字
    with open('reservedWord.txt', 'r', encoding='UTF-8') as reservedWordFile:
        reservedWordSet = set(reservedWordFile.read().split('
    '))
        for reservedWord in reservedWordSet:
            jieba.add_word(reservedWord)
    
    
    keywordList = list(jieba.cut(descriptions))
    keywordSet = set(keywordList) - mLessSet  # 将无意义词从词语集合中删除
    
    keywordDict = {}
    # 统计出词频字典
    for word in keywordSet:
        keywordDict[word] = keywordList.count(word)
    
    # 对词频进行排序
    keywordListSorted = list(keywordDict.items())
    keywordListSorted.sort(key=lambda e: e[1], reverse=True)
    # 将所有词频写出到txt做词云分析
    for topWordTup in keywordListSorted:
        print(topWordTup)
        with open('word.txt', 'a+', encoding='UTF-8') as wordFile:
            for i in range(0, topWordTup[1]):
                wordFile.write(topWordTup[0]+'
    ')
    
    
    

    经过以上处理之后将结果通过 https://wordsift.org/ 生成词云如下:

    有些保留字没有处理好,所以有些事无意义词就选择性忽略
    在上面中的文件已上传这里

  • 相关阅读:
    索引压缩
    拼写校正
    词典(词汇表)
    Text Relatives II
    Text Relatives
    CoreText
    Quartz2D Text
    PDF Document Creation, Viewing
    Core Graphics Layer Drawing
    Bitmap Images and Image Masks
  • 原文地址:https://www.cnblogs.com/lger/p/8795312.html
Copyright © 2011-2022 走看看