zoukankan      html  css  js  c++  java
  • 爬虫大作业

    1.选一个自己感兴趣的主题。

    2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

    3.对爬了的数据进行文本分析,生成词云。

    4.对文本分析结果进行解释说明。

    5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

    6.最后提交爬取的全部数据、爬虫及数据分析源代码。

    我所爬取的是校园新闻信息还有多玩LOL新闻版块的新闻

    完成作业遇到的问题:

    词云wordcloud的安装.

    主要是不会怎么借助词云导出。

    爬取校园新闻信息并且生成词云

    
    
    import requests
    import string
    import re
    import jieba
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
    from datetime import datetime
    newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(newsurl) #返回response对象
    res.encoding='utf-8'
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(res.text,'html.parser')

    def getKeynews(content):
    content = ''.join(re.findall('[u4e00-u9fa5]', content)) # 通过正则表达式选取中文字符数组,拼接为无标点字符内容
    # 去掉重复的字符生成集合
    newSet = set(jieba._lcut(content)) #划分内容
    newDict = {} #定义字典
    for i in newSet:
    newDict[i] = content.count(i)
    deleteList, keynews = [], []
    for i in newDict.keys():
    if len(i) < 2:
    deleteList.append(i) #去掉单音无意义字符
    for i in deleteList:
    del newDict[i]
    dictList = list(newDict.items())
    dictList.sort(key=lambda item: item[1], reverse=True) # 排序
    for dict in dictList:
    keynews.append(dict[0])
    return keynews
    def writeFilekeynews(keywords):
    f = open('Filekeynews', 'a', encoding='utf-8')
    for word in keywords:
    f.write(" "+word)
    f.close()
    def writeNewsDetail(content):
    f=open('gzccNews.txt','a',encoding='utf-8')
    f.write(" " + content)
    f.close()
    def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    content=soupd.select('.show-content')[0].text.strip()
    writeNewsDetail(content)
    keynews = getKeynews(content)
    writeFilekeynews(keynews)
    # def getWordCloud():
    # keynewsTowordcloud = open('keyword.txt', 'r', encoding='utf-8').read()
    # print(keynewsTowordcloud)
    # backgroud_Image = plt.imread('bg.jpg')
    # wc = WordCloud(background_color='white', # 设置背景颜色
    # mask=backgroud_Image, # 设置背景图片
    # stopwords=STOPWORDS,
    # max_words=80, # 设置最大现实的字数
    # font_path='C:WindowsFontsAdobeKaitiStd-Regular.otf', # 设置字体格式,如不设置显示不了中文
    # max_font_size=80, # 设置字体最大值
    # random_state=30, # 设置有多少种随机生成状态,即有多少种配色方案
    # )
    # wc.generate(keynewsTowordcloud)
    # image_colors = ImageColorGenerator(backgroud_Image)
    # wc.recolor(color_func=image_colors)
    # plt.imshow(wc)
    # plt.axis('off')
    # plt.show()

    def getListPage(listPageUrl):
    res=requests.get(listPageUrl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    for news in soup.select('li'):
    if len(news.select('.news-list-title'))>0:
    a=news.select('a')[0].attrs['href']
    getNewsDetail(a)
    firstPage='http://news.gzcc.cn/html/xiaoyuanxinwen/'
    getListPage(firstPage)
    for i in range(2,3):
    listpageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    f=open('keyword.txt','r').read()
    wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:WindowsFontsAdobeKaitiStd-Regular.otf').generate(f)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    wordcloud.to_file('111.png')
     

     爬取多玩LOL新闻版块

    # -*- coding : UTF-8 -*-
    import requests
    import string
    import re
    import jieba
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
    from datetime import  datetime
    from bs4 import BeautifulSoup
    
    # def getKeynews(content):
    #     content = ''.join(re.findall('[u4e00-u9fa5]', content))  # 通过正则表达式选取中文字符数组,拼接为无标点字符内容
    #     # 去掉重复的字符生成集合
    #     newSet = set(jieba._lcut(content))               #划分内容
    #     newDict = {}            #定义字典
    #     for i in newSet:
    #         newDict[i] = content.count(i)
    #     deleteList, keynews = [], []
    #     for i in newDict.keys():
    #         if len(i) < 2:
    #          deleteList.append(i)  #去掉单音无意义字符
    #     for i in deleteList:
    #         del newDict[i]
    #     dictList = list(newDict.items())
    #     dictList.sort(key=lambda item: item[1], reverse=True)  # 排序
    #     for dict in dictList:
    #         keynews.append(dict[0])
    #     return keynews
    # def writeFilekeynews(keywords):
    #     f = open('Filekeynews6', 'a', encoding='utf-8')
    #     for word in keywords:
    #         f.write(" "+word)
    #     f.close()
    # def writeNewsDetail(content):
    #     f=open('duowanNews.txt','a',encoding='utf-8')
    #     f.write("
    " + content)
    #     f.close()
    # def getNewsDetail(newsUrl):
    #     resd = requests.get(newsUrl)
    #     resd.encoding = 'utf-8'
    #     soupd = BeautifulSoup(resd.text, 'html.parser')
    #     content=soupd.select('.show-content')[0].text.strip()
    #     writeNewsDetail(content)
    #     keynews = getKeynews(content)
    #     writeFilekeynews(keynews)
    
    
    # firstPage='http://lol.duowan.com/tag/172578469745.html'
    # for i in range(2,3):
    #     listpageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    # f=open('keyword.txt','r').read()
    # wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:WindowsFontsAdobeKaitiStd-Regular.otf').generate(f)
    # plt.imshow(wordcloud)
    # plt.axis("off")
    # plt.show()
    # wordcloud.to_file('111.png')
    
    # f=open('keyword2.txt','r').read()
    # wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:WindowsFontsAdobeKaitiStd-Regular.otf').generate(f)
    # plt.imshow(wordcloud)
    # plt.axis("off")
    # plt.show()
    def write_news_to_document(filename, content):
        f = open(filename, 'w', encoding='utf-8')
        for detail in content:
            f.write(detail['content'])
        f.close()
    
    
    # 将得到的关键词写入文件
    def write_keywords_to_document(filename, keywords):
        f = open(filename, 'w', encoding='utf-8')
        for word in keywords:
            f.write('  ' + word)
        f.close()
    
    
    # 通过jieba分词得到关键词
    def get_keywords(filename):
        f = open(filename, 'r', encoding='utf-8')
        content = f.read()
        f.close()
        word_set = set(jieba.lcut(''.join(re.findall("[u4e00-u9fa5_a-zA-Z0-9]", content))))
        # 通过正则表达式选取中文,字母及数字字符数组,拼接为无标点字符内容,再转换为字符集合
        word_dict = {}
        delete_list = []
        keywords = []
        for a in word_set:
            word_dict[a] = content.count(a)  # 生成词云字典
        for j in word_dict.keys():
            if len(j) < 2:
                delete_list.append(j)  # 生成单字无意义字符列表
        for k in delete_list:
            del word_dict[k]  # 在词云字典中删除无意义字符
        dict_list = list(word_dict.items())
        dict_list.sort(key=lambda item: item[1], reverse=True)
        for dict in dict_list:
            keywords.append(dict[0])
        print(keywords)
        write_keywords_to_document("NewsKeyword.txt", keywords)
    
    
    # 获取详细新闻内容
    # def get_news_detail(news_url):
    #     res_d = requests.get(news_url)
    #     res_d.encoding = 'gbk'
    #     soup_d = BeautifulSoup(res_d.text, 'html.parser')
    #     content = ''
    #     for p in range(0, len(soup_d.select(".text"))):
    #         content += soup_d.select('.text')[p].text + '
    '
    #     detail = {'content': content}
    #     return detail
    
    def get_news_detail(news_url):
        res_d = requests.get(news_url)
        res_d.encoding = 'UTF-8'
        soup_d = BeautifulSoup(res_d.text, 'html.parser')
        content = ''
        for i in range(3, 15):
            content += (soup_d.select('p')[i].text)+ '
    '
        detail = {'content': content}
        return detail
    
    
    
    
    # 获取新闻列表
    def get_news_list(list_url):
        res = requests.get(list_url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        page_detail = []
        for newsList in soup.select('.m-list')[0].select('li'):
            a = newsList.select('a')[0].attrs['href']
            b = 'http://lol.duowan.com/' + a
            news_detail = get_news_detail(b)
            page_detail.append(news_detail)
        return page_detail
    
    
    #主函数
     url = "http://lol.duowan.com/tag/172578469745.html"
     Page_detail = get_news_list(url)
     print(Page_detail)
     write_news_to_document("News.txt", Page_detail)
     for i in range(2, 9):
         news_url = "http://lol.duowan.com/tag/172578469745_{}.html".format(i)
         Page_detail = get_news_list(url)
         write_news_to_document("News.txt", Page_detail)
     get_keywords("News.txt")
    f=open('NewsKeyword.txt','r').read()
    wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:WindowsFontsAdobeKaitiStd-Regular.otf').generate(f)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    wordcloud.to_file('777.png')

    爬取这段的时候是LOL的MSI季中赛决赛结束后夺冠,可以看出RNG战队的UZI、Letme、Karsa、Ming、Xiaohu、Mlxg这几名选手的热度都很高,然后由于UZI、Letme、Karsa,这三名选手的表脸都很亮眼,所以词频数很高。

  • 相关阅读:
    精算师的前世今生
    失落的C语言结构体封装艺术
    关于联合的一些介绍
    变量的声明和定义
    C/C++内存分配区
    探寻周瑜“前世今生”
    SpringBoot中使用AOP
    springBoot中的事物管理
    springBoot整合多数据源
    spingBoot整合mybatis+generator+pageHelper
  • 原文地址:https://www.cnblogs.com/swxvico/p/8973164.html
Copyright © 2011-2022 走看看