zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests, re, jieba,pandas
    from bs4 import BeautifulSoup
    from datetime import datetime
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    
    # 获取新闻细节
    def getNewsDetail(newsUrl):
        res = requests.get(newsUrl)
        res.encoding = 'gb2312'
        soupd = BeautifulSoup(res.text, 'html.parser')
        detail = {'title': soupd.select('#epContentLeft')[0].h1.text, 'newsUrl': newsUrl, 'time': datetime.strptime(
            re.search('(d{4}.d{2}.d{2}sd{2}.d{2}.d{2})', soupd.select('.post_time_source')[0].text).group(1),
            '%Y-%m-%d %H:%M:%S'), 'source': re.search('来源:(.*)', soupd.select('.post_time_source')[0].text).group(1),
                  'content': soupd.select('#endText')[0].text}
        return detail
    
    # 通过jieba分词,获取新闻关键词
    def getKeyWords():
        content = open('news.txt', 'r', encoding='utf-8').read()
        wordSet = set(jieba._lcut(''.join(re.findall('[u4e00-u9fa5]', content))))  # 通过正则表达式选取中文字符数组,拼接为无标点字符内容,再转换为字符集合
        wordDict = {}
        deleteList, keyWords = [], []
        for i in wordSet:
            wordDict[i] = content.count(i)  # 生成词云字典
        for i in wordDict.keys():
            if len(i) < 2:
                deleteList.append(i)  # 生成单字无意义字符列表
        for i in deleteList:
            del wordDict[i]  # 在词云字典中删除无意义字符
        dictList = list(wordDict.items())
        dictList.sort(key=lambda item: item[1], reverse=True)
        for dict in dictList:
            keyWords.append(dict[0])
        writekeyword(keyWords)
    
    
    # 将新闻内容写入到文件
    def writeNews(pagedetail):
        f = open('text1.txt', 'a', encoding='utf-8')
        for detail in pagedetail:
            f.write(detail['content'])
        f.close()
    
    
    # 将词云写入到文件
    def writekeyword(keywords):
        f = open('text.txt', 'a', encoding='utf-8')
        for word in text:
            f.write('  ' + word)
        f.close()
    
    # 获取一页的新闻
    def getListPage(listUrl):
        res = requests.get(listUrl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        pagedetail = []  # 存储一页所有新闻的详情
        for news in soup.select('#news-flow-content')[0].select('li'):
            newsdetail = getNewsDetail(news.select('a')[0]['href'])  # 调用getNewsDetail()获取新闻详情
            pagedetail.append(newsdetail)
        return pagedetail
    
    
    def getWordCloud():
        keywords = open('keywords.txt', 'r', encoding='utf-8').read()  # 打开词云文件
        wc = WordCloud(font_path=r'C:WindowsFontssimfang.ttf', background_color='white', max_words=100).generate(
            keywords).to_file('kwords.png')  # 生成词云,字体设置为可识别中文字符
        plt.imshow(wc)
        plt.axis('off')
        plt.show()
    
    pagedetail = getListPage('http://tech.163.com/internet/')  # 获取首页新闻
    writeNews(pagedetail)
    for i in range(2, 20):  # 因为网易新闻频道只存取20页新闻,直接设置20
        listUrl = 'http://tech.163.com/special/tele_2016_%02d/' % i  # 填充新闻页,页面格式为两位数字字符
        pagedetail = getListPage(listUrl)
        writeNews(pagedetail)
    getKeyWords()  # 获取词云,并且写到文件
    getWordCloud()  # 从词云文件读取词云,生成词云
    

      

  • 相关阅读:
    Chrome技巧
    jQuery中.bind() .live() .delegate() .on()的区别
    BRAVEHEART勇敢的心威廉姆华莱士战场演讲
    CSS3小模块hover左右交替互换动画
    sublime text 3
    百度图片搜索页的图片展示列表模块jquery效果
    出埃及记:摩西劈开红海
    用位数组计算整数中1的个数
    Using the XPath Wrappers
    【转】为Xcode 4挑选自己喜欢的字体和颜色(Panic Sans)
  • 原文地址:https://www.cnblogs.com/onlythisone/p/8973928.html
Copyright © 2011-2022 走看看