zoukankan      html  css  js  c++  java
  • 【Python】统计个人新浪微博词频并给出相应的柱状图

    Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门

    https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865

    本文介绍如何进行个人新浪微博词频统计,并给出相应的柱状图分析,编程环境为Python 2.7。该文主要包括三个部分:新浪微博API的使用、文本过滤及分词和词频统计。

        一、新浪微博API的使用
        首先在新浪微博开放平台http://open.weibo.com/development/上申请开发者账号,获取个人APP_KEY和APP_SECRET,下载并安装Python SDK。本文介绍的方法无需每次验证,直接运行即可。
    # -*- coding: UTF-8 -*-
    from weibo import APIClient
    from re import split
    import urllib,httplib
    import webbrowser
    import operator
    import numpy as np
    import matplotlib.pyplot as plt
     
    class iWInsightor(object):
        def __init__(self,ID,PW):
            self.ACCOUNT = ID
            self.PASSWORD = PW
            self.CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html'
            self.APP_KEY = 'XXXXXXX'#Yours
            self.APP_SECRET = 'XXXXXX'#Yours
            self.client = APIClient(app_key=self.APP_KEY, app_secret=self.APP_SECRET, redirect_uri=self.CALLBACK_URL)
            self.url = self.client.get_authorize_url()
            self.get_Authorization()
        
        def get_code(self):  
            conn = httplib.HTTPSConnection('api.weibo.com')
            postdata = urllib.urlencode({'client_id':self.APP_KEY,'response_type':'code','redirect_uri':self.CALLBACK_URL,'action':'submit','userId':self.ACCOUNT,'passwd':self.PASSWORD,'isLoginSina':0,'from':'','regCallback':'','state':'','ticket':'','withOfficalFlag':0})
            conn.request('POST','/oauth2/authorize',postdata,{'Referer':self.url,'Content-Type': 'application/x-www-form-urlencoded'})
            res = conn.getresponse()
            location = res.getheader('location')
            code = location.split('=')[1]
            conn.close()
            return code
        
        def get_Authorization(self):
            code = self.get_code()
            r = self.client.request_access_token(code)
            access_token = r.access_token
            expires_in = r.expires_in
            self.client.set_access_token(access_token, expires_in)
     
        #发送微博消息   
        def post_weibo(self,message):
            self.client.post.statuses__update(status=message.decode('gbk'))
            
        #获取当前用户ID
        def getCurrentUid(self):
            try:
                uid = self.client.account.get_uid.get()['uid']
                return uid
            except Exception:
                print 'get userid failed'
                return
     
        #获取用户关注列表
        def getFocus(self,userid):
            focuses = self.client.get.friendships__friends(uid=userid,count=200)
            Resfocus = []
            for focus in focuses["users"]:
                try:
                    Resfocus.append((focus["screen_name"],focus["gender"]))   
                except Exception:
                    print 'get focus failed'
                    return
            return Resfocus
     
        #获取用户标签
        def getTags(self,userid):
            try:
                tags = self.client.tags.get(uid=userid)
            except Exception:
                print 'get tags failed'
                return
            userTags = []
            sortedT = sorted(tags,key=operator.attrgetter('weight'),reverse=True)
            for tag in sortedT:
                for item in tag:
                    if item != 'weight':
                       userTags.append(tag[item])
            return userTags
     
        #获取用户发布的微博
        def getWeibo(self,uesrid,infile):
            contents = self.client.get.statuses__user_timeline(uid=uesrid, count=100)
            for content in contents.statuses:
                try:
                    f = open(infile,'a')
                    f.write(content.text)
                    f.write(' ')
                    f.close()
                except Exception:
                    print 'get text failed'
     
        def autolabel(self,rects):
            for rect in rects:
                height = rect.get_height()
                plt.text(rect.get_x()+rect.get_width()/2., 1.03*height, '%s' % float(height))
        
        #画出用户的关注男女比例图
        def getSexplot(self,userid,m,f,n):
            res = self.client.get.users__show(uid=userid)
            ind = np.arange(1,4) 
            width = 0.25      
            plt.subplot(111)
            rects1 = plt.bar(left=ind, height=(m,f,n), width=0.25,align = 'center')
     
            plt.ylabel('The Focus Number')
            plt.title('Sex Analysis(effective samples:%d)' % (m+f+n))
         
            plt.xticks(ind, ("Male","Female","Unknown") )
            self.autolabel(rects1)
            plt.legend((rects1,),("User:%s" % res["screen_name"],))
            plt.show()
            
    if __name__ == '__main__':
        usrID = raw_input('请输入新浪微博用户名:')
        usrPW = raw_input('请输入新浪微博密码:')
        AppClient = iWInsightor(usrID, usrPW)
        
        userid = AppClient.getCurrentUid()
        infile = "E://data/weibo.dat"#微博内容保存路径及文件名
        AppClient.getWeibo(userid,infile)
     
        #Focus = AppClient.getFocus(userid)
        #m = 0
        #f = 0
        #n = 0
        #for i in Focus:
            #if i[1] == "m":
                #m = m+1
            #elif i[1] == "f":
                #f = f+1
            #else:
                #n = n+1
        #AppClient.getSexplot(userid,m,f,n)
        二、文本过滤及分词
        微博中常常含有一些词汇,其对词频统计无任何作用,利用英文字母数字、汉语标点符号以及其他个性符号,这些我们需要在分词前将其滤除。此外,你还可以添加自己想滤除的符号或者字词。
        中文与英文句子比较而言,有一个非常有趣的现象,那就是英文单词之间是有空格的,而中文则不然。因此,分词也成了中文信息处理中的一个基本步骤。我用的是结巴分词,可以添加自定义词典(因为分词字典很多词可能没涉及到),下载地址为https://github.com/fxsjy/jieba
    # -*- coding: UTF-8-*-
    import string
    import jieba
     
    extra_dict = 'F://NLP/iWInsightor/jieba/mydict.dict'#自定义词典
    jieba.load_userdict(extra_dict)
     
    def filter_str(instr):
      deEstr = string.punctuation + ' ' + string.digits + string.letters
      deCstr = ',。《》【】()!?★”“、:…'
      destr = deEstr + deCstr
      outstr = ''
      for char in instr.decode('utf-8'):
        if char not in destr:
          outstr += char
      return outstr
     
    fp_in = open('F://NLP/iWInsightor/weibo.dat', 'rb+')#待处理文本
    fp_out = open('F://NLP/iWInsightor/weibo_filter.dat', 'a')#处理后的文本
     
    for line in fp_in:
      str_delete = filter_str(line)
      seg_list = jieba.cut(str_delete,cut_all=True)
      str_join = ' '.join(seg_list)
      fp_out.write(str_join)
     
    fp_in.close()
    fp_out.close()
        三、词频统计
        词频统计就是指统计出某个文本中各个词出现的次数,这里使用python中的词典数据结构易得。我用的是matplotlib画柱状图,画出top-K个高频词。这里需要注意的是图中的中文显示问题,在使用之前,需要修改相应的设置,具体方法不妨去google一下,我就不详细介绍了。
        # -*- coding: UTF-8-*-
    import string
    import numpy
    import pylab
     
    def getstr(word, count):
        countstr = word + ',' + str(count)
        return countstr
     
    def get_wordlist(infile):
        c = open(infile).readlines()
        wordlist = []
        for line in c:
            if len(line)>1:
                words = line.split(' ')
                for word in words:
                    if len(word)>1:
                        wordlist.append(word)
        return wordlist
        
    def get_wordcount(wordlist, outfile):
        out = open(outfile, 'w')
        wordcnt ={}
        for i in wordlist:
            if i in wordcnt:
                wordcnt[i] += 1
            else:
                wordcnt[i] = 1
        worddict = wordcnt.items()
        worddict.sort(key=lambda a: -a[1])
        for word,cnt in worddict:
            out.write(getstr(word.encode('gbk'), cnt)+' ')
        out.close()
        return wordcnt
     
    def barGraph(wcDict):
        wordlist=[]
        for key,val in wcDict.items():
            if val>5 and len(key)>3:
                wordlist.append((key.decode('utf-8'),val))
        wordlist.sort()
        keylist=[key for key,val in wordlist]
        vallist=[val for key,val in wordlist]
        barwidth=0.5
        xVal=numpy.arange(len(keylist))
        pylab.xticks(xVal+barwidth/2.0,keylist,rotation=45)
        pylab.bar(xVal,vallist,width=barwidth,color='y')
        pylab.title(u'微博词频分析图')
        pylab.show()
         
    if __name__ == '__main__':
        myfile = 'F://NLP/iWInsightor/weibo_filter.dat'
        outfile = 'F://NLP/iWInsightor/result.dat'
        wordlist = get_wordlist(myfile)
        wordcnt = get_wordcount(wordlist,outfile)
        barGraph(wordcnt)
        
        至此,我们的工作就完成了。下面是我的微博词频的一个柱状图。这些仅是业余时间之作,尚有诸多不足之处。
        【Python】统计个人新浪微博词频并给出相应的柱状图
  • 相关阅读:
    使用cd回到上次编辑的目录
    自动机编程
    python日常题目小练习
    python中的循环结构等相关知识
    python中的数学类型及操作
    小白艰难的Python图像的绘制
    小白的第二天之计算机基础及软件安装
    小白的日常练习
    小白的第一天
    协程
  • 原文地址:https://www.cnblogs.com/webRobot/p/5399413.html
Copyright © 2011-2022 走看看