【Python】统计个人新浪微博词频并给出相应的柱状图

zoukankan html css js c++ java

【Python】统计个人新浪微博词频并给出相应的柱状图

Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门

https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865

本文介绍如何进行个人新浪微博词频统计，并给出相应的柱状图分析，编程环境为Python 2.7。该文主要包括三个部分：新浪微博API的使用、文本过滤及分词和词频统计。

    一、新浪微博API的使用

    首先在新浪微博开放平台http://open.weibo.com/development/上申请开发者账号，获取个人APP_KEY和APP_SECRET，下载并安装Python SDK。本文介绍的方法无需每次验证，直接运行即可。

# -*- coding: UTF-8 -*-

from weibo import APIClient

from re import split

import urllib,httplib

import webbrowser

import operator

import numpy as np

import matplotlib.pyplot as plt

class iWInsightor(object):

    def __init__(self,ID,PW):

        self.ACCOUNT = ID

        self.PASSWORD = PW

        self.CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html'

        self.APP_KEY = 'XXXXXXX'#Yours

        self.APP_SECRET = 'XXXXXX'#Yours

        self.client = APIClient(app_key=self.APP_KEY, app_secret=self.APP_SECRET, redirect_uri=self.CALLBACK_URL)

        self.url = self.client.get_authorize_url()

        self.get_Authorization()



    def get_code(self):

        conn = httplib.HTTPSConnection('api.weibo.com')

        postdata = urllib.urlencode({'client_id':self.APP_KEY,'response_type':'code','redirect_uri':self.CALLBACK_URL,'action':'submit','userId':self.ACCOUNT,'passwd':self.PASSWORD,'isLoginSina':0,'from':'','regCallback':'','state':'','ticket':'','withOfficalFlag':0})

        conn.request('POST','/oauth2/authorize',postdata,{'Referer':self.url,'Content-Type': 'application/x-www-form-urlencoded'})

        res = conn.getresponse()

        location = res.getheader('location')

        code = location.split('=')[1]

        conn.close()

        return code



    def get_Authorization(self):

        code = self.get_code()

        r = self.client.request_access_token(code)

        access_token = r.access_token

        expires_in = r.expires_in

        self.client.set_access_token(access_token, expires_in)

    #发送微博消息

    def post_weibo(self,message):

        self.client.post.statuses__update(status=message.decode('gbk'))



    #获取当前用户ID

    def getCurrentUid(self):

        try:

            uid = self.client.account.get_uid.get()['uid']

            return uid

        except Exception:

            print 'get userid failed'

            return

    #获取用户关注列表

    def getFocus(self,userid):

        focuses = self.client.get.friendships__friends(uid=userid,count=200)

        Resfocus = []

        for focus in focuses["users"]:

            try:

                Resfocus.append((focus["screen_name"],focus["gender"]))

            except Exception:

                print 'get focus failed'

                return

        return Resfocus

    #获取用户标签

    def getTags(self,userid):

        try:

            tags = self.client.tags.get(uid=userid)

        except Exception:

            print 'get tags failed'

            return

        userTags = []

        sortedT = sorted(tags,key=operator.attrgetter('weight'),reverse=True)

        for tag in sortedT:

            for item in tag:

                if item != 'weight':

                   userTags.append(tag[item])

        return userTags

    #获取用户发布的微博

    def getWeibo(self,uesrid,infile):

        contents = self.client.get.statuses__user_timeline(uid=uesrid, count=100)

        for content in contents.statuses:

            try:

                f = open(infile,'a')

                f.write(content.text)

                f.write(' ')

                f.close()

            except Exception:

                print 'get text failed'

    def autolabel(self,rects):

        for rect in rects:

            height = rect.get_height()

            plt.text(rect.get_x()+rect.get_width()/2., 1.03*height, '%s' % float(height))



    #画出用户的关注男女比例图

    def getSexplot(self,userid,m,f,n):

        res = self.client.get.users__show(uid=userid)

        ind = np.arange(1,4)

        width = 0.25

        plt.subplot(111)

        rects1 = plt.bar(left=ind, height=(m,f,n), width=0.25,align = 'center')

        plt.ylabel('The Focus Number')

        plt.title('Sex Analysis(effective samples:%d)' % (m+f+n))



        plt.xticks(ind, ("Male","Female","Unknown") )

        self.autolabel(rects1)

        plt.legend((rects1,),("User:%s" % res["screen_name"],))

        plt.show()



if __name__ == '__main__':

    usrID = raw_input('请输入新浪微博用户名：')

    usrPW = raw_input('请输入新浪微博密码:')

    AppClient = iWInsightor(usrID, usrPW)



    userid = AppClient.getCurrentUid()

    infile = "E://data/weibo.dat"#微博内容保存路径及文件名

    AppClient.getWeibo(userid,infile)

    #Focus = AppClient.getFocus(userid)

    #m = 0

    #f = 0

    #n = 0

    #for i in Focus:

        #if i[1] == "m":

            #m = m+1

        #elif i[1] == "f":

            #f = f+1

        #else:

            #n = n+1

    #AppClient.getSexplot(userid,m,f,n)

    二、文本过滤及分词

    微博中常常含有一些词汇，其对词频统计无任何作用，利用英文字母数字、汉语标点符号以及其他个性符号，这些我们需要在分词前将其滤除。此外，你还可以添加自己想滤除的符号或者字词。

    中文与英文句子比较而言，有一个非常有趣的现象，那就是英文单词之间是有空格的，而中文则不然。因此，分词也成了中文信息处理中的一个基本步骤。我用的是结巴分词，可以添加自定义词典（因为分词字典很多词可能没涉及到），下载地址为https://github.com/fxsjy/jieba。

# -*- coding: UTF-8-*-

import string

import jieba

extra_dict = 'F://NLP/iWInsightor/jieba/mydict.dict'#自定义词典

jieba.load_userdict(extra_dict)

def filter_str(instr):

  deEstr = string.punctuation + ' ' + string.digits + string.letters

  deCstr = '，。《》【】（）！？★”“、：…'

  destr = deEstr + deCstr

  outstr = ''

  for char in instr.decode('utf-8'):

    if char not in destr:

      outstr += char

  return outstr

fp_in = open('F://NLP/iWInsightor/weibo.dat', 'rb+')#待处理文本

fp_out = open('F://NLP/iWInsightor/weibo_filter.dat', 'a')#处理后的文本

for line in fp_in:

  str_delete = filter_str(line)

  seg_list = jieba.cut(str_delete,cut_all=True)

  str_join = ' '.join(seg_list)

  fp_out.write(str_join)

fp_in.close()

fp_out.close()

    三、词频统计

    词频统计就是指统计出某个文本中各个词出现的次数，这里使用python中的词典数据结构易得。我用的是matplotlib画柱状图，画出top-K个高频词。这里需要注意的是图中的中文显示问题，在使用之前，需要修改相应的设置，具体方法不妨去google一下，我就不详细介绍了。

    # -*- coding: UTF-8-*-

import string

import numpy

import pylab

def getstr(word, count):

    countstr = word + ',' + str(count)

    return countstr

def get_wordlist(infile):

    c = open(infile).readlines()

    wordlist = []

    for line in c:

        if len(line)>1:

            words = line.split(' ')

            for word in words:

                if len(word)>1:

                    wordlist.append(word)

    return wordlist



def get_wordcount(wordlist, outfile):

    out = open(outfile, 'w')

    wordcnt ={}

    for i in wordlist:

        if i in wordcnt:

            wordcnt[i] += 1

        else:

            wordcnt[i] = 1

    worddict = wordcnt.items()

    worddict.sort(key=lambda a: -a[1])

    for word,cnt in worddict:

        out.write(getstr(word.encode('gbk'), cnt)+' ')

    out.close()

    return wordcnt

def barGraph(wcDict):

    wordlist=[]

    for key,val in wcDict.items():

        if val>5 and len(key)>3:

            wordlist.append((key.decode('utf-8'),val))

    wordlist.sort()

    keylist=[key for key,val in wordlist]

    vallist=[val for key,val in wordlist]

    barwidth=0.5

    xVal=numpy.arange(len(keylist))

    pylab.xticks(xVal+barwidth/2.0,keylist,rotation=45)

    pylab.bar(xVal,vallist,width=barwidth,color='y')

    pylab.title(u'微博词频分析图')

    pylab.show()



if __name__ == '__main__':

    myfile = 'F://NLP/iWInsightor/weibo_filter.dat'

    outfile = 'F://NLP/iWInsightor/result.dat'

    wordlist = get_wordlist(myfile)

    wordcnt = get_wordcount(wordlist,outfile)

    barGraph(wordcnt)



    至此，我们的工作就完成了。下面是我的微博词频的一个柱状图。这些仅是业余时间之作，尚有诸多不足之处。



查看全文

相关阅读:
ORACLE触发器详解
 论文笔记 Interpreting Black-Box Classifiers Using Instance-Level Visual Explanations
Popush迭代2个人总结
 Popush迭代1个人总结
 Popush第5次会议记录
 Xv6代码阅读报告之进程调度
 Popush源代码学习报告
 Popush 第二次小组会议记录及分工
 Popush 用户故事
 “老衲印象”开发团队章程

原文地址：https://www.cnblogs.com/webRobot/p/5399413.html