zoukankan      html  css  js  c++  java
  • veterbi

    #!usr/bin/python
    #coding=utf-8
    
    import urllib2
    import sys, time, re
    import sys
    import jieba
    jieba.load_userdict("userdict.txt")
    import jieba.analyse
    import jieba.posseg as pseg
    import os
    jieba.initialize()
    import operator
    reload(sys);
    sys.setdefaultencoding('utf8');
    
    
    t1 = time.time()
    
    url = "10.txt"
    content = open(url, "rb").read()
    #print type(content)
    print '文章长度:', len(content)
    strRe = re.sub('s', '', content)   #用正则干掉所有的空白
    print '用正则干掉所有的空白后,字符长度', len(strRe)
    
    '''
    fo = open("foo.txt", "wb")
    fo.write(strRe);
    # 关闭打开的文件
    fo.close()
    '''
    
    #分词, 未登录词用veterbi分词
    words = list(jieba.cut(strRe, cut_all=False))
    print "分词的总数:", len(words)
    wordset = sorted(set(words))
    print "不重复的单词数:", len(wordset)
    
    #TF-IDF
    jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
    tf_idf_tags = jieba.analyse.extract_tags(strRe, topK = 10)
    print "TF-IDF 未去除停用词, 获取10个关键词"
    print(",".join(tf_idf_tags))
    
    jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
    jieba.analyse.set_stop_words("extra_dict/cn_stop_words.txt")
    tf_idf_stop_words_tags = jieba.analyse.extract_tags(strRe, topK = 10)
    print "TF-IDF 去除停用词"
    print(",".join(tf_idf_stop_words_tags))
    
    
    #TextRank
    #tagswords = jieba.analyse.textrank(content)
    #print(",".join(tagswords))
    
    print "TextRank, 获取10个关键词"
    TextRank_words = jieba.analyse.textrank(strRe)
    print(",".join(TextRank_words))
    
    '''
    list = words
    fl = open('list.txt', 'wb')
    
    for i in range(len(list)):
        fl.write(list[i].encode('utf-8')+'--')
        
    fl.close()
    '''
    
    
    # 统计分词结果后,每个个分词的次数
    wordsDict = {}
    DictsMaxWordlen = 0
    singal = ''
    for w in words:
        if wordsDict.get(w) == None:
            wordsDict[w] = 1
        else:
            wordsDict[w] += 1
            
        if DictsMaxWordlen <= wordsDict[w]:
            DictsMaxWordlen = wordsDict[w]
            global singal 
            singal = w
            #print w
    
    print "分词最多重复的次数:".decode('utf-8'), DictsMaxWordlen , "分词是:".decode('utf-8'),singal
    
    #按字典值排序(默认为升序),返回值是字典{key, tuple}
    sorted_wordsDict = sorted(wordsDict.iteritems(), key=operator.itemgetter(1))
    #print type(sorted_wordsDict[1])    #tuple
    
    
    classNumWord = {}
    
    for w in sorted_wordsDict:
        if classNumWord.has_key(w[1]) == True:
            if w[0] not in classNumWord[w[1]]:  
                classNumWord[w[1]].append(w[0])
        else:
            classNumWord[w[1]] = []
            classNumWord[w[1]].append(w[0])
    #将字典排序,按照升序, 通过键排序,
    sort_classNumWord = sorted(classNumWord.iteritems(), key=lambda asd:asd[0], reverse = False)
    #print sort_classNumWord[20][1][0].encode('gb2312') 
    
    wordslength = 0     #分词的总数
    worldsNum = 0       #分词有多少个不同的词或词组
    wordsFequencelist = {}  #分词出现的频次等级,从1到N次,并存储所对应等级的词语个数
    for w in sort_classNumWord:
        worldsNum += w[0]
        wordslength += len(w[1]) * w[0]
        
        wordsFequencelist[w[0]] = []
        wordsFequencelist[w[0]].append(len(w[1]))
            
        #print "============================" 
        #for i in range(len(w[1])):     #按照出现的频次,打印词组
         #   print w[1][i]
        #print "出现".decode('utf-8'),w[0], "次的有:".decode('utf-8') ,len(w[1])
        #print "============================"      
    
    sort_wordsFequencelist = sorted(wordsFequencelist.iteritems(), key=lambda asd:asd[0], reverse = False)
    
    print '		频率是单词出现的次数, 次数是出现对应次数的所有不同单词的总和'
    lenWords = 0
    for wordsFequence in sort_wordsFequencelist:
    	lenWords += 1
    	print '频率:{0:<4} 词数:{1:>6}'.format(wordsFequence[0], wordsFequence[1]), " ",
    	if lenWords % 4 == 0:
    		print
    
    print 
    print "一共有".decode('utf-8'), worldsNum, '个不同的词或词组'.decode('utf-8')
    print "一共有".decode('utf-8'), wordslength, '个词或词组'.decode('utf-8')
    
    
    print 
    print
    t2 = time.time()
    tm_cost = t2-t1
    print '运行时间', tm_cost
    

      

    Building prefix dict from C:Python27libsite-packagesjieba-0.36.2-py2.7.eggjiebadict.txt ...
    Dumping model to file cache c:usersogappdatalocal empjieba.cache
    Loading model cost 2.16899991035 seconds.
    Prefix dict has been built succesfully.

  • 相关阅读:
    解释器模式
    java-->Hashtable简单使用
    HashTable和HashMap区别
    享元模式
    Beanutils.copyProperties( )用法
    删除List集合中的元素方法
    Date中before和after方法的使用
    Spring定时任务@Scheduled注解使用方式
    Oracle中INSTR、SUBSTR和NVL的用法
    StringBuffer的delete方法与deleteCharAt方法的区别。
  • 原文地址:https://www.cnblogs.com/hgonlywj/p/4842689.html
Copyright © 2011-2022 走看看