zoukankan      html  css  js  c++  java
  • obtainKeywords

    #!usr/bin/python
    #coding=utf-8
    
    import urllib2
    import sys, time, re
    import sys
    import chardet 
    import jieba
    jieba.load_userdict("userdict.txt")
    import jieba.analyse
    import jieba.posseg as pseg
    import os
    jieba.initialize()
    import operator
    reload(sys);
    sys.setdefaultencoding('utf8');
    import divideSentence
    
    
    #判断 文本(字符串)的变法类型
    def obtainTextType(ff):
        # import chardet 
        enc = chardet.detect(ff) 
        return enc['encoding']  #返回文件类型
    
    #文件编码类型判断
    def obtainFileType(filepath):
        # import chardet 
        tt = open(filepath, 'rb') 
        ff = tt.readline()        #这里试着换成read(5)也可以,但是换成readlines()后报错 
        tt.close() 
        return obtainTextType(ff)  #返回文件类型
    
    #读取文件, 返回去掉空格和空白的字符串 
    def ReadFile(url):      #url文件的路径
        # print obtainFileType(url)
        if obtainFileType(url) == 'GB2312':
            #.decode("gbk").encode('utf-8') 以gbk编码格式读取字符串(因为他就是gbk编码的)并转换为utf-8格式输出
            content = open(url, "rb").read().decode("gbk").encode('utf-8')
            # print obtainTextType(content)
    
        elif obtainFileType(url) == 'ascii':
            content = open(url, "rb").read().encode('utf-8')
            # print obtainTextType(content)
    
        else:
            # print obtainFileType(url)
            content = open(url, "rb").read()
            # print obtainTextType(content)
    
        strRe = re.sub('s', '', content)   #用正则干掉所有的空白
        return strRe
    
    
    #分词,对中文文章进行分词
    def divide_text_words(content):
        #分词, 未登录词用veterbi分词
        words = list(jieba.cut(content, cut_all=False))
        #print "分词的总数:", len(words)
        #wordset = sorted(set(words))
        #print "不重复的单词数:", len(wordset)
    
        return words
        
        '''
        #将数据写入文件
        list = words
        fl = open('list.txt', 'wb')
        for i in range(len(list)):
            fl.write(list[i].encode('utf-8')+'--')
        fl.close()
        '''
    
    # 获取停用词表,返回一个中文的停用词列表
    def stopWords():
        #读取文件中的停用词,返回停用词列表
        cn_stop_words_file = open("extra_dict/cn_stop_words.txt", "rb").readlines()
        cn_stop_word_list = []      # 停用词,词表.
        for word in cn_stop_words_file:
            word = re.sub('s', '', word)   #用正则干掉所有的空白
            #print word
            cn_stop_word_list.append(word.decode('utf-8'))
    
        return cn_stop_word_list
    
    #去掉停用词,参数是要处理的词列表 和 停用词列表,  返回值是处理之后的列表
    def delStopWords(words, stopWords):
        reWords = []
        for word in words:
            if word in stopWords:
                continue
            else:
                reWords.append(word) 
    
        return reWords
    
    
    #获取关键词列表,并返回关键词列表
    def keywords(content):
        # #TF-IDF
        # jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
        # tf_idf_tags = jieba.analyse.extract_tags(content, topK = 10)
        # # print "TF-IDF 未去除停用词, 获取10个关键词"
        # print(",".join(tf_idf_tags))
    
        #去掉停用词 TF-IDF 语言,研究,汉语,中文信息处理,汉字
        jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
        jieba.analyse.set_stop_words("extra_dict/cn_stop_words.txt")
        tf_idf_stop_words_tags = jieba.analyse.extract_tags(content, topK = 10)
        # print type(tf_idf_stop_words_tags)
        # print "TF-IDF 去除停用词"
        # print(",".join(tf_idf_stop_words_tags))
    
        #TextRank 分词
        # print "TextRank, 获取10个关键词"
        #TextRank_words = []
        TextRank_words = jieba.analyse.textrank(content)
        # print type(TextRank_words)
        # key_words_listprint(",".join(TextRank_words))
    
        keywords_list = TextRank_words + tf_idf_stop_words_tags
        keywords = list(set(keywords_list))
        return keywords
    
    
    
    #统计分词,统计词频. 参数:  words 需要统计的分词之后的列表,  
    #                       high_frequency_level:高频词汇的等级,数值越小,统计的量越大.
    def having_high_frequency_vocabulary(words, high_frequency_level):
    # 统计分词结果后,每个个分词的次数
        
        wordsDict = {}
        DictsMaxWordlen = 0
        singal = ''
        for w in words:
            if wordsDict.get(w) == None:
                wordsDict[w] = 1
            else:
                wordsDict[w] += 1
                
            if DictsMaxWordlen <= wordsDict[w]:
                DictsMaxWordlen = wordsDict[w]
                # global singal 
                singal = w
                #print w
    
        #print "分词最多重复的次数:%d" % DictsMaxWordlen , "分词是: %s" % singal
        #按字典值排序(默认为升序),返回值是字典{key, tuple}
        sorted_wordsDict = sorted(wordsDict.iteritems(), key=operator.itemgetter(1))
        # print sorted_wordsDict[2][0]
    
    #按照统计次数相同的词,进行分组.
    
        classNumWord = {}       #保存分组之后的字典, 例如: {1:['1', '2'], 2:['文化', '历史'], }
        for w in sorted_wordsDict:
            if classNumWord.has_key(w[1]) == True:
                if w[0] not in classNumWord[w[1]]:  
                    classNumWord[w[1]].append(w[0])
            else:
                classNumWord[w[1]] = []
                classNumWord[w[1]].append(w[0])
    
        #将字典排序,按照升序, 通过键排序,
        sort_classNumWord = sorted(classNumWord.iteritems(), key=lambda asd:asd[0], reverse = False)
        wordsList = []  #存取单词的列表
        
        #根据自己的想法,设置前多少级的词频,进入统计
        for num in range(int(len(sort_classNumWord) * high_frequency_level), len(sort_classNumWord)):
            #print sort_classNumWord[num][0]
            wordsList = wordsList + sort_classNumWord[num][1]       
    
        # print "数字大小", int(len(sort_classNumWord) * high_frequency_level)
        # print len(wordsList)
        return wordsList
        # print type(sort_classNumWord)
        # print type(sort_classNumWord[20])
        # print 'sort_classNumWord[20][1]', sort_classNumWord[20][1]
        # print type(sort_classNumWord[20][1])
    
        # print sort_classNumWord[20][1][0]
        # print sort_classNumWord[20][1][1]
    
        # wordslength = 0             #分词的总数
        # worldsNum = 0               #分词有多少个不同的词或词组
        # wordsFequencelist = {}      #分词出现的频次等级,从1到N次,并存储所对应等级的词语个数
    
        # for w in sort_classNumWord:
        #     worldsNum += w[0]
        #     wordslength += len(w[1]) * w[0]
        #     wordsFequencelist[w[0]] = []
        #     wordsFequencelist[w[0]].append(len(w[1]))
    
        # sort_wordsFequencelist = sorted(wordsFequencelist.iteritems(), key=lambda asd:asd[0], reverse = False)
    
        # print '		频率是单词出现的次数, 次数是出现对应次数的所有不同单词的总和'
        # lenWords = 0
        # for wordsFequence in sort_wordsFequencelist:
        #     lenWords += 1
        #     print '频率:{0:<4} 词数:{1:>6}'.format(wordsFequence[0], wordsFequence[1]), " ",
        #     if lenWords % 4 == 0:
        #         print
    
        # print 
        # print "一共有".decode('utf-8'), worldsNum, '个不同的词或词组'.decode('utf-8')
        # print "一共有".decode('utf-8'), wordslength, '个词或词组'.decode('utf-8')
    
    
    #获取高频词汇的函数, 只去除列表中停用词.
    def having_del_stop_high_frequency_word(strReContent):
        stop_words_list = stopWords()                       #获取停用词
            
        #获取高频词汇,设置阈值, 取出高频词汇, 消除 关键词和停用词 共同构成的词表, 剩下的高频词汇.
        words = divide_text_words(strReContent)                                                # 对文章进行分词
        high_frequency_vocabulary = having_high_frequency_vocabulary(words, 0.333)          # 计算词频, 取后等级, 全体等级数量的后2/3的所有词.
        high_frequency_words = delStopWords(high_frequency_vocabulary, stop_words_list)     # 获取删除停用词之后的词汇列表
    
        return high_frequency_words
    
    
    
    #获取高频词汇的函数, 去除关键词和停用词的高频词汇, 参数是strReContent 文本.
    def having_del_keywords_and_stop_high_frequency_word(strReContent):
        key_words_list = keywords(strReContent)             #获取关键词通过tf-idf和textRank
        stop_words_list = stopWords()                       #获取停用词
            
        #获取高频词汇,设置阈值,取出高频词汇,消除 关键词和停用词 共同构成的词表, 剩下的高频词汇.
        words = divide_text_words(strReContent)                                            # 对文章进行分词
        high_frequency_vocabulary = having_high_frequency_vocabulary(words, 0.333)      # 计算词频, 取后等级, 全体等级数量的后2/3的所有词.
        stop_words_list = list(set(stop_words_list + key_words_list))                   # 将关键词 和 停用词叠加, 合成停用词表
        high_frequency_delstopWords_list = delStopWords(high_frequency_vocabulary, stop_words_list)   #获取删除停用词之后的词汇列表
    
        return high_frequency_delstopWords_list
    
    
    #获取关键词,通过tf-idf和textRank合成,在去除停用词
    def having_keywords(strReContent):
        key_words_list = keywords(strReContent)             #获取关键词通过tf-idf和textRank
        stop_words_list = stopWords()                       #获取停用词
        keywords_del_stop_list = delStopWords(key_words_list, stop_words_list)   #获取删除停用词之后的词汇列表
        return keywords_del_stop_list
    
    
    
    
    # ------------------------------------------------------------------------------------------------------------------------#
    if __name__ == '__main__':
    
        #这里读取的文件是utf-8 和 gbk 文件, 暂不支持asscii码.
     
        #获取关键词,通过tf-idf和textRank合成,在去除停用词
        strReContent = ReadFile('DIPS-LY06-15339.txt')                   #获取去掉空白的中文文档字符串   
        # print obtainTextType(strReContent)
    
        key_words_list = keywords(strReContent)             #获取关键词通过tf-idf和textRank
        stop_words_list = stopWords()                       #获取停用词
        keywords_del_stop_list = delStopWords(key_words_list, stop_words_list)   #获取删除停用词之后的词汇列表
        print 
        print 'keywords_del_stop_list'
        for word in keywords_del_stop_list:
            print word
        print   
        
        # 获取高频词汇,设置阈值,取出高频词汇,消除 关键词和停用词 共同构成的词表, 剩下的高频词汇.
        words = divide_text_words(strReContent)                                         #对文章进行分词
        high_frequency_vocabulary = having_high_frequency_vocabulary(words, 0.333)      #计算词频,取后等级,2/3的所有词.
        stop_words_list = stop_words_list + key_words_list                              #将关键词和停用词叠加,合成停用词表
        # for i in range(0, len(stop_words_list)):
        #     stop_words_list[i] = stop_words_list[i].encode('utf8')
    
        # for i in range(0, len(high_frequency_vocabulary)):
        #     high_frequency_vocabulary[i] = high_frequency_vocabulary[i].encode('utf8')
    
        # print 'high_frequency_vocabulary'
        # for word in high_frequency_vocabulary:
        #     print word
        # print
        high_frequency_delstopWords_list = delStopWords(high_frequency_vocabulary, stop_words_list)   #获取删除停用词之后的词汇列表
    
        # print type(stop_words_list[1])
        # print type(high_frequency_vocabulary[1])
        # print 
    
        # high_frequency_delstopWords_list = []
        # for word in high_frequency_vocabulary:
        #     print '111111', word, type(word)
        #     # word = (word).decode('utf8')
        #     if word in stop_words_list:
        #         print 'continue'
        #         continue
        #     else:
        #         high_frequency_delstopWords_list.append(word) 
    
    
        #     # print divideSentence.obtainTextType(stop_words_list[1])
    
    
        print 
        print 'high_frequency_delstopWords_list'
        for word in high_frequency_delstopWords_list:
            print word
    
    
        print 
        print
  • 相关阅读:
    [分治算法]众数问题
    C++代码注入
    003 ansible部署ceph集群
    002 ceph的deploy部署
    001 Ceph简介
    vmware安装ubuntu的简单配置
    博客之行启程
    Java实现 LeetCode 813 最大平均值和的分组 (DFS+DP记忆化搜索)
    Java实现 LeetCode 813 最大平均值和的分组 (DFS+DP记忆化搜索)
    海伦公式和鞋带公式求三角形的面积
  • 原文地址:https://www.cnblogs.com/hgonlywj/p/4842686.html
Copyright © 2011-2022 走看看