zoukankan      html  css  js  c++  java
  • obtainKeywords

    #!usr/bin/python
    #coding=utf-8
    
    import urllib2
    import sys, time, re
    import sys
    import chardet 
    import jieba
    jieba.load_userdict("userdict.txt")
    import jieba.analyse
    import jieba.posseg as pseg
    import os
    jieba.initialize()
    import operator
    reload(sys);
    sys.setdefaultencoding('utf8');
    import divideSentence
    
    
    #判断 文本(字符串)的变法类型
    def obtainTextType(ff):
        # import chardet 
        enc = chardet.detect(ff) 
        return enc['encoding']  #返回文件类型
    
    #文件编码类型判断
    def obtainFileType(filepath):
        # import chardet 
        tt = open(filepath, 'rb') 
        ff = tt.readline()        #这里试着换成read(5)也可以,但是换成readlines()后报错 
        tt.close() 
        return obtainTextType(ff)  #返回文件类型
    
    #读取文件, 返回去掉空格和空白的字符串 
    def ReadFile(url):      #url文件的路径
        # print obtainFileType(url)
        if obtainFileType(url) == 'GB2312':
            #.decode("gbk").encode('utf-8') 以gbk编码格式读取字符串(因为他就是gbk编码的)并转换为utf-8格式输出
            content = open(url, "rb").read().decode("gbk").encode('utf-8')
            # print obtainTextType(content)
    
        elif obtainFileType(url) == 'ascii':
            content = open(url, "rb").read().encode('utf-8')
            # print obtainTextType(content)
    
        else:
            # print obtainFileType(url)
            content = open(url, "rb").read()
            # print obtainTextType(content)
    
        strRe = re.sub('s', '', content)   #用正则干掉所有的空白
        return strRe
    
    
    #分词,对中文文章进行分词
    def divide_text_words(content):
        #分词, 未登录词用veterbi分词
        words = list(jieba.cut(content, cut_all=False))
        #print "分词的总数:", len(words)
        #wordset = sorted(set(words))
        #print "不重复的单词数:", len(wordset)
    
        return words
        
        '''
        #将数据写入文件
        list = words
        fl = open('list.txt', 'wb')
        for i in range(len(list)):
            fl.write(list[i].encode('utf-8')+'--')
        fl.close()
        '''
    
    # 获取停用词表,返回一个中文的停用词列表
    def stopWords():
        #读取文件中的停用词,返回停用词列表
        cn_stop_words_file = open("extra_dict/cn_stop_words.txt", "rb").readlines()
        cn_stop_word_list = []      # 停用词,词表.
        for word in cn_stop_words_file:
            word = re.sub('s', '', word)   #用正则干掉所有的空白
            #print word
            cn_stop_word_list.append(word.decode('utf-8'))
    
        return cn_stop_word_list
    
    #去掉停用词,参数是要处理的词列表 和 停用词列表,  返回值是处理之后的列表
    def delStopWords(words, stopWords):
        reWords = []
        for word in words:
            if word in stopWords:
                continue
            else:
                reWords.append(word) 
    
        return reWords
    
    
    #获取关键词列表,并返回关键词列表
    def keywords(content):
        # #TF-IDF
        # jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
        # tf_idf_tags = jieba.analyse.extract_tags(content, topK = 10)
        # # print "TF-IDF 未去除停用词, 获取10个关键词"
        # print(",".join(tf_idf_tags))
    
        #去掉停用词 TF-IDF 语言,研究,汉语,中文信息处理,汉字
        jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
        jieba.analyse.set_stop_words("extra_dict/cn_stop_words.txt")
        tf_idf_stop_words_tags = jieba.analyse.extract_tags(content, topK = 10)
        # print type(tf_idf_stop_words_tags)
        # print "TF-IDF 去除停用词"
        # print(",".join(tf_idf_stop_words_tags))
    
        #TextRank 分词
        # print "TextRank, 获取10个关键词"
        #TextRank_words = []
        TextRank_words = jieba.analyse.textrank(content)
        # print type(TextRank_words)
        # key_words_listprint(",".join(TextRank_words))
    
        keywords_list = TextRank_words + tf_idf_stop_words_tags
        keywords = list(set(keywords_list))
        return keywords
    
    
    
    #统计分词,统计词频. 参数:  words 需要统计的分词之后的列表,  
    #                       high_frequency_level:高频词汇的等级,数值越小,统计的量越大.
    def having_high_frequency_vocabulary(words, high_frequency_level):
    # 统计分词结果后,每个个分词的次数
        
        wordsDict = {}
        DictsMaxWordlen = 0
        singal = ''
        for w in words:
            if wordsDict.get(w) == None:
                wordsDict[w] = 1
            else:
                wordsDict[w] += 1
                
            if DictsMaxWordlen <= wordsDict[w]:
                DictsMaxWordlen = wordsDict[w]
                # global singal 
                singal = w
                #print w
    
        #print "分词最多重复的次数:%d" % DictsMaxWordlen , "分词是: %s" % singal
        #按字典值排序(默认为升序),返回值是字典{key, tuple}
        sorted_wordsDict = sorted(wordsDict.iteritems(), key=operator.itemgetter(1))
        # print sorted_wordsDict[2][0]
    
    #按照统计次数相同的词,进行分组.
    
        classNumWord = {}       #保存分组之后的字典, 例如: {1:['1', '2'], 2:['文化', '历史'], }
        for w in sorted_wordsDict:
            if classNumWord.has_key(w[1]) == True:
                if w[0] not in classNumWord[w[1]]:  
                    classNumWord[w[1]].append(w[0])
            else:
                classNumWord[w[1]] = []
                classNumWord[w[1]].append(w[0])
    
        #将字典排序,按照升序, 通过键排序,
        sort_classNumWord = sorted(classNumWord.iteritems(), key=lambda asd:asd[0], reverse = False)
        wordsList = []  #存取单词的列表
        
        #根据自己的想法,设置前多少级的词频,进入统计
        for num in range(int(len(sort_classNumWord) * high_frequency_level), len(sort_classNumWord)):
            #print sort_classNumWord[num][0]
            wordsList = wordsList + sort_classNumWord[num][1]       
    
        # print "数字大小", int(len(sort_classNumWord) * high_frequency_level)
        # print len(wordsList)
        return wordsList
        # print type(sort_classNumWord)
        # print type(sort_classNumWord[20])
        # print 'sort_classNumWord[20][1]', sort_classNumWord[20][1]
        # print type(sort_classNumWord[20][1])
    
        # print sort_classNumWord[20][1][0]
        # print sort_classNumWord[20][1][1]
    
        # wordslength = 0             #分词的总数
        # worldsNum = 0               #分词有多少个不同的词或词组
        # wordsFequencelist = {}      #分词出现的频次等级,从1到N次,并存储所对应等级的词语个数
    
        # for w in sort_classNumWord:
        #     worldsNum += w[0]
        #     wordslength += len(w[1]) * w[0]
        #     wordsFequencelist[w[0]] = []
        #     wordsFequencelist[w[0]].append(len(w[1]))
    
        # sort_wordsFequencelist = sorted(wordsFequencelist.iteritems(), key=lambda asd:asd[0], reverse = False)
    
        # print '		频率是单词出现的次数, 次数是出现对应次数的所有不同单词的总和'
        # lenWords = 0
        # for wordsFequence in sort_wordsFequencelist:
        #     lenWords += 1
        #     print '频率:{0:<4} 词数:{1:>6}'.format(wordsFequence[0], wordsFequence[1]), " ",
        #     if lenWords % 4 == 0:
        #         print
    
        # print 
        # print "一共有".decode('utf-8'), worldsNum, '个不同的词或词组'.decode('utf-8')
        # print "一共有".decode('utf-8'), wordslength, '个词或词组'.decode('utf-8')
    
    
    #获取高频词汇的函数, 只去除列表中停用词.
    def having_del_stop_high_frequency_word(strReContent):
        stop_words_list = stopWords()                       #获取停用词
            
        #获取高频词汇,设置阈值, 取出高频词汇, 消除 关键词和停用词 共同构成的词表, 剩下的高频词汇.
        words = divide_text_words(strReContent)                                                # 对文章进行分词
        high_frequency_vocabulary = having_high_frequency_vocabulary(words, 0.333)          # 计算词频, 取后等级, 全体等级数量的后2/3的所有词.
        high_frequency_words = delStopWords(high_frequency_vocabulary, stop_words_list)     # 获取删除停用词之后的词汇列表
    
        return high_frequency_words
    
    
    
    #获取高频词汇的函数, 去除关键词和停用词的高频词汇, 参数是strReContent 文本.
    def having_del_keywords_and_stop_high_frequency_word(strReContent):
        key_words_list = keywords(strReContent)             #获取关键词通过tf-idf和textRank
        stop_words_list = stopWords()                       #获取停用词
            
        #获取高频词汇,设置阈值,取出高频词汇,消除 关键词和停用词 共同构成的词表, 剩下的高频词汇.
        words = divide_text_words(strReContent)                                            # 对文章进行分词
        high_frequency_vocabulary = having_high_frequency_vocabulary(words, 0.333)      # 计算词频, 取后等级, 全体等级数量的后2/3的所有词.
        stop_words_list = list(set(stop_words_list + key_words_list))                   # 将关键词 和 停用词叠加, 合成停用词表
        high_frequency_delstopWords_list = delStopWords(high_frequency_vocabulary, stop_words_list)   #获取删除停用词之后的词汇列表
    
        return high_frequency_delstopWords_list
    
    
    #获取关键词,通过tf-idf和textRank合成,在去除停用词
    def having_keywords(strReContent):
        key_words_list = keywords(strReContent)             #获取关键词通过tf-idf和textRank
        stop_words_list = stopWords()                       #获取停用词
        keywords_del_stop_list = delStopWords(key_words_list, stop_words_list)   #获取删除停用词之后的词汇列表
        return keywords_del_stop_list
    
    
    
    
    # ------------------------------------------------------------------------------------------------------------------------#
    if __name__ == '__main__':
    
        #这里读取的文件是utf-8 和 gbk 文件, 暂不支持asscii码.
     
        #获取关键词,通过tf-idf和textRank合成,在去除停用词
        strReContent = ReadFile('DIPS-LY06-15339.txt')                   #获取去掉空白的中文文档字符串   
        # print obtainTextType(strReContent)
    
        key_words_list = keywords(strReContent)             #获取关键词通过tf-idf和textRank
        stop_words_list = stopWords()                       #获取停用词
        keywords_del_stop_list = delStopWords(key_words_list, stop_words_list)   #获取删除停用词之后的词汇列表
        print 
        print 'keywords_del_stop_list'
        for word in keywords_del_stop_list:
            print word
        print   
        
        # 获取高频词汇,设置阈值,取出高频词汇,消除 关键词和停用词 共同构成的词表, 剩下的高频词汇.
        words = divide_text_words(strReContent)                                         #对文章进行分词
        high_frequency_vocabulary = having_high_frequency_vocabulary(words, 0.333)      #计算词频,取后等级,2/3的所有词.
        stop_words_list = stop_words_list + key_words_list                              #将关键词和停用词叠加,合成停用词表
        # for i in range(0, len(stop_words_list)):
        #     stop_words_list[i] = stop_words_list[i].encode('utf8')
    
        # for i in range(0, len(high_frequency_vocabulary)):
        #     high_frequency_vocabulary[i] = high_frequency_vocabulary[i].encode('utf8')
    
        # print 'high_frequency_vocabulary'
        # for word in high_frequency_vocabulary:
        #     print word
        # print
        high_frequency_delstopWords_list = delStopWords(high_frequency_vocabulary, stop_words_list)   #获取删除停用词之后的词汇列表
    
        # print type(stop_words_list[1])
        # print type(high_frequency_vocabulary[1])
        # print 
    
        # high_frequency_delstopWords_list = []
        # for word in high_frequency_vocabulary:
        #     print '111111', word, type(word)
        #     # word = (word).decode('utf8')
        #     if word in stop_words_list:
        #         print 'continue'
        #         continue
        #     else:
        #         high_frequency_delstopWords_list.append(word) 
    
    
        #     # print divideSentence.obtainTextType(stop_words_list[1])
    
    
        print 
        print 'high_frequency_delstopWords_list'
        for word in high_frequency_delstopWords_list:
            print word
    
    
        print 
        print
  • 相关阅读:
    octotree神器 For Github and GitLab 火狐插件
    实用篇如何使用github(本地、远程)满足基本需求
    PPA(Personal Package Archives)简介、兴起、使用
    Sourse Insight使用过程中的常使用功能简介
    Sourse Insight使用教程及常见的问题解决办法
    github 遇到Permanently added the RSA host key for IP address '192.30.252.128' to the list of known hosts问题解决
    二叉查找树的C语言实现(一)
    初识内核链表
    container_of 和 offsetof 宏详解
    用双向链表实现一个栈
  • 原文地址:https://www.cnblogs.com/hgonlywj/p/4842686.html
Copyright © 2011-2022 走看看