zoukankan      html  css  js  c++  java
  • gensim自然语言处理

    参考代码
    ChineseClean_demo1.py:
    # -*- coding:utf-8 -*-
    import xlrd
    import xlwt
    '''
    python3.4
    '''
    # file 表示源文件名字,修改此处即可
    file="./data/answer_detail_5_15307860968687.xls"
    dirs="./result"
     
    def read_excel(rows_numb,cols_numb):
     
        f = xlwt.Workbook() #创建工作簿
        '''
        创建第一个sheet:
        sheet1
        '''
        sheet1 = f.add_sheet(u'sheet1_1',cell_overwrite_ok=True) #创建sheet
        sheet2 = f.add_sheet(u'sheet1_2',cell_overwrite_ok=True) #创建sheet
        row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID']
     
        # 打开文件
        workbook = xlrd.open_workbook(file)
        sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始
        cols = sheet0.col_values(cols_numb)
        rows_list_1=[]
        rows_list_2=[]
        for i in range(1,len(cols)):
            if cols[i] == '0':
                rows_list_1.append(i)
            else:
                rows_list_2.append(i)
     
        for i in range(0,len(row0)):
            sheet1.write(0,i,row0[i])
            sheet2.write(0,i,row0[i])
        '''
        sheet1_1保存0分数据
     
        '''
        for j in range(0,len(rows_list_1)):
            rows = sheet0.row_values(rows_list_1[j]) # 获取行内容
            for i in range(0,len(rows)):
                sheet1.write(j+1,i,rows[i])
        '''
     
        sheet1_2保存非0分数据
     
        '''
        for j in range(0,len(rows_list_2)):
            rows = sheet0.row_values(rows_list_2[j]) # 获取行内容
            for i in range(0,len(rows)):
                sheet2.write(j+1,i,rows[i])
     
        f.save('./data/demo1.xls') #保存文件
     
     
    if __name__ == '__main__':
        # 读取文件的行和列
        rows_numb=0
        cols_numb=6
        read_excel(rows_numb,cols_numb)
     
    ChineseClean_demo2.py:
    # -*- coding:utf-8 -*-
    import xlrd
    import xlwt
    '''
    python3.4
      
    '''
    # file 表示源文件名字,修改此处即可
    file="./data/demo1.xls"
      
     
    def read_excel(rows_numb,cols_numb):
     
        f = xlwt.Workbook() #创建工作簿
     
        '''
        创建第sheet:
        '''
        sheet1 = f.add_sheet(u'sheet2_1',cell_overwrite_ok=True) #创建sheet
        sheet2 = f.add_sheet(u'sheet2_2',cell_overwrite_ok=True) #创建sheet
        sheet3 = f.add_sheet(u'sheet2_3',cell_overwrite_ok=True) #创建sheet
        sheet4 = f.add_sheet(u'sheet2_4',cell_overwrite_ok=True) #创建sheet
        row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID']
     
        for i in range(0,len(row0)):
            sheet1.write(0,i,row0[i])
            sheet2.write(0,i,row0[i])
            sheet3.write(0,i,row0[i])
            sheet4.write(0,i,row0[i])
     
     
        # 打开文件
        workbook = xlrd.open_workbook(file)
        sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始
        cols = sheet0.col_values(cols_numb) # 获取列内容
        rows_list_1=[]
        rows_list_2=[]
        rows_list_3=[]
        rows_list_4=[]  
        for i in range(1,len(cols)):
     
            if float(cols[i]) < 12.0:
                rows_list_1.append(i)
            if float(cols[i]) >= 12.0 and float(cols[i]) < 16.0:
                rows_list_2.append(i)
            if float(cols[i]) >= 16.0 and float(cols[i]) < 18.0:
                rows_list_3.append(i)
            if float(cols[i]) >= 18.0:
                print(i)
                print(type(cols[i]))
                exit()
                rows_list_4.append(i)
     
        '''
        sheet2_1保存差,小于12分
     
        '''
        for j in range(0,len(rows_list_1)):
            rows = sheet0.row_values(rows_list_1[j]) # 获取行内容
            for i in range(0,len(rows)):
                sheet1.write(j+1,i,rows[i])
        '''
        sheet2_2保存中,大于等于12,且小于16分
     
     
        '''
     
        for j in range(0,len(rows_list_2)):
            rows = sheet0.row_values(rows_list_2[j]) # 获取行内容
            for i in range(0,len(rows)):
                sheet2.write(j+1,i,rows[i])
     
        '''
        sheet2_3保存良,大于等于16,且小于18分
     
        '''
        for j in range(0,len(rows_list_3)):
            rows = sheet0.row_values(rows_list_3[j]) # 获取行内容
            for i in range(0,len(rows)):
                sheet3.write(j+1,i,rows[i])
        '''
        sheet2_4保存优,大于等于18分
     
     
        '''
     
        for j in range(0,len(rows_list_4)):
            rows = sheet0.row_values(rows_list_4[j]) # 获取行内容
            for i in range(0,len(rows)):
                sheet4.write(j+1,i,rows[i])
     
        f.save('./data/demo2.xls')
     
    if __name__ == '__main__':
        # 读取文件的行和列
        rows_numb=0
        cols_numb=6
        read_excel(rows_numb,cols_numb)
    ChineseClean_demo3.py:
    # -*- coding:utf-8 -*-
    import xlrd
    import xlwt
    '''
    python3.4
      
    '''
    file="./data/answer_detail_5_15307860968687.xls"
     
    def read_excel(rows_numb,cols_numb):
     
        f = xlwt.Workbook() #创建工作簿
     
        '''
        创建第一个sheet:
        sheet1
        '''
        sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet
        sheet2 = f.add_sheet(u'sheet2',cell_overwrite_ok=True) #创建sheet
        sheet3 = f.add_sheet(u'sheet3',cell_overwrite_ok=True) #创建sheet
        sheet4 = f.add_sheet(u'sheet4',cell_overwrite_ok=True) #创建sheet
        sheet5 = f.add_sheet(u'sheet5',cell_overwrite_ok=True)
        row0 = [u'UserNo',u'Name',u'Question',u'Answer',u'Layer',u'Mark',u'Score',u'AbilityID']
     
        for i in range(0,len(row0)):
            sheet1.write(0,i,row0[i])
            sheet2.write(0,i,row0[i])
            sheet3.write(0,i,row0[i])
            sheet4.write(0,i,row0[i])
            sheet5.write(0,i,row0[i])
     
        # 打开文件
        workbook = xlrd.open_workbook(file)
        sheet0 = workbook.sheet_by_index(0) # sheet索引从0开始
        cols = sheet0.col_values(cols_numb) # 获取列内容
        rows_list_1=[]
        rows_list_2=[]
        rows_list_3=[]
        rows_list_4=[]
        rows_list_5=[]  
        for i in range(1,len(cols)):
     
            if cols[i] == '100012':
                rows_list_1.append(i)
            if cols[i] == '100014':
                rows_list_2.append(i)
            if cols[i] == '100007':
                rows_list_3.append(i)
            if cols[i] == '100016':
                rows_list_4.append(i)
            if cols[i] == '100017':
                print(i)
                print(type(cols[i]))
                rows_list_5.append(i)
        '''
        sheet1保存
        '''
        for j in range(0,len(rows_list_1)):
            rows = sheet0.row_values(rows_list_1[j]) # 获取第四行内容
            for i in range(0,len(rows)):
                sheet1.write(j+1,i,rows[i])
        '''
        sheet2保存
        '''
     
        for j in range(0,len(rows_list_2)):
            rows = sheet0.row_values(rows_list_2[j]) # 获取第四行内容
            for i in range(0,len(rows)):
                sheet2.write(j+1,i,rows[i])
     
        '''
        sheet3保存
        '''
        for j in range(0,len(rows_list_3)):
            rows = sheet0.row_values(rows_list_3[j]) # 获取第四行内容
            for i in range(0,len(rows)):
                sheet3.write(j+1,i,rows[i])
        '''
        sheet4保存
        '''
        for j in range(0,len(rows_list_4)):
            rows = sheet0.row_values(rows_list_4[j]) # 获取第四行内容
            for i in range(0,len(rows)):
                sheet4.write(j+1,i,rows[i])
     
        '''
        sheet5保存
        '''
        for j in range(0,len(rows_list_5)):
            rows = sheet0.row_values(rows_list_5[j]) # 获取第四行内容
            for i in range(0,len(rows)):
                sheet5.write(j+1,i,rows[i])
     
        f.save('./data/demo3.xls') #保存文件
     
    if __name__ == '__main__':
        # 读取文件的行和列
        rows_numb=0
        cols_numb=7
        read_excel(rows_numb,cols_numb)
    ChineseClean_demo4or5.py:
    同ChineseClean_demo3.py
    ChineseClean_answer_QA.py:
    # -*- coding:utf-8 -*-
    import re
    import xlrd
    file="./data/demo5.xls"
    dirs="./result"
     
    def read_excel(rows_numb,cols1_numb):
        number='1'
        f2 = open(dirs+'./demo5_sheet1_%s.csv'%number, 'a', encoding='utf-8')
        # 打开文件
        workbook = xlrd.open_workbook(file)
        sheet0 = workbook.sheet_by_index(int(number)-1) # sheet索引从0开始
        cols1 = sheet0.col_values(cols1_numb[3]) [1:]# 获取列内容
     
        p1 = r"(?:[u2E80-uFFFD]|[u201c-u201d]|[u002d]|[u003a])+"
        pattern1 = re.compile(p1)
        for i in range(len(cols1)):
            matcher1 = re.findall(pattern1, cols1[i])
            str1=str()
            if matcher1:
                str1 = ' '.join(matcher1)
                f2.write(str1)
            f2.write('
    ')
     
        f2.close()
     
    if __name__ == '__main__':
        # 读取文件的行和列
        rows_numb=0
        cols1_numb=[0,1,2,3,4,5,6,7]
        read_excel(rows_numb,cols1_numb)
     
    qa_test_clean_word.py:
    # -*- coding: utf-8 -*-
     
    import jieba
    # 创建停用词list
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
        return stopwords
      
      
    # 对句子进行分词
    def seg_sentence(sentence):
        sentence_seged = jieba.lcut_for_search(sentence.strip(),HMM=True)
        stopwords = stopwordslist('./test/stopwords.txt')  # 这里加载停用词的路径
        outstr = ''
        for word in sentence_seged:
            if word not in stopwords:
                if word != '	':
                    outstr += word
                    outstr += " "
        return(outstr)
     
    inputs = open('./data/demo5_answer_csv/demo5_sheet5_5.csv', 'r', encoding='utf-8')
    outputs = open('./test/demo5_sheet5_5_5.csv', 'w')
    for line in inputs:
        line_seg = seg_sentence(line) 
        try:
            if len(line_seg):
                outputs.write(line_seg + '
    ')
        except:
            pass
         
    outputs.close()
    inputs.close()
    word_fre.py:
    # -*- coding: utf-8 -*-
     
     
    import matplotlib.pyplot as plt
    from matplotlib.font_manager import *
    import numpy as np
     
    def drawStatBarh():
        '''
        画出词频统计条形图,用渐变颜色显示,选取前N个词频
        '''
        fig, ax = plt.subplots()
        myfont = FontProperties(fname='./data/simfang.ttf')
        N = 30
        words = []
        counts = []
        for line in open('./data/word_fre.txt'):
            if line == '
    ':
                continue
            line.strip('
    ')
     
            words.append(line.split(' ')[0])
            print(line.split(' ')[0])
            # exit()
            counts.append(int(line.split(' ')[1].strip('
    ')))
     
        y_pos = np.arange(N)
     
        colors = ['#FA8072'] #这里是为了实现条状的渐变效果,以该色号为基本色实现渐变效果
        for i in range(len(words[:N]) - 1):
            colors.append('#FA' + str(int(colors[-1][3:]) - 1))
     
        rects = ax.barh(y_pos, counts[:N], align='center', color=colors)
     
        ax.set_yticks(np.arange(N))
        ax.set_yticklabels(words[:N],fontproperties=myfont)
        ax.invert_yaxis()  # labels read top-to-bottom
        ax.set_title('报告中的高频词汇',fontproperties=myfont, fontsize=17)
        ax.set_xlabel(u"出现次数",fontproperties=myfont)
     
        autolabel(rects, ax)
        plt.show()
     
     
    def autolabel(rects, ax):
        """
        给条形图加上文字标签
        """
        #fig, ax = plt.subplots()
        for rect in rects:
            width = rect.get_width()
            ax.text(1.03 * width, rect.get_y() + rect.get_height()/2., 
                '%d' % int(width),ha='center', va='center')
     
     
    def wordCount(segment_list):
        '''
            该函数实现词频的统计,并将统计结果存储至本地。
            在制作词云的过程中用不到,主要是在画词频统计图时用到。
        '''
        word_lst = []
        word_dict = {}
        with open('./data/word_fre.txt','w') as wf2:
            word_lst.append(segment_list.split(' '))
            for item in word_lst:
                for item2 in item:
                    if item2 not in word_dict:
                        word_dict[item2] = 1
                    else:
                        word_dict[item2] += 1
            # print(type(word_dict))
            # print(word_dict)
            word_dict_sorted =list(sorted(word_dict.items(),key = lambda jj:jj[1],reverse=True))#list是关键,按照词频从大到小排序
            # word_dict_sorted = dict(sorted(word_dict.items(),key = lambda item:item[1], reverse=True))#按照词频从大到小排序
            print(word_dict_sorted)
            # exit()
            for tup in word_dict_sorted:
                # print(type(tup))
                # print(tup)
                # exit()
                if tup[0] != '':
                    wf2.write(tup[0].strip('
    ')+' '+str(tup[1])+'
    ')
        wf2.close()
     
     
     
     
    if __name__ == "__main__":
        segment_list_remove_stopwords=open('./data/demo5_sheet5_1_1.csv').read()
        wordCount(segment_list_remove_stopwords)
        drawStatBarh()
    wordcloud_test2.py:
    # - * - coding: utf - 8 -*-
     
    from os import path
    from scipy.misc import imread
    import matplotlib.pyplot as plt
    import jieba
    # jieba.load_userdict("txtuserdict.txt")
    # 添加用户词库为主词典,原词典变为非主词典
    from wordcloud import WordCloud, ImageColorGenerator
     
    # 获取当前文件路径
    # __file__ 为当前文件, 在ide中运行此行会报错,可改为
    # d = path.dirname('.')
    d = path.dirname(__file__)
     
    stopwords = {}
    isCN = 1 #默认启用中文分词
    back_coloring_path = "data/lz1.jpg" # 设置背景图片路径
    text_path = 'data/demo5_sheet5_1_1.csv' #设置要分析的文本路径,讲原始文件转化为‘ANSI编码即可’
    font_path = 'data/simfang.ttf' # 为matplotlib设置中文字体路径
    stopwords_path = 'data/stopwords.txt' # 停用词词表
    imgname1 = "data/WordCloudDefautColors.png" # 保存的图片名字1(只按照背景图片形状)
    imgname2 = "data/WordCloudColorsByImg.png"# 保存的图片名字2(颜色按照背景图片颜色布局生成)
     
    # my_words_list = ['CHENGLEI'] # 在结巴的词库中添加新词
     
    back_coloring = imread(path.join(d, back_coloring_path))# 设置背景图片
     
    # 设置词云属性
    wc = WordCloud(font_path=font_path,  # 设置字体
                   background_color="white",  # 背景颜色
                   max_words=2000,  # 词云显示的最大词数
                   mask=back_coloring,  # 设置背景图片
                   max_font_size=100,  # 字体最大值
                   random_state=42,
                   width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
                   )
     
    # 添加自己的词库分词
    # def add_word(list):
    #     for items in list:
    #         jieba.add_word(items)
     
    # add_word(my_words_list)
     
    text = open(path.join(d, text_path)).read()
     
    # def jiebaclearText(text):
    #     mywordlist = []
    #     seg_list = jieba.cut(text, cut_all=False)
    #     liststr="/ ".join(seg_list)
    #     f_stop = open(stopwords_path)
    #     try:
    #         f_stop_text = f_stop.read( )
    #         f_stop_text=unicode(f_stop_text,'utf-8')
    #     finally:
    #         f_stop.close( )
    #     f_stop_seg_list=f_stop_text.split('
    ')
    #     for myword in liststr.split('/'):
    #         if not(myword.strip() in f_stop_seg_list) and len(myword.strip())>1:
    #             mywordlist.append(myword)
    #     return ''.join(mywordlist)
    #
    # if isCN:
    #     text = jiebaclearText(text)
     
    # 生成词云, 可以用generate输入全部文本(wordcloud对中文分词支持不好,建议启用中文分词),也可以我们计算好词频后使用generate_from_frequencies函数
    wc.generate(text)
    # wc.generate_from_frequencies(text)
    # txt_freq例子为[('词a', 100),('词b', 90),('词c', 80)]
    # 从背景图片生成颜色值
    image_colors = ImageColorGenerator(back_coloring)
     
    plt.figure()
    # 以下代码显示图片
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
    # 绘制词云
     
    # 保存图片
    wc.to_file(path.join(d, imgname1))
     
    image_colors = ImageColorGenerator(back_coloring)
     
    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")
    # 绘制背景图片为颜色的图片
    plt.figure()
    plt.imshow(back_coloring, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()
    # 保存图片
    wc.to_file(path.join(d, imgname2))
     
    lda_test_ok.py:
    # coding=utf-8        
     
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.feature_extraction.text import CountVectorizer
    import lda
     
    def doc_topic_word():
        print(doc_topic[:, :3])#输出文档主题分布情况(前3列)
        print(topic_word[:, :3])#输出主题词分布情况(前3列),采用ifidf计算词频
     
    #导出分布图
    def plot_1():   
        # 计算各个主题中单词权重分布的情况
        f, ax= plt.subplots(2, 1, figsize=(6, 6), sharex=True) 
        for i, k in enumerate([0, 9]):         #任意选择两个主题
            ax[i].stem(topic_word[k,:], linefmt='b-', 
                       markerfmt='bo', basefmt='w-') 
            ax[i].set_xlim(-2,2000) 
            ax[i].set_ylim(0, 1) 
            ax[i].set_ylabel("Prob") 
            ax[i].set_title("topic {}".format(k)) 
           
        ax[1].set_xlabel("word")
        plt.tight_layout() 
        plt.show()
     
    def plot_2():
        # 计算文档具体分布在那个主题,代码如下所示:
          
        f, ax= plt.subplots(2, 1, figsize=(8, 8), sharex=True) 
        for i, k in enumerate([0,9]):  #任意选择两个主题
            ax[i].stem(doc_topic[k,:], linefmt='r-', 
                       markerfmt='ro', basefmt='w-') 
            ax[i].set_xlim(-1, 20)     #x坐标下标,即主题的取值范围
            ax[i].set_ylim(0, 1.2)    #y坐标下标
            ax[i].set_ylabel("Prob") 
            ax[i].set_title("Document {}".format(k)) 
        ax[1].set_xlabel("Topic")
        plt.tight_layout()
        plt.show() 
     
      
    if __name__ == "__main__":
      
      
        #存储读取语料 一行预料为一个文档
        corpus = []
        for line in open('./data/demo5_sheet5_1_1.csv', 'r').readlines():
            corpus.append(line.strip())
     
        #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
        vectorizer = CountVectorizer()
        print (vectorizer)
     
        X = vectorizer.fit_transform(corpus)
        analyze = vectorizer.build_analyzer()
        weight = X.toarray()
        print("type(X): {}".format(type(X))) 
        print("shape: {}
    ".format(X.shape))
        print (len(weight))
        print (weight[:5, :5])
     
        #LDA算法
        print ('LDA:')
        model = lda.LDA(n_topics=20, n_iter=50, random_state=1)
        # model.fit_transform(X)
        model.fit(np.asarray(weight))     # model.fit_transform(X) is also available?
        topic_word = model.topic_word_    # model.components_ also works
     
        #文档-主题(Document-Topic)分布
        doc_topic = model.doc_topic_
        print("type(doc_topic): {}".format(type(doc_topic)))
        print("shape: {}".format(doc_topic.shape))
     
        #输出前10篇文章最可能的Topic
        label = []     
        for n in range(10):
            topic_most_pr = doc_topic[n].argmax()
            label.append(topic_most_pr)
            print("doc: {} topic: {}".format(n, topic_most_pr))
     
        #输出主题中的TopN关键词
        word = vectorizer.get_feature_names()
        n = 6 
        for i, topic_dist in enumerate(topic_word): 
            topic_words = np.array(word)[np.argsort(topic_dist)][:-(n+1):-1] 
            print(u'*Topic {}
    - {}'.format(i, ' '.join(topic_words))) 
             
        # doc_topic_word()
        # plot_1()
        plot_2()
     
    gensimTopicTest0803.py:
    # coding=utf-8        
    import re
    import xlrd
    import codecs
    import jieba
    from gensim import corpora, models, similarities
     
    FILE="demo5"#选择要训练的文件
    ID='1'#选择要训练的能力ID
     
    # 读取停用词表
    stopwords = [line.strip() for line in codecs.open('./data/stopwords.txt', 'r', encoding='utf-8').readlines()]
     
    def cleanAnswer(cols_numb):
     
        f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8')
        # 打开文件
        workbook = xlrd.open_workbook('./data/%s.xls'%FILE)
        #根据sheet索引或者名称获取sheet内容
        sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引从0开始
        cols1 = sheet0.col_values(cols_numb[3])[1:]# 获取第三列内容,从第一行开始
     
        p1 = r"(?:[u2E80-uFFFD]|[u201c-u201d]|[u002d]|[u003a])+"#(?:)不获取匹配,即不获取括号内的匹配,括号内使用UNICODE编码匹配
        pattern1 = re.compile(p1)
        for i in range(len(cols1)):
            matcher1 = re.findall(pattern1, cols1[i])#以列表形式返回所有能匹配到的子串
            str1=str()
            if matcher1:
                str1 =''.join(matcher1)
                f1.write(str1.strip())
            f1.write('
    ')
        f1.close()
     
    def ldaAnaly():
     
     
        print("构造分词库-----train-----")
        #去停用词,构建分词库
        train = []
        fp = codecs.open('./result/%s_sheet%s.csv'%(FILE,ID),'r',encoding='utf8')
        for line in fp.readlines():
            line = line.strip()
            if not len(line):#判断是否为空行
                continue
            outstr = ' '
            seg_list =jieba.cut(line,cut_all=False)#采用精确模式分词,效果最好
            for word in seg_list:
                if word not in stopwords:
                    if word != '	':
                        outstr += word
                        outstr += " "
            train.append(outstr.strip().split(" "))#字符串转列表
        fp.close()
     
        print("构造分词库,并保存----“dict_v1.dict”----")
        dic = corpora.Dictionary(train)
        dic.save('./result/dict_v1.dict')
     
        print("保存可读取的分词库----“dic.csv”----")
        fd = codecs.open('./result/dic.csv', 'a',encoding = 'utf-8')
        for word,index in dic.token2id.items():
            fd.write(word +':'+ str(index)+'
    ')
        fd.close()
     
        print("生成语料库,并保存-----“corpus.mm”-----")
        corpus = [dic.doc2bow(text) for text in train]
        corpora.MmCorpus.serialize('./result/corpus.mm', corpus)
     
        print("保存tfidf模型-----“corpus.tfidf_model”-----")
        tfidf = models.TfidfModel(corpus)
        tfidf.save('./result/corpus.tfidf_model')
     
        print("进行LDA主题分析,并保存-----“ldaModel.pkl”-----")
        #使用tf-idf模型训练语料库
        corpus_tfidf = tfidf[corpus]
        #设置100个LDA主题,使用500次迭代
        lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=100, iterations=500 )
        lda.save('./result/ldaModel.pkl')
     
        print("评估文章属于不同主题的概率,一个词对文章的重要性-----“Demo:评估文章1”-----")
        for index, score in sorted(lda[corpus_tfidf[0]], key=lambda tup: -1 * tup[1]):
            print("Score: {}	 Topic: {}".format(score, lda.print_topic(index, 10)))
     
        # 输出100个主题
        # ldaOut = lda.print_topics(100)
        # print("默认返回每个主题的前10的概率最大的词")
        # print (ldaOut[0])
        # print (ldaOut[1])
        # print (ldaOut[2])
        # corpus_lda = lda[corpus_tfidf]
        # print("每篇文章属于不同主题的概率分布")
        # k = 0
        # for doc in corpus_lda:
        #     print(doc)
        #     k += 1
        #     if k == 3:
        #         break
     
     
    def questionAnswer(cols_numb, questionNumber):
        lda = models.LdaModel.load('./result/ldaModel.pkl')
        dic = corpora.Dictionary.load('./result/dict_v1.dict')
        corpus = corpora.MmCorpus('./result/corpus.mm')
        tfidf = models.TfidfModel.load('./result/corpus.tfidf_model')
     
        # print("输入一个问题------------------")
        f1 = open('./result/%s_sheet%s.csv'%(FILE,ID), 'a', encoding='utf-8')
        # 打开文件
        workbook = xlrd.open_workbook('./data/%s.xls'%FILE)
        sheet0 = workbook.sheet_by_index(int(ID)-1) # sheet索引从0开始
        cols0 = sheet0.col_values(cols_numb[3])[questionNumber]  # 获取第三列内容,从第一行开始
     
        #对问题进行去乱码
        p1 = r"(?:[u2E80-uFFFD]|[u201c-u201d]|[u002d]|[u003a])+"#(?:)不获取匹配,即不获取括号内的匹配,括号内使用UNICODE编码匹配
        pattern1 = re.compile(p1)
        matcher1 = re.findall(pattern1, cols0)#以列表形式返回所有能匹配到的子串
        query=str()
        if matcher1:
            query =''.join(matcher1)
        # print("待预测的问题(去乱码):", query)
     
        #对问题进行分词
        seg_list = jieba.cut(query, cut_all=False)
        outstr = ' '
        for word in seg_list:
            if word not in stopwords:
                if word != '	':
                    outstr += word
                    outstr += " "
        inputTest=list(outstr.strip().split(" "))
        # print("分词后的问题(去停用词):", inputTest)
     
        #将问题转成词袋
        query_bow = dic.doc2bow(inputTest)
        # print("生成的词袋:", query_bow)
     
        #需要对查询语句进行tfidf转化
        query_tfidf = tfidf[query_bow]
        lda_vec_tfidf = lda[query_tfidf]
        # print("问题对应的主题概率(tfidf)", lda_vec_tfidf)
     
        # print("预测问题属于不同主题的概率--------------------")
        #输出主题概率的代码
        # for index, score in sorted(lda_vec_tfidf, key=lambda tup: -1 * tup[1]):
        #     print("Score: {}	 Topic: {}".format(score, lda.print_topic(index, 20)))
     
     
        # print("预测问题与数据库中的哪些问题相似,并给出相似度排序(tfidf)--------------------")
        #进行相似性检索
        similarity = similarities.MatrixSimilarity(corpus)
     
        #在TFIDF的基础上,进行相似性检测。query_lsi需要进行预先处理。先变化为dow2bow,然后tfidf.
        lda_vec = lda[query_bow]
     
        # sims = similarity[lda_vec] #相似度检测的词袋为no-tfidf
        sims = similarity[lda_vec_tfidf] #相似度检测的词袋为tfidf
     
        #先枚举出来,后进行排序输出
        listSims = enumerate(sims)
        sort_sims = sorted(listSims, key=lambda item: -item[1])
        # print(sort_sims[0:6])#前n名效果最好
     
     
        #进行分数预测--版本1---
        sort_sims_list = sort_sims[0:6]
        cols1 = sheet0.col_values(cols_numb[6])[1:]# 获取第三列内容,从第一行开始
        f1.close()
     
        #采用百分比形式的加权平均法,实质就是加权平均偏差法
        sumCore1 = 0
        sumPro = 0
        for i in range(len(sort_sims_list)):
            sumCore1 += float(cols1[sort_sims_list[i][0] - 1]) * sort_sims_list[i][1]
            # print(cols1[sort_sims_list[i][0] - 1])
            sumPro += sort_sims_list[i][1]
     
        preCore1 = sumCore1 / sumPro
        # print("采用加权平均偏差法,预测分数1为:%s,实际分数为%s"%(preCore1, cols1[questionNumber-1]))
     
        print("保存预测结果----“pre.csv”----")
        return preCore1, cols1[questionNumber-1], abs(preCore1 - float(cols1[questionNumber-1]))
     
    if __name__ == '__main__':
     
        cols_numb = [0,1,2,3,4,5,6,7] #读取文件的列标号
        # questionNumber = 124 #待测试的问题号,最大不超过问题总数,主要用于测试
        # cleanAnswer(cols_numb) #对数据库中的问题进行提取,并去乱码
        # ldaAnaly() #对问题进行训练,生成主题模型
        # questionAnswer(cols_numb, questionNumber) #对问题进行预测,给出预测分数
     
        #循环预测的demo
        fp = codecs.open('./result/pre_v1.csv', 'a', encoding='utf-8')
        sum = 0
        i = 1
        count = 0
        while( i < 8717 ):
            questionNumber = i
            a = questionAnswer(cols_numb, questionNumber)
            sum += a[2]
            # print(a, a[2])
            # exit()
            i += 8
            count += 1
            fp.write(str(i)+":"+str(a) + '
    ')
        fp.close()
     
        ave = sum / count
        print(ave)
    

      

  • 相关阅读:
    js debounce防抖技术
    我在项目中es6中数组的常用方法
    windows 部署Nginx转发http2.0协议
    AES加密,C#和java相同
    asp:Button js弹出提示框信息
    服务器不重启安装Asp.net Core 程序包
    C# string.Join的用法
    IIS部署asp.net core webapi
    ASP.net 加载不了字体Failed to load resource: the server responded with a status of 404 (Not Found)
    Windows Redis 取消保护模式C#进行访问
  • 原文地址:https://www.cnblogs.com/smuxiaolei/p/9444271.html
Copyright © 2011-2022 走看看