一、
利用 jieba 进行分词,关键词提取
利用gensim下面的corpora,models,similarities 进行语料库建立,模型tfidf算法,稀疏矩阵相似度分析
# -*- coding: utf-8 -*- import jieba from gensim import corpora, models, similarities from collections import defaultdict # 定义文件目录 work_dir = "D:/workspace/PythonSdy/data" f1 = work_dir + "/t1.txt" f2 = work_dir + "/t2.txt" # 读取文件内容 c1 = open(f1, encoding='utf-8').read() c2 = open(f2, encoding='utf-8').read() # jieba 进行分词 data1 = jieba.cut(c1) data2 = jieba.cut(c2) data11 = "" # 获取分词内容 for i in data1: data11 += i + " " data21 = "" # 获取分词内容 for i in data2: data21 += i + " " doc1 = [data11, data21] # print(doc1) t1 = [[word for word in doc.split()] for doc in doc1] # print(t1) # # frequence频率 freq = defaultdict(int) for i in t1: for j in i: freq[j] += 1 # print(freq) # 限制词频 t2 = [[token for token in k if freq[j] >= 3] for k in t1] print(t2) # corpora语料库建立字典 dic1 = corpora.Dictionary(t2) dic1.save(work_dir + "/yuliaoku.txt") # 对比文件 f3 = work_dir + "/t3.txt" c3 = open(f3, encoding='utf-8').read() # jieba 进行分词 data3 = jieba.cut(c3) data31 = "" for i in data3: data31 += i + " " new_doc = data31 print(new_doc) # doc2bow把文件变成一个稀疏向量 new_vec = dic1.doc2bow(new_doc.split()) # 对字典进行doc2bow处理,得到新语料库 new_corpor = [dic1.doc2bow(t3) for t3 in t2] tfidf = models.TfidfModel(new_corpor) # 特征数 featurenum = len(dic1.token2id.keys()) # similarities 相似之处 # SparseMatrixSimilarity 稀疏矩阵相似度 idx = similarities.SparseMatrixSimilarity(tfidf[new_corpor], num_features=featurenum) sims = idx[tfidf[new_vec]] print(sims)
二、轻量级数据文本相似的处理
Lsimodel训练模型
import jieba from gensim import corpora from gensim import models from gensim import similarities from settings import MONGO_DB content_list = [] # 放数据库中的内容 for i in MONGO_DB.content.find(): # 查数据库内容,生成器 content_list.append(i.get("title")) # 制作语料库 l1 = content_list all_doc_list = [] # 存放jieba分词列表 for doc in l1: doc_list = [word for word in jieba.cut_for_search(doc)] all_doc_list.append(doc_list) dictionary = corpora.Dictionary(all_doc_list) #制作词袋 例如: {'什么': 0, '你': 1, '名字': 2, '是': 3, '的': 4, '了': 5, '今年': 6} corpus = [dictionary.doc2bow(doc) for doc in all_doc_list] # [(1, 1), (5, 1), (6, 1), (7, 1)] bow模型语料库 lsi = models.LsiModel(corpus) # 根据语料库训练Lsi模型,向量表示 # [5*5,6*4,2*3....] # 百度ai识别的用户语音消息 ,jieba分词 --> 语料库 def my_gensim(ai_msg): doc_test_list = [word for word in jieba.cut_for_search(ai_msg)] # 分词 doc_test_vec = dictionary.doc2bow(doc_test_list) # bow 对象语料库 # 计算文本相似度 # 稀疏矩阵相似度 将主语料库corpus的训练结果 作为初始值 index = similarities.SparseMatrixSimilarity(lsi[corpus], num_features=len(dictionary.keys())) # 将 语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示 ,与 语料库corpus的 向量表示 做矩阵相似度计算 sim = index[lsi[doc_test_vec]] print(sim,enumerate(sim)) cc = sorted(enumerate(sim), key=lambda item: -item[1]) # 按相似度排序 print(cc) if cc[0][1] > 0.58: text = l1[cc[0][0]] else: text = None return text print(my_gensim('xiaoxiao 小的'))