一、基于gensim
1、模型类
import os import jieba import pickle import logging import numpy as np from gensim import corpora, models, similarities import utils.word_process as word_process from root_path import root from pathlib import Path import heapq class TfIdf(object): """tf-idf模型计算相似度""" def __init__(self): root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf") if not Path(root_path).is_dir(): os.mkdir(root_path) self.dic_path = os.path.join(root_path, "bow.model") self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model") self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model") self.stop_list = word_process.get_stop_list() self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt") def del_stopwords(self, words): """删除一句话中的停用词""" word_list = [] for word in words: if word not in self.stop_list: word_list.append(word) return word_list def _seg_word(self, words_list, jieba_flag=True, del_stopword=True): """对多句话进行分词或分字""" word_list = [] if jieba_flag: if del_stopword: for words in words_list: jieba.cut(words) word_list.append(self.del_stopwords(list(jieba.cut(words)))) else: for words in words_list: word_list.append(list(jieba.cut(words))) else: if del_stopword: for words in words_list: word_list.append(self.del_stopwords(words)) else: for words in words_list: word_list.append([word for word in words]) return word_list def train(self, sentence_list): """训练模型""" #下面保存语料字典 word_list = self._seg_word(sentence_list) dic = corpora.Dictionary(word_list, prune_at=2000000) dic.save(self.dic_path) # 构建tfidf模型 tfidf_model_path = self.tfidf_model_path corpus_model = [dic.doc2bow(word) for word in word_list] tfidf_model = models.TfidfModel(corpus_model) tfidf_model.save(tfidf_model_path) #构造检索模型 tfidf_index_path = self.tfidf_index_path corpus_tfidf = tfidf_model[corpus_model] tfidf_index = similarities.MatrixSimilarity(corpus_tfidf) tfidf_index.save(tfidf_index_path) def predict(self, sentence): # 得到句子向量, 直接出检索结果(检索是基于word_list的)。 dic = corpora.Dictionary.load(self.dic_path) words = sentence word_bow = dic.doc2bow(self._seg_word([words])[0]) word_tfidf = models.TfidfModel.load(self.tfidf_model_path)[word_bow] tfidf_index = similarities.MatrixSimilarity.load(self.tfidf_index_path) score = tfidf_index[word_tfidf] return score def get_train_data(self): """得到句子数组和标签数组""" labels = [] sentences = [] with open(self.data_path, "r", encoding="utf8") as f: for line in f.readlines(): data_tuple = line.split(" ") label = data_tuple[0] labels.append(label) sentence = data_tuple[1].replace(" ", "").replace(" ", "") sentences.append(sentence) return labels, sentences def main(self): labels, sentences = self.get_train_data() print(sentences) self.train(sentences) score_list = self.predict("我有困难还不了") # 获取下标, 输出为[4, 5, 2] print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__)) # 获取数值, 输出为[9, 9, 6] print(heapq.nlargest(30, score_list)) if __name__ == '__main__': TfIdf().main()
2、工具类
import os from root_path import root import tqdm stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt") def get_stop_list(): """得到停用词列表""" stop_word_list = [] with open(stop, "r", encoding="utf8") as f: data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1) data_lines.set_description('正在处理停用词...') for line in data_lines: line = line.replace(" ", "").replace(" ", "").replace(" ", "") if len(line) == 1: stop_word_list.append(line) return stop_word_list
二、基于sklearn
import os import jieba import pickle from root_path import root from pathlib import Path from sklearn.feature_extraction.text import TfidfVectorizer class TfIdf(object): """tf-idf模型计算相似度""" def __init__(self): root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf") if not Path(root_path).is_dir(): os.mkdir(root_path) self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt") self.model_path = os.path.join(root_path, "tfidf.model") def get_train_data(self): """得到句子数组和标签数组""" labels = [] sentences = [] with open(self.data_path, "r", encoding="utf8") as f: for line in f.readlines(): data_tuple = line.split(" ") label = data_tuple[0] labels.append(label) sentence = data_tuple[1].replace(" ", "").replace(" ", "") sentences.append(sentence) return labels, sentences def train(self): labels, sentences = self.get_train_data() sent_words = [list(jieba.cut(sent0)) for sent0 in sentences] document = [" ".join(sent0) for sent0 in sent_words] tfidf_vectorizer = TfidfVectorizer() feature = tfidf_vectorizer.fit_transform(document) # 保存模型 with open(self.model_path, 'wb') as f: pickle.dump(tfidf_vectorizer, f) def predict(self, sentence): # 加载模型 with open(self.model_path, 'rb') as f: tfidf_vectorizer = pickle.load(f) sentence = list(jieba.cut(sentence)) sen = " ".join(sentence) res = tfidf_vectorizer.transform([sen]).toarray() return res[0] def main(self): sentence = "是的,我知道那就十五号没办法,因为这个,也可能是十二十号发工资的,因为遇见了超过了一点点。" self.predict(sentence) if __name__ == '__main__': TfIdf().main()