一.通常关于文本聚类也都是针对已有的一堆历史数据进行聚类,比如常用的方法有kmeans,dbscan等。如果有个需求需要针对流式文本进行聚类(即来一条聚一条),那么这些方法都不太适用了,当然也有很多其它针对流式数据进行动态聚类方法,动态聚类也有很多挑战,比如聚类个数是不固定的,聚类的相似阈值也不好设。这些都有待继续研究下去。本文实现一个简单single-pass单遍聚类方法,文本间的相似度是利用余弦距离,文本向量可以用tfidf(这里的idf可以在一个大的文档集里统计得到,然后在新的文本中的词直接利用),也可以用一些如word2vec,bert等中文预训练模型对文本进行向量表示。
二.程序
1 import numpy as np 2 import os 3 import sys 4 import pickle 5 import collections 6 from sklearn.feature_extraction.text import TfidfVectorizer 7 from sklearn.decomposition import TruncatedSVD 8 from gensim import corpora, models, matutils 9 from utils.tokenizer import load_stopwords, load_samples, tokenizer, word_segment, load_data, read_data_to_list 10 from gensim.models import doc2vec, Doc2Vec 11 from sklearn.metrics.pairwise import cosine_similarity 12 13 ''' 14 大体流程: 15 input:doc vector;threshold 16 output:cluster 17 begin 18 input doc vector 19 input threshold 20 first doc as first cluster and it's vector as the center of the cluster 21 while(doc vectors){ 22 while(clusters){ 23 max_sim,max_cluster = simlarity(doc vector,cluster); 24 } 25 if(max_sim > threshold){ 26 max_cluster.put(doc vector); 27 max_cluster.update_center() 28 } 29 else{ 30 build new cluster(doc vector); 31 } 32 } 33 end 34 ''' 35 class SingelPassCluster(object): 36 37 ''' 38 1.利用tfidf vec计算cossim 39 ''' 40 def tfidf_vec(self, corpus, pivot=10, slope=0.25): 41 dictionary = corpora.Dictionary(corpus) # 形成词典映射 42 self.dict_size = len(dictionary) 43 print('dictionary size:{}'.format(len(dictionary))) 44 corpus = [dictionary.doc2bow(text) for text in corpus] # 词的向量表示 45 tfidf = models.TfidfModel(corpus, pivot=pivot, slope=slope) 46 corpus_tfidf = tfidf[corpus] 47 return corpus_tfidf 48 49 def get_max_similarity(self, cluster_cores, vector): 50 max_value = 0 51 max_index = -1 52 print('vector:{}'.format(vector)) 53 for k, core in cluster_cores.items(): 54 print('core:{}'.format(core)) 55 similarity = matutils.cossim(vector, core) 56 if similarity > max_value: 57 max_value = similarity 58 max_index = k 59 return max_index, max_value 60 61 def single_pass(self, corpus_vec, corpus, theta): 62 clusters = {} 63 cluster_cores = {} 64 cluster_text = {} 65 num_topic = 0 66 cnt = 0 67 for vector, text in zip(corpus_vec, corpus): 68 if num_topic == 0: 69 clusters.setdefault(num_topic, []).append(vector) 70 cluster_cores[num_topic] = vector 71 cluster_text.setdefault(num_topic, []).append(text) 72 num_topic += 1 73 else: 74 max_index, max_value = self.get_max_similarity(cluster_cores, vector) 75 if max_value > theta: 76 clusters[max_index].append(vector) 77 text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size, 78 num_docs=len(clusters[max_index])).T # 稀疏转稠密 79 core = np.mean(text_matrix, axis=0) # 更新簇中心 80 core = matutils.any2sparse(core) # 将稠密向量core转为稀疏向量 81 cluster_cores[max_index] = core 82 cluster_text[max_index].append(text) 83 else: # 创建一个新簇 84 clusters.setdefault(num_topic, []).append(vector) 85 cluster_cores[num_topic] = vector 86 cluster_text.setdefault(num_topic, []).append(text) 87 num_topic += 1 88 cnt += 1 89 if cnt % 100 == 0: 90 print('processing {}...'.format(cnt)) 91 return clusters, cluster_text 92 93 def fit_transform(self, corpus, raw_data, theta=0.5): 94 tfidf_vec = self.tfidf_vec(corpus) # tfidf_vec是稀疏向量 95 clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta) 96 return clusters, cluster_text 97 98 99 ''' 100 2.利用doc2vec计算cossim 101 ''' 102 def fit(self, doc2vec_model, corpus, raw_data, theta=0.5): 103 doc_vec = self.doc_vec(doc2vec_model, corpus) 104 clusters, cluster_text = self.doc2vec_single_pass(doc_vec, raw_data, theta) 105 return clusters, cluster_text 106 107 def fit_2(self, doc_vec, text2index, theta): 108 clusters, cluster_text = self.doc2vec_single_pass(doc_vec, text2index, theta) 109 return clusters, cluster_text 110 111 def doc_vec(self, doc2vec_model, x_train): 112 print('doc2vec infered vec...') 113 infered_vectors_list = [] 114 for text, label in x_train: 115 vector = doc2vec_model.infer_vector(text) 116 infered_vectors_list.append(vector) 117 print('infered vector size:{}'.format(len(infered_vectors_list))) 118 if len(infered_vectors_list) >= 100: 119 break 120 return infered_vectors_list 121 122 def get_doc2vec_similarity(self, cluster_cores, vector): 123 max_value = 0 124 max_index = -1 125 for k, core in cluster_cores.items(): # core -> np.ndarray 126 similarity = cosine_similarity(vector.reshape(1, -1), core.reshape(1, -1)) 127 similarity = similarity[0, 0] 128 if similarity > max_value: 129 max_value = similarity 130 max_index = k 131 return max_index, max_value 132 133 def doc2vec_single_pass(self, corpus_vec, corpus, theta): 134 clusters = {} 135 cluster_cores = {} 136 cluster_text = {} 137 num_topic = 0 138 cnt = 0 139 for vector, text in zip(corpus_vec, corpus): 140 if num_topic == 0: 141 clusters.setdefault(num_topic, []).append(vector) 142 cluster_cores[num_topic] = vector 143 cluster_text.setdefault(num_topic, []).append(text) 144 num_topic += 1 145 else: 146 max_index, max_value = self.get_doc2vec_similarity(cluster_cores, vector) 147 if max_value > theta: 148 clusters[max_index].append(vector) 149 core = np.mean(clusters[max_index], axis=0) # 更新簇中心 150 cluster_cores[max_index] = core 151 cluster_text[max_index].append(text) 152 else: # 创建一个新簇 153 clusters.setdefault(num_topic, []).append(vector) 154 cluster_cores[num_topic] = vector 155 cluster_text.setdefault(num_topic, []).append(text) 156 num_topic += 1 157 cnt += 1 158 if cnt % 100 == 0: 159 print('processing {}...'.format(cnt)) 160 return clusters, cluster_text 161 162 163 def sim(doc_vec): 164 vector = doc_vec[0] 165 print('vector:{}'.format(type(vector))) 166 for core in doc_vec: 167 similarity = cosine_similarity(vector.reshape(1,-1), core.reshape(1,-1)) 168 similarity = similarity[0, 0] 169 print("similarity:{}".format(similarity)) 170 171 if __name__ == '__main__': 172 base_path = os.path.abspath(os.path.join(os.getcwd(), '../..')) 173 process_text = base_path + '/data/process_text.txt' # 处理后的样本路径 174 doc2vec_path = base_path + '/data/doc2vec.pkl' 175 cluster_result = base_path + '/data/cluster_result.txt' 176 doc_vec_path = base_path + '/data/doc_vec.vec' # 经过doc2vec推荐的文本向量 177 178 corpus = load_data(process_text) 179 raw_text = load_samples(process_text) 180 181 index2corpus = collections.OrderedDict() 182 for index, line in enumerate(raw_text): 183 index2corpus[index] = line 184 text2index = list(index2corpus.keys()) 185 print('docs total size:{}'.format(len(text2index))) 186 187 single_cluster = SingelPassCluster() 188 189 cal_vec_type = 'doc2vec' 190 191 if cal_vec_type == 'tfidf': 192 clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4) 193 194 if cal_vec_type == 'doc2vec': 195 with open(doc_vec_path, 'rb') as file: 196 infered_vectors_list = pickle.load(file) 197 clusters, cluster_text = single_cluster.fit_2(infered_vectors_list, text2index, theta=0.6) 198 199 ''' 200 if os.path.exists(doc2vec_path): 201 print('doc2vec model loading...') 202 doc2vec_model = Doc2Vec.load(doc2vec_path) 203 x_train = read_data_to_list(process_text) 204 clusters, cluster_text = single_cluster.fit(doc2vec_model, x_train, text2index, theta=0.6) 205 ''' 206 207 if cal_vec_type == 'd2vsim': 208 if os.path.exists(doc2vec_path): 209 print('doc2vec model loading...') 210 doc2vec_model = Doc2Vec.load(doc2vec_path) 211 x_train = read_data_to_list(process_text) 212 doc_vec = single_cluster.doc_vec(doc2vec_model, x_train) 213 sim(doc_vec) 214 215 216 print("............................................................................................") 217 print("得到的类数量有: {} 个 ...".format(len(clusters))) 218 print("............................................................................................ ") 219 # 按聚类语句数量对聚类结果进行降序排列 220 clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True) 221 with open(cluster_result, 'w', encoding='utf-8') as file_write: 222 for k in clusterTopic_list: 223 cluster_text = [] 224 for index, value in enumerate(k[1],start=1): 225 cluster_text.append('(' + str(index) + '): ' + index2corpus[value]) 226 cluster_text = ' '.join(cluster_text) 227 file_write.write("【簇索引】:{} 【簇中文档数】:{} 【簇中文档】 : {}".format(k[0], len(k[1]), cluster_text)) 228 file_write.write(' ') 229 file_write.flush()