zoukankan      html  css  js  c++  java
  • single-pass单遍聚类方法

    一.通常关于文本聚类也都是针对已有的一堆历史数据进行聚类,比如常用的方法有kmeans,dbscan等。如果有个需求需要针对流式文本进行聚类(即来一条聚一条),那么这些方法都不太适用了,当然也有很多其它针对流式数据进行动态聚类方法,动态聚类也有很多挑战,比如聚类个数是不固定的,聚类的相似阈值也不好设。这些都有待继续研究下去。本文实现一个简单single-pass单遍聚类方法,文本间的相似度是利用余弦距离,文本向量可以用tfidf(这里的idf可以在一个大的文档集里统计得到,然后在新的文本中的词直接利用),也可以用一些如word2vec,bert等中文预训练模型对文本进行向量表示。

    二.程序

      1 import numpy as np
      2 import os
      3 import sys
      4 import pickle
      5 import collections
      6 from sklearn.feature_extraction.text import TfidfVectorizer
      7 from sklearn.decomposition import TruncatedSVD
      8 from gensim import corpora, models, matutils
      9 from utils.tokenizer import load_stopwords, load_samples, tokenizer, word_segment, load_data, read_data_to_list
     10 from gensim.models import doc2vec, Doc2Vec
     11 from sklearn.metrics.pairwise import cosine_similarity
     12 
     13 '''
     14 大体流程:
     15 input:doc vector;threshold
     16 output:cluster
     17 begin
     18     input doc vector
     19     input threshold
     20     first doc as first cluster and it's vector as the center of the cluster
     21     while(doc vectors){
     22         while(clusters){
     23             max_sim,max_cluster = simlarity(doc vector,cluster);
     24         }
     25         if(max_sim > threshold){
     26             max_cluster.put(doc vector);
     27             max_cluster.update_center()
     28         }
     29         else{
     30             build new cluster(doc vector);
     31         }
     32     }
     33 end
     34 '''
     35 class SingelPassCluster(object):
     36 
     37     '''
     38         1.利用tfidf vec计算cossim
     39     '''
     40     def tfidf_vec(self, corpus, pivot=10, slope=0.25):
     41         dictionary = corpora.Dictionary(corpus)  # 形成词典映射
     42         self.dict_size = len(dictionary)
     43         print('dictionary size:{}'.format(len(dictionary)))
     44         corpus = [dictionary.doc2bow(text) for text in corpus]  # 词的向量表示
     45         tfidf = models.TfidfModel(corpus, pivot=pivot, slope=slope)
     46         corpus_tfidf = tfidf[corpus]
     47         return corpus_tfidf
     48 
     49     def get_max_similarity(self, cluster_cores, vector):
     50         max_value = 0
     51         max_index = -1
     52         print('vector:{}'.format(vector))
     53         for k, core in cluster_cores.items():
     54             print('core:{}'.format(core))
     55             similarity = matutils.cossim(vector, core)
     56             if similarity > max_value:
     57                 max_value = similarity
     58                 max_index = k
     59         return max_index, max_value
     60 
     61     def single_pass(self, corpus_vec, corpus, theta):
     62         clusters = {}
     63         cluster_cores = {}
     64         cluster_text = {}
     65         num_topic = 0
     66         cnt = 0
     67         for vector, text in zip(corpus_vec, corpus):
     68             if num_topic == 0:
     69                 clusters.setdefault(num_topic, []).append(vector)
     70                 cluster_cores[num_topic] = vector
     71                 cluster_text.setdefault(num_topic, []).append(text)
     72                 num_topic += 1
     73             else:
     74                 max_index, max_value = self.get_max_similarity(cluster_cores, vector)
     75                 if max_value > theta:
     76                     clusters[max_index].append(vector)
     77                     text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size,
     78                                                         num_docs=len(clusters[max_index])).T  # 稀疏转稠密
     79                     core = np.mean(text_matrix, axis=0)  # 更新簇中心
     80                     core = matutils.any2sparse(core)  # 将稠密向量core转为稀疏向量
     81                     cluster_cores[max_index] = core
     82                     cluster_text[max_index].append(text)
     83                 else:  # 创建一个新簇
     84                     clusters.setdefault(num_topic, []).append(vector)
     85                     cluster_cores[num_topic] = vector
     86                     cluster_text.setdefault(num_topic, []).append(text)
     87                     num_topic += 1
     88             cnt += 1
     89             if cnt % 100 == 0:
     90                 print('processing {}...'.format(cnt))
     91         return clusters, cluster_text
     92 
     93     def fit_transform(self, corpus, raw_data, theta=0.5):
     94         tfidf_vec = self.tfidf_vec(corpus)  # tfidf_vec是稀疏向量
     95         clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta)
     96         return clusters, cluster_text
     97 
     98 
     99     '''
    100         2.利用doc2vec计算cossim
    101     '''
    102     def fit(self, doc2vec_model, corpus, raw_data, theta=0.5):
    103         doc_vec = self.doc_vec(doc2vec_model, corpus)
    104         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, raw_data, theta)
    105         return clusters, cluster_text
    106 
    107     def fit_2(self, doc_vec, text2index, theta):
    108         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, text2index, theta)
    109         return clusters, cluster_text
    110 
    111     def doc_vec(self, doc2vec_model, x_train):
    112         print('doc2vec infered vec...')
    113         infered_vectors_list = []
    114         for text, label in x_train:
    115             vector = doc2vec_model.infer_vector(text)
    116             infered_vectors_list.append(vector)
    117             print('infered vector size:{}'.format(len(infered_vectors_list)))
    118             if len(infered_vectors_list) >= 100:
    119                 break
    120         return infered_vectors_list
    121 
    122     def get_doc2vec_similarity(self, cluster_cores, vector):
    123         max_value = 0
    124         max_index = -1
    125         for k, core in cluster_cores.items():  # core -> np.ndarray
    126             similarity = cosine_similarity(vector.reshape(1, -1), core.reshape(1, -1))
    127             similarity = similarity[0, 0]
    128             if similarity > max_value:
    129                 max_value = similarity
    130                 max_index = k
    131         return max_index, max_value
    132 
    133     def doc2vec_single_pass(self, corpus_vec, corpus, theta):
    134         clusters = {}
    135         cluster_cores = {}
    136         cluster_text = {}
    137         num_topic = 0
    138         cnt = 0
    139         for vector, text in zip(corpus_vec, corpus):
    140             if num_topic == 0:
    141                 clusters.setdefault(num_topic, []).append(vector)
    142                 cluster_cores[num_topic] = vector
    143                 cluster_text.setdefault(num_topic, []).append(text)
    144                 num_topic += 1
    145             else:
    146                 max_index, max_value = self.get_doc2vec_similarity(cluster_cores, vector)
    147                 if max_value > theta:
    148                     clusters[max_index].append(vector)
    149                     core = np.mean(clusters[max_index], axis=0)  # 更新簇中心
    150                     cluster_cores[max_index] = core
    151                     cluster_text[max_index].append(text)
    152                 else:  # 创建一个新簇
    153                     clusters.setdefault(num_topic, []).append(vector)
    154                     cluster_cores[num_topic] = vector
    155                     cluster_text.setdefault(num_topic, []).append(text)
    156                     num_topic += 1
    157             cnt += 1
    158             if cnt % 100 == 0:
    159                 print('processing {}...'.format(cnt))
    160         return clusters, cluster_text
    161 
    162 
    163 def sim(doc_vec):
    164     vector = doc_vec[0]
    165     print('vector:{}'.format(type(vector)))
    166     for core in doc_vec:
    167         similarity = cosine_similarity(vector.reshape(1,-1), core.reshape(1,-1))
    168         similarity = similarity[0, 0]
    169         print("similarity:{}".format(similarity))
    170 
    171 if __name__ == '__main__':
    172     base_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
    173     process_text = base_path + '/data/process_text.txt'  # 处理后的样本路径
    174     doc2vec_path = base_path + '/data/doc2vec.pkl'
    175     cluster_result = base_path + '/data/cluster_result.txt'
    176     doc_vec_path = base_path + '/data/doc_vec.vec'  # 经过doc2vec推荐的文本向量
    177 
    178     corpus = load_data(process_text)
    179     raw_text = load_samples(process_text)
    180 
    181     index2corpus = collections.OrderedDict()
    182     for index, line in enumerate(raw_text):
    183         index2corpus[index] = line
    184     text2index = list(index2corpus.keys())
    185     print('docs total size:{}'.format(len(text2index)))
    186 
    187     single_cluster = SingelPassCluster()
    188 
    189     cal_vec_type = 'doc2vec'
    190 
    191     if cal_vec_type == 'tfidf':
    192         clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4)
    193 
    194     if cal_vec_type == 'doc2vec':
    195         with open(doc_vec_path, 'rb') as file:
    196             infered_vectors_list = pickle.load(file)
    197         clusters, cluster_text = single_cluster.fit_2(infered_vectors_list, text2index, theta=0.6)
    198 
    199         '''
    200         if os.path.exists(doc2vec_path):
    201             print('doc2vec model loading...')
    202             doc2vec_model = Doc2Vec.load(doc2vec_path)
    203         x_train = read_data_to_list(process_text)
    204         clusters, cluster_text = single_cluster.fit(doc2vec_model, x_train, text2index, theta=0.6)
    205         '''
    206 
    207     if cal_vec_type == 'd2vsim':
    208         if os.path.exists(doc2vec_path):
    209             print('doc2vec model loading...')
    210             doc2vec_model = Doc2Vec.load(doc2vec_path)
    211         x_train = read_data_to_list(process_text)
    212         doc_vec = single_cluster.doc_vec(doc2vec_model, x_train)
    213         sim(doc_vec)
    214 
    215 
    216     print("............................................................................................")
    217     print("得到的类数量有: {} 个 ...".format(len(clusters)))
    218     print("............................................................................................
    ")
    219     # 按聚类语句数量对聚类结果进行降序排列
    220     clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True)
    221     with open(cluster_result, 'w', encoding='utf-8') as file_write:
    222         for k in clusterTopic_list:
    223             cluster_text = []
    224             for index, value in enumerate(k[1],start=1):
    225                 cluster_text.append('(' + str(index) + '): ' + index2corpus[value])
    226             cluster_text = '
    '.join(cluster_text)
    227             file_write.write("【簇索引】:{} 
    【簇中文档数】:{} 
    【簇中文档】 :
    {}".format(k[0], len(k[1]), cluster_text))
    228             file_write.write('
    ')
    229             file_write.flush()
  • 相关阅读:
    maven实现打包带源代码的jar包
    maven实现打包带源代码的jar包
    maven实现打包带源代码的jar包
    Java奇淫巧技之Lombok
    Java奇淫巧技之Lombok
    Java奇淫巧技之Lombok
    基于移动Web的视图引擎实现
    AE错误代码解释
    Visual Studio q启动卡顿
    RabbitMQ安装后,BADARG问题
  • 原文地址:https://www.cnblogs.com/little-horse/p/11688801.html
Copyright © 2011-2022 走看看