zoukankan      html  css  js  c++  java
  • 聚类之k-means附代码

     

     import os
    import sys as sys
    #reload(sys)
    #sys.setdefaultencoding('utf-8')
    from sklearn.cluster import KMeans
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer

    import matplotlib.pyplot as plt
    from matplotlib.font_manager import FontProperties
    from sklearn.cluster import KMeans
    from scipy.spatial.distance import cdist
    import numpy as np

    def tfidf_vector(corpus_path):
        corpus_train=[]
        #利用train-corpus提取特征
        target_train=[]
        for line in open(corpus_path):
            line=line.strip().split(' ')
            if len(line)==2:
                words=line[1]
                category=line[0]
                target_train.append(category)
                corpus_train.append(words)
        print ("build train-corpus done!!")
        count_v1= CountVectorizer(max_df=0.4,min_df=0.01)
        counts_train = count_v1.fit_transform(corpus_train)  
        
        word_dict={}
        for index,word in enumerate(count_v1.get_feature_names()):
            word_dict[index]=word
        
        print ("the shape of train is ")
        print (repr(counts_train.shape))
        tfidftransformer = TfidfTransformer()
        tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
        return tfidf_train,word_dict

    def best_kmeans(tfidf_matrix,word_dict):  
        K = range(1, 10)
        meandistortions = []
        for k in K:
            print (k),('****'*5)
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(tfidf_matrix)    
            meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0])
        plt.plot(K, meandistortions, 'bx-')
        plt.grid(True)
        plt.xlabel('Number of clusters')
        plt.ylabel('Average within-cluster sum of squares')
        plt.title('Elbow for Kmeans clustering')
        plt.show()

    corpus_train = "corpus_train.txt"
    cluster_docs = "cluster_result_document.txt"
    cluster_keywords = "cluster_result_keyword.txt"
    num_clusters = 7
    tfidf_train,word_dict=tfidf_vector(corpus_train)
    best_kmeans(tfidf_train,word_dict)
    cluster_kmeans(tfidf_train,word_dict,cluster_docs,cluster_keywords,num_clusters)

  • 相关阅读:
    java.lang.NoClassDefFoundError: org/apache/commons/fileupload/disk/DiskFileItemFactory
    连续子数组的最大和
    @Scheduled(cron = "* * * * * *")
    BigDecimal加减乘除计算
    04
    作业03
    作业01
    Haar小波的理解
    Matlab画colormap的一种色彩搭配方法
    单自由度系统中质量、阻尼和刚度变化对频率响应函数(FRF)影响图的绘制
  • 原文地址:https://www.cnblogs.com/hrnn/p/13406185.html
Copyright © 2011-2022 走看看