zoukankan      html  css  js  c++  java
  • 聚类之k-means附代码

     

     import os
    import sys as sys
    #reload(sys)
    #sys.setdefaultencoding('utf-8')
    from sklearn.cluster import KMeans
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer

    import matplotlib.pyplot as plt
    from matplotlib.font_manager import FontProperties
    from sklearn.cluster import KMeans
    from scipy.spatial.distance import cdist
    import numpy as np

    def tfidf_vector(corpus_path):
        corpus_train=[]
        #利用train-corpus提取特征
        target_train=[]
        for line in open(corpus_path):
            line=line.strip().split(' ')
            if len(line)==2:
                words=line[1]
                category=line[0]
                target_train.append(category)
                corpus_train.append(words)
        print ("build train-corpus done!!")
        count_v1= CountVectorizer(max_df=0.4,min_df=0.01)
        counts_train = count_v1.fit_transform(corpus_train)  
        
        word_dict={}
        for index,word in enumerate(count_v1.get_feature_names()):
            word_dict[index]=word
        
        print ("the shape of train is ")
        print (repr(counts_train.shape))
        tfidftransformer = TfidfTransformer()
        tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
        return tfidf_train,word_dict

    def best_kmeans(tfidf_matrix,word_dict):  
        K = range(1, 10)
        meandistortions = []
        for k in K:
            print (k),('****'*5)
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(tfidf_matrix)    
            meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0])
        plt.plot(K, meandistortions, 'bx-')
        plt.grid(True)
        plt.xlabel('Number of clusters')
        plt.ylabel('Average within-cluster sum of squares')
        plt.title('Elbow for Kmeans clustering')
        plt.show()

    corpus_train = "corpus_train.txt"
    cluster_docs = "cluster_result_document.txt"
    cluster_keywords = "cluster_result_keyword.txt"
    num_clusters = 7
    tfidf_train,word_dict=tfidf_vector(corpus_train)
    best_kmeans(tfidf_train,word_dict)
    cluster_kmeans(tfidf_train,word_dict,cluster_docs,cluster_keywords,num_clusters)

  • 相关阅读:
    JSON--List集合转换成JSON对象
    某些项目因位于工作空间目录中而被隐藏
    Target runtime Apache Tomcat v6.0 is not defined.错误解决方法
    SQLSERVER2008 18456错误
    android捕获ListView中每个item点击事件
    Android中Toast的用法简介
    android ListView详解
    Android调试工具及方法
    免费卫星图像下载网站
    ArcScene三维制作
  • 原文地址:https://www.cnblogs.com/hrnn/p/13406185.html
Copyright © 2011-2022 走看看