zoukankan      html  css  js  c++  java
  • 聚类之k-means附代码

     

     import os
    import sys as sys
    #reload(sys)
    #sys.setdefaultencoding('utf-8')
    from sklearn.cluster import KMeans
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer

    import matplotlib.pyplot as plt
    from matplotlib.font_manager import FontProperties
    from sklearn.cluster import KMeans
    from scipy.spatial.distance import cdist
    import numpy as np

    def tfidf_vector(corpus_path):
        corpus_train=[]
        #利用train-corpus提取特征
        target_train=[]
        for line in open(corpus_path):
            line=line.strip().split(' ')
            if len(line)==2:
                words=line[1]
                category=line[0]
                target_train.append(category)
                corpus_train.append(words)
        print ("build train-corpus done!!")
        count_v1= CountVectorizer(max_df=0.4,min_df=0.01)
        counts_train = count_v1.fit_transform(corpus_train)  
        
        word_dict={}
        for index,word in enumerate(count_v1.get_feature_names()):
            word_dict[index]=word
        
        print ("the shape of train is ")
        print (repr(counts_train.shape))
        tfidftransformer = TfidfTransformer()
        tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
        return tfidf_train,word_dict

    def best_kmeans(tfidf_matrix,word_dict):  
        K = range(1, 10)
        meandistortions = []
        for k in K:
            print (k),('****'*5)
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(tfidf_matrix)    
            meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0])
        plt.plot(K, meandistortions, 'bx-')
        plt.grid(True)
        plt.xlabel('Number of clusters')
        plt.ylabel('Average within-cluster sum of squares')
        plt.title('Elbow for Kmeans clustering')
        plt.show()

    corpus_train = "corpus_train.txt"
    cluster_docs = "cluster_result_document.txt"
    cluster_keywords = "cluster_result_keyword.txt"
    num_clusters = 7
    tfidf_train,word_dict=tfidf_vector(corpus_train)
    best_kmeans(tfidf_train,word_dict)
    cluster_kmeans(tfidf_train,word_dict,cluster_docs,cluster_keywords,num_clusters)

  • 相关阅读:
    移动端应用rem定义相对长度单位
    ionic4(angular) 生成browser平台的(webApp)在手机QQ浏览器不更新页面
    解决 git bash命令行执行git命令一直报错 segmentation fault
    MACBOOK OSX升级到10.15.3 Catalina 后 photoshop CS6(32位)不能用了
    自制操作系统笔记-第三章
    自制操作系统笔记-第2章
    自制操作系统笔记-第一章
    Vue学习笔记
    解决 MAC 终端上每次打开新窗口手动执行source ~/.bash_profile导出环境变量
    HTTPS的安全性
  • 原文地址:https://www.cnblogs.com/hrnn/p/13406185.html
Copyright © 2011-2022 走看看