zoukankan      html  css  js  c++  java
  • python聚类分析

    #!/usr/bin/env python
    #-*- coding:utf-8 -*-
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans
    from sklearn import preprocessing
    from scipy.spatial.distance import cdist
    from sklearn import metrics
    
    # 读取原始数据
    X = []
    y_true = []
    id = []
    
    f = open('data/wina.data')
    for line in f:
        y = []
        for index,item in enumerate(line.split(",")):
            if index == 0:
                id.append(int(item))
                continue
            y.append(float(item))
        X.append(y)
    # 转化为numpy array
    X = np.array(X)
    y_true = np.array(id)
    
    
    min_max_scaler = preprocessing.MinMaxScaler()
    X = min_max_scaler.fit_transform(X)
    
    
    K = range(1, 10)
    meandistortions = []
    for k in K:
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    plt.plot(K, meandistortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('meandistortions')
    plt.title('best K of the model')
    plt.show()
    n_clusters = 3
    
    
    cls = KMeans(n_clusters).fit(X)
    y_pre = cls.predict(X)
    
    n_samples,n_features=X.shape     #总样本量,总特征数
    inertias = cls.inertia_   #样本距离最近的聚类中心的总和
    adjusted_rand_s=metrics.adjusted_rand_score(y_true,y_pre)           #调整后的兰德指数
    homogeneity_s=metrics.homogeneity_score(y_true,y_pre)               #同质化得分
    silhouette_s=metrics.silhouette_score(X,y_pre,metric='euclidean')   #平均轮廓系数
    print("兰德指数ART",adjusted_rand_s)
    print("同质化得分homo",homogeneity_s)
    print("平均轮廓系数",silhouette_s)
    
    centers=cls.cluster_centers_  #各类别中心
    
    colors=['#ff0000','#00ff00','#0000ff']   #设置不同类别的颜色
    plt.figure()    #建立画布
    for i in range(n_clusters):    #循环读取类别
        index_sets=np.where(y_pre==i)  #找到相同类的索引集合、
        cluster=X[index_sets]   #将相同类的数据划分为一个聚类子集
        plt.scatter(cluster[:,0],cluster[:,0],c=colors[i],marker='.')   #展示聚类子集内的样本点
        plt.plot(centers[i][0],centers[i][0],'*',markerfacecolor=colors[i],markeredgecolor='k',markersize=6)
    plt.show()

     

  • 相关阅读:
    50.2 Django 连接MySQL,django orm 数据库(models)操作
    50.1 Django 静态资源配置 static && form表单和 request对象
    JS备忘
    Python 单元测试 生产HTML测试报告
    python 单元测试 执行测试
    pycharm 安装插件
    Python 单元测试 实战演练
    Python 单元测试
    Pycharm 使用备忘
    Python 异常
  • 原文地址:https://www.cnblogs.com/msdog/p/12638965.html
Copyright © 2011-2022 走看看