zoukankan      html  css  js  c++  java
  • 机器学习笔记16-----聚类实践

    1.谱聚类

    谱聚类过程:

    上图说明:

    对m个样本,计算相似度sij,由sij构成的矩阵W,再得到D。

    实际使用时,优先考虑随机游走拉普拉斯矩阵。

    2.代码案例

    kmeans算法

    # !/usr/bin/python
    # -*- coding:utf-8 -*-
    
    import numpy as np
    import matplotlib.pyplot as plt
    import sklearn.datasets as ds
    import matplotlib.colors
    from sklearn.cluster import KMeans
    
    
    def expand(a, b):
        d = (b - a) * 0.1
        return a-d, b+d
    
    
    if __name__ == "__main__":
        N = 400
        centers = 4
        data, y = ds.make_blobs(N, n_features=2, centers=centers, random_state=2)
        data2, y2 = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1,2.5,0.5,2), random_state=2)
        data3 = np.vstack((data[y == 0][:], data[y == 1][:50], data[y == 2][:20], data[y == 3][:5]))
        y3 = np.array([0] * 100 + [1] * 50 + [2] * 20 + [3] * 5)
    
        cls = KMeans(n_clusters=4, init='k-means++')
        y_hat = cls.fit_predict(data)
        y2_hat = cls.fit_predict(data2)
        y3_hat = cls.fit_predict(data3)
    
        m = np.array(((1, 1), (1, 3)))
        data_r = data.dot(m)
        y_r_hat = cls.fit_predict(data_r)
    
        matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
        matplotlib.rcParams['axes.unicode_minus'] = False
        cm = matplotlib.colors.ListedColormap(list('rgbm'))
    
        plt.figure(figsize=(9, 10), facecolor='w')
        plt.subplot(421)
        plt.title(u'原始数据')
        plt.scatter(data[:, 0], data[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(422)
        plt.title(u'KMeans++聚类')
        plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(423)
        plt.title(u'旋转后数据')
        plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data_r, axis=0)
        x1_max, x2_max = np.max(data_r, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(424)
        plt.title(u'旋转后KMeans++聚类')
        plt.scatter(data_r[:, 0], data_r[:, 1], c=y_r_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(425)
        plt.title(u'方差不相等数据')
        plt.scatter(data2[:, 0], data2[:, 1], c=y2, s=30, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data2, axis=0)
        x1_max, x2_max = np.max(data2, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(426)
        plt.title(u'方差不相等KMeans++聚类')
        plt.scatter(data2[:, 0], data2[:, 1], c=y2_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(427)
        plt.title(u'数量不相等数据')
        plt.scatter(data3[:, 0], data3[:, 1], s=30, c=y3, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data3, axis=0)
        x1_max, x2_max = np.max(data3, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(428)
        plt.title(u'数量不相等KMeans++聚类')
        plt.scatter(data3[:, 0], data3[:, 1], c=y3_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.tight_layout(2)
        plt.suptitle(u'数据分布对KMeans聚类的影响', fontsize=18)
        # https://github.com/matplotlib/matplotlib/issues/829
        plt.subplots_adjust(top=0.92)
        plt.show()

    运行结果:

  • 相关阅读:
    git命令记录
    JS实现iframe自适应高度
    js生成某个范围内的随机数
    jquery实现按钮翻转动画
    删除ELK的索引
    ELK故障处理,不知道成功否
    软件开发的SOLID原则
    阿里云的远程桌面问题
    Zabbix增加邮箱后Server宕处理
    201811招投标培训要点
  • 原文地址:https://www.cnblogs.com/luckyplj/p/12703096.html
Copyright © 2011-2022 走看看