zoukankan      html  css  js  c++  java
  • 机器学习笔记16-----聚类实践

    1.谱聚类

    谱聚类过程:

    上图说明:

    对m个样本,计算相似度sij,由sij构成的矩阵W,再得到D。

    实际使用时,优先考虑随机游走拉普拉斯矩阵。

    2.代码案例

    kmeans算法

    # !/usr/bin/python
    # -*- coding:utf-8 -*-
    
    import numpy as np
    import matplotlib.pyplot as plt
    import sklearn.datasets as ds
    import matplotlib.colors
    from sklearn.cluster import KMeans
    
    
    def expand(a, b):
        d = (b - a) * 0.1
        return a-d, b+d
    
    
    if __name__ == "__main__":
        N = 400
        centers = 4
        data, y = ds.make_blobs(N, n_features=2, centers=centers, random_state=2)
        data2, y2 = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1,2.5,0.5,2), random_state=2)
        data3 = np.vstack((data[y == 0][:], data[y == 1][:50], data[y == 2][:20], data[y == 3][:5]))
        y3 = np.array([0] * 100 + [1] * 50 + [2] * 20 + [3] * 5)
    
        cls = KMeans(n_clusters=4, init='k-means++')
        y_hat = cls.fit_predict(data)
        y2_hat = cls.fit_predict(data2)
        y3_hat = cls.fit_predict(data3)
    
        m = np.array(((1, 1), (1, 3)))
        data_r = data.dot(m)
        y_r_hat = cls.fit_predict(data_r)
    
        matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
        matplotlib.rcParams['axes.unicode_minus'] = False
        cm = matplotlib.colors.ListedColormap(list('rgbm'))
    
        plt.figure(figsize=(9, 10), facecolor='w')
        plt.subplot(421)
        plt.title(u'原始数据')
        plt.scatter(data[:, 0], data[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(422)
        plt.title(u'KMeans++聚类')
        plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(423)
        plt.title(u'旋转后数据')
        plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data_r, axis=0)
        x1_max, x2_max = np.max(data_r, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(424)
        plt.title(u'旋转后KMeans++聚类')
        plt.scatter(data_r[:, 0], data_r[:, 1], c=y_r_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(425)
        plt.title(u'方差不相等数据')
        plt.scatter(data2[:, 0], data2[:, 1], c=y2, s=30, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data2, axis=0)
        x1_max, x2_max = np.max(data2, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(426)
        plt.title(u'方差不相等KMeans++聚类')
        plt.scatter(data2[:, 0], data2[:, 1], c=y2_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(427)
        plt.title(u'数量不相等数据')
        plt.scatter(data3[:, 0], data3[:, 1], s=30, c=y3, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(data3, axis=0)
        x1_max, x2_max = np.max(data3, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.subplot(428)
        plt.title(u'数量不相等KMeans++聚类')
        plt.scatter(data3[:, 0], data3[:, 1], c=y3_hat, s=30, cmap=cm, edgecolors='none')
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
    
        plt.tight_layout(2)
        plt.suptitle(u'数据分布对KMeans聚类的影响', fontsize=18)
        # https://github.com/matplotlib/matplotlib/issues/829
        plt.subplots_adjust(top=0.92)
        plt.show()

    运行结果:

  • 相关阅读:
    QAbstractItemModel使用样例与解析(Model::index使用了createIndex,它会被销毁吗?被销毁了,因为栈对象出了括号就会被销毁)
    更多的人为了追求自己真正热爱的事,甚至会在职业生涯刚开始时拒绝许多高薪工作,这样的人最终都成了真正的赢家。
    MYSQL分库分表之sharding-jdbc第四篇
    MYSQL分库分表之 Sharding-JDBC第三篇
    MySQL分库分表之Sharding-JDBC第二篇
    MySQL分库分表之Sharding-JDBC第一篇
    增加复杂度的12危险信号
    ASP.NET-Core-Web-API-Best-Practices-Guide
    聚合
    浏览器输入www.baidu.com后干啥了-web性能优化指南
  • 原文地址:https://www.cnblogs.com/luckyplj/p/12703096.html
Copyright © 2011-2022 走看看