zoukankan      html  css  js  c++  java
  • 聚类K-Means和大数据集的Mini Batch K-Means算法

    import numpy as np
    
    from sklearn.datasets import make_blobs
    from sklearn.cluster import KMeans
    from sklearn.metrics import pairwise_distances
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    from cycler import cycler
    
    from .tools import discrete_scatter
    from .plot_2d_separator import plot_2d_classification
    from .plot_helpers import cm3
    
    
    def plot_kmeans_algorithm():
    
        X, y = make_blobs(random_state=1)
        # we don't want cyan in there
        with mpl.rc_context(rc={'axes.prop_cycle': cycler('color', ['#0000aa',
                                                                    '#ff2020',
                                                                    '#50ff50'])}):
            fig, axes = plt.subplots(3, 3, figsize=(10, 8), subplot_kw={'xticks': (), 'yticks': ()})
            axes = axes.ravel()
            axes[0].set_title("Input data")
            discrete_scatter(X[:, 0], X[:, 1], ax=axes[0], markers=['o'], c='w')
    
            axes[1].set_title("Initialization")
            init = X[:3, :]
            discrete_scatter(X[:, 0], X[:, 1], ax=axes[1], markers=['o'], c='w')
            discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2], ax=axes[1],
                             markers=['^'], markeredgewidth=2)
    
            axes[2].set_title("Assign Points (1)")
            km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X)
            centers = km.cluster_centers_
            # need to compute labels by hand. scikit-learn does two e-steps for max_iter=1
            # (and it's totally my fault)
            labels = np.argmin(pairwise_distances(init, X), axis=0)
            discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
                             ax=axes[2])
            discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2],
                             ax=axes[2], markers=['^'], markeredgewidth=2)
    
            axes[3].set_title("Recompute Centers (1)")
            discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
                             ax=axes[3])
            discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
                             ax=axes[3], markers=['^'], markeredgewidth=2)
    
            axes[4].set_title("Reassign Points (2)")
            km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X)
            labels = km.labels_
            discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
                             ax=axes[4])
            discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
                             ax=axes[4], markers=['^'], markeredgewidth=2)
    
            km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X)
            axes[5].set_title("Recompute Centers (2)")
            centers = km.cluster_centers_
            discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
                             ax=axes[5])
            discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
                             ax=axes[5], markers=['^'], markeredgewidth=2)
    
            axes[6].set_title("Reassign Points (3)")
            labels = km.labels_
            discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
                             ax=axes[6])
            markers = discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
                                       ax=axes[6], markers=['^'],
                                       markeredgewidth=2)
    
            axes[7].set_title("Recompute Centers (3)")
            km = KMeans(n_clusters=3, init=init, max_iter=3, n_init=1).fit(X)
            centers = km.cluster_centers_
            discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
                             ax=axes[7])
            discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
                             ax=axes[7], markers=['^'], markeredgewidth=2)
            axes[8].set_axis_off()
            axes[8].legend(markers, ["Cluster 0", "Cluster 1", "Cluster 2"], loc='best')
    
    
    def plot_kmeans_boundaries():
        X, y = make_blobs(random_state=1)
        init = X[:3, :]
        km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X)
        discrete_scatter(X[:, 0], X[:, 1], km.labels_, markers=['o'])
        discrete_scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
                         [0, 1, 2], markers=['^'], markeredgewidth=2)
        plot_2d_classification(km, X, cm=cm3, alpha=.4)
    
    
    def plot_kmeans_faces(km, pca, X_pca, X_people, y_people, target_names):
        n_clusters = 10
        image_shape = (87, 65)
        fig, axes = plt.subplots(n_clusters, 11, subplot_kw={'xticks': (), 'yticks': ()},
                                 figsize=(10, 15), gridspec_kw={"hspace": .3})
    
        for cluster in range(n_clusters):
            center = km.cluster_centers_[cluster]
            mask = km.labels_ == cluster
            dists = np.sum((X_pca - center) ** 2, axis=1)
            dists[~mask] = np.inf
            inds = np.argsort(dists)[:5]
            dists[~mask] = -np.inf
            inds = np.r_[inds, np.argsort(dists)[-5:]]
            axes[cluster, 0].imshow(pca.inverse_transform(center).reshape(image_shape), vmin=0, vmax=1)
            for image, label, asdf, ax in zip(X_people[inds], y_people[inds],
                                              km.labels_[inds], axes[cluster, 1:]):
                ax.imshow(image.reshape(image_shape), vmin=0, vmax=1)
                ax.set_title("%s" % (target_names[label].split()[-1]), fontdict={'fontsize': 9})
    
        # add some boxes to illustrate which are similar and which dissimilar
        rec = plt.Rectangle([-5, -30], 73, 1295, fill=False, lw=2)
        rec = axes[0, 0].add_patch(rec)
        rec.set_clip_on(False)
        axes[0, 0].text(0, -40, "Center")
    
        rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2)
        rec = axes[0, 1].add_patch(rec)
        rec.set_clip_on(False)
        axes[0, 1].text(0, -40, "Close to center")
    
        rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2)
        rec = axes[0, 6].add_patch(rec)
        rec.set_clip_on(False)
        axes[0, 6].text(0, -40, "Far from center")

    过程解析:

    在大数据集的情况下还可以使用scikit-learn 提供了MiniBatchKMeans算法,大致思想就是对数据进行抽样,每次不使用所有的数据来计算,这就会导致准确率的损失。

    MiniBatchKmeans 继承自Kmeans 因为MiniBathcKmeans 本质上还利用了Kmeans 的思想.从构造方法和文档大致能看到这些参数的含义,了解了这些参数会对使用的时候有很大的帮助。batch_size 是每次选取的用于计算的数据的样本量,默认为100.Mini Batch K-Means算法是K-Means算法的变种,采用小批量的数据子集减小计算时间,同时仍试图优化目标函数,这里所谓的小批量是指每次训练算法时所随机抽取的数据子集,采用这些随机产生的子集进行训练算法,大大减小了计算时间,与其他算法相比,减少了k-均值的收敛时间,小批量k-均值产生的结果,一般只略差于标准算法。

    代码只需要修改一行:

    clf = MiniBatchKMeans(n_clusters = 3)
  • 相关阅读:
    Java基础知识强化80:Math类random()方法的小扩展(控制产生目的数字的概率)
    Java基础知识强化79:被遗忘的Java Math类
    Java基础知识强化78:正则表达式之获取功能(案例)
    Java基础知识强化77:正则表达式之获取功能(Pattern 和 Matcher类的使用)
    Java基础知识强化76:正则表达式之替换功能
    Java基础知识强化75:正则表达式之分割功能(字符串中的数字排序案例)
    asp.net页面过滤所有换行符和多余空格
    nvelocity学习笔记
    ASP.NET session时间的设置
    Server的Transfer和Response的Redirect
  • 原文地址:https://www.cnblogs.com/starcrm/p/11690865.html
Copyright © 2011-2022 走看看