zoukankan      html  css  js  c++  java
  • 【ML-7】聚类算法-实例代码

    目录

    1. K-Means算法和Mini Batch K-Means算法比较
    2. 层次聚类(BIRCH)算法参数比较
    3. DBSCAN算法

    一、K-Means算法和Mini Batch K-Means算法比较

    1

    2

    3

    4

    5

    6

    7

    8

    9

    10

    11

    12

    13

    14

    15

    16

    17

    18

    19

    20

    21

    22

    23

    24

    25

    26

    27

    28

    29

    30

    31

    32

    33

    34

    35

    36

    37

    38

    39

    40

    41

    42

    43

    44

    45

    46

    47

    48

    49

    50

    51

    52

    53

    54

    55

    56

    57

    58

    59

    60

    61

    62

    63

    64

    65

    66

    67

    68

    69

    70

    71

    72

    73

    74

    75

    76

    77

    78

    79

    80

    81

    82

    83

    84

    85

    86

    87

    88

    89

    90

    91

    92

    93

    94

    95

    96

    97

    98

    99

    # Author:yifan

    import time

    import numpy as np

    import matplotlib.pyplot as plt

    import matplotlib as mpl

    from sklearn.cluster import MiniBatchKMeans,KMeans

    from sklearn.metrics.pairwise import pairwise_distances_argmin

    from sklearn.datasets.samples_generator import make_blobs

    ## 设置属性防止中文乱码

    mpl.rcParams['font.sans-serif'] = [u'SimHei']

    mpl.rcParams['axes.unicode_minus']=False

    #初始化三个中心

    centers = [[1,1],[-1,-1],[1,-1]]

    clusters = len(centers) #聚类的数目为3

    #产生3000组二维的数据,中心是意思三个中心点,标准差是.7

    X,Y = make_blobs(n_samples=28000,centers=centers,cluster_std=0.7,random_state=28)

    #构建kmeans算法

    k_means = KMeans(init='k-means++',n_clusters=clusters,random_state=28)

    t0 = time.time() #current time

    k_means.fit(X) #trainning mode

    km_batch = time.time() - t0 #the spend of trainning mode

    print("K-means算法所需要的时间:%.4fs" % km_batch)

    #构建MiniBatchKMeans算法

    batch_size = 100

    mbk = MiniBatchKMeans(init='k-means++',n_clusters=clusters,batch_size=batch_size,random_state=28)

    t0=time.time()

    mbk.fit(X)

    mbk_batch = time.time()-t0 ##the spend of trainning mode

    print ("Mini Batch K-Means算法模型训练消耗时间:%.4fs" % mbk_batch)

       

    #预测结果

    km_y_hat = k_means.predict(X)

    mbkm_y_hat = mbk.predict(X)

    print(km_y_hat[:10])

    print(mbkm_y_hat[:10])

    print(k_means.cluster_centers_)

    print(mbk.cluster_centers_)

    ##获取聚类中心点并聚类中心点进行排序

    k_means_cluster_centers = k_means.cluster_centers_ #输出kmeans聚类中心点

    mbk_means_cluster_centers = mbk.cluster_centers_ #输出mbk聚类中心点

    print ("K-Means算法聚类中心点: center=", k_means_cluster_centers)

    print ("Mini Batch K-Means算法聚类中心点: center=", mbk_means_cluster_centers)

    order = pairwise_distances_argmin(k_means_cluster_centers, mbk_means_cluster_centers) #array([1, 2, 0], dtype=int64)

    #方便后面画图

       

    #画图

    plt.figure(figsize=(12,6),facecolor='w')

    plt.subplots_adjust(left=0.05,right=0.95,bottom=0.05,top=0.9)

    cm = mpl.colors.ListedColormap(['#FFC2CC', '#C2FFCC', '#CCC2FF'])

    cm2 = mpl.colors.ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    #子图1:原始数据

    plt.subplot(221)

    plt.scatter(X[:,0],X[:,1],c=Y,s=6,cmap = cm,edgecolors='none')

    plt.title(u'原始数据分布图')

    plt.xticks(())

    plt.yticks(())

    plt.grid(True)

    #子图2K-Means算法聚类结果图

    plt.subplot(222)

    plt.scatter(X[:,0],X[:,1],c=km_y_hat,s=6,cmap=cm,edgecolors='none')

    plt.scatter(k_means_cluster_centers[:,0], k_means_cluster_centers[:,1],c=range(clusters),s=60,cmap=cm2,edgecolors='none')

    plt.title(u'K-Means算法聚类图')

    plt.xticks(())

    plt.yticks(())

    plt.text(-3.8, 3, 'train time: %.2fms' % (km_batch*1000))

    plt.grid(True)

    #子图3:Mini Batch K-Means算法聚类结果图

    plt.subplot(223)

    plt.scatter(X[:,0], X[:,1], c=km_y_hat, s=6, cmap=cm,edgecolors='none')

    plt.scatter(mbk_means_cluster_centers[:,0],mbk_means_cluster_centers[:,1],c=range(clusters),s=60,cmap=cm2,edgecolors='none')

    plt.title(u'Mini Batch K-Means算法聚类结果图')

    plt.xticks(())

    plt.yticks(())

    plt.text(-3.8, 3, 'train time: %.2fms' % (mbk_batch*1000))

    plt.grid(True)

    #统计不同的数据个数

    different = list(map(lambda x:(x!=0) & (x!=1) &(x!=2),mbkm_y_hat))

    for k in range(clusters):

    different += ((km_y_hat == k) != (mbkm_y_hat == order[k]))

    idendic = np.logical_not(different)

    different_nodes = len(list(filter(lambda x:x, different)))

    #根据上面的统计,画出预测异同点

    plt.subplot(224)

    #相同部分;

    plt.plot(X[idendic,0],X[idendic,1],'w',markerfacecolor = '#bbbbbb',marker='.')

    #不相同部分;

    plt.plot(X[different, 0], X[different, 1], 'w', markerfacecolor='m', marker='.')

    plt.title(u'Mini Batch K-MeansK-Means算法预测结果不同的点')

    plt.xticks(())

    plt.yticks(())

    plt.text(-3.8, 2, 'different nodes: %d' % (different_nodes))

    plt.show()

    结果:

    K-means算法所需要的时间:0.1945s

    Mini Batch K-Means算法模型训练消耗时间:0.0728s

    [2 1 2 0 0 1 2 2 0 0]

    [1 0 1 2 2 0 1 1 2 2]

    [[ 1.05236122 -1.06341323]

    [ 1.00290173 1.03159141]

    [-1.03255552 -1.00256646]]

    [[ 0.93636615 1.04956555]

    [-0.96042372 -1.04840814]

    [ 1.18516308 -1.00675752]]

    K-Means算法聚类中心点:

    center= [[ 1.05236122 -1.06341323]

    [ 1.00290173 1.03159141]

    [-1.03255552 -1.00256646]]

    Mini Batch K-Means算法聚类中心点:

    center= [[ 0.93636615 1.04956555]

    [-0.96042372 -1.04840814]

    [ 1.18516308 -1.00675752]]

    Mini Batch K-Mean效果差一点,但是速度快。

    二、层次聚类(BIRCH)算法参数比较

    1

    2

    3

    4

    5

    6

    7

    8

    9

    10

    11

    12

    13

    14

    15

    16

    17

    18

    19

    20

    21

    22

    23

    24

    25

    26

    27

    28

    29

    30

    31

    32

    33

    34

    35

    36

    37

    38

    39

    40

    41

    42

    43

    44

    45

    46

    47

    48

    49

    50

    51

    52

    53

    54

    55

    56

    57

    58

    59

    60

    61

    62

    63

    64

    65

    66

    67

    68

    69

    70

    71

    72

    73

    # Author:yifan

    from itertools import cycle

    from time import time

    import numpy as np

    import matplotlib as mpl

    import matplotlib.pyplot as plt

    import matplotlib.colors as colors

    from sklearn.preprocessing import StandardScaler

    from sklearn.cluster import Birch

    from sklearn.datasets.samples_generator import make_blobs

       

    ## 设置属性防止中文乱码

    mpl.rcParams['font.sans-serif'] = [u'SimHei']

    mpl.rcParams['axes.unicode_minus'] = False

       

    ## 产生模拟数据

    xx = np.linspace(-22, 22, 10)

    yy = np.linspace(-22, 22, 10)

    xx, yy = np.meshgrid(xx, yy)

    n_centres = np.hstack((np.ravel(xx)[:, np.newaxis],np.ravel(yy)[:, np.newaxis]))

    #产生10万条特征属性是2,类别是100,符合高斯分布的数据集

    X, y = make_blobs(n_samples=100000,n_features=2, centers=n_centres, random_state=28)

       

    #创建不同的参数(簇直径)Birch层次聚类

    birch_models = [

    Birch(threshold=1.7, n_clusters=None), #运行的函数

    Birch(threshold=0.5, n_clusters=None),

    Birch(threshold=1.7, n_clusters=100)]

    #threshold:簇直径的阈值, branching_factor:大叶子个数

       

    #我们也可以加参数来试一下效果,比如加入分支因子branching_factor,给定不同的参数值,看聚类的结果

    ## 画图

    final_step = [u'直径=1.7;n_lusters=None', u'直径=0.5;n_clusters=None', u'直径=1.7;n_lusters=100']

       

    plt.figure(figsize=(12, 8), facecolor='w')

    plt.subplots_adjust(left=0.02, right=0.98, bottom=0.1, top=0.9)

    colors_ = cycle(colors.cnames.keys())

    cm = mpl.colors.ListedColormap(colors.cnames.keys())

       

    for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)):

    t = time()

    birch_model.fit(X)

    time_ = time() - t

    # 获取模型结果(label和中心点)

    labels = birch_model.labels_

    centroids = birch_model.subcluster_centers_

    n_clusters = len(np.unique(centroids))

    print("Birch算法,参数信息为:%s;模型构建耗时为:%.3f秒;聚类中心数目:%d" % (info, time_, len(np.unique(labels))))

    ## 画图

    subinx = 222 + ind

    plt.subplot(subinx)

    for this_centroid, k, col in zip(centroids, range(n_clusters), colors_):

    mask = labels == k

    plt.plot(X[mask, 0], X[mask, 1], 'w', markerfacecolor=col, marker='.')

    if birch_model.n_clusters is None:

    plt.plot(this_centroid[0], this_centroid[1], '*', markerfacecolor=col, markeredgecolor='k', markersize=2)

    plt.ylim([-25, 25])

    plt.xlim([-25, 25])

    plt.title(u'Birch算法%s,耗时%.3fs' % (info, time_))

    plt.grid(False)

       

    ## 原始数据集显示

    plt.subplot(221)

    plt.scatter(X[:, 0], X[:, 1], c=y, s=1, cmap=cm, edgecolors='none')

    plt.ylim([-25, 25])

    plt.xlim([-25, 25])

    plt.title(u'原始数据')

    plt.grid(False)

    plt.show()

    结果:

    Birch算法,参数信息为:直径=1.7;n_lusters=None;模型构建消耗时间为:3.250秒;聚类中心数目:171

    Birch算法,参数信息为:直径=0.5;n_clusters=None;模型构建消耗时间为:8.347秒;聚类中心数目:3205

    Birch算法,参数信息为:直径=1.7;n_lusters=100;模型构建消耗时间为:3.333秒;聚类中心数目:100

    三、DBSCAN算法

    1

    2

    3

    4

    5

    6

    7

    8

    9

    10

    11

    12

    13

    14

    15

    16

    17

    18

    19

    20

    21

    22

    23

    24

    25

    26

    27

    28

    29

    30

    31

    32

    33

    34

    35

    36

    37

    38

    39

    40

    41

    42

    43

    44

    45

    46

    47

    48

    49

    50

    51

    52

    53

    54

    55

    56

    57

    58

    59

    60

    61

    62

    63

    64

    65

    66

    67

    68

    69

    70

    71

    72

    73

    74

    75

    76

    77

    78

    79

    80

    81

    82

    # Author:yifan

    import numpy as np

    import matplotlib as mpl

    import matplotlib.pyplot as plt

    import sklearn.datasets as ds

    import matplotlib.colors

    from sklearn.cluster import DBSCAN

    from sklearn.preprocessing import StandardScaler

       

    ## 设置属性防止中文乱码及拦截异常信息

    mpl.rcParams['font.sans-serif'] = [u'SimHei']

    mpl.rcParams['axes.unicode_minus'] = False

    ### 创建模拟数据

    N = 1000

    centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]

    data1, y1 = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1,0.75, 0.5,0.25), random_state=0)

    data1 = StandardScaler().fit_transform(data1)

    params1 = ((0.15, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15)) #六种参数,后面画出六个图例

       

    t = np.arange(0, 2 * np.pi, 0.1)

    data2_1 = np.vstack((np.cos(t), np.sin(t))).T

    data2_2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T

    data2_3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T

    data2 = np.vstack((data2_1, data2_2, data2_3))

    y2 = np.vstack(([0] * len(data2_1), [1] * len(data2_2), [2] * len(data2_3))).reshape(-1,)

    params2 = ((0.5, 3), (0.5, 5), (0.5, 10), (1., 3), (1., 10), (1., 20))

       

    datasets = [(data1, y1,params1), (data2, y2,params2)]

       

    def expandBorder(a, b):

    d = (b - a) * 0.1

    return a-d, b+d

    colors = ['r', 'g', 'b', 'y', 'c', 'k']

    cm = mpl.colors.ListedColormap(colors)

       

    for i, (X, y, params) in enumerate(datasets):

    x1_min, x2_min = np.min(X, axis=0)

    x1_max, x2_max = np.max(X, axis=0)

    x1_min, x1_max = expandBorder(x1_min, x1_max)

    x2_min, x2_max = expandBorder(x2_min, x2_max)

       

    plt.figure(figsize=(12, 8), facecolor='w')

    plt.suptitle(u'DBSCAN聚类-数据%d' % (i + 1), fontsize=18)

    plt.subplots_adjust(top=0.9, hspace=0.35)

       

    for j, param in enumerate(params):

    eps, min_samples = param

    model = DBSCAN(eps=eps, min_samples=min_samples)

    # eps 半径,控制邻域的大小,值越大,越能容忍噪声点,值越小,相比形成的簇就越多

    # min_samples 原理中所说的M,控制哪个是核心点,值越小,越可以容忍噪声点,越大,就更容易把有效点划分成噪声点

    model.fit(X)

    y_hat = model.labels_

       

    unique_y_hat = np.unique(y_hat)

    n_clusters = len(unique_y_hat) - (1 if -1 in y_hat else 0)

    print("类别:", unique_y_hat, ";聚类簇数目:", n_clusters)

       

    core_samples_mask = np.zeros_like(y_hat, dtype=bool)

    core_samples_mask[model.core_sample_indices_] = True

       

    ## 开始画图

    plt.subplot(3, 3, j + 1)

    for k, col in zip(unique_y_hat, colors):

    if k == -1:

    col = 'k'

    class_member_mask = (y_hat == k)

    xy = X[class_member_mask & core_samples_mask]

    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]

    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)

    plt.xlim((x1_min, x1_max))

    plt.ylim((x2_min, x2_max))

    plt.grid(True)

    plt.title('$epsilon$ = %.1f m = %d,聚类簇数目:%d' % (eps, min_samples, n_clusters), fontsize=12)

    ## 原始数据显示

    plt.subplot(3, 3, 7)

    plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cm, edgecolors='none')

    plt.xlim((x1_min, x1_max))

    plt.ylim((x2_min, x2_max))

    plt.title('原始数据,聚类簇数目:%d' % len(np.unique(y)))

    plt.grid(True)

    plt.show()

    类别: [-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] ;聚类簇数目: 16

    类别: [-1 0 1 2 3 4] ;聚类簇数目: 5

    类别: [-1 0 1 2 3 4] ;聚类簇数目: 5

    类别: [-1 0 1 2] ;聚类簇数目: 3

    类别: [-1 0] ;聚类簇数目: 1

    类别: [-1 0] ;聚类簇数目: 1

       

    类别: [0 1 2] ;聚类簇数目: 3

    类别: [-1 0 1] ;聚类簇数目: 2

    类别: [-1 0] ;聚类簇数目: 1

    类别: [0] ;聚类簇数目: 1

    类别: [-1 0] ;聚类簇数目: 1

    类别: [-1 0] ;聚类簇数目: 1

       

       

  • 相关阅读:
    如何快速搞定websocket
    websocket断网消息补发
    div嵌套多个点击事件,点击后如何阻止多次事件触发冒泡
    仿照 MediatR实现了一个中介者模式Publish功能,使用同MediatR
    git提交指南(超级详细)
    删除github中的文件夹
    抽取进程集成模式注册报错,OGG-08221,OCI Error ORA-44004 invalid qualified SQL Name
    Oracle-参数学习_no_or_expansion
    OGG19版本源端新增字段,目标端复制进程不报错,使用MAPALLCOLUMNS进行测试
    Oracle存储过程如何定位慢SQL?
  • 原文地址:https://www.cnblogs.com/yifanrensheng/p/12354894.html
Copyright © 2011-2022 走看看