zoukankan      html  css  js  c++  java
  • 无监督学习

    数据准备

     1 %matplotlib notebook
     2 import numpy as np
     3 import pandas as pd
     4 import seaborn as sn
     5 import matplotlib.pyplot as plt
     6 from sklearn.datasets import load_breast_cancer
     7 
     8 # Breast cancer dataset
     9 cancer = load_breast_cancer()
    10 (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
    11 
    12 # Our sample fruits dataset
    13 fruits = pd.read_table('fruit_data_with_colors.txt')
    14 X_fruits = fruits[['mass','width','height', 'color_score']]
    15 y_fruits = fruits[['fruit_label']] - 1

    降维和流形学习

    PCA

    使用PCA查找乳腺癌数据集的前两个主要组成部分

     1 from sklearn.preprocessing import StandardScaler
     2 from sklearn.decomposition import PCA
     3 from sklearn.datasets import load_breast_cancer
     4 
     5 cancer = load_breast_cancer()
     6 (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
     7 
     8 # Before applying PCA, each feature should be centered (zero mean) and with unit variance
     9 X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)  
    10 #n_components设置主成分个数
    11 pca = PCA(n_components = 2).fit(X_normalized)
    12 
    13 X_pca = pca.transform(X_normalized)
    14 print(X_cancer.shape, X_pca.shape)
    (569, 30) (569, 2)


    绘制乳腺癌数据集的PCA转换版本
    1 from adspy_shared_utilities import plot_labelled_scatter
    2 plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign'])
    3 
    4 plt.xlabel('First principal component')
    5 plt.ylabel('Second principal component')
    6 plt.title('Breast Cancer Dataset PCA (n_components = 2)');

    画出各个因素对两个主成分的影响大小

     1 fig = plt.figure(figsize=(8, 4))
     2 plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
     3 feature_names = list(cancer.feature_names)
     4 
     5 plt.gca().set_xticks(np.arange(-.5, len(feature_names)));
     6 plt.gca().set_yticks(np.arange(0.5, 2));
     7 plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12);
     8 plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12);
     9 
    10 plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0, 
    11                                               pca.components_.max()], pad=0.65);

    PCA用于水果数据集(用于比较)

     1 from sklearn.preprocessing import StandardScaler
     2 from sklearn.decomposition import PCA
     3 
     4 # each feature should be centered (zero mean) and with unit variance
     5 X_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  
     6 
     7 pca = PCA(n_components = 2).fit(X_normalized)
     8 X_pca = pca.transform(X_normalized)
     9 
    10 from adspy_shared_utilities import plot_labelled_scatter
    11 plot_labelled_scatter(X_pca, y_fruits, ['apple','mandarin','orange','lemon'])
    12 
    13 plt.xlabel('First principal component')
    14 plt.ylabel('Second principal component')
    15 plt.title('Fruits Dataset PCA (n_components = 2)');

    多种学习方法

    水果数据集上的多维缩放(MDS)

     1 from adspy_shared_utilities import plot_labelled_scatter
     2 from sklearn.preprocessing import StandardScaler
     3 from sklearn.manifold import MDS
     4 
     5 # each feature should be centered (zero mean) and with unit variance
     6 X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  
     7 
     8 mds = MDS(n_components = 2)
     9 
    10 X_fruits_mds = mds.fit_transform(X_fruits_normalized)
    11 
    12 plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])
    13 plt.xlabel('First MDS feature')
    14 plt.ylabel('Second MDS feature')
    15 plt.title('Fruit sample dataset MDS');

     

    乳腺癌上的多维缩放

    t-SNE on the fruit dataset

    (结果不是很好)

     1 from sklearn.manifold import TSNE
     2 
     3 tsne = TSNE(random_state = 0)
     4 
     5 X_tsne = tsne.fit_transform(X_fruits_normalized)
     6 
     7 plot_labelled_scatter(X_tsne, y_fruits, 
     8     ['apple', 'mandarin', 'orange', 'lemon'])
     9 plt.xlabel('First t-SNE feature')
    10 plt.ylabel('Second t-SNE feature')
    11 plt.title('Fruits dataset t-SNE');

    t-SNE on the breast cancer dataset

    1 tsne = TSNE(random_state = 0)
    2 
    3 X_tsne = tsne.fit_transform(X_normalized)
    4 
    5 plot_labelled_scatter(X_tsne, y_cancer, 
    6     ['malignant', 'benign'])
    7 plt.xlabel('First t-SNE feature')
    8 plt.ylabel('Second t-SNE feature')
    9 plt.title('Breast cancer dataset t-SNE');

    聚类算法

    k-means

    #分成三个聚类中心
    1
    from sklearn.datasets import make_blobs 2 from sklearn.cluster import KMeans 3 from adspy_shared_utilities import plot_labelled_scatter 4 5 X, y = make_blobs(random_state = 10) 6 7 kmeans = KMeans(n_clusters = 3) 8 kmeans.fit(X) 9 10 plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3'])

    #分成4个聚类中心
    1
    from sklearn.datasets import make_blobs 2 from sklearn.cluster import KMeans 3 from adspy_shared_utilities import plot_labelled_scatter 4 from sklearn.preprocessing import MinMaxScaler 5 6 fruits = pd.read_table('fruit_data_with_colors.txt') 7 X_fruits = fruits[['mass','width','height', 'color_score']].as_matrix() 8 y_fruits = fruits[['fruit_label']] - 1 9 10 X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits) 11 12 kmeans = KMeans(n_clusters = 4, random_state = 0) 13 kmeans.fit(X_fruits_normalized) 14 15 plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, 16 ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])

    Agglomerative clustering

     1 from sklearn.datasets import make_blobs
     2 from sklearn.cluster import AgglomerativeClustering
     3 from adspy_shared_utilities import plot_labelled_scatter
     4 
     5 X, y = make_blobs(random_state = 10)
     6 
     7 cls = AgglomerativeClustering(n_clusters = 3)
     8 cls_assignment = cls.fit_predict(X)
     9 
    10 plot_labelled_scatter(X, cls_assignment, 
    11         ['Cluster 1', 'Cluster 2', 'Cluster 3'])

    创建树状图(使用scipy)

    此树形图是基于上一步中使用make_blobs创建的数据集,但为了清楚起见,本示例仅选择了10个样本,如下所示:

    1 X, y = make_blobs(random_state = 10, n_samples = 10)
    2 plot_labelled_scatter(X, y, 
    3         ['Cluster 1', 'Cluster 2', 'Cluster 3'])
    4 print(X)

    [[  5.69192445  -9.47641249]
     [  1.70789903   6.00435173]
     [  0.23621041  -3.11909976]
     [  2.90159483   5.42121526]
     [  5.85943906  -8.38192364]
     [  6.04774884 -10.30504657]
     [ -2.00758803  -7.24743939]
     [  1.45467725  -6.58387198]
     [  1.53636249   5.11121453]
     [  5.4307043   -9.75956122]]

    这里的树状图对应于使用Ward方法的10点以上的凝聚聚类。 点的索引0..9对应于上面X数组中点的索引。
    例如,点0(5.69,-9.47)和点9(5.43,-9.76)是最接近的两个点,并且首先聚集在一起。
    1 from scipy.cluster.hierarchy import ward, dendrogram
    2 plt.figure()
    3 dendrogram(ward(X))
    4 plt.show()

    DBSCAN集群(基于密度的聚类)

     1 from sklearn.cluster import DBSCAN
     2 from sklearn.datasets import make_blobs
     3 
     4 X, y = make_blobs(random_state = 9, n_samples = 25)
     5 
     6 dbscan = DBSCAN(eps = 2, min_samples = 2)
     7 
     8 cls = dbscan.fit_predict(X)
     9 print("Cluster membership values:
    {}".format(cls))
    10 
    11 plot_labelled_scatter(X, cls + 1, 
    12         ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])
    Cluster membership values:
    [ 0  1  0  2  0  0  0  2  2 -1  1  2  0  0 -1  0  0  1 -1  1  1  2  2  2  1]



  • 相关阅读:
    波段是金牢记六大诀窍
    zk kafka mariadb scala flink integration
    Oracle 体系结构详解
    图解 Database Buffer Cache 内部原理(二)
    SQL Server 字符集介绍及修改方法演示
    SQL Server 2012 备份与还原详解
    SQL Server 2012 查询数据库中所有表的名称和行数
    SQL Server 2012 查询数据库中表格主键信息
    SQL Server 2012 查询数据库中所有表的索引信息
    图解 Database Buffer Cache 内部原理(一)
  • 原文地址:https://www.cnblogs.com/zhengzhe/p/8567726.html
Copyright © 2011-2022 走看看