zoukankan html css js c++ java

无监督学习

数据准备

 1 %matplotlib notebook
 2 import numpy as np
 3 import pandas as pd
 4 import seaborn as sn
 5 import matplotlib.pyplot as plt
 6 from sklearn.datasets import load_breast_cancer
 7 
 8 # Breast cancer dataset
 9 cancer = load_breast_cancer()
10 (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
11 
12 # Our sample fruits dataset
13 fruits = pd.read_table('fruit_data_with_colors.txt')
14 X_fruits = fruits[['mass','width','height', 'color_score']]
15 y_fruits = fruits[['fruit_label']] - 1

降维和流形学习

PCA

使用PCA查找乳腺癌数据集的前两个主要组成部分

 1 from sklearn.preprocessing import StandardScaler
 2 from sklearn.decomposition import PCA
 3 from sklearn.datasets import load_breast_cancer
 4 
 5 cancer = load_breast_cancer()
 6 (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
 7 
 8 # Before applying PCA, each feature should be centered (zero mean) and with unit variance
 9 X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)  
10 #n_components设置主成分个数
11 pca = PCA(n_components = 2).fit(X_normalized)
12 
13 X_pca = pca.transform(X_normalized)
14 print(X_cancer.shape, X_pca.shape)

(569, 30) (569, 2)


绘制乳腺癌数据集的PCA转换版本

1 from adspy_shared_utilities import plot_labelled_scatter
2 plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign'])
3 
4 plt.xlabel('First principal component')
5 plt.ylabel('Second principal component')
6 plt.title('Breast Cancer Dataset PCA (n_components = 2)');

画出各个因素对两个主成分的影响大小

 1 fig = plt.figure(figsize=(8, 4))
 2 plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
 3 feature_names = list(cancer.feature_names)
 4 
 5 plt.gca().set_xticks(np.arange(-.5, len(feature_names)));
 6 plt.gca().set_yticks(np.arange(0.5, 2));
 7 plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12);
 8 plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12);
 9 
10 plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0, 
11                                               pca.components_.max()], pad=0.65);

PCA用于水果数据集(用于比较)

 1 from sklearn.preprocessing import StandardScaler
 2 from sklearn.decomposition import PCA
 3 
 4 # each feature should be centered (zero mean) and with unit variance
 5 X_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  
 6 
 7 pca = PCA(n_components = 2).fit(X_normalized)
 8 X_pca = pca.transform(X_normalized)
 9 
10 from adspy_shared_utilities import plot_labelled_scatter
11 plot_labelled_scatter(X_pca, y_fruits, ['apple','mandarin','orange','lemon'])
12 
13 plt.xlabel('First principal component')
14 plt.ylabel('Second principal component')
15 plt.title('Fruits Dataset PCA (n_components = 2)');

多种学习方法

水果数据集上的多维缩放（MDS）

 1 from adspy_shared_utilities import plot_labelled_scatter
 2 from sklearn.preprocessing import StandardScaler
 3 from sklearn.manifold import MDS
 4 
 5 # each feature should be centered (zero mean) and with unit variance
 6 X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  
 7 
 8 mds = MDS(n_components = 2)
 9 
10 X_fruits_mds = mds.fit_transform(X_fruits_normalized)
11 
12 plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])
13 plt.xlabel('First MDS feature')
14 plt.ylabel('Second MDS feature')
15 plt.title('Fruit sample dataset MDS');

乳腺癌上的多维缩放

t-SNE on the fruit dataset

（结果不是很好)

 1 from sklearn.manifold import TSNE
 2 
 3 tsne = TSNE(random_state = 0)
 4 
 5 X_tsne = tsne.fit_transform(X_fruits_normalized)
 6 
 7 plot_labelled_scatter(X_tsne, y_fruits, 
 8     ['apple', 'mandarin', 'orange', 'lemon'])
 9 plt.xlabel('First t-SNE feature')
10 plt.ylabel('Second t-SNE feature')
11 plt.title('Fruits dataset t-SNE');

t-SNE on the breast cancer dataset

1 tsne = TSNE(random_state = 0)
2 
3 X_tsne = tsne.fit_transform(X_normalized)
4 
5 plot_labelled_scatter(X_tsne, y_cancer, 
6     ['malignant', 'benign'])
7 plt.xlabel('First t-SNE feature')
8 plt.ylabel('Second t-SNE feature')
9 plt.title('Breast cancer dataset t-SNE');

聚类算法

k-means

#分成三个聚类中心
 1 from sklearn.datasets import make_blobs
 2 from sklearn.cluster import KMeans
 3 from adspy_shared_utilities import plot_labelled_scatter
 4 
 5 X, y = make_blobs(random_state = 10)
 6 
 7 kmeans = KMeans(n_clusters = 3)
 8 kmeans.fit(X)
 9 
10 plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3'])

#分成4个聚类中心
 1 from sklearn.datasets import make_blobs
 2 from sklearn.cluster import KMeans
 3 from adspy_shared_utilities import plot_labelled_scatter
 4 from sklearn.preprocessing import MinMaxScaler
 5 
 6 fruits = pd.read_table('fruit_data_with_colors.txt')
 7 X_fruits = fruits[['mass','width','height', 'color_score']].as_matrix()
 8 y_fruits = fruits[['fruit_label']] - 1
 9 
10 X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits)  
11 
12 kmeans = KMeans(n_clusters = 4, random_state = 0)
13 kmeans.fit(X_fruits_normalized)
14 
15 plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, 
16                       ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])

Agglomerative clustering

 1 from sklearn.datasets import make_blobs
 2 from sklearn.cluster import AgglomerativeClustering
 3 from adspy_shared_utilities import plot_labelled_scatter
 4 
 5 X, y = make_blobs(random_state = 10)
 6 
 7 cls = AgglomerativeClustering(n_clusters = 3)
 8 cls_assignment = cls.fit_predict(X)
 9 
10 plot_labelled_scatter(X, cls_assignment, 
11         ['Cluster 1', 'Cluster 2', 'Cluster 3'])

创建树状图（使用scipy）

此树形图是基于上一步中使用make_blobs创建的数据集，但为了清楚起见，本示例仅选择了10个样本，如下所示：

1 X, y = make_blobs(random_state = 10, n_samples = 10)
2 plot_labelled_scatter(X, y, 
3         ['Cluster 1', 'Cluster 2', 'Cluster 3'])
4 print(X)

[[  5.69192445  -9.47641249]
 [  1.70789903   6.00435173]
 [  0.23621041  -3.11909976]
 [  2.90159483   5.42121526]
 [  5.85943906  -8.38192364]
 [  6.04774884 -10.30504657]
 [ -2.00758803  -7.24743939]
 [  1.45467725  -6.58387198]
 [  1.53636249   5.11121453]
 [  5.4307043   -9.75956122]]

这里的树状图对应于使用Ward方法的10点以上的凝聚聚类。 点的索引0..9对应于上面X数组中点的索引。 
例如，点0（5.69，-9.47）和点9（5.43，-9.76）是最接近的两个点，并且首先聚集在一起。

1 from scipy.cluster.hierarchy import ward, dendrogram
2 plt.figure()
3 dendrogram(ward(X))
4 plt.show()

DBSCAN集群(基于密度的聚类)

 1 from sklearn.cluster import DBSCAN
 2 from sklearn.datasets import make_blobs
 3 
 4 X, y = make_blobs(random_state = 9, n_samples = 25)
 5 
 6 dbscan = DBSCAN(eps = 2, min_samples = 2)
 7 
 8 cls = dbscan.fit_predict(X)
 9 print("Cluster membership values:
{}".format(cls))
10 
11 plot_labelled_scatter(X, cls + 1, 
12         ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])

Cluster membership values:
[ 0  1  0  2  0  0  0  2  2 -1  1  2  0  0 -1  0  0  1 -1  1  1  2  2  2  1]

查看全文

相关阅读:
.NET Framework 3.0 和 Windows SDK
用 C# 开发 SQL Server 2005 的自定义聚合函数
 IronPython 源码剖析系列（2）：IronPython 引擎的运作流程
 IronPython 个人网站样例宝藏挖掘
 SetRenderMethodDelegate 方法
 使用 Castle ActiveRecord 开发发现的一些问题
 IronPython for ASP.NET CTP WhitePaper 摘要翻译
 关于 IE 模态对话框的两个问题
 逐步改用 IronPython 开发你的 ASP.NET 应用程序
 使用 Flash 和 C# WinForm 配合打造界面漂亮的应用程序（摘要）

原文地址：https://www.cnblogs.com/zhengzhe/p/8567726.html