谱聚类通常会先对两两样本间求相似度。 然后依据相似度矩阵求出拉普拉斯矩阵,然后将每一个样本映射到拉普拉斯矩阵特诊向量中,最后使用k-means聚类。
scikit-learn开源包中已经有现成的接口能够使用,详细见
写了一个測试样例
构造二维空间样本点。
#!/usr/bin/env python import random import numpy as np import math index = 0 pointlist = [] fd = open("points.txt", 'w') for x in np.arange(0.1, 10., 0.5) : for y in np.arange(0., 10., 0.1) : print >> fd, str(index)+' '+str(x)+' '+str(y) pointlist.append((index, (x, y))) index += 1 for x in np.arange(-10.0, -0.1, 0.5) : for y in np.arange(0., 10., 0.1) : print >> fd, str(index)+' '+str(x)+' '+str(y) pointlist.append((index, (x, y))) index += 1 for x in np.arange(-10.0, -0.1, 0.5) : for y in np.arange(-10.0, 0., 0.1) : print >> fd, str(index)+' '+str(x)+' '+str(y) pointlist.append((index, (x, y))) index += 1 fd.close() def get_dist(pnt1, pnt2) : return math.sqrt((pnt1[1][0] - pnt2[1][0])**2 + (pnt1[1][1] - pnt2[1][1])**2) simfd = open("sim_pnts.txt", 'w') for pnt1 in pointlist : for pnt2 in pointlist : index1, index2 = pnt1[0], pnt2[0] dist = get_dist(pnt1, pnt2) if dist <=0.00001 : print >> simfd, str(index1) + " "+str(index2) + " " + "10" continue sim = 1.0 / dist print >> simfd, str(index1) + " "+str(index2) + " " + str(sim) simfd.close()
使用谱聚类:
#!/usr/bin/env python # Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org> # Gael Varoquaux <gael.varoquaux@normalesup.org> # License: BSD 3 clause import sys import numpy as np from sklearn.cluster import spectral_clustering from scipy.sparse import coo_matrix ############################################################################### fid2fname = {} for line in open("points.txt") : line = line.strip().split(' ') fid2fname.setdefault(int(line[0]), (float(line[1]), float(line[2]))) N = len(fid2fname) rowlist = [] collist = [] datalist = [] for line in open("sim_pnts.txt") : line = line.strip().split(' ') if len(line) < 3 : continue f1, f2, sim = line[:3] rowlist.append(int(f1)) collist.append(int(f2)) datalist.append(float(sim)) for id in fid2fname : rowlist.append(int(id)) collist.append(int(id)) datalist.append(1.0) row = np.array(rowlist) col = np.array(collist) data = np.array(datalist) graph = coo_matrix((data, (row, col)), shape=(N, N)) ############################################################################### # Force the solver to be arpack, since amg is numerically # unstable on this example labels = spectral_clustering(graph, n_clusters=3, eigen_solver='arpack') #print labels cluster2fid = {} for index, lab in enumerate(labels) : cluster2fid.setdefault(lab, []) cluster2fid[lab].append(index) for index, lab in enumerate(cluster2fid) : fd = open("cluster_%s" % index, "w") for fid in cluster2fid[lab] : print >> fd , fid2fname[fid]
将聚类后的样本可视化:
#!/usr/bin/env python import matplotlib.pyplot as plt plt.figure(figsize=(12,6)) cluster_list = [] cluster_0_x = [] cluster_0_y = [] for line in open("cluster_0"): line = line.strip().split(',') x = float(line[0][1:].strip()) y = float(line[1][:-1].strip()) cluster_0_x.append(x) cluster_0_y.append(y) plt.plot(cluster_0_x, cluster_0_y, 'or') cluster_1_x = [] cluster_1_y = [] for line in open("cluster_1"): line = line.strip().split(',') x = float(line[0][1:].strip()) y = float(line[1][:-1].strip()) cluster_1_x.append(x) cluster_1_y.append(y) plt.plot(cluster_1_x, cluster_1_y, 'xb') cluster_2_x = [] cluster_2_y = [] for line in open("cluster_2"): line = line.strip().split(',') x = float(line[0][1:].strip()) y = float(line[1][:-1].strip()) cluster_2_x.append(x) cluster_2_y.append(y) plt.plot(cluster_2_x, cluster_2_y, '+g') plt.show()
不同颜色代表不同的聚类, 能够看到聚类效果还是不错的。