import pandas as pd import numpy as np data = pd.read_csv(r'data.csv') train = data.iloc[:,0:4] #计算不同样本之间的欧几里得距离, #如果不同样本数据的刻度不一致,要对数据进行规格化处理 def nearest(traini,center): distance = np.zeros((len(center),1)) for i in range(len(center)): dist = traini-center.ix[i,:] distance[i]=dist.dot(dist.T) return distance.argmin() def zhidian(x): return x.sum()/len(x) #收敛条件 def shoulian(train,center): julihe = 0 for i in range(len(train)): #print(train.iloc[i,0:4]) made = train.ix[i,'near'] dist = train.iloc[i,0:4] - center.ix[made,0:4] julihe = julihe +dist.dot(dist.T) return julihe def kmeans(train,center,julihe): #随机选择3个质点 #每个样本的最近的类 print('return') oldtrain = train oldcenter = center near = np.zeros((len(train),1)).astype(int) for i in range(len(train)): near[i] = nearest(train.ix[i,:],center) #重新计算质点 train['near']=near center = train.groupby(train['near']).apply(zhidian) #收敛条件 newjulihe = shoulian(train,center) if newjulihe<julihe: del train['near'] del center['near'] return kmeans(train,center,newjulihe) else: print(oldcenter) return oldtrain,oldcenter def sdasd(train,julihe): center = train.ix[0:3,:] train,center = kmeans(train,center,julihe) return train,center julihe = 100000 train,center = sdasd(train,julihe)