K-均值算法的基本思想是首先从含有N个数据对象的数据集中随机选择K个数据对象作为初始中心,然后计算每个数据对象到各中心的距离,根据最近邻原则,所有数据对象将会被划分到离它最近的那个中心所代表的簇中,接着分别计算新生成的各个簇中数据对象的均值作为各簇新的中心,比较新的中心和上一次得到的中心,如果新的中心没有发生变化,则算法收敛,输出结果,如果新的中心和上一次的中心相比发生变化,则要根据新的中心对所有数据对象重新进行划分。直到满足算法的收敛条件为止。
K-means算法的过程可以描述为:
算法:划分并计算基于簇中对象的平均值。
输入:簇的数目K和包含N个对象的数据库。
输出:平方误差总和最小条件下的K个簇。
方法:
1) 任意选择K个对象作为初始的簇中心;
2) 分别计算数据集中每个元素与所选簇的中心计算距离(一般采用欧式距离),根据最近邻原则,将元素划分到相应的簇中;
3) 计算每个簇中对象的平均值,更新簇的中心;
4) 重复上面的步骤,直至更新的簇的中心与原簇的中心的差值在预定范围内或者达到预设的迭代次数;
5) 输出K个簇中心。
K-means 方法的时间复杂度为O(NKT),N代表总元素个数,K代表簇中心个数,T代表迭代次数。K-means算法是一种硬性划分的聚类,即每个数据点唯一地分配给一个聚类,由于事先不知道实际的聚类情况,因此可能是一种严重的局限。该算法对初始中心的选取非常敏感,初始中心随机选取,导致结果波动较大,稳定性较差。同时该算法对噪声数据和孤立点数据较为敏感。该算法通常采用欧式距离作为数据样本之间的度量方式,导致该算法对球状的簇有比较好的聚类效果,但是很难发现其他形状的簇。
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
#include <fstream> #include <iomanip> #include <iostream> #include <time.h> #include <stdlib.h> using namespace std; template <typename DataType> int readData( char* file_path , DataType** &data,int dimension) { if(dimension <= 0) return -1; int data_number = 0; fstream infile; infile.open(file_path,ios::in); DataType datum; long int position = infile.tellg(); while(!infile.eof()) { infile >> datum; data_number++; } infile.close(); position = infile.tellg(); infile.seekg(0,ios::beg); position = infile.tellg(); data_number /= dimension; infile.open(file_path,ios::in); data = new DataType*[data_number]; for( int i = 0; i < data_number; i++) { data[i] = new DataType[dimension]; for( int j = 0; j < dimension; j++) { infile >> data[i][j]; } } infile.close(); return data_number; } template <typename DataType> void kmeans(DataType** &data,int data_number,int dimension, DataType** ¢ers,int K, int* &labels, int iterations, DataType threshold) { if(data == NULL) return; centers = new DataType*[K]; labels = new int[data_number]; DataType** sum; int* counts; sum = new DataType*[K]; counts = new int[K]; for(int i = 0; i < K; i++) { centers[i] = new DataType[dimension]; sum[i] = new DataType[dimension]; counts[i] = 0; for( int j = 0; j < dimension; j++) { sum[i][j] = 0; } } rand_init_centers(data,data_number,dimension,centers,K); int iteration_time = 0; DataType difference = INT_MAX; while( iteration_time < iterations || difference > threshold) { for(int i = 0; i < K; i++) { counts[i] = 0; for( int j = 0; j < dimension; j++) { sum[i][j] = 0; } } for(int i = 0; i < data_number; i++) { labels[i] = select_center(data[i],centers,K,dimension); counts[labels[i]]++; for( int j = 0; j < dimension; j++) { sum[labels[i]][j] += data[i][j]; } } difference = 0; for( int i = 0; i < K; i++) { for( int j = 0; j < dimension; j++) { if(counts[i] > 0) { sum[i][j] /= counts[i]; DataType delta = sum[i][j] - centers[i][j]; difference += delta*delta; centers[i][j] = sum[i][j]; } } } cout << iteration_time << ' ' << difference << endl; iteration_time++; } for( int i = 0; i < K; i++) { if(sum[i] != NULL) { delete[] sum[i]; } } if(sum != NULL) { delete [] sum; } delete [] counts; } template <typename DataType> void save_centers(char* file_path, DataType** centers,int K,int dimension) { ofstream outfile; outfile.open(file_path,ios::out); for(int i = 0; i < K; i++) { outfile.setf(ios::left); for(int j = 0; j < dimension; j++) { outfile.width(10); outfile << centers[i][j]; } outfile.unsetf(ios::left); outfile << endl; } outfile.close(); } template <typename DataType> void save_labels(char* file_path, DataType** data, int* labels,int data_number,int dimension) { ofstream outfile; outfile.open(file_path,ios::out); for(int i = 0; i < data_number; i++) { outfile.setf(ios::left); outfile.width(10); outfile << labels[i]; for(int j = 0; j < dimension; j++) { outfile.width(10); outfile << data[i][j]; } outfile.unsetf(ios::left); outfile<< endl; } outfile.close(); } template <typename DataType> void rand_init_centers(DataType** &data, int data_number,int dimension, DataType** ¢ers,int K) { int step = data_number/K; for(int i = 0; i < K; i++) { srand(time(NULL)); int m = rand()%step; for(int j = 0; j < dimension; j++) { centers[i][j] = data[i*step+m][j]; } } } template <typename DataType> int select_center(DataType* &data_i, DataType** ¢ers, int K, int dimension) { if( K < 0) return -1; int label = 0; DataType min_dist = calculate_dist(data_i, centers[0],dimension); DataType dist = 0; for(int i = 1; i < K; i++) { dist = calculate_dist(data_i, centers[i],dimension); if(min_dist > dist) { min_dist = dist; label = i; } } return label; } template <typename DataType> DataType calculate_dist(DataType* &data_i, DataType* ¢ers_i,int dimension) { if(data_i == NULL || centers_i == NULL) { return (DataType)-1; } DataType dist = 0; for( int j = 0; j < dimension; j++) { DataType delta = data_i[j] - centers_i[j]; dist += delta*delta; } return dist; }
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
#include <iostream> using namespace std; #include "Kmeans.h" int main(int argc, char* argv[]) { float** data_source; float** clusters; int* labels; data_source = NULL; clusters = NULL; labels = NULL; int K = 5; int iterations = 50; float threshold = 0.001; int dimension = 1764; int data_number = readData("D:/Users/Surge/Desktop/test.txt",data_source,dimension); kmeans(data_source,data_number,dimension,clusters,K,labels,iterations,threshold); save_centers("D:/Users/Surge/Desktop/test_centers.txt",clusters,K,dimension); save_labels("D:/Users/Surge/Desktop/test_labels.txt",data_source,labels,data_number,dimension); for(int i = 0; i < data_number; i++) { if(data_source[i] != NULL) { delete[] data_source[i]; } } for(int i = 0; i < K; i++) { if(clusters[i] != NULL) { delete[] clusters[i]; } } if(data_source != NULL) { delete[] data_source; } if(clusters != NULL) { delete[] clusters; } if(labels != NULL) { delete[] labels; } system("pause"); return 0; }