zoukankan      html  css  js  c++  java
  • k-means算法的Python实现

      1 #coding=utf-8
      2 import codecs
      3 import numpy
      4 from numpy import *
      5 import pylab
      6 
      7 def loadDataSet(fileName):
      8     dataMat = []
      9     fr = codecs.open(fileName)
     10     for line in fr.readlines():
     11         curLine = line.strip().split('	')
     12         fltLine = map(float, curLine)
     13         dataMat.append(fltLine)
     14     return dataMat    
     15     
     16 def distMeasure(vecA, vecB):
     17     #print vecA
     18     dist = sqrt(sum(power(vecA - vecB, 2)))
     19     return dist
     20     
     21 def kMeansInitCentroids(X, K):
     22     """
     23     KMEANSINITCENTROIDS This function initializes K centroids that are to be 
     24     used in K-Means on the dataset X
     25     centroids = KMEANSINITCENTROIDS(X, K) returns K initial centroids to be
     26     used with the K-Means on the dataset X.
     27     """
     28     n = shape(X)[1]
     29     centroids = mat(zeros((K,n)))
     30     for j in range(n):
     31         #print X[:,j]
     32         minJ = min(X[:,j])
     33         rangeJ = float(max(array(X)[:,j]) - minJ)
     34         centroids[:,j] = minJ + rangeJ * random.rand(K,1)
     35     return centroids
     36     
     37 def findClosestCentroids(X, centroids):
     38     """
     39     FINDCLOSESTCENTROIDS computes the centroid memberships for every example
     40     idx = FINDCLOSESTCENTROIDS (X, centroids) returns the closest centroids
     41     in idx for a dataset X where each row is a single example. idx = m x 1 
     42     vector of centroid assignments (i.e. each entry in range [1..K])
     43     """
     44     # 数据总量
     45     m = shape(X)[0]
     46     K = shape(centroids)[0]
     47     clusterAssment = mat(zeros((m,2)))#create mat to assign data points 
     48                                       #to a centroid, also holds SE of each point
     49     #centroids = createCent(dataSet, k)
     50     clusterChanged = True
     51     while clusterChanged:
     52         clusterChanged = False
     53         for i in range(m):#for each data point assign it to the closest centroid
     54             minDist = inf; minIndex = -1
     55             # k个中间数据(质心)都与数据i进行欧氏比较,选择距离最近的第minIndex类
     56             for j in range(K):
     57                 distJI = distMeasure(centroids[j,:],X[i,:])
     58                 if distJI < minDist:
     59                     minDist = distJI; minIndex = j
     60             if clusterAssment[i,0] != minIndex: clusterChanged = True
     61             clusterAssment[i,:] = minIndex,minDist**2
     62     return clusterAssment
     63     
     64 def computeCentroids(X, clusterAssment, K):
     65     """
     66     COMPUTECENTROIDS returs the new centroids by computing the means of the 
     67     data points assigned to each centroid.
     68     centroids = COMPUTECENTROIDS(X, idx, K) returns the new centroids by 
     69     computing the means of the data points assigned to each centroid. It is
     70     given a dataset X where each row is a single data point, a vector
     71     idx of centroid assignments (i.e. each entry in range [1..K]) for each
     72     example, and K, the number of centroids. You should return a matrix
     73     centroids, where each row of centroids is the mean of the data points
     74     assigned to it.
     75     """
     76     n = shape(X)[1]
     77     centroids = mat(zeros((K,n)))
     78     for centroid in range(K):#recalculate centroids
     79         # nonzero会产生两个array,第一个非零的为序号列表
     80         ptsInClust = X[nonzero(clusterAssment[:,0].A==centroid)[0]]#get all the point in this cluster
     81         #print 'ererer:',ptsInClust,'dfdf'
     82         centroids[centroid,:] = mean(ptsInClust, axis=0) #assign centroid to mean
     83     return centroids
     84 
     85 def show(dataSet, k, centroids, clusterAssment):
     86     from matplotlib import pyplot as plt  
     87     numSamples, dim = dataSet.shape  
     88     mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']  
     89     print type(dataSet)
     90     for i in xrange(numSamples):  
     91         markIndex = int(clusterAssment[i, 0])  
     92         plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])  
     93     mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']  
     94     for i in range(k):  
     95         plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)  
     96     plt.show()
     97     
     98 def runkMeans(X, initial_centroids,max_iters, plot_progress):
     99     """
    100     RUNKMEANS runs the K-Means algorithm on data matrix X, where each row of X
    101     is a single example
    102     [centroids, idx] = RUNKMEANS(X, initial_centroids, max_iters, ...
    103     plot_progress) runs the K-Means algorithm on data matrix X, where each 
    104     row of X is a single example. It uses initial_centroids used as the
    105     initial centroids. max_iters specifies the total number of interactions 
    106     of K-Means to execute. plot_progress is a true/false flag that 
    107     indicates if the function should also plot its progress as the 
    108     learning happens. This is set to false by default. runkMeans returns 
    109     centroids, a Kxn matrix of the computed centroids and idx, a m x 1 
    110     vector of centroid assignments (i.e. each entry in range [1..K]).
    111     """
    112     (m,n) = shape(X)
    113     K = shape(initial_centroids)[0]
    114     centroids = initial_centroids   
    115     clusterAssment = zeros((m,2))
    116     
    117     #Run K-Means
    118     for i in range(max_iters):
    119         clusterAssment = findClosestCentroids(X, centroids)
    120         centroids = computeCentroids(X, clusterAssment, K);
    121     
    122     return centroids, clusterAssment
    123 
    124 def main():    
    125     K =5
    126     max_iters = 10
    127     dataSet =  loadDataSet('E://PythonSpace//TextClustering//data//test2.txt')  
    128     X = array(dataSet)
    129     X = (X - mean(X)) / std(X)
    130     
    131     initial_centroids = kMeansInitCentroids(X, K)
    132     myCentroids, clusterAssment = runkMeans(X, initial_centroids, max_iters,False);
    133     print "-------------------------------------"
    134     show(X, K, myCentroids, clusterAssment)
    135     
    136 main()

    参考了Andrew Ng的Machine Learning Assignment(https://github.com/rieder91/MachineLearning/blob/master/Exercise%207/ex7/runkMeans.m)

    以及博文http://www.cnblogs.com/MrLJC/p/4127553.html

    运行结果:

  • 相关阅读:
    艾伟_转载:你知道吗?——ASP.NET的Session会导致的性能问题 狼人:
    艾伟_转载:一次挂死(hang)的处理过程及经验 狼人:
    艾伟也谈项目管理,微型项目实践感悟 狼人:
    艾伟_转载:[原创]再谈IIS与ASP.NET管道 狼人:
    艾伟_转载:企业库缓存依赖的实现基于文件依赖 狼人:
    艾伟也谈项目管理,我也发软件开发团队的思考(侧重点是人员) 狼人:
    MYSQL用户名:root
    map 和 unordered_map以char * 为key
    设计模式单例模式(singleton)
    Android允许其他应用程序启动你的Activity
  • 原文地址:https://www.cnblogs.com/gui0901/p/5526936.html
Copyright © 2011-2022 走看看