  转:Python K-means代码

    #coding: UTF-8
    import pearson_distance
    from pearson_distance import pearson_distance
    from math import sqrt
    import random 
    def print_matchs(matchs) : 
        for i in range(len(matchs)) : 
            print i , '---->', 
            for item in matchs[i] : 
                print item, 
        print '-'*20
    def kmeans(blogwords, k) : 
        min_max_per_word = [ [min([row[i] for row in blogwords]), max([row[i] for row in blogwords])]  for i in range(len(blogwords[0]))] 
        # generate k clusters randomly 
        clusters = []
        for i in range(k) : 
            cluster = [] 
            for min_, max_ in min_max_per_word : 
                cluster.append(random.random() * (max_ - min_) + min_)    #形成初始类中心 利用每列的最大最小值 一列代表一个数据的属性
        lables = [] 
        matchs = [ [] for i in range(k)] 
        lastmatchs = [ [] for i in range(k)] 
        rounds = 100
        while rounds > 0 : 
            matchs = [ [] for i in range(k)] 
            print 'round 	',rounds 
            for i in range(len(blogwords)) : #遍历所有的数据
                bestmatch_cluster = None
                min_distance = 100
                for j in range(k) :  #遍历每一个类标签
                    dis = pearson_distance(clusters[j], blogwords[i]) #计算相似度距离
                    if dis < min_distance : 
                        min_distance = dis 
                        bestmatch_cluster = j        #保存距离最近的类中心编号
                matchs[bestmatch_cluster].append(i)  #将数据行编号i 添加到matchs[[],[]...[]](k个类)的每一个类中
            if matchs == lastmatchs : break #如果上次和这次的label没改变 则跳出循环
            lastmatchs = [[ item for item in matchs[i] ] for i in range(k)] #保存的是上次的k-means结果的label
            #move the centroids to the average of their members 
            for j in range(k) : 
                avg = [0.0 for i in range(len(blogwords[0])) ]  
                for m in matchs[j] : 
                    vec = blogwords[m] 
                    for i in range(len(blogwords[0])) : 
                        avg[i] += vec[i] 
                avg = [ item / len(match[j]) for item in avg]  
    clusters[j] = avg #更新新的聚类中心 rounds -= 1 return matchs ## label指的是: ## 1------ [1,3] · ## 2------ [2,8] · ## 类编号 3------ [0,6] · 数据编号 ## 4------ [4,7] · ## 5------ [5,10,11] ·


    #pearson distance
    from math import sqrt
    def pearson_distance(vector1,vector2):
        sum1 = sum(vector1)
        sum2 = sum(vector2)
        sum1Sq = sum([pow(v,2) for v in vector1])
        sum2Sq = sum([pow(v,2) for v in vector2])
        pSum = sum([vector1[i] * vector2[i] for i in range(len(vector1))])
        num = pSum - (sum1 * sum2 / len(vector1))
        den = sqrt((sum1Sq - pow(sum1,2) / len(vector1)) * (sum2Sq - pow(sum2,2)/len(vector1)))
        if den == 0 : return 0.0
        return 1.0 - num/den

    ***注意:如果修改py文件(例如添加一个函数)必须要Restart shell 才能调用该函数,我们可以修改一下距离函数 或者 迭代终止条件~~~

    ***注意:如果要加中文注释,需要在最开头一行加入 #coding: UTF-8

    ***数据输入格式 [[123, 312, 434, 4325, 345345], [23124, 141241, 434234, 9837489, 34743], [128937, 127, 12381, 424, 8945], [323, 4348, 5040, 8189, 2348], [51249, 42190, 2713, 2319, 4328], [13957, 1871829, 8712847, 34589, 30945], [1234, 45094, 23409, 13495, 348052], [49853, 3847, 4728, 4059, 5389]] 一行代表一个数据,列代表一个数据的一个属性值







