zoukankan      html  css  js  c++  java
  • R语言 模糊c均值(FCM)算法程序(转)

    FCM <- function(x, K, mybeta = 2, nstart = 1, iter_max = 100, eps = 1e-06) {
      ## FCM
    
      ## INPUTS
      ##   x: input matrix n*d, n  d-dim samples 
      ##   K: number of desired clusters
      ##   Optional : 
      ##       mybeta : beta, exponent for u (defaut 2). 
      ##       nstart:  how many random sets should be chosen(defaut 1) 
      ##       iter_max : The maximum number of iterations allowed. (default 100)
      
      ##       
      ## OUTPUTS
      ##   u: The fuzzy membership matrix = maxtrix of size n*K;
      ##   g: matrix of size K*d of the centers of the clusters
      ##   J: objective function
      ##   histJ: all the objective function values in the iter process
    
      ## modified time: 2015-02-07
    
        FCM_onetime <- function(x, init_centers, mybeta = 2, iter_max = 100, eps = 1e-06) {
            n = dim(x)[1]
            d = dim(x)[2]
            g = init_centers
            K = dim(g)[1]
            histJ = c()
            pasfini = 1
            Jold = Inf
            D = matrix(0, n, K)
            for (j in 1:K) {
                D[, j] = rowSums(sweep(x, 2, g[j, ], "-")^2)
            }
            iter = 1
            J_old = Inf
            while (pasfini) {
                s = (1/(D + eps))^(1/(mybeta - 1))
                u = s/(s %*% matrix(1, K, K))
                t1 = t(u^mybeta) %*% x
                t2 = t(u^mybeta) %*% matrix(1, n, d)
                V = t1/t2
                g = V
                D = matrix(0, n, K)
                for (j in 1:K) {
                    D[, j] = rowSums(sweep(x, 2, g[j, ], "-")^2)
                }
                J = sum(u^mybeta * D)
                pasfini = abs(J - Jold) > 0.001 && (iter < iter_max)
                Jold = J
                histJ = c(histJ, J)
                iter = iter + 1
            }
            cluster_id = apply(u, 1, which.max)
            re = list(u, J, histJ, g, cluster_id)
            names(re) = c("u", "J", "histJ", "g", "cluster_id")
            return(re)
        }
        x = as.matrix(x)
        seeds = 1:nrow(x)
        id = sample(seeds, K)
        g = as.matrix(x[id, ])
        re_best = FCM_onetime(x = x, init_centers = g, mybeta = mybeta, iter_max = iter_max, eps = eps)
        if (nstart > 1) {
            minJ = 0
            i = 2
            while (i <= nstart) {
                init_centers_id = sample(seeds, K)
                init_centers = as.matrix(x[init_centers_id, ])
                run = FCM_onetime(x, init_centers = init_centers, mybeta = mybeta, iter_max = iter_max)
                if (run$J <= re_best$J) {
                    re_best = run
                }
                i = i + 1
            }
        }
        return(re_best)
    } 
    # 对于模糊聚类均值的公式及其推到,大致如下:
                                                  
    #主要代码参见下面:(其中使用kmeans作比较。然后通过svm分类测验训练)
    # 设置伪随机种子
    set.seed(100)
    
    # 生产数据样本
    simple.data = function (n=200, nclass=2) 
    {
        require(clusterGeneration)
        require(mvtnorm)
        # Center of Gaussians
        xpos = seq(-nclass*2, nclass*2, length=nclass)
        ypos = runif(nclass, min=-2*nclass, max=2*nclass)
        
        func = function(i,xpos,ypos,n) {
            # Create a random covariance matrix
            cov = genPositiveDefMat(2, covMethod="eigen",
                                rangeVar=c(1, 10), lambdaLow=1, ratioLambda=10)
            # 保存随机数据
            data = rmvnorm(n=n, mean=c(xpos[i], ypos[i]), sigma=cov$Sigma)
            # 保存每一次的结果
            list(means=cbind(xpos[i], ypos[i]), covars=cov$Sigma, data=data,class=rep(i*1.0, n))
        }
        # do call 合并列表 为 数据框
        strL=do.call(rbind,lapply(1:nclass,func,xpos,ypos,n))
        data=list()
        data$means=do.call(rbind,strL[,1])
        data$covars = as.list(strL[,2])
        data$data=do.call(rbind,strL[,3])
        data$class=do.call(c,strL[,4])
        # 返回
        data
    }
    
    # 第一次随机产生u值 nr点个数  k 类别数
    random.uij = function(k,nr)
    {
        # 
        u = matrix(data=round(runif(k*nr,10,20)),nrow=k,ncol=nr,
                   dimnames=list(paste('u',1:k,sep=""),paste('x',1:nr,sep='')))
        tempu = function(x)
        {
            ret = round(x/sum(x),4)
            # 保证每一列之和为1
            ret[1] = 1-sum(ret[-1])
            ret
        }
        apply(u,2,tempu)
    }
    
    # 计算 点矩阵 到 中心的距离
    dist_cc_dd = function(cc,dd)
    {
        # cc 为 中心点  dd 为样本点值
        temp = function(cc,dd)
        {
            # 计算每一个中心点与每一个点的距离
            temp1 = function(index)
            {
                sqrt(sum(index^2))
            }
            # 结果向量以列存放,后面的结果需要转置,按行存储
            apply(dd-cc,2,temp1)
        }
        # 将结果转置
        t(apply(cc,1,temp,dd))
    }
    
    # 模糊均值聚类
    fuzzy.cmeans = function(data,u,m=3)
    {
        # 简单的判断,可以不要
        if (is.array(data) || is.matrix(data))
        {
            data = as.data.frame(data)
        }
        
    #     nr = nrow(data)
    #     nc = ncol(data)
        
    #     while (J > lim && step < steps)
    #     {
    #         step = step + 1
            # uij 的 m 次幂
            um = u^m
            rowsum = apply(um,1,sum)
            # 求中心点 ci
            cc = as.matrix(um/rowsum) %*% as.matrix(data)
            #     rownames(cc)=paste('c',1:k,sep='')
            #     colnames(cc)=paste('x',1:nc,sep='')
            # 计算 J 值
            distance = dist_cc_dd(cc,t(data))
            J = sum(distance^2 * um)
            #     cc_temp = matrix(rep(cc,each=nr),ncol=2)
            #     dd_temp = NULL
            #     lapply(1:k,function(i){dd_temp <<- rbind(dd_temp,data)})
            #     dist = apply((dd_temp-cc_temp)^2,1,sum)
            #     um_temp = as.vector(t(um))
            #     J = um_temp %*% dist
            
            
            # 计算幂次系数,后面需要使用m != 1
            t = -2 / (m - 1)
            # 根据公式 计算
            tmp = distance^t
            colsum = apply(tmp,2,sum)
            mat = rep(1,nrow(cc)) %*% t(colsum)
            # 计算 uij,如此u的每一列之和为0
            u = tmp / mat
    #     }
    #     u
        # 保存一次迭代的结果值
        list(U = u,C = cc,J = J)
    }
    
    # 设置初始化参数
    n = 100
    k = 4
    dat = simple.data(n,k)
    nr = nrow(dat$data)
    m = 3
    limit = 1e-4
    max_itr=50
    # 随机初始化 uij
    u = random.uij(k,nr)
    results = list()
    data=dat$data
    
    # 迭代计算 收敛值
    for (i in 1 : max_itr)
    {
        results[[i]] = fuzzy.cmeans(dat$data,u,m)
        if (i != 1 && abs((results[[i]]$J - results[[i-1]]$J)) < limit)
        {
            break
        }
        u = results[[i]]$U
    }
    
    # 做散点图
    require(ggplot2)
    data=as.data.frame(dat$data,stringsAsFactors=FALSE)
    data=cbind(data,dat$class)
    nc = ncol(data)
    colnames(data)=paste('x',1:nc,sep='')
    # par(mar=rep(2,4))
    p=ggplot(data,aes(x=x1,y=x2,color=factor(x3)))
    p+geom_point()+xlab('x轴')+ylab('y轴')+ggtitle('scatter points')
    
    # plot(dat$data,col=factor(dat$class))
    # points(results[[i]]$C,pch=19,col=1:uniqe(dat$class))
    # Sys.sleep(1)
    
    # 计算聚类与原始类的误差比率
    tclass=apply(results[[i]]$U,2,function(x){which(x==max(x))})
    tclass[tclass==2]=5
    tclass[tclass==3]=6
    tclass[tclass==4]=7
    tclass[tclass==5]=4
    tclass[tclass==6]=2
    tclass[tclass==7]=3
    
    freq=table(dat$class,tclass)
    (sum(freq)-sum(diag(freq))) / sum(freq)
    
    # 训练 svm model
    svm_test = function()
    {
        library(e1071)
        svm.fit = svm(dat$data,dat$class)
        r.fit = predict(svm.fit, dat$data)
        diff.class = round(as.numeric(r.fit)) - as.numeric(dat$class)
        i.misclass = which(abs(diff.class) > 0)
        n.misclass = length(i.misclass)
        f.misclass = n.misclass/length(dat$class)
    }
    # 同一数据,使用 kmeans 聚类
    kmeans_test = function()
    {
        
        k.fit = kmeans(x=dat$data,4)
        tclass=k.fit$cluster
        tclass[tclass==2]=5
        tclass[tclass==3]=6
        tclass[tclass==4]=7
        tclass[tclass==5]=3
        tclass[tclass==6]=4
        tclass[tclass==7]=2
        freq=table(dat$class,tclass)
        (sum(freq)-sum(diag(freq))) / sum(freq)
    }
    
    # kmeans 和 fuzzy c means 
    ---------------------------------------------------------------------------------- 数据和特征决定了效果上限,模型和算法决定了逼近这个上限的程度 ----------------------------------------------------------------------------------
  • 相关阅读:
    CentOS下设置ipmi
    CentOS 使用文件增加Swap空间
    CentOS LVM
    做IT需要掌握的电力基础知识
    CentOS 7搭建本地yum源
    Lsi卡和IB卡在CentOS中升级
    Mellanox 4036配置
    IdentityServer4入门二
    IdentityServer4入门一
    RAFT选举算法-分布式数据库困惑
  • 原文地址:https://www.cnblogs.com/payton/p/5491605.html
Copyright © 2011-2022 走看看