zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然 R语言开发学习:聚类分析(续一)

    #-------------------------------------------------------#
    # R in Action (2nd ed): Chapter 16                      #
    # Cluster analysis                                      #
    # requires packaged NbClust, flexclust, rattle          #
    # install.packages(c("NbClust", "flexclust", "rattle")) #
    #-------------------------------------------------------#
    
    par(ask=TRUE)
    opar <- par(no.readonly=FALSE)
    
    # Calculating Distances
    data(nutrient, package="flexclust")
    head(nutrient, 2)
    d <- dist(nutrient)
    as.matrix(d)[1:4,1:4]
    
    
    # Listing 16.1 - Average linkage clustering of nutrient data
    data(nutrient, package="flexclust")
    row.names(nutrient) <- tolower(row.names(nutrient))
    nutrient.scaled <- scale(nutrient)                                  
    d <- dist(nutrient.scaled)                                          
    fit.average <- hclust(d, method="average")                          
    plot(fit.average, hang=-1, cex=.8, main="Average Linkage Clustering")
    
    
    # Listing 16.2 - Selecting the number of clusters
    library(NbClust)
    nc <- NbClust(nutrient.scaled, distance="euclidean", 
                  min.nc=2, max.nc=15, method="average")
    par(opar)
    table(nc$Best.n[1,])
    barplot(table(nc$Best.n[1,]), 
            xlab="Numer of Clusters", ylab="Number of Criteria",
            main="Number of Clusters Chosen by 26 Criteria") 
    
    
    # Listing 16.3 - Obtaining the final cluster solution
    clusters <- cutree(fit.average, k=5) 
    table(clusters)
    aggregate(nutrient, by=list(cluster=clusters), median) 
    aggregate(as.data.frame(nutrient.scaled), by=list(cluster=clusters),
              median)
    plot(fit.average, hang=-1, cex=.8,  
         main="Average Linkage Clustering
    5 Cluster Solution")
    rect.hclust(fit.average, k=5)
    
    # Plot function for within groups sum of squares by number of clusters
    wssplot <- function(data, nc=15, seed=1234){
      wss <- (nrow(data)-1)*sum(apply(data,2,var))
      for (i in 2:nc){
        set.seed(seed)
        wss[i] <- sum(kmeans(data, centers=i)$withinss)}
      plot(1:nc, wss, type="b", xlab="Number of Clusters",
           ylab="Within groups sum of squares")}
    
    
    # Listing 16.4 - K-means clustering of wine data
    data(wine, package="rattle")
    head(wine)
    df <- scale(wine[-1])  
    wssplot(df)      
    library(NbClust)
    set.seed(1234)
    nc <- NbClust(df, min.nc=2, max.nc=15, method="kmeans")
    par(opar)
    table(nc$Best.n[1,])
    barplot(table(nc$Best.n[1,]), 
            xlab="Numer of Clusters", ylab="Number of Criteria",
            main="Number of Clusters Chosen by 26 Criteria") 
    set.seed(1234)
    fit.km <- kmeans(df, 3, nstart=25) 
    fit.km$size
    fit.km$centers                                               
    aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean)
    
    
    # evaluate clustering
    ct.km <- table(wine$Type, fit.km$cluster)
    ct.km   
    library(flexclust)
    randIndex(ct.km)
    
    
    # Listing 16.5 - Partitioning around mediods for the wine data
    library(cluster)
    set.seed(1234)
    fit.pam <- pam(wine[-1], k=3, stand=TRUE)       
    fit.pam$medoids                                 
    clusplot(fit.pam, main="Bivariate Cluster Plot")
    
    # evaluate clustering
    ct.pam <- table(wine$Type, fit.pam$clustering)
    ct.pam
    randIndex(ct.pam)
    
    ## Avoiding non-existent clusters
    library(fMultivar)
    set.seed(1234)
    df <- rnorm2d(1000, rho=.5)
    df <- as.data.frame(df)
    plot(df, main="Bivariate Normal Distribution with rho=0.5")
    
    wssplot(df)
    library(NbClust)
    nc <- NbClust(df, min.nc=2, max.nc=15, method="kmeans")
    par(opar)
    barplot(table(nc$Best.n[1,]), 
            xlab="Numer of Clusters", ylab="Number of Criteria",
            main  ="Number of Clusters Chosen by 26 Criteria")
    
    library(ggplot2)
    library(cluster)
    fit <- pam(df, k=2)
    df$clustering <- factor(fit$clustering)
    ggplot(data=df, aes(x=V1, y=V2, color=clustering, shape=clustering)) +
      geom_point() + ggtitle("Clustering of Bivariate Normal Data")
    
    plot(nc$All.index[,4], type="o", ylab="CCC",
         xlab="Number of clusters", col="blue")
  • 相关阅读:
    正则表达式match方法和search方法
    正则表达式(基础篇1)
    动画
    重绘和重排(回流)
    数组常用的10个方法
    css3只需一招,将网站变成灰色的
    Python-类的几种调用方法
    Codeforces Global Round 8 C. Even Picture(构造)
    Codeforces Global Round 8 D. AND, OR and square sum(位运算)
    Codeforces Round #650 (Div. 3) C. Social Distance
  • 原文地址:https://www.cnblogs.com/tszr/p/11176454.html
Copyright © 2011-2022 走看看