zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然 R语言开发学习:回归(续四)

    #------------------------------------------------------------#
    # R in Action (2nd ed): Chapter 8                            #
    # Regression                                                 #
    # requires packages car, gvlma, MASS, leaps to be installed  #
    # install.packages(c("car", "gvlma", "MASS", "leaps"))       #
    #------------------------------------------------------------#
    
    par(ask=TRUE)
    opar <- par(no.readonly=TRUE)
    
    # Listing 8.1 - Simple linear regression
    fit <- lm(weight ~ height, data=women)
    summary(fit)
    women$weight
    fitted(fit)
    residuals(fit)
    plot(women$height,women$weight,
         main="Women Age 30-39", 
         xlab="Height (in inches)", 
         ylab="Weight (in pounds)")
    # add the line of best fit
    abline(fit)
    
    
    # Listing 8.2 - Polynomial regression
    fit2 <- lm(weight ~ height + I(height^2), data=women)
    summary(fit2)
    plot(women$height,women$weight,
         main="Women Age 30-39",
         xlab="Height (in inches)",
         ylab="Weight (in lbs)")
    lines(women$height,fitted(fit2))
    
    
    # Enhanced scatterplot for women data
    library(car)
    library(car)
    scatterplot(weight ~ height, data=women,
                spread=FALSE, smoother.args=list(lty=2), pch=19,
                main="Women Age 30-39",
                xlab="Height (inches)",
                ylab="Weight (lbs.)")
    
    
    # Listing 8.3 - Examining bivariate relationships
    states <- as.data.frame(state.x77[,c("Murder", "Population", 
                                         "Illiteracy", "Income", "Frost")])
    cor(states)
    library(car)
    scatterplotMatrix(states, spread=FALSE, smoother.args=list(lty=2),
                      main="Scatter Plot Matrix")
    
    
    # Listing 8.4 - Multiple linear regression
    states <- as.data.frame(state.x77[,c("Murder", "Population", 
                                         "Illiteracy", "Income", "Frost")])
    fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
    summary(fit)
    
    
    # Listing 8.5 - Mutiple linear regression with a significant interaction term
    fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)
    summary(fit)
    
    library(effects)
    plot(effect("hp:wt", fit,, list(wt=c(2.2, 3.2, 4.2))), multiline=TRUE)
    
    # simple regression diagnostics
    fit <- lm(weight ~ height, data=women)
    par(mfrow=c(2,2))
    plot(fit)
    newfit <- lm(weight ~ height + I(height^2), data=women)
    par(opar)
    par(mfrow=c(2,2))
    plot(newfit)
    par(opar)
    
    
    # basic regression diagnostics for states data
    opar <- par(no.readonly=TRUE)
    fit <- lm(weight ~ height, data=women)
    par(mfrow=c(2,2))
    plot(fit)
    par(opar)
    
    fit2 <- lm(weight ~ height + I(height^2), data=women)
    opar <- par(no.readonly=TRUE)
    par(mfrow=c(2,2))
    plot(fit2)
    par(opar)
    
    
    # Assessing normality
    library(car)
    states <- as.data.frame(state.x77[,c("Murder", "Population",
                                         "Illiteracy", "Income", "Frost")])
    fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
    qqPlot(fit, labels=row.names(states), id.method="identify",
           simulate=TRUE, main="Q-Q Plot")
    
    
    # Listing 8.6 - Function for plotting studentized residuals
    residplot <- function(fit, nbreaks=10) {
      z <- rstudent(fit)
      hist(z, breaks=nbreaks, freq=FALSE,
           xlab="Studentized Residual",
           main="Distribution of Errors")
      rug(jitter(z), col="brown")
      curve(dnorm(x, mean=mean(z), sd=sd(z)),
            add=TRUE, col="blue", lwd=2)
      lines(density(z)$x, density(z)$y,
            col="red", lwd=2, lty=2)
      legend("topright",
             legend = c( "Normal Curve", "Kernel Density Curve"),
             lty=1:2, col=c("blue","red"), cex=.7)
    }
    
    residplot(fit)
    
    
    # Assessing linearity
    library(car)
    crPlots(fit)
    
    
    # Listing 8.7 - Assessing homoscedasticity
    library(car)
    ncvTest(fit)
    spreadLevelPlot(fit)
    
    
    # Listing 8.8 - Global test of linear model assumptions
    library(gvlma)
    gvmodel <- gvlma(fit) 
    summary(gvmodel)
    
    # Listing 8.9 - Evaluating multi-collinearity
    library(car)
    vif(fit) 
    sqrt(vif(fit)) > 2 # problem?
    
    # Assessing outliers
    library(car)
    outlierTest(fit)
    
    #  Identifying high leverage points
    hat.plot <- function(fit) {
      p <- length(coefficients(fit))
      n <- length(fitted(fit))
      plot(hatvalues(fit), main="Index Plot of Hat Values")
      abline(h=c(2,3)*p/n, col="red", lty=2)
      identify(1:n, hatvalues(fit), names(hatvalues(fit)))
    }
    hat.plot(fit)
    
    # Identifying influential observations
    
    # Cooks Distance D
    # identify D values > 4/(n-k-1) 
    cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)
    plot(fit, which=4, cook.levels=cutoff)
    abline(h=cutoff, lty=2, col="red")
    
    # Added variable plots
    # add id.method="identify" to interactively identify points
    library(car)
    avPlots(fit, ask=FALSE, id.method="identify")
    
    # Influence Plot
    library(car)
    influencePlot(fit, id.method="identify", main="Influence Plot", 
                  sub="Circle size is proportial to Cook's Distance" )
    
    
    # Listing 8.10 - Box-Cox Transformation to normality
    library(car)
    summary(powerTransform(states$Murder))
    
    # Box-Tidwell Transformations to linearity
    library(car)
    boxTidwell(Murder~Population+Illiteracy,data=states)
    
    
    # Listing 8.11 - Comparing nested models using the anova function
    states <- as.data.frame(state.x77[,c("Murder", "Population",
                                         "Illiteracy", "Income", "Frost")])
    fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
               data=states)
    fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
    anova(fit2, fit1)
    
    
    # Listing 8.12 - Comparing models with the AIC
    fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
               data=states)
    fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
    AIC(fit1,fit2)
    
    
    # Listing 8.13 - Backward stepwise selection
    library(MASS)
    states <- as.data.frame(state.x77[,c("Murder", "Population",
                                         "Illiteracy", "Income", "Frost")])
    fit <- lm(Murder ~ Population + Illiteracy + Income + Frost,
              data=states)
    stepAIC(fit, direction="backward")
    
    
    # Listing 8.14 - All subsets regression
    library(leaps)
    states <- as.data.frame(state.x77[,c("Murder", "Population",
                                         "Illiteracy", "Income", "Frost")])
    leaps <-regsubsets(Murder ~ Population + Illiteracy + Income +
                         Frost, data=states, nbest=4)
    plot(leaps, scale="adjr2")
    library(car)
    subsets(leaps, statistic="cp",
            main="Cp Plot for All Subsets Regression")
    abline(1,1,lty=2,col="red")
    
    
    # Listing 8.15 - Function for k-fold cross-validated R-square
    shrinkage <- function(fit,k=10){
      require(bootstrap)
      
      # define functions 
      theta.fit <- function(x,y){lsfit(x,y)}
      theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef} 
      
      # matrix of predictors
      x <- fit$model[,2:ncol(fit$model)]
      # vector of predicted values
      y <- fit$model[,1]
      
      results <- crossval(x,y,theta.fit,theta.predict,ngroup=k)
      r2 <- cor(y, fit$fitted.values)**2 # raw R2 
      r2cv <- cor(y,results$cv.fit)**2 # cross-validated R2
      cat("Original R-square =", r2, "
    ")
      cat(k, "Fold Cross-Validated R-square =", r2cv, "
    ")
      cat("Change =", r2-r2cv, "
    ")
    }
    
    # using it
    states <- as.data.frame(state.x77[,c("Murder", "Population",
                                         "Illiteracy", "Income", "Frost")])
    fit <- lm(Murder ~ Population + Income + Illiteracy + Frost, data=states)
    shrinkage(fit)
    fit2 <- lm(Murder~Population+Illiteracy,data=states)
    shrinkage(fit2)
    
    #  Calculating standardized regression coefficients
    states <- as.data.frame(state.x77[,c("Murder", "Population",
                                         "Illiteracy", "Income", "Frost")])
    zstates <- as.data.frame(scale(states))
    zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
    coef(zfit)
    
    # Listing 8.16 rlweights function for clculating relative importance of predictors
    relweights <- function(fit,...){
      R <- cor(fit$model)
      nvar <- ncol(R)
      rxx <- R[2:nvar, 2:nvar]
      rxy <- R[2:nvar, 1]
      svd <- eigen(rxx)
      evec <- svd$vectors
      ev <- svd$values
      delta <- diag(sqrt(ev))
      lambda <- evec %*% delta %*% t(evec)
      lambdasq <- lambda ^ 2
      beta <- solve(lambda) %*% rxy
      rsquare <- colSums(beta ^ 2)
      rawwgt <- lambdasq %*% beta ^ 2
      import <- (rawwgt / rsquare) * 100
      import <- as.data.frame(import)
      row.names(import) <- names(fit$model[2:nvar])
      names(import) <- "Weights"
      import <- import[order(import),1, drop=FALSE]
      dotchart(import$Weights, labels=row.names(import),
               xlab="% of R-Square", pch=19,
               main="Relative Importance of Predictor Variables",
               sub=paste("Total R-Square=", round(rsquare, digits=3)),
               ...)
      return(import)
      }
      
      # Listing 8.17 - Applying the relweights function
      states <- as.data.frame(state.x77[,c("Murder", "Population",
                                           "Illiteracy", "Income", "Frost")])
      fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
      relweights(fit, col="blue")
  • 相关阅读:
    异步-promise、async、await
    node
    node基础 day1
    gulp的简介以及使用方法
    web前端安全——常见的web攻击方法
    Linux修改IP地址
    在linux下批量删除文件
    常用内容的正则表达式
    Oracle 数据库自带用户有哪些
    统计Oracle数据库当前User下各表的记录数
  • 原文地址:https://www.cnblogs.com/tszr/p/11175560.html
Copyright © 2011-2022 走看看