zoukankan      html  css  js  c++  java
  • R语言学习笔记(六):OLS回归

    OSL回归

    简单的线性回归

    > fit<-lm(weight~height,women)
    > summary(fit)
    
    Call:
    lm(formula = weight ~ height, data = women)
    
    Residuals:
    Min 1Q Median 3Q Max 
    -1.7333 -1.1333 -0.3833 0.7417 3.1167
    
    Coefficients:
    Estimate Std. Error t value Pr(>|t|) 
    (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
    height 3.45000 0.09114 37.85 1.09e-14 ***
    ---
    Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    Residual standard error: 1.525 on 13 degrees of freedom
    Multiple R-squared: 0.991,	Adjusted R-squared: 0.9903 
    F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
    
    
    > residuals(fit)
    1 2 3 4 5 
    2.41666667 0.96666667 0.51666667 0.06666667 -0.38333333 
    6 7 8 9 10 
    -0.83333333 -1.28333333 -1.73333333 -1.18333333 -1.63333333 
    11 12 13 14 15 
    -1.08333333 -0.53333333 0.01666667 1.56666667 3.11666667
    
    
    > fitted(fit)  #列出拟合模型的预测值
    1 2 3 4 5 6 7 8 9 
    112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333 140.1833 
    10 11 12 13 14 15 
    143.6333 147.0833 150.5333 153.9833 157.4333 160.8833
    
    
    > residuals(fit) #列出拟合模型的残差值
    1 2 3 4 5 6 7 
    2.41666667 0.96666667 0.51666667 0.06666667 -0.38333333 -0.83333333 -1.28333333 
    8 9 10 11 12 13 14 
    -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333 0.01666667 1.56666667 
    15 
    3.11666667
    
    
    > plot(women$height~women$weight,xlab="Height (in inches)", ylab="Weight (in pounds)")
    > abline(fit)
    

      

    得到预测回归公式:Weight=-87.52+3.45*Height

    多项式回归

    fit2<-lm(weight~height+I(height^2), data=women)
    > summary(fit2)
    
    Call:
    lm(formula = weight ~ height + I(height^2), data = women)
    
    Residuals:
    Min 1Q Median 3Q Max 
    -0.50941 -0.29611 -0.00941 0.28615 0.59706
    
    Coefficients:
    Estimate Std. Error t value Pr(>|t|) 
    (Intercept) 261.87818 25.19677 10.393 2.36e-07 ***
    height -7.34832 0.77769 -9.449 6.58e-07 ***
    I(height^2) 0.08306 0.00598 13.891 9.32e-09 ***
    ---
    Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    Residual standard error: 0.3841 on 12 degrees of freedom
    Multiple R-squared: 0.9995,	Adjusted R-squared: 0.9994 
    F-statistic: 1.139e+04 on 2 and 12 DF, p-value: < 2.2e-16
    
    > plot(women$height,women$weight,xlab="Height (in inches)",ylab="Weight (in lbs)")
    > lines(women$height,fitted(fit2))
    
    回归公式:Weight=261.88-7.35*Height+0.083*Height^2
    

      

    三次方线性回归

    fit3<-lm(weight~height+I(height^2)+I(height^3),data=women)
    
    summary(fit3)
    
    Call:
    lm(formula = weight ~ height + I(height^2) + I(height^3), data = women)
    
    Residuals:
    Min 1Q Median 3Q Max 
    -0.40677 -0.17391 0.03091 0.12051 0.42191
    
    Coefficients:
    Estimate Std. Error t value Pr(>|t|) 
    (Intercept) -8.967e+02 2.946e+02 -3.044 0.01116 * 
    height 4.641e+01 1.366e+01 3.399 0.00594 **
    I(height^2) -7.462e-01 2.105e-01 -3.544 0.00460 **
    I(height^3) 4.253e-03 1.079e-03 3.940 0.00231 **
    ---
    Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    Residual standard error: 0.2583 on 11 degrees of freedom
    Multiple R-squared: 0.9998,	Adjusted R-squared: 0.9997 
    F-statistic: 1.679e+04 on 3 and 11 DF, p-value: < 2.2e-16
    
    
    plot(women$height,women$weight,xlab="height",ylab="weight")
    lines(women$height,fitted(fit3))
    
    回归公式:Weight=-8.967e+02 + 4.641e+01*Height -7.462e-01 * Height^2 + 4.253e-03 *Height^3
     
    

      

    多元线性回归

    fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
    summary(fit)
    
    Call:
    lm(formula = Murder ~ Population + Illiteracy + Income + Frost, 
    data = states)
    
    Residuals:
    Min 1Q Median 3Q Max 
    -4.7960 -1.6495 -0.0811 1.4815 7.6210
    
    Coefficients:
    Estimate Std. Error t value Pr(>|t|) 
    (Intercept) 1.235e+00 3.866e+00 0.319 0.7510 
    Population 2.237e-04 9.052e-05 2.471 0.0173 * 
    Illiteracy 4.143e+00 8.744e-01 4.738 2.19e-05 ***
    Income 6.442e-05 6.837e-04 0.094 0.9253 
    Frost 5.813e-04 1.005e-02 0.058 0.9541 
    ---
    Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    Residual standard error: 2.535 on 45 degrees of freedom
    Multiple R-squared: 0.567,	Adjusted R-squared: 0.5285 
    F-statistic: 14.73 on 4 and 45 DF, p-value: 9.133e-08
     
    

      

    有交互项的多元线性回归

    fit<-lm(mpg~hp+wt+hp:wt,data=mtcars)
    summary(fit)
    
    Call:
    lm(formula = mpg ~ hp + wt + hp:wt, data = mtcars)
    
    Residuals:
    Min 1Q Median 3Q Max 
    -3.0632 -1.6491 -0.7362 1.4211 4.5513
    
    Coefficients:
    Estimate Std. Error t value Pr(>|t|) 
    (Intercept) 49.80842 3.60516 13.816 5.01e-14 ***
    hp -0.12010 0.02470 -4.863 4.04e-05 ***
    wt -8.21662 1.26971 -6.471 5.20e-07 ***
    hp:wt 0.02785 0.00742 3.753 0.000811 ***
    ---
    Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    Residual standard error: 2.153 on 28 degrees of freedom
    Multiple R-squared: 0.8848,	Adjusted R-squared: 0.8724 
    F-statistic: 71.66 on 3 and 28 DF, p-value: 2.981e-13
    
     
    
    library(effects)
    plot(effect("hp:wt",fit,,list(wt=c(2.2,3.2,4.2))),multiline=TRUE)
     
    

      

    回归判断

    fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
    confint(fit)
    
                    2.5 %                  97.5 %
    (Intercept) -6.552191e+00     9.0213182149
    Population 4.136397e-05      0.0004059867
    Illiteracy 2.381799e+00        5.9038743192 #文盲改变1%,谋杀率就在改区间变动,它的置信区间为95%
    Income -1.312611e-03         0.0014414600
    Frost -1.966781e-02            0.0208304170
    
     
    
    fit<-lm(weight~height,data=women)
    par(mfrow=c(2,2))
    plot(fit)
    
     
    
    Residuals vs Fitted: 线性
    
    Normal Q-Q: 正态性
    
    Scale-Location:同方差性
     
    

      

    fit2<-lm(weight~height+I(height^2),data=women)
    par(nfrow=c(2,2))
    plot(fit2)
    

      

    newfit2<-lm(weight~height+I(height^2),data=women[-c(13,15),])  #排除异常点
    par(nfrow=c(2,2))
    plot(newfit2)
    

      

    检测异常值

    #离群点
    library(car)
    outlierTest(fit)
    
    rstudent unadjusted p-value Bonferonni p
    Nevada 3.542929 0.00095088 0.047544
    
     
    
    #高杠杆值
    hat.plot<-function(fit){
    p<-length(coefficients(fit))
    n<-length(fitted(fit))
    plot(hatvalues(fit),main="Index Plot of Hat Values")
    abline(h=c(2,3)*p/n,col="red",lty=2)
    identify(1:n,hatvalues(fit),names(hatvalues(fit)))
    }
    
    hat.plot(fit)
    

      

    #强影响点
    cutoff<-4/(nrow(states)-length(fit$coefficients)-2)
    plot(fit,which=4,cook.levels=cutoff)
    abline(h=cutoff,lty=2,col="red")

    library(car)
    avPlots(fit,ask=FALSE,id.method="identify")

    library(car)
    influencePlot(fit,id.method = "identify",main="Influence Plot",sub="Circle size is proportional to Cook's distance")

    改进措施

    #变量变换
    library(car)
    summary(powerTransform(states$Murder))
    
    bcPower Transformation to Normality 
    Est Power Rounded Pwr Wald Lwr bnd Wald Upr Bnd
    states$Murder 0.6055 1 0.0884 1.1227  #可以使用 0.6来正态化变量Murder, 例如 Murder ^ 0.6
    
    Likelihood ratio tests about transformation parameters
    LRT df pval
    LR test, lambda = (0) 5.665991 1 0.01729694
    LR test, lambda = (1) 2.122763 1 0.14512456

    选择最佳回归模型

    #排除不需要的随机变量
    fit1<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
    fit2<-lm(Murder~Population+Illiteracy,data=states)
    anova(fit2,fit1)
    
    AIC(fit1,fit2) #AIF 返回值越小,模型效果越佳,精简变量
    
    df AIC
    fit1 6 241.6429
    fit2 4 237.6565
    
     
    
    逐步回归
    
    模型通过每次添加或删除一个变量来探索最佳的回归模型所需要的变量
    
    library(MASS)
    fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
    stepAIC(fit,direction="backward")
    
     
    
    Start: AIC=97.75
    Murder ~ Population + Illiteracy + Income + Frost
    
    Df Sum of Sq RSS AIC
    - Frost 1 0.021 289.19 95.753
    - Income 1 0.057 289.22 95.759
    <none> 289.17 97.749
    - Population 1 39.238 328.41 102.111
    - Illiteracy 1 144.264 433.43 115.986
    
    Step: AIC=95.75
    Murder ~ Population + Illiteracy + Income
    
    Df Sum of Sq RSS AIC
    - Income 1 0.057 289.25 93.763
    <none> 289.19 95.753
    - Population 1 43.658 332.85 100.783
    - Illiteracy 1 236.196 525.38 123.605
    
    Step: AIC=93.76
    Murder ~ Population + Illiteracy
    
    Df Sum of Sq RSS AIC
    <none> 289.25 93.763
    - Population 1 48.517 337.76 99.516
    - Illiteracy 1 299.646 588.89 127.311
    
    Call:
    lm(formula = Murder ~ Population + Illiteracy, data = states) 最佳模型公式
    
    Coefficients:
    (Intercept) Population Illiteracy 
    1.6515497 0.0002242 4.0807366
    
     
    
    变量选择-全子集回归
    install.packages("leaps")
    library(leaps)
    states<-as.data.frame(state.x77[,c("Murder","Population","Illiteracy","Income","Frost")])
    leaps<-regsubsets(Murder~Population+Illiteracy+Income+Frost,data=states,nbest=4)
    plot(leaps,scale="adjr2")

    library(car)
    subsets(leaps,statistic="cp",main="Cp Plot for All Subsets Regression")
    abline(1,1,lty=2,col="red")  #离线越近,效果约好

    #R平方的K重交叉验证
    install.packages("bootstrap")
    shrinkage<-function(fit,k=10)
    {
    require(bootstrap)
    theta.fit<-function(x,y){lsfit(x,y)}
    theta.predict<-function(fit,x){cbind(1,x)%*%fit$coef}
    
    x<-fit$model[,2:ncol(fit$model)]
    y<-fit$model[,1]
    
    results<-crossval(x,y,theta.fit,theta.predict,ngroup=k)
    r2<-cor(y,fit$fitted.values)^2
    r2cv<-cor(y,results$cv.fit)^2
    cat("Original R-square=",r2,"
    ")
    cat(k,"Fold Cross-Validated R-square=",r2cv,"
    ")
    cat("Change=",r2-r2cv,"
    ")
    
    }
    fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
    shrinkage(fit)
    
    Original R-square= 0.5669502 
    10 Fold Cross-Validated R-square= 0.4474345 
    Change= 0.1195158
    
    
    fit2<-lm(Murder~Population+Illiteracy,data=states)
    shrinkage(fit2)
    
    Original R-square= 0.5668327 
    10 Fold Cross-Validated R-square= 0.5182764 
    Change= 0.0485563
    
     
    
     
    
    相对重要性
    
    检测模型中的变量重要性程度
    zstates<-as.data.frame(scale(states))
    zfit<-lm(Murder~Population+Income+Illiteracy+Frost,data=zstates)
    coef(zfit)
    
     
    
     
    
    (Intercept) Population Income Illiteracy Frost 
    -2.054026e-16 2.705095e-01 1.072372e-02 6.840496e-01 8.185407e-03 文盲率值最大,所以它在模型变量中的权重最大
    
     
    
     
    
    变量权重
    
    
    relweights <- function(fit,...){
    R <- cor(fit$model)
    nvar <- ncol(R)
    rxx <- R[2:nvar, 2:nvar]
    rxy <- R[2:nvar, 1]
    svd <- eigen(rxx)
    evec <- svd$vectors
    ev <- svd$values
    delta <- diag(sqrt(ev))
    lambda <- evec %*% delta %*% t(evec)
    lambdasq <- lambda ^ 2
    beta <- solve(lambda) %*% rxy
    rsquare <- colSums(beta ^ 2)
    rawwgt <- lambdasq %*% beta ^ 2
    import <- (rawwgt / rsquare) * 100
    import <- as.data.frame(import)
    row.names(import) <- names(fit$model[2:nvar])
    names(import) <- "Weights"
    import <- import[order(import),1, drop=FALSE]
    dotchart(import$Weights, labels=row.names(import),
    xlab="% of R-Square", pch=19,
    main="Relative Importance of Predictor Variables",
    sub=paste("Total R-Square=", round(rsquare, digits=3)),
    ...)
    return(import)
    }
    
    fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
    relweights(fit,col="blue")

  • 相关阅读:
    opencv-learnopencv-Facial Landmark Detection
    【OpenCV】Learn OpenCV
    leetcode-1-TwoNums
    【c++基础】判断是否到文件末尾-eof函数
    【c++基础】int转string自动补零
    【linux】如何退出shell终端
    【机器学习算法】bagging算法
    【机器学习算法】Boostrapping算法
    【python基础】如何注释代码块
    【机器学习算法】cascade classifier级联分类器
  • 原文地址:https://www.cnblogs.com/GhostBear/p/7724686.html
Copyright © 2011-2022 走看看