zoukankan      html  css  js  c++  java
  • 泰坦尼克号生还者概率

    加载数据 

    赋值新数据集

    字段值之间的关系

    探索缺失值的方法

    填补缺失值

    使用决策树或随机森林处理缺失值

    使用逻辑回归 

    #加载数据

    test <- read.csv(file.choose())
    train <- read.csv(file.choose())

    #注意:字符串要带有双引号

    #“Sex”字段类型转换/生成新的字段值(因子类型字段处理)

    train$sex[train$Sex == "female"] <- 0
    train$sex[train$Sex == "male"] <- 1
    
    test$sex[test$Sex == "female"] <- 0
    test$sex[test$Sex == "male"] <- 1

    #“Embarked“字段转换/生成新的字段值(因子类型字段处理)

    train$embarked[train$Embarked == "C"] <-1
    train$embarked[train$Embarked == "Q"] <-2
    train$embarked[train$Embarked == "S"] <-3
    
    test$embarked[test$Embarked == "C"] <-1
    test$embarked[test$Embarked == "Q"] <-2
    test$embarked[test$Embarked == "S"] <-3

    #拆分字段"Name",获取称谓

    title_train <- as.character(train$Name)
    title_train <- strsplit(title_train," ")
    
    title_test <- as.character(test$Name)
    title_test <- strsplit(title_test," ")

    #生成新字段title

    train$title <- train$Survived
    train$title <- as.character(train$title)
    
    test$title <- test$PassengerId
    test$title <- as.character(test$PassengerId)

    #提取姓名中的称呼字段"Name" Warning

    for(i in 1:length(train$title)){
    temp_num <- grep("\\.",title_train[[i]]);
    #if else 完善结构,无意义
    if(is.integer(temp_num)) {
    train$title[i] <- title_train[[i]][temp_num];
    }
    else {
    train$title[i] <- NA;
    }
    }

    #定义需要删除的字段

    temp_name <- names(train)
    temp_delete <- c("Embarked","Name","Sex")

    #循环删除字段

    for(i in 1:length(temp_delete))
    temp_name <- temp_name[-grep(temp_delete[i],temp_name)]
    

    #赋值新数据集 train_set

    train_set <- train[temp_name]
    
    
    for(i in 1:length(test$title)){
    temp_num <- grep("\\.",title_test[[i]]);
    #if else 完善结构,无意义
    if(is.integer(temp_num)) {
    test$title[i] <- title_test[[i]][temp_num];
    }
    else {
    test$title[i] <- NA;
    }
    }

    #定义需要删除的字段

    temp_name <- names(test)
    temp_delete <- c("Embarked","Name","Sex")

    #循环删除字段

    for(i in 1:length(temp_delete))
    temp_name <- temp_name[-grep(temp_delete[i],temp_name)]

    #赋值新数据集 test_set

    test_set <- test[temp_name]
    
    #剔除多余列的优化方案
    # n<-names(train)
    # head(train_ [n [n !="title" & n!="sex" & n!="Fare" ] ] )

    #2017-08-06
    #数据处理的目标:字段是否可用,如何转化为可用字段
    #Ticket 可以根据字段特征(长度+前几位数字、相同字幕),确定字段是否属于同类型(级别)
    #缺失值处理,可以对train和test数据集合并后进行处理

    #作图

    #导出图片

    #savePlot(filename = "",
    # type = c("wmf", "emf", "png", "jpg", "jpeg", "bmp",
    # "tif", "tiff", "ps", "eps", "pdf"),
    # device = dev.cur(),
    # restoreConsole = TRUE)

    #字段值之间的关系

    #性别和生存关系

    #savePlot(filename = "",
    # type = c("wmf", "emf", "png", "jpg", "jpeg", "bmp",
    # "tif", "tiff", "ps", "eps", "pdf"),
    # device = dev.cur(),
    # restoreConsole = TRUE)


    #登船位置与生存关系

    #barplot(table(train_set$Survived,train_set$embarked),col=c("red","green"),args.legend = list(x = "topleft"),legend.text = c("0", "1"))

     #年龄的统计量

    summary(train$Age)
    #Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
    #0.42 20.12 28.00 29.70 38.00 80.00 177

    #箱线图查看生存和年龄的关系

    #boxplot(Age~Survived, data=train)

    #查看年龄变量缺失值对应title统计量

    table(train_set$title[is.na(train_set$Age)])


     #查看train数据集中的title变量年龄的分布

    #boxplot(Age~title,data=train_set)

     

    #合并test和train数据集

    train_age <- data.frame(train$Age,train$title)
    names(train_age) <- c("age","title")
    test_age <- data.frame(test$Age,test$title)
    names(test_age) <- c("age","title")
    combin_age <- rbind(train_age,test_age)
    head(combind_age)

    #剔除title出现次数较少行

    temp <- combin_age[which(combin_age$title !="Capt."
    & combin_age$title !="Countess."
    & combin_age$title !="Don."
    & combin_age$title !="Jonkheer."
    & combin_age$title !="Lady."
    & combin_age$title !="Mme."
    & combin_age$title !="Sir."
    & combin_age$title !="Dona."
    & combin_age$title !="Ms."
    & combin_age$title !="Mlle."
    & combin_age$title !="Major."
    & combin_age$title !="Col."
    ),]


    #作图,合并后的age和title

    #boxplot(age~title,data=temp)

    #备份数据

    train_ <- train_set
    test_ <- test_set

    #由于和年龄相关变量较少,故此处仅使用中位数作为缺失值

    #获取年龄中位数

    summary(temp[which(temp$title=="Dr."),]$age)
    #Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
    #23.00 38.00 49.00 43.57 51.50 54.00 1
    summary(temp[which(temp$title=="Master."),]$age)
    #Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
    #0.330 2.000 4.000 5.483 9.000 14.500 8
    summary(temp[which(temp$title=="Miss."),]$age)
    #Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
    #0.17 15.00 22.00 21.77 30.00 63.00 50
    summary(temp[which(temp$title=="Mr."),]$age)
    #Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
    #11.00 23.00 29.00 32.25 39.00 80.00 176
    summary(temp[which(temp$title=="Mrs."),]$age)
    #Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
    #14.00 27.00 35.50 36.99 46.50 76.00 27

    #填补缺失值

    train_$Age[train_$title =="Miss." & is.na(train_$Age)]<-median(temp[which(temp$title=="Miss."),]$age,na.rm=TRUE)
    train_$Age[train_$title =="Master." & is.na(train_$Age)]<-median(temp[which(temp$title=="Master."),]$age,na.rm=TRUE)
    train_$Age[train_$title =="Mr." & is.na(train_$Age)]<-median(temp[which(temp$title=="Mr."),]$age,na.rm=TRUE)
    train_$Age[train_$title =="Mrs." & is.na(train_$Age)]<-median(temp[which(temp$title=="Mrs."),]$age,na.rm=TRUE)
    train_$Age[train_$title =="Dr." & is.na(train_$Age)]<-median(temp[which(temp$title=="Dr."),]$age,na.rm=TRUE)
    
    
    test_$Age[test_$title =="Miss." & is.na(test_$Age)]<-median(temp[which(temp$title=="Miss."),]$age,na.rm=TRUE)
    test_$Age[test_$title =="Master." & is.na(test_$Age)]<-median(temp[which(temp$title=="Master."),]$age,na.rm=TRUE)
    test_$Age[test_$title =="Mr." & is.na(test_$Age)]<-median(temp[which(temp$title=="Mr."),]$age,na.rm=TRUE)
    test_$Age[test_$title =="Mrs." & is.na(test_$Age)]<-median(temp[which(temp$title=="Mrs."),]$age,na.rm=TRUE)
    test_$Age[test_$title =="Dr." & is.na(test_$Age)]<-median(temp[which(temp$title=="Dr."),]$age,na.rm=TRUE)

    #剔除embarked为NA的行

    train_ <- train_[!is.na(train_$embarked),]
    test_ <- test_[!is.na(test_$embarked),]

    #剔除多余的列

    train_ <- train_[names(train_)[(names(train_) !="title")]]
    test_ <- test_[names(test_)[(names(test_) !="title")]]

    #票价有为0的  

    #从下图可以看出,船舱等级和票价之间的联系,然后对0票价进行处理

    #summary(train_$Fare) 
    #发现存在票价为0的情况,查看为0的数量
    #length(train_$Fare[train_$Fare == 0])
    

    #剔除(未分析的)干扰项目

    train_ <- train_[names(train_)[(names(train_) !="Cabin")]]
    train_ <- train_[names(train_)[(names(train_) !="Ticket")]]
    
    test_ <- test_[names(test_)[(names(test_) !="Cabin")]]
    test_ <- test_[names(test_)[(names(test_) !="Ticket")]]

    #查看变量直接的关联强度(剔除NA值)

    #此处应该先把非定量序数变量变成定量序数变量后,采用Speaman进行相关分析

    heatmap(cor(as.matrix(na.omit(train_))),symm = TRUE, Rowv=NA, Colv=NA, col = cm.colors(256))

     

    #查看票价和船仓等级之间的关系

    boxplot(Fare~Pclass,data=subset(train_,Fare !=  "512.3292"),horizontal=TRUE)

     

    #剔除0值后再观察

    boxplot(Fare~Pclass,data=subset(train_,Fare !=  "512.3292" & Fare !="0"),horizontal=TRUE)


     #探索缺失值的方法

    library(mice)
    md.pattern(train_)

    #使用决策树或随机森林处理缺失值

    #用于熟悉分类方法,目前掌握的资料暂未发现使用该方法进行缺失值处理

    #采用随机抽取的方法,把train_分为训练集和测试集

    #分析两个模型

    #获取到如下数据 head(newData)

    #注意数据集中不能出现字符串,可以更改为因子

    #指定百分比抽样数据,分作训练集和测试集

    sampleData <- sample(2,replace= TRUE, nrow(newData), prob=c(0.7, 0.3)) 
    trainData <- newData[sampleData==1,]
    testData <- newData[sampleData==2,]

    #条件推断树

    library(party)
    f1 <- emba ~ Survived + Pclass + Age + SibSp + Parch + Fare + sex 
    c_tree <- ctree(f1, data=trainData)

    #绘制决策树

    plot(c_tree)

    #对测试数据进行预测

    testPred <- predict(c_tree, newdata = testData)
    table(testPred,testData$emba)

    #随机森林方法进行预测

     library(randomForest)
     rf <- randomForest(f1, data=trainData, ntree=100, proximity=TRUE)
     table(predict(rf), trainData$emba)

    #由于预测效果极差,放弃该方法

    #处理test中的NA的情况

    #此处忽略操作......

    #使用逻辑回归

    f1<-glm(Survived~Pclass+Age+SibSp+Parch+Fare+sex+embarked,data=train_,family=binomial())
    
    f2<-glm(Survived~Pclass+Age+SibSp+sex+embarked,data=train_,family=binomial())
    
    anova(f1,f2,test="Chisq")
    
    #Analysis of Deviance Table
    
    #Model 1: Survived ~ Pclass + Age + SibSp + Parch + Fare + sex + embarked
    #Model 2: Survived ~ Pclass + Age + SibSp + sex + embarked
    #  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
    #1       881     781.22                     
    #2       883     782.29 -2  -1.0685   0.5861
    f3<-glm(Survived~Pclass+Age+SibSp+sex+embarked,data=train_,family=binomial())
    
    test_p<-test_[,c("Pclass","Age","Fare","sex","embarked")]
    
    pred<-predict(f3,test_p)
    
    perf[perf>0]<-1
    
    perf[perf<=0]<-0
  • 相关阅读:
    setup-nginx
    Sql Server
    第一次找工作
    JGroups 初探
    游戏与彩票
    MINA 网络黏包处理代码
    powershell遍历文件夹设置权限,解决文件无法删除的问题。
    c# 异步任务队列(可选是否使用单线程执行任务,以及自动取消任务)
    c#注册表对象映射
    最小安装centos 7 无GUI静默安装 oracle 12c,打造轻量linux化服务器
  • 原文地址:https://www.cnblogs.com/aongao/p/7284186.html
Copyright © 2011-2022 走看看