install.packages("pROC") install.packages("randomForest") library(pROC) library(randomForest)
ここから入手可能なソースデータを読み取ります:
titanic3.csv
data <- read.csv('~/habr/unit_weights/titanic3.csv')
従属変数が
survived
data$survived <- as.factor(data$survived)
通常の分析には異なる値が多すぎる変数を除外します
data$name <- NULL data$ticket <- NULL data$cabin <- NULL data$home.dest <- NULL data$embarked <- NULL
将来の変数を使用しないでください
data$boat <- NULL data$body <- NULL
欠損値を平均で置き換えます
data$age[is.na(data$age)] <- mean(data$age, na.rm=TRUE) data$fare[is.na(data$fare)] <- mean(data$fare, na.rm=TRUE)
性別を指標変数に変換する
data$female <- 0 data$female[which(data$sex == 'female')] <- 1 data$sex <- NULL
残っているものを見ます:
survived pclass age sibsp 0:809 Min. :1.000 Min. : 0.1667 Min. :0.0000 1:500 1st Qu.:2.000 1st Qu.:22.0000 1st Qu.:0.0000 Median :3.000 Median :29.8811 Median :0.0000 Mean :2.295 Mean :29.8811 Mean :0.4989 3rd Qu.:3.000 3rd Qu.:35.0000 3rd Qu.:1.0000 Max. :3.000 Max. :80.0000 Max. :8.0000 parch fare female Min. :0.000 Min. : 0.000 Min. :0.000 1st Qu.:0.000 1st Qu.: 7.896 1st Qu.:0.000 Median :0.000 Median : 14.454 Median :0.000 Mean :0.385 Mean : 33.295 Mean :0.356 3rd Qu.:0.000 3rd Qu.: 31.275 3rd Qu.:1.000 Max. :9.000 Max. :512.329 Max. :1.000
実験を数千回繰り返します
im.gini = NULL pm.gini = NULL lr.gini = NULL rf.gini = NULL set.seed(42) for (i in 1:1000) {
乗客を2つのサンプルに分割します-70%がトレーニングサンプルに送られ、残りの30%でモデルの品質が測定されます。
data$random_number <- runif(nrow(data),0,1) development <- data[ which(data$random_number > 0.3), ] holdout <- data[ which(data$random_number <= 0.3), ] development$random_number <- NULL holdout$random_number <- NULL
シングルウェイトモデル
beta_pclass <- -1/sd(development$pclass) beta_age <- -1/sd(development$age ) beta_female <- 1/sd(development$female) im.score <- beta_pclass*holdout$pclass + beta_age*holdout$age + beta_female*holdout$female im.roc <- roc(holdout$survived, im.score) im.gini[i] <- 2*im.roc$auc-1
通常のモデルは、同じ独立変数を使用したロジスティック回帰です。
pm.model = glm(survived~pclass+age+female, family=binomial(logit), data=development) pm.score <- predict(pm.model, holdout, type="response") pm.roc <- roc(holdout$survived, pm.score) pm.gini[i] <- 2*pm.roc$auc-1
すべての変数を含むロジスティック回帰
lr.model = glm(survived~., family=binomial(logit), data=development) lr.score <- predict(lr.model, holdout, type="response") lr.roc <- roc(holdout$survived, lr.score) lr.gini[i] <- 2*lr.roc$auc-1
みんなのお気に入り(理由もなく)RandomForest
rf.model <- randomForest(survived~., development) rf.score <- predict(rf.model, holdout, type = "prob") rf.roc <- roc(holdout$survived, rf.score[,1]) rf.gini[i] <- 2*rf.roc$auc-1 }
結果を出力する
bpd<-data.frame(ImproperModel=im.gini, ProperModel=pm.gini, LogisticRegression=lr.gini, RandomForest=rf.gini) png('~/habr/unit_weights/auc_comparison.png', height=700, width=400, res=120, units='px') boxplot(bpd, las=2, ylab="Gini", ylim=c(0,1), par(mar=c(9,5,1,1)+ 0.1), col=c("red","green","royalblue2","brown")) dev.off() mean(im.gini) mean(pm.gini) mean(lr.gini) mean(rf.gini) mean(im.gini)/mean(rf.gini) mean(pm.gini)/mean(rf.gini) mean(lr.gini)/mean(rf.gini) mean(rf.gini)/mean(rf.gini)