R 이진분류 모델 비교 구현

1 개요

set.seed(42) # 랜덤값 고정

#### 데이터 준비
df = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
df$y = df$admit
df$admit <- NULL

# 데이터 분할
library(caret, quietly=T)
idx = createDataPartition(df$y, p=0.5, list=F)
Train = df[ idx,]
Test  = df[-idx,]

# 모델 적합
library(rpart)
library(xgboost)
library(neuralnet)
library(randomForest)

x_columns = c("gre","gpa","rank")
formula = as.formula("y ~ gre + gpa + rank")
Train_forest = Train
Train_forest$y = as.factor(as.character(Train$y))
Train_xgboost = as.matrix(Train[,names(df) %in% x_columns])

model_tree      = rpart       (formula, Train)
model_forest    = randomForest(formula, Train_forest)
model_xgboost   = xgboost     (data=Train_xgboost, label=Train$y, nround=2, verbose=0, objective="binary:logistic")
model_logistic  = glm         (formula, Train, family="binomial")
model_neuralnet = neuralnet   (formula, Train, hidden=2)

#### 모델 평가 1 - ROC 곡선
library(ROCR)
Test_xgboost   = as.matrix(Test[,names(df) %in% x_columns])
Test_neuralnet = Test[,names(df) %in% x_columns]

TestResult = Test
TestResult$pred_tree      = predict(model_tree     , Test)
TestResult$pred_forest    = predict(model_forest   , Test, type="prob")[,2]
TestResult$pred_xgboost   = predict(model_xgboost  , Test_xgboost)
TestResult$pred_logistic  = predict(model_logistic , Test)
TestResult$pred_neuralnet = predict(model_neuralnet, Test)

pred_tree      = prediction( TestResult$pred_tree     , Test$y )
pred_forest    = prediction( TestResult$pred_forest   , Test$y )
pred_xgboost   = prediction( TestResult$pred_xgboost  , Test$y )
pred_logistic  = prediction( TestResult$pred_logistic , Test$y )
pred_neuralnet = prediction( TestResult$pred_neuralnet, Test$y )

perf_tree      = performance(pred_tree     , "tpr", "fpr")
perf_forest    = performance(pred_forest   , "tpr", "fpr")
perf_xgboost   = performance(pred_xgboost  , "tpr", "fpr")
perf_logistic  = performance(pred_logistic , "tpr", "fpr")
perf_neuralnet = performance(pred_neuralnet, "tpr", "fpr")

methods = c("tree", "forest", "xgboost", "logistic", "neuralnet")
colors  = c("black" , "green" , "blue"   , "orange"  , "red"    )
plot(perf_tree     , lwd=2, add=F, col=colors[1])
plot(perf_forest   , lwd=2, add=T, col=colors[2])
plot(perf_xgboost  , lwd=2, add=T, col=colors[3])
plot(perf_logistic , lwd=2, add=T, col=colors[4])
plot(perf_neuralnet, lwd=2, add=T, col=colors[5])
title("ROC curve")
legend("bottomright", methods, lwd=2, col=colors)


#### 모델 평가 2 - Confusion Matrix

# cutoff 0.5로 판정
TestResult$y_tree      = ifelse(TestResult$pred_tree     >0.5, 1, 0)
TestResult$y_forest    = ifelse(TestResult$pred_forest   >0.5, 1, 0)
TestResult$y_xgboost   = ifelse(TestResult$pred_xgboost  >0.5, 1, 0)
TestResult$y_logistic  = ifelse(TestResult$pred_logistic >0.5, 1, 0)
TestResult$y_neuralnet = ifelse(TestResult$pred_neuralnet>0.5, 1, 0)

table(TestResult$y_tree     , Test$y)
table(TestResult$y_forest   , Test$y)
table(TestResult$y_xgboost  , Test$y)
table(TestResult$y_logistic , Test$y)
table(TestResult$y_neuralnet, Test$y)


#### 모델 평가 3 - 각종 지표

evaluation = data.frame(Name=c("tree","forest","xgboost","logistic","neuralnet"))
# Accuracy
evaluation$Accuracy = c(
  sum(TestResult$y_tree     ==Test$y)/nrow(Test),
  sum(TestResult$y_forest   ==Test$y)/nrow(Test),
  sum(TestResult$y_xgboost  ==Test$y)/nrow(Test),
  sum(TestResult$y_logistic ==Test$y)/nrow(Test),
  sum(TestResult$y_neuralnet==Test$y)/nrow(Test)
)

# Precision
evaluation$Precision = c(
  sum(TestResult$y_tree     ==T & Test$y==T)/sum(TestResult$y_tree     ==T),
  sum(TestResult$y_forest   ==T & Test$y==T)/sum(TestResult$y_forest   ==T),
  sum(TestResult$y_xgboost  ==T & Test$y==T)/sum(TestResult$y_xgboost  ==T),
  sum(TestResult$y_logistic ==T & Test$y==T)/sum(TestResult$y_logistic ==T),
  sum(TestResult$y_neuralnet==T & Test$y==T)/sum(TestResult$y_neuralnet==T)
)

# Recall
evaluation$Recall = c(
  sum(TestResult$y_tree     ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_forest   ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_xgboost  ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_logistic ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_neuralnet==T & Test$y==T)/sum(Test$y==T)
)

# F1 Score
evaluation$F1_Score = 2 * evaluation$Precision * evaluation$Recall / ( evaluation$Precision + evaluation$Recall )
cat( "성능평가표", evaluation )
2 같이 보기

3 참고