R 이진분류 모델 비교 구현

1 개요

R 분류 모형 비교, R 분류 모델 비교
R 이진분류 모형 비교, R 이진분류 모델 비교

2 에시 1

set.seed(42) # 랜덤값 고정

#### 데이터 준비
df = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
df$y = df$admit
df$admit = NULL

# 데이터 분할
library(caret)
idx = createDataPartition(df$y, p=0.5, list=F)
Train = df[ idx,]
Test  = df[-idx,]

# 모델 적합
library(rpart)
library(xgboost)
library(neuralnet)
library(randomForest, warn.conflicts=F)

x_columns = c("gre","gpa","rank")
formula = as.formula("y ~ gre + gpa + rank")
Train_forest = Train
Train_forest$y = as.factor(as.character(Train$y))
Train_xgboost = as.matrix(Train[,names(df) %in% x_columns])

model_tree      = rpart       (formula, Train)
model_forest    = randomForest(formula, Train_forest)
model_xgboost   = xgboost     (data=Train_xgboost, label=Train$y, nround=2, verbose=0, objective="binary:logistic")
model_logistic  = glm         (formula, Train, family="binomial")
model_neuralnet = neuralnet   (formula, Train, hidden=2)

#### 모델 평가 1 - ROC 곡선
library(ROCR, warn.conflicts=F)
Test_xgboost   = as.matrix(Test[,names(df) %in% x_columns])
Test_neuralnet = Test[,names(df) %in% x_columns]

TestResult = Test
TestResult$pred_tree      = predict(model_tree     , Test)
TestResult$pred_forest    = predict(model_forest   , Test, type="prob")[,2]
TestResult$pred_xgboost   = predict(model_xgboost  , Test_xgboost)
TestResult$pred_logistic  = predict(model_logistic , Test)
TestResult$pred_neuralnet = predict(model_neuralnet, Test)

pred_tree      = prediction( TestResult$pred_tree     , Test$y )
pred_forest    = prediction( TestResult$pred_forest   , Test$y )
pred_xgboost   = prediction( TestResult$pred_xgboost  , Test$y )
pred_logistic  = prediction( TestResult$pred_logistic , Test$y )
pred_neuralnet = prediction( TestResult$pred_neuralnet, Test$y )

perf_tree      = performance(pred_tree     , "tpr", "fpr")
perf_forest    = performance(pred_forest   , "tpr", "fpr")
perf_xgboost   = performance(pred_xgboost  , "tpr", "fpr")
perf_logistic  = performance(pred_logistic , "tpr", "fpr")
perf_neuralnet = performance(pred_neuralnet, "tpr", "fpr")

methods = c("tree" , "forest", "xgboost", "logistic", "neuralnet")
colors  = c("black", "green" , "blue"   , "orange"  , "red"      )
plot(perf_tree     , lwd=2, add=F, col=colors[1])
plot(perf_forest   , lwd=2, add=T, col=colors[2])
plot(perf_xgboost  , lwd=2, add=T, col=colors[3])
plot(perf_logistic , lwd=2, add=T, col=colors[4])
plot(perf_neuralnet, lwd=2, add=T, col=colors[5])
title("ROC curve")
legend("bottomright", methods, lwd=2, col=colors)


#### 모델 평가 2 - Confusion Matrix

# cutoff 0.5로 판정
TestResult$y_tree      = ifelse(TestResult$pred_tree     >0.5, 1, 0)
TestResult$y_forest    = ifelse(TestResult$pred_forest   >0.5, 1, 0)
TestResult$y_xgboost   = ifelse(TestResult$pred_xgboost  >0.5, 1, 0)
TestResult$y_logistic  = ifelse(TestResult$pred_logistic >0.5, 1, 0)
TestResult$y_neuralnet = ifelse(TestResult$pred_neuralnet>0.5, 1, 0)

cat( "분류표" )
table(TestResult$y_tree     , Test$y)
table(TestResult$y_forest   , Test$y)
table(TestResult$y_xgboost  , Test$y)
table(TestResult$y_logistic , Test$y)
table(TestResult$y_neuralnet, Test$y)


#### 모델 평가 3 - 각종 지표

evaluation = data.frame(Name=c("tree","forest","xgboost","logistic","neuralnet"))
# Accuracy 계산
evaluation$Accuracy = c(
  sum(TestResult$y_tree     ==Test$y)/nrow(Test),
  sum(TestResult$y_forest   ==Test$y)/nrow(Test),
  sum(TestResult$y_xgboost  ==Test$y)/nrow(Test),
  sum(TestResult$y_logistic ==Test$y)/nrow(Test),
  sum(TestResult$y_neuralnet==Test$y)/nrow(Test)
)

# Precision 계산
evaluation$Precision = c(
  sum(TestResult$y_tree     ==T & Test$y==T)/sum(TestResult$y_tree     ==T),
  sum(TestResult$y_forest   ==T & Test$y==T)/sum(TestResult$y_forest   ==T),
  sum(TestResult$y_xgboost  ==T & Test$y==T)/sum(TestResult$y_xgboost  ==T),
  sum(TestResult$y_logistic ==T & Test$y==T)/sum(TestResult$y_logistic ==T),
  sum(TestResult$y_neuralnet==T & Test$y==T)/sum(TestResult$y_neuralnet==T)
)

# Recall 계산
evaluation$Recall = c(
  sum(TestResult$y_tree     ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_forest   ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_xgboost  ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_logistic ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_neuralnet==T & Test$y==T)/sum(Test$y==T)
)

# F1 Score 계산
evaluation$F1_Score = 2 * evaluation$Precision * evaluation$Recall / ( evaluation$Precision + evaluation$Recall )

cat( "\n평가지표\n" )
print( evaluation )

3 에시 2

library(caret)        # createDataPartition()
library(hash)         # hash()
library(rpart)        # rpart()
library(xgboost)      # xgboost()
library(neuralnet)    # neuralnet()
library(randomForest) # randomForest()
library(ROCR)         # performance()

set.seed(42) # 랜덤값 고정

#### 데이터 준비
df = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
df$y = df$admit
df$admit = NULL

# 데이터 분할
idx = createDataPartition(df$y, p=0.5, list=F)
Train = df[ idx,]
Test  = df[-idx,]

# 모델 적합
x_columns = c("gre","gpa","rank")
formula = as.formula("y ~ gre + gpa + rank")
Train_forest = Train
Train_forest$y = as.factor(as.character(Train$y))
Train_xgboost  = as.matrix(Train[,names(df) %in% x_columns])
Test_xgboost   = as.matrix(Test[,names(df) %in% x_columns])
Test_neuralnet = Test[,names(df) %in% x_columns]

names = c("tree","forest","xgboost","logistic","neuralnet")
lst = list()
for(i in 1:5) {
  lst[[i]] = hash(name=names[i])
}

lst[[1]]$model = rpart       (formula, Train)
lst[[2]]$model = randomForest(formula, Train_forest)
lst[[3]]$model = xgboost     (data=Train_xgboost, label=Train$y, nround=2, verbose=0, objective="binary:logistic")
lst[[4]]$model = glm         (formula, Train, family="binomial")
lst[[5]]$model = neuralnet   (formula, Train, hidden=2)

lst[[1]]$predict = predict(lst[[1]]$model, Test)
lst[[2]]$predict = predict(lst[[2]]$model, Test, type="prob")[,2]
lst[[3]]$predict = predict(lst[[3]]$model, Test_xgboost)
lst[[4]]$predict = predict(lst[[4]]$model, Test)
lst[[5]]$predict = predict(lst[[5]]$model, Test)

for (i in 1:5) {
  lst[[i]]$predictedY      = ifelse(lst[[i]]$predict>.5, 1, 0)
  lst[[i]]$performance     = performance(prediction(lst[[i]]$predict, Test$y), "tpr", "fpr")
  lst[[i]]$confusionMatrix = confusionMatrix(table(lst[[i]]$predictedY, Test$y))
}

## ROC 곡선
colors = c("black", "green", "blue", "orange", "red")
plot(lst[[1]]$performance, lwd=2, add=F, col=colors[1])
plot(lst[[2]]$performance, lwd=2, add=T, col=colors[2])
plot(lst[[3]]$performance, lwd=2, add=T, col=colors[3])
plot(lst[[4]]$performance, lwd=2, add=T, col=colors[4])
plot(lst[[5]]$performance, lwd=2, add=T, col=colors[5])
title("ROC curve")
legend("bottomright", legend=names, lwd=2, col=colors)

## 모델 평가표
evaluation = data.frame(Name=names)
evaluation$Accuracy  = sapply(lst, function(x){x$confusionMatrix$overall['Accuracy']})
evaluation$Precision = sapply(lst, function(x){x$confusionMatrix$byClass['Precision']})
evaluation$Recall    = sapply(lst, function(x){x$confusionMatrix$byClass['Recall']})
evaluation

4 같이 보기