R 이진분류 모델 비교 구현

1 개요[ | ]

R 분류 모형 비교, R 분류 모델 비교
R 이진분류 모형 비교, R 이진분류 모델 비교

2 에시 1[ | ]

set.seed(42) # 랜덤값 고정

#### 데이터 준비
df = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
df$y = df$admit
df$admit = NULL

# 데이터 분할
library(caret)
idx = createDataPartition(df$y, p=0.5, list=F)
Train = df[ idx,]
Test  = df[-idx,]

# 모델 적합
library(rpart)
library(xgboost)
library(neuralnet)
library(randomForest, warn.conflicts=F)

x_columns = c("gre","gpa","rank")
formula = as.formula("y ~ gre + gpa + rank")
Train_forest = Train
Train_forest$y = as.factor(as.character(Train$y))
Train_xgboost = as.matrix(Train[,names(df) %in% x_columns])

model_tree      = rpart       (formula, Train)
model_forest    = randomForest(formula, Train_forest)
model_xgboost   = xgboost     (data=Train_xgboost, label=Train$y, nround=2, verbose=0, objective="binary:logistic")
model_logistic  = glm         (formula, Train, family="binomial")
model_neuralnet = neuralnet   (formula, Train, hidden=2)

#### 모델 평가 1 - ROC 곡선
library(ROCR, warn.conflicts=F)
Test_xgboost   = as.matrix(Test[,names(df) %in% x_columns])
Test_neuralnet = Test[,names(df) %in% x_columns]

TestResult = Test
TestResult$pred_tree      = predict(model_tree     , Test)
TestResult$pred_forest    = predict(model_forest   , Test, type="prob")[,2]
TestResult$pred_xgboost   = predict(model_xgboost  , Test_xgboost)
TestResult$pred_logistic  = predict(model_logistic , Test)
TestResult$pred_neuralnet = predict(model_neuralnet, Test)

pred_tree      = prediction( TestResult$pred_tree     , Test$y )
pred_forest    = prediction( TestResult$pred_forest   , Test$y )
pred_xgboost   = prediction( TestResult$pred_xgboost  , Test$y )
pred_logistic  = prediction( TestResult$pred_logistic , Test$y )
pred_neuralnet = prediction( TestResult$pred_neuralnet, Test$y )

perf_tree      = performance(pred_tree     , "tpr", "fpr")
perf_forest    = performance(pred_forest   , "tpr", "fpr")
perf_xgboost   = performance(pred_xgboost  , "tpr", "fpr")
perf_logistic  = performance(pred_logistic , "tpr", "fpr")
perf_neuralnet = performance(pred_neuralnet, "tpr", "fpr")

methods = c("tree" , "forest", "xgboost", "logistic", "neuralnet")
colors  = c("black", "green" , "blue"   , "orange"  , "red"      )
plot(perf_tree     , lwd=2, add=F, col=colors[1])
plot(perf_forest   , lwd=2, add=T, col=colors[2])
plot(perf_xgboost  , lwd=2, add=T, col=colors[3])
plot(perf_logistic , lwd=2, add=T, col=colors[4])
plot(perf_neuralnet, lwd=2, add=T, col=colors[5])
title("ROC curve")
legend("bottomright", methods, lwd=2, col=colors)

#### 모델 평가 2 - Confusion Matrix

# cutoff 0.5로 판정
TestResult$y_tree      = ifelse(TestResult$pred_tree     >0.5, 1, 0)
TestResult$y_forest    = ifelse(TestResult$pred_forest   >0.5, 1, 0)
TestResult$y_xgboost   = ifelse(TestResult$pred_xgboost  >0.5, 1, 0)
TestResult$y_logistic  = ifelse(TestResult$pred_logistic >0.5, 1, 0)
TestResult$y_neuralnet = ifelse(TestResult$pred_neuralnet>0.5, 1, 0)

cat( "분류표" )
table(TestResult$y_tree     , Test$y)
table(TestResult$y_forest   , Test$y)
table(TestResult$y_xgboost  , Test$y)
table(TestResult$y_logistic , Test$y)
table(TestResult$y_neuralnet, Test$y)

#### 모델 평가 3 - 각종 지표

evaluation = data.frame(Name=c("tree","forest","xgboost","logistic","neuralnet"))
# Accuracy 계산
evaluation$Accuracy = c(
  sum(TestResult$y_tree     ==Test$y)/nrow(Test),
  sum(TestResult$y_forest   ==Test$y)/nrow(Test),
  sum(TestResult$y_xgboost  ==Test$y)/nrow(Test),
  sum(TestResult$y_logistic ==Test$y)/nrow(Test),
  sum(TestResult$y_neuralnet==Test$y)/nrow(Test)
)

# Precision 계산
evaluation$Precision = c(
  sum(TestResult$y_tree     ==T & Test$y==T)/sum(TestResult$y_tree     ==T),
  sum(TestResult$y_forest   ==T & Test$y==T)/sum(TestResult$y_forest   ==T),
  sum(TestResult$y_xgboost  ==T & Test$y==T)/sum(TestResult$y_xgboost  ==T),
  sum(TestResult$y_logistic ==T & Test$y==T)/sum(TestResult$y_logistic ==T),
  sum(TestResult$y_neuralnet==T & Test$y==T)/sum(TestResult$y_neuralnet==T)
)

# Recall 계산
evaluation$Recall = c(
  sum(TestResult$y_tree     ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_forest   ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_xgboost  ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_logistic ==T & Test$y==T)/sum(Test$y==T),
  sum(TestResult$y_neuralnet==T & Test$y==T)/sum(Test$y==T)
)

# F1 Score 계산
evaluation$F1_Score = 2 * evaluation$Precision * evaluation$Recall / ( evaluation$Precision + evaluation$Recall )

cat( "\n평가지표\n" )
print( evaluation )

3 예시 2[ | ]

library(caret)        # createDataPartition()
library(hash)         # hash()
library(e1071)        # svm()
library(rpart)        # rpart()
library(xgboost)      # xgboost()
library(neuralnet)    # neuralnet()
library(randomForest) # randomForest()
library(ROCR)         # performance()

set.seed(42) # 랜덤값 고정

#### 데이터 준비
df = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
df$y = df$admit
df$admit = NULL

# 데이터 분할
idx = createDataPartition(df$y, p=0.5, list=F)
Train = df[ idx,]
Test  = df[-idx,]

# 모델 적합
x_columns = c("gre","gpa","rank")
formula = as.formula("y ~ gre + gpa + rank")
Train_forest = Train
Train_forest$y = as.factor(as.character(Train$y))
Train_xgboost  = as.matrix(Train[,names(df) %in% x_columns])
Test_xgboost   = as.matrix(Test[,names(df) %in% x_columns])
Test_neuralnet = Test[,names(df) %in% x_columns]

names = c("svm","tree","forest","xgboost","logistic","neuralnet")
lst = list()
for(i in 1:length(names)) {
  lst[[i]] = hash(name=names[i])
}

lst[[1]]$model = svm         (formula, Train)
lst[[2]]$model = rpart       (formula, Train)
lst[[3]]$model = randomForest(formula, Train_forest)
lst[[4]]$model = xgboost     (data=Train_xgboost, label=Train$y, nround=2, verbose=0, objective="binary:logistic")
lst[[5]]$model = glm         (formula, Train, family="binomial")
lst[[6]]$model = neuralnet   (formula, Train, hidden=2)

lst[[1]]$predict = predict(lst[[1]]$model, Test)
lst[[2]]$predict = predict(lst[[2]]$model, Test)
lst[[3]]$predict = predict(lst[[3]]$model, Test, type="prob")[,2]
lst[[4]]$predict = predict(lst[[4]]$model, Test_xgboost)
lst[[5]]$predict = predict(lst[[5]]$model, Test)
lst[[6]]$predict = predict(lst[[6]]$model, Test)

for (i in 1:length(names)) {
  lst[[i]]$predictedY      = ifelse(lst[[i]]$predict>.5, 1, 0)
  lst[[i]]$performance     = performance(prediction(lst[[i]]$predict, Test$y), "tpr", "fpr")
  lst[[i]]$confusionMatrix = confusionMatrix(factor(lst[[i]]$predictedY), factor(Test$y), positive='1')
}

## 모델 평가표
evaluation = data.frame(Name=names)
evaluation$Accuracy    = sapply(lst, function(x){x$confusionMatrix$overall['Accuracy']})
evaluation$Sensitivity = sapply(lst, function(x){x$confusionMatrix$byClass['Sensitivity']})
evaluation$Specificity = sapply(lst, function(x){x$confusionMatrix$byClass['Specificity']})
evaluation$Precision   = sapply(lst, function(x){x$confusionMatrix$byClass['Precision']})
evaluation

## ROC 곡선
colors = c("black", "green", "blue", "orange", "red", "gray")
for (i in 1:length(names)) {
  plot(lst[[i]]$performance, lwd=2, add=(i!=1), col=colors[i])  
}
title("ROC curve")
legend("bottomright", legend=names, lwd=2, col=colors)

4 예시 3[ | ]

library(caret)        # createDataPartition()
library(hash)         # hash()
library(rpart)        # rpart()
library(randomForest) # randomForest()
library(e1071)        # svm()

set.seed(42) # 랜덤값 고정

# 데이터 준비
df <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
df$y <- df$admit
df$admit <- NULL

# 데이터 분할
idx <- createDataPartition(df$y, p=.5, list=F)
Train <- df[ idx,]
Test  <- df[-idx,]

# 리스트 생성
names <- c("logistic","tree","forest","svm")
lst <- list()
for(i in 1:length(names)) { lst[[i]] <- hash() }

# 모델 적합
lst[[1]]$model <- glm         (y ~ ., Train, family="binomial")
lst[[2]]$model <- rpart       (y ~ ., Train)
lst[[3]]$model <- randomForest(y ~ ., Train)
lst[[4]]$model <- svm         (y ~ ., Train)

# 예측값 구하기
lst[[1]]$predictedY <- ifelse(predict(lst[[1]]$model, Test) > .5, 1, 0)
lst[[2]]$predictedY <- ifelse(predict(lst[[2]]$model, Test) > .5, 1, 0)
lst[[3]]$predictedY <- ifelse(predict(lst[[3]]$model, Test, type="response") > .5, 1, 0)
lst[[4]]$predictedY <- ifelse(predict(lst[[4]]$model, Test) > .5, 1, 0)

# 혼동행렬 구하기
for(i in 1:length(names)) { lst[[i]]$confusionMatrix = confusionMatrix(factor(lst[[i]]$predictedY), factor(Test$y), positive='1') }

# 모델 평가표 작성
evaluation <- data.frame(Name=names)
evaluation$Accuracy    <- sapply(lst, function(x){x$confusionMatrix$overall['Accuracy']})
evaluation$Sensitivity <- sapply(lst, function(x){x$confusionMatrix$byClass['Sensitivity']})
evaluation$Specificity <- sapply(lst, function(x){x$confusionMatrix$byClass['Specificity']})
evaluation$Precision   <- sapply(lst, function(x){x$confusionMatrix$byClass['Precision']})
evaluation

5 같이 보기[ | ]