R XGBoost

1 개요[ | ]

R XGBoost

2 2 분류[ | ]

2.1 공부시간-합격확률 자료[ | ]

set.seed(42) # 랜덤값 고정

# 데이터 준비
df <- data.frame(
  Hours = c(0.50,0.75,1.00,1.25,1.50,1.75,1.75,2.00,2.25,2.50,2.75,3.00,3.25,3.50,4.00,4.25,4.50,4.75,5.00,5.50),
  Pass = c(0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,1,1,1,1,1)
)

# 데이터 분할
library(caret, quietly=T)
idx <- createDataPartition(df$Pass, list=F, p=0.8)
Train <- df[ idx,]
Test  <- df[-idx,]

train.data  <- as.matrix(Train[, names(df)!="Pass"])
test.data   <- as.matrix(Test[ , names(df)!="Pass"])
train.label <- Train$Pass
test.label  <- Test$Pass

# 모델 적합
library(xgboost)
model <- xgboost(data=train.data, label=train.label, max.depth = 2,
                eta = 1, nthread = 2, nround = 2)

# 모델 정보
model

# 테스트
pred <- predict(model, test.data)
pred <- ifelse(pred>0.5, 1, 0)
str(pred)

# 분류표
table(pred, test.label)

# 정분류율(accuracy)
sum(pred==test.label)/length(pred)

2.2 agaricus 자료[ | ]

# 데이터 준비
data(agaricus.train, package='xgboost')
data(agaricus.test , package='xgboost')
train.data  <- agaricus.train$data
test.data   <- agaricus.test$data
train.label <- agaricus.train$label
test.label  <- agaricus.test$label

# 모델 적합
library(xgboost)
model <- xgboost(data=train.data, label=train.label, max.depth = 2,
                eta = 1, nthread = 2, nround = 2)

# 모델 정보
model

# 테스트
pred <- predict(model, test.data)
pred <- ifelse(pred>0.5, 1, 0)
str(pred)

# 분류표
table(pred, test.label)

# 정분류율(accuracy)
sum(pred==test.label)/length(pred)

3 3 분류[ | ]

3.1 iris 자료[ | ]

set.seed(42) # 랜덤값 고정

# 데이터 준비
df = iris
num_class = 3
names = levels(df$Species)

# 데이터 분할
library(caret, quietly=T)
idx = createDataPartition(df$Species, list=F, p=0.8)
Train = df[ idx,]
Test  = df[-idx,]

train.data  = as.matrix(Train[, names(df)!="Species"])
test.data   = as.matrix(Test[ , names(df)!="Species"])
train.label = as.integer(Train$Species) - 1 # 0기반
test.label  = as.integer(Test$Species ) - 1 # 0기반

# 모델 적합
library(xgboost)
dtrain = xgb.DMatrix(data=train.data, label=train.label)
dtest  = xgb.DMatrix(data=test.data , label=test.label )
watchlist = list(train=dtrain, eval=dtest)
param = list(max_depth=2, eta=1, verbose=0, nthread=2,
             objective="multi:softprob", eval_metric="mlogloss", num_class=num_class)
model = xgb.train(param, dtrain, nrounds=2, watchlist)

# 모델 정보
model

# 테스트
pred = as.data.frame(predict(model,test.data,reshape=T))
colnames(pred) = names
pred$prediction = apply(pred,1,function(x) names[which.max(x)])
pred$class = Test$Species
pred

# 분류표
table(pred$prediction, pred$class)

# 정분류율(accuracy)
sum(pred$prediction==pred$class)/nrow(pred)

4 같이 보기[ | ]

5 참고[ | ]

https://www.rdocumentation.org/packages/xgboost/versions/0.4-4/topics/xgboost