"R 랜덤 포레스트"의 두 판 사이의 차이

2020년 5월 10일 (일) 13:12 판

1 개요

R Random Forest
R 랜덤 포레스트

randomForest() 또는 cforest() 사용

2 randomForest()

R

CPU

3.2s

MEM

216M

3.6s

Copy

set.seed(42) # 랜덤값 고정
data(stagec, package='rpart')
df = stagec
df = na.omit(df) # 결측치 제거

# 데이터 분할
library(caret, quietly=T)
idx = createDataPartition(df$ploidy, p=0.7, list=FALSE)
trainData = df[ idx,]
testData  = df[-idx,]

# 모델 적합
library(randomForest, warn.conflict=F)
model = randomForest(ploidy ~ ., trainData, ntree=100, proximity=T)

options(echo=T)
# 모델 정보
model
model$importance

# 시각화
plot(model)
varImpPlot(model)

# 테스트
pred = predict(model, testData)
# 분류표
table(pred, testData$ploidy)
# 정분류율
sum(pred==testData$ploidy)/nrow(testData)

randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
> # 모델 정보
> model

Call:
 randomForest(formula = ploidy ~ ., data = trainData, ntree = 100,      proximity = T) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 2

        OOB estimate of  error rate: 7.37%
Confusion matrix:
           diploid tetraploid aneuploid class.error
diploid         44          1         1  0.04347826
tetraploid       1         44         0  0.02222222
aneuploid        3          1         0  1.00000000
> model$importance
        MeanDecreaseGini
pgtime          5.338281
pgstat          1.673620
age             4.631630
eet             1.001239
g2             32.856804
grade           1.410163
gleason         2.773629
> 
> # 시각화
> plot(model)
> varImpPlot(model)
> 
> # 테스트
> pred = predict(model, testData)
> # 분류표
> table(pred, testData$ploidy)
            
pred         diploid tetraploid aneuploid
  diploid         19          0         1
  tetraploid       0         19         0
  aneuploid        0          0         0
> # 정분류율
> sum(pred==testData$ploidy)/nrow(testData)
[1] 0.974359
> cat('

3 cforest()

R

Copy

set.seed(42) # 랜덤값 고정
data(stagec, package='rpart')
df = stagec
df = na.omit(df) # 결측치 제거

# 데이터 분할
library(caret, quietly=T)
idx = createDataPartition(df$ploidy, p=0.7, list=FALSE)
trainData = df[ idx,]
testData  = df[-idx,]

# 모델 적합
library(party, warn.conflicts=F, quietly=T, mask.ok)
model = cforest(ploidy ~ ., trainData)

options(echo=T)
# 모델 정보
model

# 테스트
pred = predict(model, newdata=testData, OOB=T, type="response")
# 분류표
table(pred, testData$ploidy)
# 정분류율
sum(pred==testData$ploidy)/nrow(testData)

Loading

4 같이 보기