# Optimal kNN
knn_error_rate <-function(x, y, numNeighbors, z=x) {
y_hat <- knn(train=x, test=z, cl=y, k=numNeighbors)
return(sum(y_hat !=y) / nrow(x))
}
ks<-c(1:10, 15, 20, 25, 30)
train_rates <- sapply(ks, FUN=knn_error_rate, x=train_q, y=train$SleepTrouble)
knn_error_rates <- data.frame(k=ks, train_rate=train_rates)
ggplot(data=knn_error_rates, aes(x=k, y=train_rate)) +
geom_point() + geom_line() + ylab("Misclassification Rate")
# knn for test set (k=c(30,25,20,10,7,5,3))
SleepTrouble_knn30 <- knn(train_q, test = test_q, cl = train$SleepTrouble, k = 30)
SleepTrouble_knn25 <- knn(train_q, test = test_q, cl = train$SleepTrouble, k = 25)
SleepTrouble_knn20 <- knn(train_q, test = test_q, cl = train$SleepTrouble, k = 20)
SleepTrouble_knn10 <- knn(train_q, test = test_q, cl = train$SleepTrouble, k = 10)
SleepTrouble_knn7 <- knn(train_q, test = test_q, cl = train$SleepTrouble, k = 7)
SleepTrouble_knn5 <- knn(train_q, test = test_q, cl = train$SleepTrouble, k = 5)
SleepTrouble_knn3 <- knn(train_q, test = test_q, cl = train$SleepTrouble, k = 3)
# performance of knn for test set
confusionMatrix(table(test$SleepTrouble, SleepTrouble_knn30))
confusionMatrix(table(test$SleepTrouble, SleepTrouble_knn25))
confusionMatrix(table(test$SleepTrouble, SleepTrouble_knn20))
confusionMatrix(table(test$SleepTrouble, SleepTrouble_knn10))
confusionMatrix(table(test$SleepTrouble, SleepTrouble_knn7))
confusionMatrix(table(test$SleepTrouble, SleepTrouble_knn5))
confusionMatrix(table(test$SleepTrouble, SleepTrouble_knn3))
tr_knn30 <- knn(train_q, test = train_q, cl = train$SleepTrouble, k = 30)
tr_knn25 <- knn(train_q, test = train_q, cl = train$SleepTrouble, k = 25)
tr_knn20 <- knn(train_q, test = train_q, cl = train$SleepTrouble, k = 20)
tr_knn10 <- knn(train_q, test = train_q, cl = train$SleepTrouble, k = 10)
tr_knn7 <- knn(train_q, test = train_q, cl = train$SleepTrouble, k = 7)
tr_knn5 <- knn(train_q, test = train_q, cl = train$SleepTrouble, k = 5)
tr_knn3 <- knn(train_q, test = train_q, cl = train$SleepTrouble, k = 3)
# performance of knn for training set
confusionMatrix(table(train$SleepTrouble,tr_knn30))
confusionMatrix(table(train$SleepTrouble,tr_knn25))
confusionMatrix(table(train$SleepTrouble,tr_knn20))
confusionMatrix(table(train$SleepTrouble,tr_knn10))
confusionMatrix(table(train$SleepTrouble,tr_knn7))
confusionMatrix(table(train$SleepTrouble,tr_knn5))
confusionMatrix(table(train$SleepTrouble,tr_knn3))
k |
Accuracy_test |
Accuracy_train |
Gap |
3 |
76.13 |
87.19 |
=C2-B2 11.06 |
5 |
75.2 |
83.23 |
=C3-B3 8.03 |
7 |
74.2 |
81.03 |
=C4-B4 6.83 |
10 |
74.13 |
79.3 |
=C5-B5 5.17 |
20 |
74.4 |
76.53 |
=SIGN(LEFT) =C6-B6 2.13 |
25 |
75.17 |
76.36 |
=C7-B7 1.19 |
30 |
74.93 |
76.13 |
=C8-B8 1.2 |
Find Optimal for kNN
Accuracy of test, train 모두 k=3에서 가장 높고, 두 군간의 gap은 k=25에서 가장 낮으나, K-nn에서 k를 너무 키우는 것은 추천하지 않는다. 과적합의 위험이 있고, 알고리즘상유사한 데이터의 범주를 할당하는데 20개씩이나 계산해보고 할당하는 것은 계산시간으로 보나 복잡도차원으로 보더라도 그리 추천할 만 하지 않기 때문이다. 따라서 이 경우 optimal K는 k=3이다.
▼ R 프로그래밍 학습용 추천도서
|
▼ Python 파이썬 프로그래밍 학습용 추천도서
|
댓글