I'm trying to build two knn image classifier with R. In first case, I scaled the data by min-max. The other one is fitted with non-scaled data. After fitting each model, I performed 4-fold cross validation with neighbor range 1~20.
When it comes to applying knn to image data(csv), there should be no difference of accuracy score between scaled model and non-scaled model since every feature of data is about pixel intensity. Min-max scaler matters if the features of the data is heterogeneous.
However, I got best accuracy of 0.819375(k=6) with scaled model and 0.8188542(k=6) with the non-scaled one...
At what point did I make a mistake or misconception?
ps. This is one of my midterm projects, so I nead a solution real-quick, hopefully, please.
library(FNN)
library(caret)
data = read.csv("mnist_fashion.csv", header = TRUE)
head(data)
sum(is.na(data))
colnames(data)[1] = "label" #predictand name restoration
#1.
##min-max scaling
preproc = preProcess(data[, -1], method = c("range"))
data[, -1] = predict(preproc, newdata = data[, -1])
data[, -1] = round(data[, -1], 3)
##random shuffle & splitting the data
set.seed(777)
data_idx = sample(1:nrow(data), round(0.8*nrow(data)))
data_train = data[data_idx, ]
data_test = data[-data_idx, ]
##k-fold preparation
k = 20
f = 4
folds = cut(seq(1,nrow(data_train)), breaks = f, labels=FALSE)
acc.kfold = rep(NA, f)
acc.list = data.frame(k=seq(1,k,1), accuracy=rep(NA,k))
for(h in 1:k){
for(i in 1:f){
data.tr = data_train[folds!=i,]
data.vd = data_train[folds==i,]
# train knn
nn = knn(train=data.tr[,2:785], test=data.vd[,2:785], cl=data.tr[,1], k=h, prob=TRUE)
data.vd$pclass = nn
acc.kfold[i] = mean(data.vd$pclass == data.vd$label)
}
acc.list[h, 2] = mean(acc.kfold)
}
acc.list
plot(acc.list[,2], type="b", xlab="k", ylab="accuracy")
which.max(acc.list[,2]) #k = 6, accuracy 0.819375
acc.list[,2][which.max(acc.list[,2])]
nn = knn(train=data_train[,2:785], test=data_test[,2:785], cl=data_train[,1], k=which.max(acc.list[,2]), prob=TRUE)
data_test$pclass = nn
mean(data_test$pclass == data_test$label)
###########################################################################################
###########################################################################################
#2.
datar = read.csv("mnist_fashion.csv", header = TRUE)
head(datar)
sum(is.na(datar))
colnames(datar)[1] = "label" #predictand name restoration
data[, -1] = round(data[, -1], 3)
set.seed(777)
datar_idx = sample(1:nrow(datar), round(0.8*nrow(datar)))
datar_train = datar[datar_idx, ]
datar_test = datar[-datar_idx, ]
##random shuffle & splitting the data
set.seed(777)
datar_idx = sample(1:nrow(datar), round(0.8*nrow(datar)))
datar_train = datar[datar_idx, ]
datar_test = datar[-datar_idx, ]
##k-fold preparation
k = 20
f = 4
folds = cut(seq(1,nrow(datar_train)), breaks = f, labels=FALSE)
racc.kfold = rep(NA, f)
racc.list = data.frame(k=seq(1,k,1), accuracy=rep(NA,k))
for(h in 1:k){
for(i in 1:f){
datar.tr = datar_train[folds!=i,]
datar.vd = datar_train[folds==i,]
# train knn
nn = knn(train=datar.tr[,2:785], test=datar.vd[,2:785], cl=datar.tr[,1], k=h, prob=TRUE)
datar.vd$pclass = nn
racc.kfold[i] = mean(datar.vd$pclass == datar.vd$label)
}
racc.list[h, 2] = mean(racc.kfold)
}
racc.list
plot(racc.list[,2], type="b", xlab="k", ylab="accuracy")
which.max(racc.list[,2]) #k = 6, accuracy 0.8188542
racc.list[,2][which.max(racc.list[,2])]
#test data prediction
nn = knn(train=datar_train[,2:785], test=datar_test[,2:785], cl=datar_train[,1], k=which.max(racc.list[,2]), prob=TRUE)
datar_test$pclass = nn
mean(datar_test$pclass == datar_test$label)