KNN image classifier disscrepency of accuracy scores between scaled model and non-scaled one

17 Views Asked by At

I'm trying to build two knn image classifier with R. In first case, I scaled the data by min-max. The other one is fitted with non-scaled data. After fitting each model, I performed 4-fold cross validation with neighbor range 1~20.

When it comes to applying knn to image data(csv), there should be no difference of accuracy score between scaled model and non-scaled model since every feature of data is about pixel intensity. Min-max scaler matters if the features of the data is heterogeneous.

However, I got best accuracy of 0.819375(k=6) with scaled model and 0.8188542(k=6) with the non-scaled one...

At what point did I make a mistake or misconception?

ps. This is one of my midterm projects, so I nead a solution real-quick, hopefully, please.

library(FNN)
library(caret)
data = read.csv("mnist_fashion.csv", header = TRUE)
head(data)
sum(is.na(data))
colnames(data)[1] = "label" #predictand name restoration

#1.
##min-max scaling
preproc = preProcess(data[, -1], method = c("range"))
data[, -1] = predict(preproc, newdata = data[, -1])
data[, -1] = round(data[, -1], 3)
##random shuffle & splitting the data
set.seed(777)
data_idx = sample(1:nrow(data), round(0.8*nrow(data)))
data_train = data[data_idx, ]
data_test  = data[-data_idx, ]

##k-fold preparation
k = 20
f = 4
folds = cut(seq(1,nrow(data_train)), breaks = f, labels=FALSE)
acc.kfold = rep(NA, f)
acc.list = data.frame(k=seq(1,k,1), accuracy=rep(NA,k))


for(h in 1:k){
  for(i in 1:f){
    data.tr = data_train[folds!=i,]
    data.vd = data_train[folds==i,]

    # train knn
    nn = knn(train=data.tr[,2:785], test=data.vd[,2:785], cl=data.tr[,1], k=h, prob=TRUE)

    data.vd$pclass = nn
    acc.kfold[i] = mean(data.vd$pclass == data.vd$label)
  }
  acc.list[h, 2] = mean(acc.kfold)
}
acc.list

plot(acc.list[,2], type="b", xlab="k", ylab="accuracy")

which.max(acc.list[,2]) #k = 6, accuracy 0.819375
acc.list[,2][which.max(acc.list[,2])]

nn = knn(train=data_train[,2:785], test=data_test[,2:785], cl=data_train[,1], k=which.max(acc.list[,2]), prob=TRUE)
data_test$pclass = nn
mean(data_test$pclass == data_test$label)

###########################################################################################
###########################################################################################

#2.
datar = read.csv("mnist_fashion.csv", header = TRUE)
head(datar)
sum(is.na(datar))
colnames(datar)[1] = "label" #predictand name restoration
data[, -1] = round(data[, -1], 3)

set.seed(777)
datar_idx = sample(1:nrow(datar), round(0.8*nrow(datar)))
datar_train = datar[datar_idx, ]
datar_test  = datar[-datar_idx, ]

##random shuffle & splitting the data
set.seed(777)
datar_idx = sample(1:nrow(datar), round(0.8*nrow(datar)))
datar_train = datar[datar_idx, ]
datar_test  = datar[-datar_idx, ]

##k-fold preparation
k = 20
f = 4
folds = cut(seq(1,nrow(datar_train)), breaks = f, labels=FALSE)
racc.kfold = rep(NA, f)
racc.list = data.frame(k=seq(1,k,1), accuracy=rep(NA,k))


for(h in 1:k){
  for(i in 1:f){
    datar.tr = datar_train[folds!=i,]
    datar.vd = datar_train[folds==i,]
    
    # train knn
    nn = knn(train=datar.tr[,2:785], test=datar.vd[,2:785], cl=datar.tr[,1], k=h, prob=TRUE)
    
    datar.vd$pclass = nn
    racc.kfold[i] = mean(datar.vd$pclass == datar.vd$label)
  }
  racc.list[h, 2] = mean(racc.kfold)
}
racc.list
plot(racc.list[,2], type="b", xlab="k", ylab="accuracy")

which.max(racc.list[,2]) #k = 6, accuracy 0.8188542
racc.list[,2][which.max(racc.list[,2])]


#test data prediction
nn = knn(train=datar_train[,2:785], test=datar_test[,2:785], cl=datar_train[,1], k=which.max(racc.list[,2]), prob=TRUE)
datar_test$pclass = nn
mean(datar_test$pclass == datar_test$label)
0

There are 0 best solutions below