1

A part of the code is

sse <-c()                         
k <- c()    

for (i in seq(3, 15, 1)) {                        

  y_pred <-knn(train = newdata.training, test = newdata.test,
               cl = newdata.trainLabels, k=i)

  pred_y <- as.numeric(levels(y_pred)[y_pred])            
  sse[i] <- sum((newdata.trainLabels-pred_y)^2)   

  k[i] <- i                                
}

pred_y is a column for each i. I want to create a data frame with all the 13 columns. Can it be done by using a for loop? Or else how can this be accomplished? I need suggestions.

R. hacker
  • 11
  • 1

1 Answers1

0

You can use foreach which has the added advantage that it can be run in parallel if you have multiple cores in your CPU. Here is the non-parallel code:

library("iterators")
library("foreach")
library("FNN")

data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))

k.values = seq(3, 15, 1)
start = 2  # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind) %do%
{
  y_pred <-knn(train = newdata.training, test = newdata.test,
               cl = newdata.trainlabels, k=i, prob = TRUE)

  pred_y <- as.numeric(levels(y_pred)[y_pred])            
  sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2) 
  pred_y 
}

results1 = data.frame(results)
colnames(results1) = k.values

Here is the parallel version:

# Parallel version
library("iterators")
library("foreach")
library("parallel")
library("doParallel")
library("FNN")


data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))

num.cores = detectCores()
clusters <- makeCluster(num.cores)
registerDoParallel(clusters)
k.values = seq(3, 15, 1)
start = 2  # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind, .packages=c("FNN")) %dopar%
{
  y_pred <-knn(train = newdata.training, test = newdata.test,
               cl = newdata.trainlabels, k=i, prob = TRUE)

  pred_y <- as.numeric(levels(y_pred)[y_pred])            
  sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2) 
  pred_y 
}

results1 = data.frame(results)
colnames(results1) = k.values
stopCluster(clusters)

There are only a few differences between the non-parallel code and the parallel code. First, there are additional libraries to load. Second, you need to create and register the clusters that will do the parallel computation (and stop the clusters when you are done). Third, foreach uses %dopar% infix operator instead of %do%. Fourth, the foreach function needs the .packages parameter to pass KNN to each of the clusters.

MrMeritology
  • 171
  • 9