0

I am interested to predict a new data set from an unsupervised fitted SOM model, and am not confident i am on the right path. Appreciate your guidance

Objective: To classify new data set with cluster groups previously fitted on training set.

What I am uncertain about:

a. Whether the cluster groups assigned in test set is assigned similarly with the training set. i.e. group 1 in test set must share the same characteristics of group 1 in training set.

b. The Iris unsupervised fit doesn't appear to be very fitting.

library('kohonen')
set.seed(1)

idx_n <- sample(nrow(iris),120)

train <- iris[idx_n,]
row.names(train) <- NULL

test <- iris[-idx_n,]
row.names(test) <- NULL

#preprocess
train.sc <- scale(train[,-5])

#train model
som_grid <- somgrid(xdim = 5
                    ,ydim=5
                    ,topo="hexagonal"
                    ,toroidal = F)  
som.iris<- som(train.sc
               ,grid=som_grid
               ,rlen=200
               ,alpha=c(0.05,0.01)
               ,keep.data = TRUE )

set_cluster <- 3

## use hierarchical clustering to cluster the codebook vectors
som.iris.hc <- cutree(hclust(dist(som.iris$codes[[1]])), set_cluster)



# --------- Predict new dataset ----------
#scale test set acording to fitted model data
test.sc  <- scale(test[,-5],
                  center = attr(som.iris$data[[1]], "scaled:center"),
                  scale  = attr(som.iris$data[[1]], "scaled:scale"))

test.pred <- predict(som.iris, 
                     newdata = test.sc)


set_cluster <- 3

## use hierarchical clustering to cluster the codebook vectors
som.iris.hc_test <- cutree(hclust(dist(test.pred$predictions[[1]])), set_cluster)

#attach cluster groups. Am I doing this right?
train_final <- cbind(train,cluster=som.iris.hc[som.iris$unit.classif])
test_final <- cbind(test,cluster=som.iris.hc_test)

#explore each clusters
by(train_final, train_final$cluster, summary)
by(test_final, test_final$cluster, summary)

#results - Not very Spectacular
table(train_final$Species,train_final$cluster)
Choc_waffles
  • 518
  • 1
  • 4
  • 15

1 Answers1

0

My current workaround is to first train as unsupervised SOM model, once i have decided on the number of clusters, i would tag the cluster and retrain as supervised SOM model. I can then be more targeted with predicting the new data set. Would like to hear your thoughts.

library('kohonen')
set.seed(1)

idx_n <- sample(nrow(iris),120)

train <- iris[idx_n,]
row.names(train) <- NULL

test <- iris[-idx_n,]
row.names(test) <- NULL

#preprocess
train.sc <- scale(train[,-5])

#train model as unsupervised
som_grid <- somgrid(xdim = 5
                    ,ydim=5
                    ,topo="hexagonal"
                    ,toroidal = F)  
som.iris<- som(train.sc
               ,grid=som_grid
               ,rlen=200
               ,alpha=c(0.05,0.01)
               ,keep.data = TRUE )

set_cluster <- 3

## use hierarchical clustering to cluster the codebook vectors
som.iris.hc <- cutree(hclust(dist(som.iris$codes[[1]])), set_cluster)
train_cluster <- as.factor(as.vector(som.iris.hc[som.iris$unit.classif]))

#assign new clusters into training set
train.l.sc <- list(x=train.sc,y=train_cluster)

#retrain model as supervised learning
mygrid = somgrid(5, 5, "hexagonal")
som.iris.l <- supersom(train.l.sc, grid = mygrid, maxNA.fraction = .5)

# --------- Predict new dataset ----------
#scale test set acording to fitted model data
test.l.sc <- list(x = as.matrix(scale(test[,-5]
                                      ,center = attr(som.iris.l$data[[1]], "scaled:center"),
                                      scale  = attr(som.iris.l$data[[1]], "scaled:scale")
                                      )))

test.pred <- predict(som.iris.l, 
                     newdata = test.l.sc)

#attach cluster groups
train_final <- cbind(train,cluster=train_cluster)
test_final <- cbind(test,cluster=test.pred$predictions$y)

#explore each clusters
by(train_final, train_final$cluster, summary)
by(test_final, test_final$cluster, summary)
Choc_waffles
  • 518
  • 1
  • 4
  • 15