I have the following problem: after splitting my data into training/testing, I am unable to predict the testing with the glmnet fit... Please see the toy example below with the BostonHousing dataset. I think it happens because glmnet (lasso, elnet) are able to shrink data, and so, I don't know which was the data that the model selected. (I think that if there was a way to extract the features that were selected, I could automatically feed my x.test data set only with the selected features..)
#' load the package
require(glmnet)
require(caret)
require(mlbench)
set.seed(123)
#' load data
data("BostonHousing")
#' Create a test/train index, to keep back a validation set for final checking
validationIndex <- createDataPartition(BostonHousing$medv, p= .7, list = FALSE)
dataset <- BostonHousing [validationIndex, ]
validation <- BostonHousing [ -validationIndex, ]
x.train <- data.matrix (dataset [ ,1:length(dataset)])
y.train <- data.matrix (dataset$medv)
colnames (y.train) <- ("MedianValue")
rownames (y.train) <- rownames (dataset)
x.test <- data.matrix (validation [,1:(length(validation)-1)])
y.test <- data.matrix(validation$medv)
rm (validationIndex, dataset, validation )
#' fit model
fit.lasso <- glmnet(x.train, y.train, family = "gaussian", alpha = 1)
fit.ridge <- glmnet(x.train, y.train, family = "gaussian", alpha = 0)
fit.elnet <- glmnet(x.train, y.train, family = "gaussian", alpha = .5)
#' cv validation with 10-folds, for each alpha possibility
fit.lasso.cv <- cv.glmnet(x.train, y.train, type.measure = "mse", alpha = 1, family = "gaussian")
fit.ridge.cv <- cv.glmnet(x.train, y.train, type.measure = "mse", alpha = 0, family="gaussian")
fit.elnet.cv <- cv.glmnet(x.train, y.train, type.measure = "mse", alpha=.5, family = "gaussian")
#' CV plot based on GLMNET's Vignette
par(mfrow=c(1,2))
plot (fit.lasso.cv); plot (fit.ridge.cv); plot (fit.elnet.cv)
plot (log (fit.lasso.cv$lambda), fit.lasso.cv$cvm, pch= 10, col="red", xlab = "log(Lambda)", ylab= fit.lasso.cv$name)
points(log(fit.elnet.cv$lambda), fit.elnet.cv$cvn, pch= 10, col = "grey")
points(log(fit.ridge.cv$lambda), fit.ridge.cv$cvm, pch= 10, col = "blue")
legend ("topleft", legend = c("alpha=1", "alpha= 0.5", "alpha= 0"), pch= 19, col = c("red", "grey", "blue"))
THE PROBLEM
#' Make predictions on validation dataset
yhat0 <- predict(fit.lasso.cv, s=fit.lasso.cv$lambda.1se, newx=x.test)
yhat1 <- predict(fit.ridge.cv, s=fit.ridge.cv$lambda.1se, newx=x.test)
yhat2 <- predict(fit.elnet.cv, s=fit.elnet.cv$lambda.1se, newx=x.test)
mse0 <- mean((y.test - yhat0)^2)
mse1 <- mean((y.test - yhat1)^2)
mse2 <- mean((y.test - yhat2)^2)
#' summarize the accuracy
print (mse0, mse1, mse2)
The error is: Error in cbind2(1, newx) %*% nbeta : Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90