1

When using KNN to predict price how do you use K-fold to cross-validate? My current code to predict is

library("tidyverse")
library("FNN")
library("forecast")
library("caret")
library("stats")


houses=read_csv("data.csv")
houses = subset(houses, price < 1000001)
houses = subset(houses, price >99999)
houses = subset(houses, price != 0)


houses =houses %>%
  select(-street,-city, -statezip,-country)


houses = houses %>%
  mutate(date = as.Date(houses$date)) %>%
  mutate(date = format(date, format="%Y"))

houses = houses %>%
  mutate(date = as.numeric(houses$date)) %>%
  mutate(yr_built = as.numeric(houses$yr_built)) %>%
  mutate(age_when_listed = date - yr_built)

houses = houses %>%
  mutate(age_when_listed = (houses$date - houses$yr_built))


houses= houses %>%
  mutate(renovated = ifelse(yr_renovated > 0, 1,0))



b1= mean(houses$bedrooms)
b2=sd(houses$bedrooms)

c1= mean(houses$bathrooms)
c2=sd(houses$bathrooms)

e1= mean(houses$sqft_lot)
e2=sd(houses$sqft_lot)

f1= mean(houses$floors)
f2=sd(houses$floors)

g1= mean(houses$view)
g2=sd(houses$view)

h1=mean(houses$waterfront)
h2=sd(houses$waterfront)

i1= mean(houses$condition)
i2=sd(houses$condition)

j1= mean(houses$sqft_above)
j2=sd(houses$sqft_above)

k1= mean(houses$sqft_basement)
k2=sd(houses$sqft_basement)

l1= mean(houses$age_when_listed)
l2=sd(houses$age_when_listed)

m1=mean(houses$yr_renovated)
m2=sd(houses$yr_renovated)


houses = houses %>%
  mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
         sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
         view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
         sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
         age_when_listed_norm = (age_when_listed-l1)/l2, waterfront_norm=(waterfront-h1)/h2,
         yr_renovated_norm=(yr_renovated-m1)/m2)

houses_input_norm = houses %>%
  select(bedrooms_norm, bathrooms_norm, 
         sqft_lot_norm, floors_norm, view_norm,condition_norm, sqft_above_norm,
         sqft_basement_norm, age_when_listed_norm, waterfront_norm, yr_renovated_norm)

#New sample observation
newdata= as_tibble(list(bedrooms=4,bathrooms=3, sqft_lot=2000,floors=2, waterfront= 0,
                        view=2, condition=3,sqft_above = 3000,sqft_basement=0,age_when_listed=20, yr_renovated=0))

newdata = newdata %>%
  mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
         sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
         view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
         sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
         age_when_listed_norm = (age_when_listed-l1)/l2,waterfront_norm=(waterfront-h1)/h2,
         yr_renovated_norm=(yr_renovated-m1)/m2)


newdata_input_norm = newdata %>%
  select(bedrooms_norm, bathrooms_norm, 
         sqft_lot_norm, floors_norm, view_norm,
         condition_norm, sqft_above_norm, sqft_basement_norm, 
         age_when_listed_norm,waterfront_norm, yr_renovated_norm)

houses_output= houses$price

Then to cross-validate I used this code (based on textbook example) does this only for cross validating linear regression problems or is it accurate for K-NN also?

set.seed(30)

houses = houses%>%
  tibble::rowid_to_column("ID")

temp =as_tibble()

houses = houses %>%
  mutate(fold = sample(1:10, 4202, replace = TRUE))

K=10
for(obs_num in 1:K)
{  
  train = houses %>%
    filter(ID != obs_num)
  
  validation= houses%>%
    filter(ID ==obs_num)
  
  train.mlr = lm(price~bedrooms + bathrooms + sqft_lot+floors+
                   view+ condition+sqft_above+ sqft_basement+ age_when_listed+
                 yr_renovated+ waterfront, train)
  
  validation = validation%>%
    mutate(price_prediction = predict(train.mlr, validation))
  
  am= accuracy(validation$price_prediction, validation$price)
  
  temp= temp %>%
    bind_rows(as_tibble(list(run=obs_num, RSME = am[2], MAPE= am[5])))
  print(paste("iteration", obs_num, "completed"), sep =" ")
  
}

temp %>%
  summarise(mean_MAPE = mean(MAPE), sd_MAPE = sd(MAPE))

temp %>%
  summarise(mean_RSME = mean(RSME), sd_RSME = sd(RSME))

Is this code accurate to cross validate using K-fold or do I need to change it, it outputs error levels currently but uncertain if they are correct

user438383
  • 5,716
  • 8
  • 28
  • 43
Danny Warner
  • 21
  • 1
  • 3

1 Answers1

0

It doesn't look quite right to me at the moment. Within your cross validation loop you are assigning to your train and validation tables based on the ID variable. I think if you change this to be based on your fold variable instead, then it should work ok.

e.g.

K=10
for(obs_num in 1:K)
{  
  train = houses %>%
    filter(fold != obs_num)
  
  validation = houses%>%
    filter(fold == obs_num)
  
  train.mlr = lm(price~bedrooms + bathrooms + sqft_lot+floors+
                   view+ condition+sqft_above+ sqft_basement+ age_when_listed+
                 yr_renovated+ waterfront, train)
  
  validation = validation%>%
    mutate(price_prediction = predict(train.mlr, validation))
  
  am= accuracy(validation$price_prediction, validation$price)
  
  temp= temp %>%
    bind_rows(as_tibble(list(run=obs_num, RSME = am[2], MAPE= am[5])))
  print(paste("iteration", obs_num, "completed"), sep =" ")
  
}

It's generally fairly easy to adapt a cross-validation loop like this for different model types. The thing to check is that the predict() function still works the same way. predict is a wrapper for predicting using most model types - sometimes it required additional input arguments, and sometimes the outputs are in a different format.

Your KNN model is slightly more complicated, as the FNN package doesn't have a simple predict function built into it. It would have to look something like this:

K=10
for(obs_num in 1:K)
{  
  train = houses %>%
    filter(fold != obs_num) 
  
  # Select your predictors, and convert to a matrix. 
  train.matrix = train %>%
    select(bedrooms,bathrooms,sqft_lot,floors,
               view,condition,sqft_above,sqft_basement,age_when_listed,
             yr_renovated,waterfront) %>% as.matrix()
  
  validation = houses%>%
    filter(fold == obs_num)
  
  # Select your predictors, and convert to a matrix. 
  validation.matrix = validation %>%
    select(bedrooms,bathrooms,sqft_lot,floors,
               view,condition,sqft_above,sqft_basement,age_when_listed,
             yr_renovated,waterfront) %>% as.matrix()
  
  # Here k is set to 3, but you can set it to something else
  train.knn = FNN::knn(train.matrix, validation.matrix, train$price, k=3)
  
  validation = validation %>%
    mutate(price_prediction = train.knn[1:nrow(validation)])
  
  am= accuracy(validation$price_prediction, validation$price)
  
  temp= temp %>%
    bind_rows(as_tibble(list(run=obs_num, RSME = am[2], MAPE= am[5])))
  print(paste("iteration", obs_num, "completed"), sep =" ")
  
}

Of course, it's hard to tell if this works without seeing your data. Can you provide a sample of your data, or a reproducible example using a dataset available in R? With the above code you will have to make sure that the format of train.matrix and validation.matrix match exactly. Also, note that FNN works with numeric data only - if you have any categorical predictors they will have to be one-hot-encoded first.

rw2
  • 1,549
  • 1
  • 11
  • 20