When using KNN to predict price how do you use K-fold to cross-validate? My current code to predict is
library("tidyverse")
library("FNN")
library("forecast")
library("caret")
library("stats")
houses=read_csv("data.csv")
houses = subset(houses, price < 1000001)
houses = subset(houses, price >99999)
houses = subset(houses, price != 0)
houses =houses %>%
select(-street,-city, -statezip,-country)
houses = houses %>%
mutate(date = as.Date(houses$date)) %>%
mutate(date = format(date, format="%Y"))
houses = houses %>%
mutate(date = as.numeric(houses$date)) %>%
mutate(yr_built = as.numeric(houses$yr_built)) %>%
mutate(age_when_listed = date - yr_built)
houses = houses %>%
mutate(age_when_listed = (houses$date - houses$yr_built))
houses= houses %>%
mutate(renovated = ifelse(yr_renovated > 0, 1,0))
b1= mean(houses$bedrooms)
b2=sd(houses$bedrooms)
c1= mean(houses$bathrooms)
c2=sd(houses$bathrooms)
e1= mean(houses$sqft_lot)
e2=sd(houses$sqft_lot)
f1= mean(houses$floors)
f2=sd(houses$floors)
g1= mean(houses$view)
g2=sd(houses$view)
h1=mean(houses$waterfront)
h2=sd(houses$waterfront)
i1= mean(houses$condition)
i2=sd(houses$condition)
j1= mean(houses$sqft_above)
j2=sd(houses$sqft_above)
k1= mean(houses$sqft_basement)
k2=sd(houses$sqft_basement)
l1= mean(houses$age_when_listed)
l2=sd(houses$age_when_listed)
m1=mean(houses$yr_renovated)
m2=sd(houses$yr_renovated)
houses = houses %>%
mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
age_when_listed_norm = (age_when_listed-l1)/l2, waterfront_norm=(waterfront-h1)/h2,
yr_renovated_norm=(yr_renovated-m1)/m2)
houses_input_norm = houses %>%
select(bedrooms_norm, bathrooms_norm,
sqft_lot_norm, floors_norm, view_norm,condition_norm, sqft_above_norm,
sqft_basement_norm, age_when_listed_norm, waterfront_norm, yr_renovated_norm)
#New sample observation
newdata= as_tibble(list(bedrooms=4,bathrooms=3, sqft_lot=2000,floors=2, waterfront= 0,
view=2, condition=3,sqft_above = 3000,sqft_basement=0,age_when_listed=20, yr_renovated=0))
newdata = newdata %>%
mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
age_when_listed_norm = (age_when_listed-l1)/l2,waterfront_norm=(waterfront-h1)/h2,
yr_renovated_norm=(yr_renovated-m1)/m2)
newdata_input_norm = newdata %>%
select(bedrooms_norm, bathrooms_norm,
sqft_lot_norm, floors_norm, view_norm,
condition_norm, sqft_above_norm, sqft_basement_norm,
age_when_listed_norm,waterfront_norm, yr_renovated_norm)
houses_output= houses$price
Then to cross-validate I used this code (based on textbook example) does this only for cross validating linear regression problems or is it accurate for K-NN also?
set.seed(30)
houses = houses%>%
tibble::rowid_to_column("ID")
temp =as_tibble()
houses = houses %>%
mutate(fold = sample(1:10, 4202, replace = TRUE))
K=10
for(obs_num in 1:K)
{
train = houses %>%
filter(ID != obs_num)
validation= houses%>%
filter(ID ==obs_num)
train.mlr = lm(price~bedrooms + bathrooms + sqft_lot+floors+
view+ condition+sqft_above+ sqft_basement+ age_when_listed+
yr_renovated+ waterfront, train)
validation = validation%>%
mutate(price_prediction = predict(train.mlr, validation))
am= accuracy(validation$price_prediction, validation$price)
temp= temp %>%
bind_rows(as_tibble(list(run=obs_num, RSME = am[2], MAPE= am[5])))
print(paste("iteration", obs_num, "completed"), sep =" ")
}
temp %>%
summarise(mean_MAPE = mean(MAPE), sd_MAPE = sd(MAPE))
temp %>%
summarise(mean_RSME = mean(RSME), sd_RSME = sd(RSME))
Is this code accurate to cross validate using K-fold or do I need to change it, it outputs error levels currently but uncertain if they are correct