0

I am currently doing a K-fold cross validation procedure to determine the best model (linear or quadratic) for this data is. My data comes from a CSV dataset called combinedData which I've pasted a dput for below:

structure(list(Unit.ID = c(925L, 967L, 1054L, 967L, 1054L, 967L, 
1160L, 1054L, 1160L, 967L, 967L, 1054L, 1160L, 967L, 1054L, 1160L, 
967L, 1160L, 1054L, 1054L, 967L, 1160L, 1054L, 967L, 1160L, 1054L, 
967L, 1160L, 1054L, 164L, 967L, 967L, 1160L, 1054L, 164L, 967L, 
164L, 1160L, 164L, 1054L, 967L, 164L, 1054L, 967L, 1054L, 164L, 
967L, 164L, 164L, 1054L, 967L, 164L, 967L, 164L, 1054L, 164L, 
925L, 164L, 967L, 1054L, 1054L, 925L, 925L, 164L, 165L, 164L, 
1054L, 967L, 164L, 165L, 967L, 164L, 164L, 165L, 1054L, 967L, 
967L, 165L, 164L, 1054L, 967L, 165L, 967L, 165L, 164L, 967L, 
164L, 967L, 164L, 967L, 164L, 967L, 164L, 1054L, 164L, 164L, 
164L, 164L, 164L, 164L, 164L), Hour.Meter.Reading = c(34L, 381L, 
532L, 600L, 732L, 783L, 796L, 947L, 1016L, 1038L, 1200L, 1282L, 
1290L, 1388L, 1481L, 1528L, 1579L, 1671L, 1704L, 1728L, 1755L, 
1906L, 1926L, 1936L, 2031L, 2063L, 2136L, 2205L, 2293L, 2321L, 
2342L, 2382L, 2425L, 2505L, 2524L, 2576L, 2704L, 2731L, 2777L, 
2811L, 2873L, 2960L, 2997L, 3080L, 3170L, 3175L, 3264L, 3371L, 
3386L, 3425L, 3485L, 3570L, 3690L, 3740L, 3746L, 3854L, 3863L, 
3976L, 3990L, 3991L, 4078L, 4103L, 4106L, 4138L, 4138L, 4216L, 
4249L, 4253L, 4305L, 4326L, 4353L, 4483L, 4489L, 4489L, 4500L, 
4580L, 4581L, 4652L, 4721L, 4742L, 4784L, 4805L, 4828L, 4943L, 
4947L, 4954L, 4968L, 5298L, 5316L, 5407L, 5533L, 5628L, 5712L, 
5747L, 5951L, 6165L, 6194L, 6439L, 6636L, 6702L, 6918L), Labour.Cost = c(1102.5, 
4270, 542.5, 2730, 682.5, 3097.5, 336, 871.5, 525, 2695, 1837.5, 
1092, 1995, 2572.5, 1092, 924, 840, 1575, 693, 693, 560, 2100, 
7959, 2747.5, 1092, 1764, 2030, 5355, 7434, 315, 1890, 2688, 
504, 3024, 805, 1701, 577.5, 777, 6440, 1281, 588, 4910, 1470, 
1911, 3738, 4140, 9219, 525, 1995, 1239, 1491, 2292.5, 4389, 
2012.5, 1134, 945, 490, 3307.5, 714, 756, 1302, 297.5, 875, 1872.5, 
1435, 1767.5, 2037, 3108, 1645, 1067.5, 3087, 1452.5, 11777.5, 
5670, 4872, 2916, 4158, 5350, 2817.5, 84, 1596, 3865, 714, 910, 
4112.5, 1197, 3622.5, 714, 3675, 4767, 3150, 2142, 2436, 210, 
1974, 3843, 14532, 2373, 2919, 7098, 2205), Parts.Cost = c(657.6733, 
6451.9113, 2235.8885, 6729.7326, 8357.0427, 9224.9012, 1957.0181, 
6890.5315, 3156.4815, 2009.3578, 4555.0977, 3458.6842, 1546.2183, 
6249.232, 4430.8058, 3835.5721, 3415.2062, 4868.2379, 2151.4558, 
2233.2055, 2554.7489, 7433.8141, 2563.289, 3348.7162, 2173.6179, 
1940.2806, 4404.6421, 5626.8595, 10553.4599, 12.62, 11405.5704, 
2554.2787, 1907.3543, 12625.7525, 243.5735, 6104.7416, 405.959, 
3609.1684, 4647.767, 12842.3638, 489.477, 9961.5883, 1706.0572, 
2381.7686, 15177.0692, 5416.7948, 16538.1428, 253.3975, 1390.5058, 
8699.7549, 7759.8042, 5128.0276, 8556.2625, 5760.523, 1923.699, 
628.643, 158.4313, 14481.7111, 3796.3243, 11671.4333, 7140.2504, 
1326.837, 441.0999, 2866.2141, 4229.31, 2935.825, 7452.8686, 
11683.7093, 2644.1532, 418.679, 11665.8066, 523.9236, 18247.2776, 
8115.265, 25011.6846, 13727.0801, 31786.6422, 6064.3123, 10599.0455, 
119.4423, 1228.3541, 3587.7566, 3666.517, 472.1537, 1968.7669, 
1417.8506, 8023.1254, 5831.6884, 14873.8008, 10193.2736, 6442.1719, 
7525.4562, 4378.1336, 1691.4286, 12144.6891, 13094.8609, 20582.1682, 
2544.103, 16934.6748, 17344.5551, 8912.7088), Total.Cost = c(1760.1733, 
10721.9113, 2778.3885, 9459.7326, 9039.5427, 12322.4012, 2293.0181, 
7762.0315, 3681.4815, 4704.3578, 6392.5977, 4550.6842, 3541.2183, 
8821.732, 5522.8058, 4759.5721, 4255.2062, 6443.2379, 2844.4558, 
2926.2055, 3114.7489, 9533.8141, 10522.289, 6096.2162, 3265.6179, 
3704.2806, 6434.6421, 10981.8595, 17987.4599, 327.62, 13295.5704, 
5242.2787, 2411.3543, 15649.7525, 1048.5735, 7805.7416, 983.459, 
4386.1684, 11087.767, 14123.3638, 1077.477, 14871.5883, 3176.0572, 
4292.7686, 18915.0692, 9556.7948, 25757.1428, 778.3975, 3385.5058, 
9938.7549, 9250.8042, 7420.5276, 12945.2625, 7773.023, 3057.699, 
1573.643, 648.4313, 17789.2111, 4510.3243, 12427.4333, 8442.2504, 
1624.337, 1316.0999, 4738.7141, 5664.31, 4703.325, 9489.8686, 
14791.7093, 4289.1532, 1486.179, 14752.8066, 1976.4236, 30024.7776, 
13785.265, 29883.6846, 16643.0801, 35944.6422, 11414.3123, 13416.5455, 
203.4423, 2824.3541, 7452.7566, 4380.517, 1382.1537, 6081.2669, 
2614.8506, 11645.6254, 6545.6884, 18548.8008, 14960.2736, 9592.1719, 
9667.4562, 6814.1336, 1901.4286, 14118.6891, 16937.8609, 35114.1682, 
4917.103, 19853.6748, 24442.5551, 11117.7088), Cumulative.Cost = c(1760.1733, 
12482.0846, 15260.4731, 24720.2057, 33759.7484, 46082.1496, 48375.1677, 
56137.1992, 59818.6807, 64523.0385, 70915.6362, 75466.3204, 79007.5387, 
87829.2707, 93352.0765, 98111.6486, 102366.8548, 108810.0927, 
111654.5485, 114580.754, 117695.5029, 127229.317, 137751.606, 
143847.8222, 147113.4401, 150817.7207, 157252.3628, 168234.2223, 
186221.6822, 186549.3022, 199844.8726, 205087.1513, 207498.5056, 
223148.2581, 224196.8316, 232002.5732, 232986.0322, 237372.2006, 
248459.9676, 262583.3314, 263660.8084, 278532.3967, 281708.4539, 
286001.2225, 304916.2917, 314473.0865, 340230.2293, 341008.6268, 
344394.1326, 354332.8875, 363583.6917, 371004.2193, 383949.4818, 
391722.5048, 394780.2038, 396353.8468, 397002.2781, 414791.4892, 
419301.8135, 431729.2468, 440171.4972, 441795.8342, 443111.9341, 
447850.6482, 453514.9582, 458218.2832, 467708.1518, 482499.8611, 
486789.0143, 488275.1933, 503027.9999, 505004.4235, 535029.2011, 
548814.4661, 578698.1507, 595341.2308, 631285.873, 642700.1853, 
656116.7308, 656320.1731, 659144.5272, 666597.2838, 670977.8008, 
672359.9545, 678441.2214, 681056.072, 692701.6974, 699247.3858, 
717796.1866, 732756.4602, 742348.6321, 752016.0883, 758830.2219, 
760731.6505, 774850.3396, 791788.2005, 826902.3687, 831819.4717, 
851673.1465, 876115.7016, 887233.4104)), class = "data.frame", row.names = c(NA, 
-101L))

So far, I've created all the models I need (K=5) for both linear and quadratic models and I am at the stage where I am trying to calculate the MSE and R squarred values. Here's the code for the process below:

#linear model (Model 1) k-validation
#splitting the testing data into 5 k folds

set.seed(123)
idx <- sample(1:nrow(combinedDataset), nrow(combinedDataset))
view(idx)
test_size <- floor(nrow(combinedDataset)*0.2)
test1 <- combinedDataset[idx[1:test_size],]
train1 <- combinedDataset[-idx[1:test_size],]
view(test1)
view(train1)
train_X1 <- train1$Hour.Meter.Reading
train_y1 <- train1$Cumulative.Cost
test_X1 <- test1$Hour.Meter.Reading
test_y1 <- test1$Cumulative.Cost

X1 <- train_X1
y1 <- train_y1

#Create the 5 linear model equations

poly_order <- 1
Model1 <- lm(y1~poly(X1, poly_order))
print(Model1)

#Calculate MSE

test_yhat1 <- predict(Model1, data.frame(X1 = test_X1))
MSE1 <- mean((test_y1-test_yhat1)^2)
print(MSE1)

But for the last part of the code where I am calculating the MSE value for the first model, I keep getting this error:

Error: variable 'poly(X1, poly_order)' was fitted with type "nmatrix.1" but type "nmatrix.2" was supplied In addition: Warning message: In Z/rep(sqrt(norm2[-1L]), each = length(x)) : longer object length is not a multiple of shorter object length

I have no idea what that code means or how to fix it. I've checked over my code multiple times but I haven't noticed anything wrong with my Model1.

Edit: Made the code shorter

Amsi
  • 11
  • 2
  • 1
    You might want to revisit the guidance on [mcve] with emphasis on *minimal*. You've posted almost 200 lines of code, many of which seem pretty repetitive. Try to pare down the example to just the part where the error occurs, so folks aren't trying to dig through so much code for you – camille Nov 26 '19 at 22:15
  • I think I've mentioned where the error occurs right after the code. But I understand I'll edit the code to only show 1 of the 5 K folds – Amsi Nov 26 '19 at 22:17
  • Hey, if you run what is in your edited version, it works now :) Previous version was too repetitive, I simply lost track of how many X1s were swimming around – StupidWolf Nov 26 '19 at 22:21
  • That's odd, I am not sure why I am getting an error for the full code then? It's the same thing – Amsi Nov 26 '19 at 22:29

0 Answers0