I would like to clarify my understanding of the result of a trained M5P model. I trained an M5P model that gave me a tree followed by 4 linear models.
M5 unpruned model tree:
(using smoothed linear models)
Value12 <= 2.266 :
| Value2 <= 1111.5 : LM1 (2/0.01%)
| Value2 > 1111.5 : LM2 (4/2.268%)
Value12 > 2.266 :
| Value3 <= 1544650 : LM3 (2/1.652%)
| Value3 > 1544650 : LM4 (2/92.017%)
LM num: 1
Value15 =
-0.0001 * Value2
+ 1.8377
LM num: 2
Value15 =
-0.0001 * Value2
+ 1.8181
LM num: 3
Value15 =
-0 * Value3
+ 1.7212
LM num: 4
Value15 =
-0 * Value3
+ 1.7093
Number of Rules : 4
In order to make sure that I understood the working principle, I tried to manually replicate the result using the decision tree and the referenced LM model but the result were not as expected.
I used the tree to determine which LM model to use and I performed the operation as stated in the LM model and the results were not the same. Is that normal?
The dataset I used:
Data_train<-structure(list(Value2 = c(610L, 1245L, 978L, 610L, 978L, 610L,
1727L, 1810L, 1805L, 1805L), Value3 = c(1544673L, 2206981L, 2512821L,
1544627L, 2512792L, 1524144L, 3415598L, 9205162L, 9182166L, 9182089L
), Value4 = c(12.1260004043579, 17.3250007629395, 19.7259998321533,
12.125, 19.7250003814697, 11.9650001525879, 26.8120002746582,
72.2610015869141, 72.0800018310547, 72.0790023803711), Value5 =
c(0.0817999988794327,
0.0856000036001205, 0.0828000009059906, 0.0817999988794327,
0.0828000009059906,
0.09009999781847, 0.145199999213219, 0.200299993157387, 0.200299993157387,
0.200200006365776), Value6 = c(2L, 1L, 2L, 2L, 2L, 2L, 4L, 4L,
4L, 4L), Value7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Value8 = c(4L, 4L, 4L, 4L, 4L, 4L, 22L, 36L, 36L, 36L), Value9 = c(1L,
1L, 2L, 1L, 2L, 1L, 8L, 6L, 6L, 6L), Value10 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Value11 = c(0.958189010620117,
1, 0.925986051559448, 0.958268105983734, 0.926032960414886,
0.971082329750061, 0.471057742834091, 0.476771682500839,
0.47670641541481, 0.47671303153038), Value12 = c(3.27869,
0.80321, 2.04499, 3.27869, 2.04499, 3.27869, 2.31616, 2.20994,
2.21607, 2.21607), Value13 = c(1L, 0L, 1L, 1L, 1L, 1L, 2L,
3L, 3L, 3L), Value15 = c(1.33398258686066, 1.90592515468597,
2.17005920410156, 1.33387243747711, 2.1699492931366, 1.31627094745636,
0.353617042303085, 1.93668437004089, 1.93183350563049, 1.93180668354034
)), .Names = c("Value2", "Value3", "Value4", "Value5", "Value6",
"Value7", "Value8", "Value9", "Value10", "Value11", "Value12",
"Value13", "Value15"), row.names = c(NA, 10L), class = "data.frame")
Here is the formula I used to train the model:
library(RWeka)
Data_modelUnPruned <- M5P(Value15 ~ Value6 + Value3 + Value4 + Value2 +
Value7 + Value8 + Value9 + Value10 + Value11 + Value12 + Value13, data =
Data_train, control = Weka_control(N = TRUE))
Here is the resulting dataset after having added the prediction column:
Data_train_Results<-structure(list(Value2 = c(610L, 1245L, 978L, 610L, 978L,
610L,
1727L, 1810L, 1805L, 1805L), Value3 = c(1544673L, 2206981L, 2512821L,
1544627L, 2512792L, 1524144L, 3415598L, 9205162L, 9182166L, 9182089L
), Value4 = c(12.1260004043579, 17.3250007629395, 19.7259998321533,
12.125, 19.7250003814697, 11.9650001525879, 26.8120002746582,
72.2610015869141, 72.0800018310547, 72.0790023803711), Value5 =
c(0.0817999988794327,
0.0856000036001205, 0.0828000009059906, 0.0817999988794327,
0.0828000009059906,
0.09009999781847, 0.145199999213219, 0.200299993157387, 0.200299993157387,
0.200200006365776), Value6 = c(2L, 1L, 2L, 2L, 2L, 2L, 4L, 4L,
4L, 4L), Value7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Value8 = c(4L, 4L, 4L, 4L, 4L, 4L, 22L, 36L, 36L, 36L), Value9 = c(1L,
1L, 2L, 1L, 2L, 1L, 8L, 6L, 6L, 6L), Value10 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Value11 = c(0.958189010620117,
1, 0.925986051559448, 0.958268105983734, 0.926032960414886,
0.971082329750061, 0.471057742834091, 0.476771682500839,
0.47670641541481, 0.47671303153038), Value12 = c(3.27869,
0.80321, 2.04499, 3.27869, 2.04499, 3.27869, 2.31616, 2.20994,
2.21607, 2.21607), Value13 = c(1L, 0L, 1L, 1L, 1L, 1L, 2L,
3L, 3L, 3L), Value15 = c(1.33398258686066, 1.90592515468597,
2.17005920410156, 1.33387243747711, 2.1699492931366, 1.31627094745636,
0.353617042303085, 1.93668437004089, 1.93183350563049, 1.93180668354034
), Model_Prediction = c(1.56039428073199, 1.74959163286097,
1.77758972532522, 1.57231876013397, 1.77758972532522, 1.57429264935954,
1.38009848913172, 1.71850280973615, 1.71877793206469, 1.71877793206469
)), .Names = c("Value2", "Value3", "Value4", "Value5", "Value6",
"Value7", "Value8", "Value9", "Value10", "Value11", "Value12",
"Value13", "Value15", "Model_Prediction"), row.names = c(NA,
10L), class = "data.frame")
Here is the code I used to try to replicate the model results, it's basically the hard coded version of the M5P model in visual basic.
Public Function GetLM(Value2 As Long, Value3 As Long, Value4 As Double,
Value6 As Long, Value7 As Long, Value8 As Long, Value9 As Long, Value10 As
Long, Value11 As Double, Value12 As Double, Value13 As Long)
Dim lm As String
If Value12 <= 2.266 Then
If Value2 <= 1111.5 Then
lm = "LM1" '(2/0.019%)
Else
lm = "LM2" '(4/2.269%)
End If
Else
If Value3 <= 1544650 Then
lm = "LM3" '(2/1.652%)
Else
lm = "LM4" '(2/92.021%)
End If
End If
Select Case lm
Case "LM1"
GetLM = -0.0001 * Value2 _
+ 1.8377
Case "LM2"
GetLM = -0.0001 * Value2 _
+ 1.8181
Case "LM3"
GetLM = -0 * Value3 _
+ 1.7212
Case "LM4"
GetLM = -0 * Value3 _
+ 1.7093
Case Else
GetLM = 0
End Select
End Function
Can someone explain to me how this should work?
Thank you very much.