0

Im creating a multinomial model to predict outcomes of hockey games.

Packages

library(tools)
library(utils)
library(dplyr)
library(nnet)
library(VGAM)
library(mlogit)
library(foreign)

dataset

structure(list(GID = 1:20, Date = structure(c(17097, 17100, 17102, 
17107, 17109, 17111, 17120, 17122, 17125, 17127, 17130, 17134, 
17142, 17144, 17146, 17162, 17167, 17170, 17172, 17174), class = "Date"), 
totHomeGoals = c(4L, 6L, 0L, 1L, 5L, 4L, 4L, 3L, 2L, 2L, 
2L, 2L, 5L, 3L, 5L, 2L, 3L, 2L, 3L, 1L), totAwayGoals = c(2L, 
1L, 4L, 5L, 1L, 1L, 1L, 2L, 3L, 2L, 3L, 1L, 5L, 2L, 1L, 3L, 
3L, 0L, 2L, 2L), TOIHome = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 1, 0, 1, 0), TOIAway = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0), DEC = structure(c(3L, 
3L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 2L, 1L, 3L, 2L, 3L, 3L, 2L, 
2L, 3L, 2L, 1L), .Label = c("-1", "0", "1"), class = "factor"), 
totHomeShots = c(37L, 26L, 35L, 33L, 33L, 21L, 27L, 23L, 
30L, 41L, 36L, 38L, 38L, 32L, 32L, 36L, 25L, 24L, 35L, 24L
), totHomePP = c(1L, 3L, 0L, 1L, 1L, 2L, 1L, 0L, 1L, 1L, 
0L, 1L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L), totAwayShots = c(19L, 
29L, 37L, 34L, 22L, 26L, 34L, 29L, 29L, 35L, 25L, 40L, 34L, 
24L, 22L, 25L, 55L, 23L, 23L, 36L), totAwayPP = c(0L, 1L, 
1L, 1L, 0L, 0L, 0L, 0L, 2L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 1L), totHomeSaves = c(17L, 28L, 33L, 29L, 21L, 25L, 
33L, 27L, 26L, 33L, 22L, 39L, 29L, 22L, 21L, 22L, 52L, 23L, 
21L, 34L), totAwaySaves = c(33L, 20L, 35L, 32L, 28L, 17L, 
23L, 20L, 28L, 39L, 34L, 36L, 33L, 29L, 27L, 34L, 22L, 22L, 
32L, 23L), HomeTeam = c("ANA", "ANA", "ANA", "ANA", "ANA", 
"ANA", "ANA", "ANA", "ANA", "ANA", "ANA", "ANA", "ANA", "ANA", 
"ANA", "ANA", "ANA", "ANA", "ANA", "ANA"), AwayTeam = c("VAN", 
"NSH", "CBJ", "PIT", "ARI", "CGY", "EDM", "NJD", "LAK", "NYI", 
"CHI", "MTL", "CAR", "SJS", "OTT", "SJS", "PHI", "DET", "ARI", 
"MIN"), HomeSH = c(0.108108108108108, 0.230769230769231, 
0, 0.0303030303030303, 0.151515151515152, 0.19047619047619, 
0.148148148148148, 0.130434782608696, 0.0666666666666667, 
0.0487804878048781, 0.0555555555555556, 0.0526315789473684, 
0.131578947368421, 0.09375, 0.15625, 0.0555555555555556, 
0.12, 0.0833333333333333, 0.0857142857142857, 0.0416666666666667
), AwaySH = c(0.105263157894737, 0.0344827586206897, 0.108108108108108, 
0.147058823529412, 0.0454545454545455, 0.0384615384615385, 
0.0294117647058824, 0.0689655172413793, 0.103448275862069, 
0.0571428571428571, 0.12, 0.025, 0.147058823529412, 0.0833333333333333, 
0.0454545454545455, 0.12, 0.0545454545454545, 0, 0.0869565217391304, 
0.0555555555555556), HomeSV = c(0.894736842105263, 0.96551724137931, 
0.891891891891892, 0.852941176470588, 0.954545454545455, 
0.961538461538462, 0.970588235294118, 0.931034482758621, 
0.896551724137931, 0.942857142857143, 0.88, 0.975, 0.852941176470588, 
0.916666666666667, 0.954545454545455, 0.88, 0.945454545454545, 
1, 0.91304347826087, 0.944444444444444), AwaySV = c(0.891891891891892, 
0.769230769230769, 1, 0.96969696969697, 0.848484848484849, 
0.80952380952381, 0.851851851851852, 0.869565217391304, 0.933333333333333, 
0.951219512195122, 0.944444444444444, 0.947368421052632, 
0.868421052631579, 0.90625, 0.84375, 0.944444444444444, 0.88, 
0.916666666666667, 0.914285714285714, 0.958333333333333)), .Names = c("GID", 
"Date", "totHomeGoals", "totAwayGoals", "TOIHome", "TOIAway", 
"DEC", "totHomeShots", "totHomePP", "totAwayShots", "totAwayPP",  
"totHomeSaves", "totAwaySaves", "HomeTeam", "AwayTeam", "HomeSH", 
"AwaySH", "HomeSV", "AwaySV"), row.names = c(NA, 20L), class = "data.frame")

here is my model

    {Kolzig <- multinom(DEC ~ totHomeShots + totHomePP + totAwayShots + 
totAwayPP + totHomeSaves + totAwaySaves+ HomeSH*totHomeShots + 
AwaySH*totAwayShots + HomeSV + AwaySV, data = NHL6)}

I then use the .pred statement.

    {Kolzig.pred <- predict(Kolzig, type="probs")}

The results however, are clearly incorrect.

-1            0            1
1     7.348283e-23 5.738844e-06 9.999943e-0
2     6.908534e-58 2.563978e-23 1.000000e+00
3     1.000000e+00 1.217702e-18 4.799552e-46
4     1.000000e+00 4.093737e-19 1.608055e-46
5     4.937595e-46 2.689526e-17 1.000000e+00

Many of the games results are showing close to 100% probability for a certain result, which aligns with what actually has occurred. What should change here?

  • Please provide a small data sample (paste in the output of `dput(data_sample)`) and add code to load whatever package you're using. Also, in the `multinom` call, `NHL6$DEC` should be just `DEC`). – eipi10 Jun 20 '17 at 18:02
  • 1
    Please add it to your question, rather than in a comment, along with a data sample, so that we can easily copy and paste a fully working reproducible example into an R script. – eipi10 Jun 20 '17 at 18:05
  • Sorry, I'm fairly new to R. I hope what I added suffices. – Arian Modarres Jun 20 '17 at 18:18
  • We need the full output of `dput`. It should start with `structure(...`. You don't need to provide all the data, but we do need a sample that demonstrates the behavior you're seeing. For example, to provide 20 rows of data, do `dput(NHL6[1:20, ])`. Just copy the output from the R console and paste it into your question. – eipi10 Jun 20 '17 at 18:19
  • Hope that helps – Arian Modarres Jun 20 '17 at 18:37

0 Answers0