2

I try to estimate multinomial logistic regression by using mlogit package, but I got the error. I have 205109 observations and below I attempted the code and error.

> Data <- read.csv("NWEScsv.csv",header=T)
> colnames(Data)
 [1] "NWSE"  "W.S"   "W.E"   "W.N"   "W.W"   "U.S"   "U.E"   "U.N"   "U.W"  
[10] "W.Not" "U.Not" "BW"    "BU" 
> movement = mlogit.data(Data, shape ="wide", varying = 2:11, choice = "NWSE")
> head(movement,15)
       NWSE     BW BU alt        W           U chid
1.E   FALSE 194.42  0   E 198.3434 2.47404e-10    1
1.N   FALSE 194.42  0   N 194.4160 1.41319e-10    1
1.Not  TRUE 194.42  0 Not 194.4200 0.00000e+00    1
1.S   FALSE 194.42  0   S 212.7249 2.08726e-10    1
1.W   FALSE 194.42  0   W 198.3434 1.37143e-10    1
2.E    TRUE 257.24  0   E 202.3502 8.05144e-10    2
2.N   FALSE 257.24  0   N 200.3368 4.59906e-10    2
> result.movement = mlogit(NWSE ~ 0 |BW+BU | W+U, movement)
 solve.default(H, g[!fixed]) error: 
  Lapack routine dgesv: system is exactly singular: U[20,20] = 0 

How can I solve the problem?

========

Sorry for long question. In my knowledge, when we try to estimate multiple logistic regression, we need to separate explanatory variables into three types. (ex: U_{ij}=α_{j}+β*X_{ij}+γ_{j} Z_{i}+δ_{j} W_{ij}+ε_{ij}, X_{ij},Z_{i},W_{ij}are three types) In the mlogit package, we need to separate the variables by using "|" and the equation will become like X_{ij}|Z_{i}|W_{ij}.

No matter how many times I try, I couldn't get the results when I put the variables into the W_{ij}. Below I put part of data structure.

> dput(head(movement, n = 25))
structure(list(NWSE = c(`1.E` = FALSE, `1.N` = FALSE, `1.Not` = TRUE, 
`1.S` = FALSE, `1.W` = FALSE, `2.E` = TRUE, `2.N` = FALSE, `2.Not` = FALSE, 
`2.S` = FALSE, `2.W` = FALSE, `3.E` = TRUE, `3.N` = FALSE, `3.Not` = FALSE, 
`3.S` = FALSE, `3.W` = FALSE, `4.E` = TRUE, `4.N` = FALSE, `4.Not` = FALSE,
`4.S` = FALSE, `4.W` = FALSE, `5.E` = TRUE, `5.N` = FALSE, `5.Not` = FALSE, 
`5.S` = FALSE, `5.W` = FALSE), BW = c(`1.E` = 194.42, `1.N` = 194.42,
`1.Not` = 194.42, `1.S` = 194.42, `1.W` = 194.42, `2.E` = 257.24, 
`2.N` = 257.24, `2.Not` = 257.24, `2.S` = 257.24, `2.W` = 257.24, 
`3.E` = 262.43, `3.N` = 262.43, `3.Not` = 262.43, `3.S` = 262.43, 
`3.W` = 262.43, `4.E` = 183.09, `4.N` = 183.09, `4.Not` = 183.09, 
`4.S` = 183.09, `4.W` = 183.09, `5.E` = 311.06, `5.N` = 311.06, 
`5.Not` = 311.06, `5.S` = 311.06, `5.W` = 311.06), BU = c(`1.E` = 0, 
`1.N` = 0, `1.Not` = 0, `1.S` = 0, `1.W` = 0, `2.E` = 0, `2.N` = 0, 
`2.Not` = 0, `2.S` = 0, `2.W` = 0, `3.E` = 0, `3.N` = 0, `3.Not` = 0, 
`3.S` = 0, `3.W` = 0, `4.E` = 0, `4.N` = 0, `4.Not` = 0, `4.S` = 0, 
`4.W` = 0, `5.E` = 0, `5.N` = 0, `5.Not` = 0, `5.S` = 0, `5.W` = 0
), alt = c(`1.E` = "E", `1.N` = "N", `1.Not` = "Not", `1.S` = "S", 
`1.W` = "W", `2.E` = "E", `2.N` = "N", `2.Not` = "Not", `2.S` = "S", 
`2.W` = "W", `3.E` = "E", `3.N` = "N", `3.Not` = "Not", `3.S` = "S", 
`3.W` = "W", `4.E` = "E", `4.N` = "N", `4.Not` = "Not", `4.S` = "S", 
`4.W` = "W", `5.E` = "E", `5.N` = "N", `5.Not` = "Not", `5.S` = "S", 
`5.W` = "W"), W = c(`1.E` = 198.3434254, `1.N` = 194.4159624, 
`1.Not` = 194.42, `1.S` = 212.7249464, `1.W` = 198.3434254, `2.E` = 
202.3502284, 
`2.N` = 200.33681, `2.Not` = 257.24, `2.S` = 219.2033856, `2.W` = 
204.383882, 
`3.E` = 208.5127103, `3.N` = 206.4379742, `3.Not` = 262.43, `3.S` = 
225.8791225, 
`3.W` = 210.6082979, `4.E` = 198.3434254, `4.N` = 194.4159624, 
`4.Not` = 183.09, `4.S` = 212.7249464, `4.W` = 198.3434254, `5.E` = 
 244.6919323, 
`5.N` = 239.8467074, `5.Not` = 311.06, `5.S` = 262.4340992, `5.W` = 
244.6919323
), U = c(`1.E` = 2.47e-10, `1.N` = 1.41e-10, `1.Not` = 0, `1.S` = 2.09e- 
  10, 
`1.W` = 1.37e-10, `2.E` = 8.05e-10, `2.N` = 4.6e-10, `2.Not` = 0, 
`2.S` = 6.73e-10, `2.W` = 4.46e-10, `3.E` = 6.53e-10, `3.N` = 3.73e-10, 
`3.Not` = 0, `3.S` = 5.51e-10, `3.W` = 3.62e-10, `4.E` = 2.47e-10, 
`4.N` = 1.41e-10, `4.Not` = 0, `4.S` = 2.09e-10, `4.W` = 1.37e-10, 
`5.E` = 2.65e-10, `5.N` = 1.52e-10, `5.Not` = 0, `5.S` = 2.24e-10, 
`5.W` = 1.47e-10), chid = c(`1.E` = 1L, `1.N` = 1L, `1.Not` = 1L, 
`1.S` = 1L, `1.W` = 1L, `2.E` = 2L, `2.N` = 2L, `2.Not` = 2L, 
`2.S` = 2L, `2.W` = 2L, `3.E` = 3L, `3.N` = 3L, `3.Not` = 3L, 
`3.S` = 3L, `3.W` = 3L, `4.E` = 4L, `4.N` = 4L, `4.Not` = 4L, 
`4.S` = 4L, `4.W` = 4L, `5.E` = 5L, `5.N` = 5L, `5.Not` = 5L, 
`5.S` = 5L, `5.W` = 5L)), reshapeLong = list(varying = structure(list(
W = c("W.S", "W.E", "W.N", "W.W", "W.Not"), U = c("U.S", 
"U.E", "U.N", "U.W", "U.Not")), v.names = c("W", "U"), times = c("S", 
"E", "N", "W", "Not")), v.names = c("W", "U"), idvar = "chid", 
timevar = "alt"), index = structure(list(chid = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 
4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L), .Label = c("1", "2", "3", "4", 
"5"), class = "factor"), alt = structure(c(1L, 2L, 3L, 4L, 5L, 
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 
2L, 3L, 4L, 5L), .Label = c("E", "N", "Not", "S", "W"), class = "factor")), 
class = "data.frame", row.names = c("1.E", 
"1.N", "1.Not", "1.S", "1.W", "2.E", "2.N", "2.Not", "2.S", "2.W", 
"3.E", "3.N", "3.Not", "3.S", "3.W", "4.E", "4.N", "4.Not", "4.S", 
"4.W", "5.E", "5.N", "5.Not", "5.S", "5.W")), choice = "NWSE", row.names = 
c("1.E", 
"1.N", "1.Not", "1.S", "1.W", "2.E", "2.N", "2.Not", "2.S", "2.W", 
"3.E", "3.N", "3.Not", "3.S", "3.W", "4.E", "4.N", "4.Not", "4.S", 
"4.W", "5.E", "5.N", "5.Not", "5.S", "5.W"), class = c("mlogit.data", 
"data.frame"))
SEIKA UTA
  • 21
  • 3
  • Do you have perfect separation or only one observation per category? – MDEWITT Nov 25 '19 at 01:37
  • I added the head of data. BW and BU only depend on individual and W and U depend on individual and their choices. – SEIKA UTA Nov 25 '19 at 07:43
  • Can you do `table(movement$BU)` and see if there is variation. Other option is `lapply( movement[,sapply(movement,is.numeric)] , sd)`. This will test to see if you have any variation in your columns. This might be causing your fit issues. – MDEWITT Nov 26 '19 at 12:56
  • @MDEWITT Thank you for your comment. I tried to see the variation. > table(movement$BU) 0 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 0.11 0.12 0.13 526520 155455 64380 35180 24465 19310 18560 13700 14055 9865 9765 8350 8295 7230 > lapply( movement[,sapply(movement,is.numeric)] , sd) $`BW` [1] 93.83399 $BU [1] 0.1472054 $W [1] 89.31311 $U [1] 0.1319652 $chid [1] 59209.61 But, how can I evaluate the results? – SEIKA UTA Nov 26 '19 at 13:56
  • the hypothesis was that you did not have enough variation in your predictors to make a stable fit due to co-linearity (e.g. your matrix of predictors wasn't full rank, had linear combinations of one another or did not vary and acted like a second intercept and thus was co-linear with the intercept). It still looks like that is the case. Could you try `caret::nearZeroVar(movement)`? – MDEWITT Nov 26 '19 at 14:16
  • @MDEWITT I got below result > caret::nearZeroVar(movement) integer(0) – SEIKA UTA Nov 27 '19 at 00:29
  • try `caret::findLinearCombos(movement)` – MDEWITT Nov 27 '19 at 00:57
  • @MDEWITTqr.default(object)Error in qr.default(.swts * attr(rhs, "gradient")) :NA/NaN/Inf in foreign function call (arg 1) – SEIKA UTA Nov 27 '19 at 03:36
  • I think I have troubleshoot as much as I can without some more data (eg. `dput(head(movement, n = 25))` and put that in the post if you can). I think you have some collinear predictors which is why it you are getting a singular matrix that is cannot be inverted. – MDEWITT Nov 27 '19 at 13:53
  • @MDEWITT I edited my question and add the results of "eg. dput(head(movement, n = 25))" . – SEIKA UTA Nov 27 '19 at 16:01

1 Answers1

1

So it looks like your data are not full rank, e.g. high collinearity. We can test this by examining the rank of the data vs the number of predictors, if the rank is not equal to the number of predictors you will have a non-unique fit which is the error you are getting

For example:

library(dplyr)
movement %>% 
    select_if(is.numeric) %>% 
    {
        cat("num predictors", ncol(.), "\n")
        qr(.) %>% 
            .$rank
    } 

Yields:

num predictors 5 
[1] 4

You can also see this by looking at the correlation of predictors:

movement %>% 
    select_if(is.numeric) %>% 
    cor()

Which yields:

           BW BU          W          U       chid
BW   1.0000000 NA  0.7360761  0.2079872  0.4765022
BU          NA  1         NA         NA         NA
W    0.7360761 NA  1.0000000 -0.2686001  0.4933215
U    0.2079872 NA -0.2686001  1.0000000 -0.1964596
chid 0.4765022 NA  0.4933215 -0.1964596  1.0000000

This indicates for example that BW and W provide basically the same information. I don't know your subject matter, but you should probably not include both of these predictors in your model. BU also does not have any variation in it, so that will throw errors.

MDEWITT
  • 2,338
  • 2
  • 12
  • 23