3

I want to multiply many of my binary variables into new columns, so called interactive variables. My dataset is structured like this:

YearCountry <- data.frame( Time = c("2000","2001", "2002", "2003",
                           "2000","2001", "2002", "2003",
                           "2000","2001", "2002", "2003"),
                  AL = c(1,1,1,1,0,0,0,0,0,0,0,0),
                  FR = c(0,0,0,0,1,1,1,1,0,0,0,0),
                  UK = c(0,0,0,0,0,0,0,0,1,1,1,1),
                  Y2000d = c(1,0,0,0,1,0,0,0,1,0,0,0),
                  Y2001d = c(0,1,0,0,0,1,0,0,0,1,0,0),
                  Y2002d = c(0,0,1,0,0,0,1,0,0,0,1,0),
                  Y2003d = c(0,0,0,1,0,0,0,1,0,0,0,1))
YearCountry

 Time AL FR UK Y2000d Y2001d Y2002d Y2003d
1  2000  1  0  0      1      0      0      0
2  2001  1  0  0      0      1      0      0
3  2002  1  0  0      0      0      1      0
4  2003  1  0  0      0      0      0      1
5  2000  0  1  0      1      0      0      0
6  2001  0  1  0      0      1      0      0
7  2002  0  1  0      0      0      1      0
8  2003  0  1  0      0      0      0      1
9  2000  0  0  1      1      0      0      0
10 2001  0  0  1      0      1      0      0
11 2002  0  0  1      0      0      1      0
12 2003  0  0  1      0      0      0      1

I need to multiply the binary variable for each of the countries (AL,FR,UK) with each of the binary variables for a given year so that I get #country x #year new variables. In this case I have three countries and four years which gives 12 new variables. My full data contains 105 countries/regions and stretches over twenty years. I therefore need a general formula. I want data that looks like this

Interact <- data.frame(Time = c("2000","2001", "2002", "2003",
                                "2000","2001", "2002", "2003",
                                "2000","2001", "2002", "2003"),
                       Y2000xAL = c(1,0,0,0,0,0,0,0,0,0,0,0),
            Y2001xAL = c(0,1,0,0,0,0,0,0,0,0,0,0),
            Y2002xAL = c(0,0,1,0,0,0,0,0,0,0,0,0),
            Y2003xAL = c(0,0,0,1,0,0,0,0,0,0,0,0),
            Y2000xFR = c(0,0,0,0,1,0,0,0,0,0,0,0),
            Y2001xFR = c(0,0,0,0,0,1,0,0,0,0,0,0),
            Y2002xFR = c(0,0,0,0,0,0,1,0,0,0,0,0),
            Y2003xFR = c(0,0,0,0,0,0,0,1,0,0,0,0),
            Y2000xUk = c(0,0,0,0,0,0,0,0,1,0,0,0),
            Y2001xUK = c(0,0,0,0,0,0,0,0,0,1,0,0),
            Y2002xUK = c(0,0,0,0,0,0,0,0,0,0,1,0),
            Y2003xUK = c(0,0,0,0,0,0,0,0,0,0,0,1))
Interact 

 Time Y2000xAL Y2001xAL Y2002xAL Y2003xAL Y2000xFR Y2001xFR Y2002xFR Y2003xFR Y2000xUk Y2001xUK Y2002xUK Y2003xUK
1  2000        1        0        0        0        0        0        0        0        0        0        0        0
2  2001        0        1        0        0        0        0        0        0        0        0        0        0
3  2002        0        0        1        0        0        0        0        0        0        0        0        0
4  2003        0        0        0        1        0        0        0        0        0        0        0        0
5  2000        0        0        0        0        1        0        0        0        0        0        0        0
6  2001        0        0        0        0        0        1        0        0        0        0        0        0
7  2002        0        0        0        0        0        0        1        0        0        0        0        0
8  2003        0        0        0        0        0        0        0        1        0        0        0        0
9  2000        0        0        0        0        0        0        0        0        1        0        0        0
10 2001        0        0        0        0        0        0        0        0        0        1        0        0
11 2002        0        0        0        0        0        0        0        0        0        0        1        0
12 2003        0        0        0        0        0        0        0        0        0        0        0        1

2 Answers2

2

Here's an approach with dplyr::across. We can make the final result into a plain data.frame with purrr:invoke as demonstrated in this answer.

library(dplyr)
library(purrr)
YearCountry %>% 
    mutate(across(AL:UK, ~ . * select(cur_data(), Y2000d:Y2003d))) %>%
    select(-(Y2000d:Y2003d)) %>% 
    invoke(.f = data.frame) %>%
    rename_with(~str_replace(.,"\\.",""))
   Time ALY2000d ALY2001d ALY2002d ALY2003d FRY2000d FRY2001d FRY2002d FRY2003d UKY2000d UKY2001d UKY2002d UKY2003d
1  2000         1         0         0         0         0         0         0         0         0         0         0         0
2  2001         0         1         0         0         0         0         0         0         0         0         0         0
3  2002         0         0         1         0         0         0         0         0         0         0         0         0
4  2003         0         0         0         1         0         0         0         0         0         0         0         0
5  2000         0         0         0         0         1         0         0         0         0         0         0         0
6  2001         0         0         0         0         0         1         0         0         0         0         0         0
7  2002         0         0         0         0         0         0         1         0         0         0         0         0
8  2003         0         0         0         0         0         0         0         1         0         0         0         0
9  2000         0         0         0         0         0         0         0         0         1         0         0         0
10 2001         0         0         0         0         0         0         0         0         0         1         0         0
11 2002         0         0         0         0         0         0         0         0         0         0         1         0
12 2003         0         0         0         0         0         0         0         0         0         0         0         1
Ian Campbell
  • 23,484
  • 14
  • 36
  • 57
  • This works but I think there is an issue with the column names because when I try to write this with library(haven) to a DTA-file (stata) I get the error "Error: Columns of type list not supported yet" – Emil Krabbe May 06 '21 at 14:36
  • Using `purrr::invoke` should take care of the problem. – Ian Campbell May 06 '21 at 14:48
  • purrr installed but new error when writing with haven is : Error: Failed to create column `AL.Y2000d`: A provided name contains an illegal character. Is there a way to just remove all the "." from column names? – Emil Krabbe May 06 '21 at 15:26
  • Sure, you can use `rename_with`. Check the edit. – Ian Campbell May 06 '21 at 15:32
2

1) model.matrix We split the names by the number of characters in them (the countries have 2 characters in their names and the years have 6) and paste pluses in each. (Alternately use Plus(grep("^..$", nms, value = TRUE)) to get the country names and use that in place of spl["2"] and similarly Plus(grep("^Y....d$", nms, value = TRUE)) in place of spl["6"].)

c(`2` = "AL+FR+UK", `6` = "Y2000d+Y2001d+Y2002d+Y2003d")

and from that the formula:

~(AL + FR + UK):(Y2000d + Y2001d + Y2002d + Y2003d) + 0

and then compute its model matrix.

The formula could also be expanded to one accepted by lm by modifying the sprintf format so we might not even need to create the model matrix. For example, if we had a response vector R then we could write: s <- sprintf("R ~ (%s)*(%s)", spl["2"], spl["4"]); fo <- formula(s); lm(fo, YearCountry) to include all variables and the interactions of countries and year as well as an intercept.

Plus <- function(x) paste(x, collapse = "+")
nms <- names(YearCountry)[-1]
spl <- sapply(split(nms, nchar(nms)), Plus)

s <- sprintf("~ (%s):(%s)+0", spl["2"], spl["6"])
fo <- formula(s)

model.matrix(fo, YearCountry)

giving this matrix:

   AL:Y2000d AL:Y2001d AL:Y2002d AL:Y2003d FR:Y2000d FR:Y2001d FR:Y2002d FR:Y2003d UK:Y2000d UK:Y2001d UK:Y2002d UK:Y2003d
1          1         0         0         0         0         0         0         0         0         0         0         0
2          0         1         0         0         0         0         0         0         0         0         0         0
3          0         0         1         0         0         0         0         0         0         0         0         0
4          0         0         0         1         0         0         0         0         0         0         0         0
5          0         0         0         0         1         0         0         0         0         0         0         0
6          0         0         0         0         0         1         0         0         0         0         0         0
7          0         0         0         0         0         0         1         0         0         0         0         0
8          0         0         0         0         0         0         0         1         0         0         0         0
9          0         0         0         0         0         0         0         0         1         0         0         0
10         0         0         0         0         0         0         0         0         0         1         0         0
11         0         0         0         0         0         0         0         0         0         0         1         0
12         0         0         0         0         0         0         0         0         0         0         0         1
attr(,"assign")
 [1]  1  2  3  4  5  6  7  8  9 10 11 12

Alternately we can write it compactly like this:

Plus <- function(x) paste(x, collapse = "+")
nms <- names(YearCountry)
s <- sprintf("~ (%s):(%s)+0", Plus(nms[2:4]), Plus(nms[5:8]))
fo <- formula(s)
model.matrix(fo, YearCountry)

2) eList Another approach is to use list comprehensions. With the eList package we can do this:

library(eList)
DF(for(i in YearCountry[2:4]) for(j in YearCountry[5:8]) i*j)

giving this data frame. Use as.matrix(...) on it if you want a matrix.

   AL.Y2000d AL.Y2001d AL.Y2002d AL.Y2003d FR.Y2000d FR.Y2001d FR.Y2002d FR.Y2003d UK.Y2000d UK.Y2001d UK.Y2002d UK.Y2003d
1          1         0         0         0         0         0         0         0         0         0         0         0
2          0         1         0         0         0         0         0         0         0         0         0         0
3          0         0         1         0         0         0         0         0         0         0         0         0
4          0         0         0         1         0         0         0         0         0         0         0         0
5          0         0         0         0         1         0         0         0         0         0         0         0
6          0         0         0         0         0         1         0         0         0         0         0         0
7          0         0         0         0         0         0         1         0         0         0         0         0
8          0         0         0         0         0         0         0         1         0         0         0         0
9          0         0         0         0         0         0         0         0         1         0         0         0
10         0         0         0         0         0         0         0         0         0         1         0         0
11         0         0         0         0         0         0         0         0         0         0         1         0
12         0         0         0         0         0         0         0         0         0         0         0         1

3) listcompr listcompr is another list comprehension package. Note that the development version of this package is needed in order to use bycol=. Replace gen.named.matrix with gen.named.data.frame if you want a data frame.

# devtools::github_github("patrickroocks/listcompr")
library(listcompr)

nms <- names(YearCountry)
gen.named.matrix("{nms[i]}.{nms[j]}", YearCountry[[i]] * YearCountry[[j]],
  i = 2:4, j = 5:8, bycol = TRUE)
G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341