Embedding a script within a for-loop in R

Question

I have a dataframe in R that looks something like this:

 library(tibble)
 sample <- tribble(~subj, ~session,
            "A", 1,
            "A", 2, 
            "A", 3,
            "B", 1,
            "B", 2,
            "C", 1,
            "C", 2,
            "C", 3,
            "C", 4)

As you can see from this example, there are a number of sessions for each subject, but subjects do not all have the same number of sessions. There are 94 rows in my real dataset (5 subjects, between 15 and 20 different sessions each).

I have another script that takes my main dataset (a set of linguistic data with detailed phonetic features for each subject in each session, with almost 200,000 rows) and filters by subject and session to create a distance matrix showing Euclidean distances between the different words. I can't replicate it here for practical reasons, but have created an example script here:

 library(tibble)
 data <- tribble(~subj, ~session, ~Target, ~S1C1_target,           # S1C1 = syllable 1, consonant 1
                 ~S1C1_T.Sonorant, ~S1C1_T.Consonantal,            # _T. = target consonant of S1C1
                 ~S1C1_T.Voice, ~S1C1_T.Nasal, ~S1C1_T.Degree,     # .Voice/.Nasal/etc are phonetic 
                                                                   # properties of the target word
            "A", 1, "electricity", "i", 0, 0, 0, 0, 0,
            "A", 1, "hectic", "h", 0.8, 0, 1, 0, 0,
            "A", 1, "pillow", "p", -1, 1, -1, 0, 0,
            "A", 2, "hello", "h", -0.5, 1, 0, -1, 0,
            "A", 2, "cup", "k", 0.8, 0, 1, 0, 0,
            "A", 2, "exam", "e", 0, 0, 0, 0, 0,
            "B", 1, "wug", "w", 0.8, 0, 1, 0, 0,
            "B", 1, "wug", "w", 0.8, 0, 1, 0, 0,
            "B", 1, "hug", "h", 0.8, 0, 1, 0, 0,
            "B", 2, "wug", "w", -0.5, 1, 0, -1, 0,
            "B", 2, "well", "w", 0.8, 0, 1, 0, 0,
            "B", 2, "what", "w", 0.8, 0, 1, 0, 0)

I want to start by creating a sub-set of data for each subject in each session. Sometimes a participant has more than one token of the same word in Target, so I create a mean value for repeated iterations here as well:

 matrixA1 <- data %>%                          # name the data after the subj and session name/number
   filter(subj == "A" & session == 1) %>%  
   dplyr::select(-subj, -session) %>%          # leave only the numeric values + `Target`
   group_by(Target) %>% 
   summarize_all(.funs = list(mean))           # Average across targets with more than one token



 ##### Calculate Euclidean distance between each phonetic property of each S1C1 target consonant

 ones <- rep(1,nrow(matrixA1))  # count repeated rows

 Son.mat.S1C1_T <- matrixA1$S1C1_T.Sonorant %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Sonorant)
 rownames(Son.mat.S1C1_T) <- matrixA1$Target
 colnames(Son.mat.S1C1_T) <- matrixA1$Target
 colnames(Son.mat.S1C1_T) <- paste(colnames(Son.mat.S1C1_T), "Son.S1C1_T", sep = "_")

 Son.mat.S1C1_T <- Son.mat.S1C1_T^2

 Con.mat.S1C1_T <- matrixA1$S1C1_T.Consonantal %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Consonantal)
 rownames(Con.mat.S1C1_T) <- matrixA1$Target
 colnames(Con.mat.S1C1_T) <- matrixA1$Target
 colnames(Con.mat.S1C1_T) <- paste(colnames(Con.mat.S1C1_T), "Con.S1C1_T", sep = "_")

 Con.mat.S1C1_T <- Con.mat.S1C1_T^2

 Voice.mat.S1C1_T <- matrixA1$S1C1_T.Voice %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Voice)
 rownames(Voice.mat.S1C1_T) <- matrixA1$Target
 colnames(Voice.mat.S1C1_T) <- matrixA1$Target
 colnames(Voice.mat.S1C1_T) <- paste(colnames(Voice.mat.S1C1_T), "Voice.S1C1_T", sep = "_")

 Voice.mat.S1C1_T <- Voice.mat.S1C1_T^2

 Nasal.mat.S1C1_T <- matrixA1$S1C1_T.Nasal %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Nasal)
 rownames(Nasal.mat.S1C1_T) <- matrixA1$Target
 colnames(Nasal.mat.S1C1_T) <- matrixA1$Target
 colnames(Nasal.mat.S1C1_T) <- paste(colnames(Nasal.mat.S1C1_T), "Nasal.S1C1_T", sep = "_")

 S1C1.1A <- Son.mat.S1C1_T +
   Con.mat.S1C1_T +
   Voice.mat.S1C1_T +
   Nasal.mat.S1C1_T

 colnames(S1C1.1A) = gsub("_Son.S1C1_T", "", colnames(S1C1.1A))

This creates a matrix that looks something like this:

             electricity hectic pillow
 electricity    0.00      1.64   3.00
 hectic         1.64      0.00   8.24
 pillow         3.00      8.24   0.00

As you can see, this code is already quite big, and the real code is quite a lot longer. I know that a loop of some kind will be the best way to deal with it, but I can't figure out how to run it. What I would like it to do is this:

For each row in sample, create a dataframe that has subj and session as identifiers in the name
For each of these dataframes, run the script above, from #####, to create a matrix for each subject and each session, like the one shown above.

To do this, I think the best way is to embed the script into a for-loop, and specify that it should be run for each row in sample.

Please provide a minimal reproducible example. To me it looks like you are looking for a function ;-) — Christoph, Apr 21 '20 at 07:47

broti · Accepted Answer · 2020-04-22T09:11:10.643

It looks to me that you don't need to refer to your sample dataframe, because the information about combinations of subj and session is all in your data. If that's not the case, let me know. Otherwise, here's my approach.

First of all, instead of manually filtering the data for each combination of subj and session, just summarize your data in one go, after grouping the data according to subject-session combinations. Before that, give each combo an id with group_indices:

data_summ <- data %>%                          
  mutate(id = group_indices(., subj, session)) %>%
  group_by(subj, session, Target) %>% 
  summarize_all(.funs = list(mean))

Now, you could work with a list approach for transparency. Start by splitting your summarized data into a list of data frames, one for each subject-session id:

data_list <- data_summ %>%
  split(., f = .$id)

Now you can get the first data frame by data_list[[1]], second by data_list[[2]] and so on. This allows you to loop through the list and compute your matrix for each list element. I have simplified some of your code - e.g., you don't need to name every of your four matrices (based on S1C1_T.Consonantal, S1C1_T.Consonantal...) anew. I suggest you store all the results in a separate list called mat_list.

mat_list = list()

for (i in 1:length(data_list)) {

  element <- data_list[[i]]

  ones <- rep(1, nrow(element))

  sonorant_vec <- element$S1C1_T.Sonorant
  sonorant_mat <- (sonorant_vec %*% t(ones) - ones %*% t(sonorant_vec))^2

  consonantal_vec <- element$S1C1_T.Consonantal
  consonantal_mat <- (consonantal_vec %*% t(ones) - ones %*% t(consonantal_vec))^2

  voice_vec <- element$S1C1_T.Voice
  voice_mat <- (voice_vec %*% t(ones) - ones %*% t(voice_vec))^2

  nasal_vec <- element$S1C1_T.Nasal
  nasal_mat <- (nasal_vec %*% t(ones) - ones %*% t(nasal_vec))^2

  all_mat <- sonorant_mat + consonantal_mat + voice_mat + nasal_mat
  rownames(all_mat) <- element$Target
  colnames(all_mat) <- element$Target

  mat_list[[i]] <- all_mat
}

Et voilà:

[[1]]
            electricity hectic pillow
electricity        0.00   1.64   3.00
hectic             1.64   0.00   8.24
pillow             3.00   8.24   0.00

[[2]]
       cup exam hello
cup   0.00 1.64  4.69
exam  1.64 0.00  2.25
hello 4.69 2.25  0.00

[[3]]
    hug wug
hug   0   0
wug   0   0

[[4]]
     well what  wug
well 0.00 0.00 4.69
what 0.00 0.00 4.69
wug  4.69 4.69 0.00

EDIT: If you wanted to avoid a for loop, you could put the chunk inside the loop into a function, and then lapply it to data_list:

lapply(data_list, FUN = function(element) {

  ones <- rep(1, nrow(element))

  sonorant_vec <- element$S1C1_T.Sonorant
  sonorant_mat <- (sonorant_vec %*% t(ones) - ones %*% t(sonorant_vec))^2

  consonantal_vec <- element$S1C1_T.Consonantal
  consonantal_mat <- (consonantal_vec %*% t(ones) - ones %*% t(consonantal_vec))^2

  voice_vec <- element$S1C1_T.Voice
  voice_mat <- (voice_vec %*% t(ones) - ones %*% t(voice_vec))^2

  nasal_vec <- element$S1C1_T.Nasal
  nasal_mat <- (nasal_vec %*% t(ones) - ones %*% t(nasal_vec))^2

  all_mat <- sonorant_mat + consonantal_mat + voice_mat + nasal_mat
  rownames(all_mat) <- element$Target
  colnames(all_mat) <- element$Target

  return(all_mat)
})

EDIT 2

To name the list elements according to subject-session combination names, you could do:

data_summ <- data %>%                          
  group_by(subj, session, Target) %>% 
  summarize_all(.funs = list(mean)) %>%
  mutate(subj_session = paste(subj, session))

And then split the data according to this new subj_session identifier:

data_list <- data_summ %>%
  split(., f = .$subj_session)

This is brilliant - it gives me exactly the output I need - thank you!! — Catherine Laing, Apr 21 '20 at 15:14
Just to follow up on this, is it possible to allocate a name to each object in the list, so that each matrix relates to the `session` and `subj` in `data_summ`? — Catherine Laing, Apr 22 '20 at 08:24

Cole · Answer 2 · 2020-04-27T00:59:01.467

Here's a way using base R. Basically, you are doing the same operations on each column while splitting by subj and session.

agg_data <-  aggregate(x = data[grep('Sonorant|Consonantal|Voice|Nasal', names(data))],
                       by =  data[c('subj', 'session', 'Target')],
                       FUN = mean)

by(data = agg_data[-which(names(agg_data) %in% c('subj', 'session'))],
   INDICES = agg_data[c('subj', 'session')],
   FUN = function (DF) {
     ones = rep(1, nrow(DF))
     mat = Reduce('+',
            lapply(DF[grep('Sonorant|Consonantal|Voice|Nasal', names(DF))],
                   function (x) (x %*% t(ones) - ones %*% t(x))^2)
            )
     colnames(mat) <- rownames(mat) <- DF[['Target']]
     mat
   }
)

Results - a by object:

subj: A
session: 1
            electricity hectic pillow
electricity        0.00   1.64   3.00
hectic             1.64   0.00   8.24
pillow             3.00   8.24   0.00
--------------------------------------------------------------------------------------------------------------------------- 
subj: B
session: 1
    hug wug
hug   0   0
wug   0   0
--------------------------------------------------------------------------------------------------------------------------- 
subj: A
session: 2
       cup exam hello
cup   0.00 1.64  4.69
exam  1.64 0.00  2.25
hello 4.69 2.25  0.00
--------------------------------------------------------------------------------------------------------------------------- 
subj: B
session: 2
     well what  wug
well 0.00 0.00 4.69
what 0.00 0.00 4.69
wug  4.69 4.69 0.00

Finally, here's a method using data.table. Since you are creating a distance matrix, this version uses dist(...) and we wrap the distance matrix into a list:

library(data.table)
dt = as.data.table(data)

done_dt = dt[, {tmp = .SD[, lapply(.SD, mean),
                by = Target,
                .SDcols = patterns('Sonorant|Consonantal|Voice|Nasal')]

      list(euc_dist = list(Reduce('+',
                                  lapply(tmp[, -1L, with = FALSE],
                                         function(x) dist(setNames(x, tmp[[1L]]))^2))))
      }
   , by = .(subj, session)]

And the outputs:

done_dt
     subj session       euc_dist
   <char>   <num>         <list>
1:      A       1 1.64,3.00,8.24
2:      A       2 4.69,2.25,1.64
3:      B       1              0
4:      B       2 4.69,4.69,0.00


done_dt[, euc_dist]
[[1]]
       electricity hectic
hectic        1.64       
pillow        3.00   8.24

[[2]]
     hello  cup
cup   4.69     
exam  2.25 1.64

[[3]]
    wug
hug   0

[[4]]
      wug well
well 4.69     
what 4.69 0.00

score 1 · Answer 3 · answered Apr 20 '20 at 15:03

To answer your first question (Embed a script in a for loop) I would suggest the source() command. Then you just need the assign() command.

It feels like your workflow goes like:

Your first two code chunks.
Everything in your third code chunk (matrixA1 <- and below).
for(i in 1:nrow(sample)){ source(your_script.R) assign(x = paste0("df_", sample$subj[i], sample$session[i]), value = S1C1.1A) }

You'll need to change your filter criteria. filter(subj == sample$subj[i] & session == sample$session[i]) should work.

You don't really need a separate script, it is fine to have a long loop. You could also make this a function if you wanted too. But loops are great and source() is a great command too! The key here is the assign() command.

I never thought of using `source()` - that's a great tip, thank you! — Catherine Laing, Apr 21 '20 at 15:15

score 1 · Answer 4 · answered Apr 26 '20 at 20:01

Here's a solution using dplyr and for loop:

# Step1: summarization of data at Subject, session & Target level
masterDt <- data %>%                          
  group_by(subj, session, Target) %>% 
  summarize_all(.funs = list(mean)) %>%
  mutate(subj_session = paste(subj, session)) %>% 
  ungroup() 

#List of variables to be used in 
varList<- c("S1C1_T.Sonorant","S1C1_T.Consonantal","S1C1_T.Voice")

# Step2: Function to calculate distance

EquiDist = function (ds,varList,rowNameVar) {
  # ds: dataframe
  # varList: list of variables which 
  # rowNameVar : row names on which for the matrix

  ones = rep(1, nrow(ds))
  mat = Reduce('+',
               lapply(ds %>% dplyr::select(varList),
                      function (x) (x %*% t(ones) - ones %*% t(x))^2)
  )
  colnames(mat) <-ds[[rowNameVar]]
  rownames(mat) <- ds[[rowNameVar]]
  mat
}

#calculating distnace for all at one go
overallMat<- EquiDist(masterDt,varList = varList,rowNameVar = "Target")

# Step3: creating an Identifier for unique subject, session & Target level,

NamesGrp<- masterDt %>% 
  dplyr::select("subj_session","Target") %>% 
  dplyr::distinct() %>%
  dplyr::group_by(subj_session) %>%
  mutate(Identifier=paste0(Target, collapse = ",")) %>%
  dplyr::select(-Target) %>%
  dplyr::distinct()  %>% 
  dplyr::ungroup()

# matrix for each subject and each session

l=list()
temp<- matrix()
for (i in 1:nrow(NamesGrp)){
  List_Names=NamesGrp[["subj_session"]][i]
  listIdentifier=c(unlist(strsplit(NamesGrp[["Identifier"]][i],",")))
  temp= overallMat[listIdentifier,listIdentifier]
  l[[List_Names]]<-temp

}

#output can be accessed by names/ index of list l 
l$`A 1`
            electricity hectic pillow
electricity        0.00   1.64   3.00
hectic             1.64   0.00   8.24
pillow             3.00   8.24   0.00

Embedding a script within a for-loop in R

4 Answers4