0

I am new to R and I don't quite know what structures to use and the correct syntax for them.

I have lists (that are more like tables with columns and column names). I would like to do the same functions to multiple lists. I assumed for loops would be reasonable to use.

My functions are

1) use a column to calculate a new column. (calculate fold change from log2foldchange)

2) make a new list using a subset of the old list and name it adjusting the name of the original list name

Here are the lines of code that worked for these tables individually.

#take values from the log2FoldChange column and calculate Fold Change
resCondition_anno$FoldChange <- 2^resCondition_anno$log2FoldChange 


#subset my dataset based on the values for each row in the padj column    
resCondition_anno_padj05 <- subset(resCondition_anno, resCondition$padj <= 0.05) 

I would like to do these functions to multiple tables.

When I tried to do it in a for loop

resfiles1 <- c(resCondition_anno,resVirus_anno,resInter_anno)

for (i in resfiles1){

 i$FoldChange <- 2^i$log2FoldChange # I was trying to calculate a new column based on log2FoldChange column

  i_with_padj05 <- paste(i,"_padj05") # I was trying to create a new name like resCondition_anno_padj05
  i_with_padj05 <- subset(i, i[[padj]] <= 0.05) # I was trying to subset my dataset based on values in the padj column
}

I tried to access the columns of my tables with $ and that gave me

Error: $ operator is invalid for atomic vectors

I tried to access the columns of my tables with [padj], I get

Error in subset.default(i, i[padj] <= 0.05) : object 'padj' not found

When I tried to access the columns of my table with `[[padj]], I got the following error

Error in subset.default(i, i[[padj]] <= 0.05) : object 'padj' not found

Am I going about this completely the wrong way? Is for loops reasonable way to approach my goals? I know apply functions exists but I had such a hard to getting output files out of them when I tried to input multiple files into it so I wanted to give for loops a try.

I would appreciate a code that would work for a random table and does these things and then I can figure out whether my tables are weird.

 dput(head(resCondition_anno))
structure(list(ensembl = c("ENSMUSG00000051951", "ENSMUSG00000102331", 
"ENSMUSG00000025902", "ENSMUSG00000104238", "ENSMUSG00000102269", 
"ENSMUSG00000096126"), baseMean = c(2.34691358937965, 0.169507902147731, 
49.4591642836684, 0.253911076708937, 3.27439052075304, 0.258178295608587
), log2FoldChange = c(1.04699290132002, 1.89907052894015, 0.629095304499277, 
0.0597400040882164, -0.291997327218544, 1.97984690635658), lfcSE = c(1.09309963258445, 
4.36961772602319, 0.291712394209747, 4.37647193807779, 1.21524080418346, 
4.3263845102792), stat = c(0.95782019324678, 0.434607933236415, 
2.15656008104662, 0.0136502655411644, -0.240279396654017, 0.457621577937096
), pvalue = c(0.338153434807336, 0.66384703564954, 0.0310399577136823, 
0.989109002094381, 0.810113666298446, 0.647224338296786), padj = c(NA, 
NA, 0.106540309680362, NA, 0.911344697137259, NA), mgi_symbol = c("Xkr4", 
"Gm19938", "Sox17", "Gm37587", "Gm7357", "Gm22307"), gene_biotype = c("protein_coding", 
"sense_intronic", "protein_coding", "processed_transcript", "processed_pseudogene", 
"snRNA")), class = c("data.table", "data.frame"), row.names = c(NA, 
-6L), .internal.selfref = <pointer: 0x0000027bef7e1ef0>)`

Expected results for the aim 1

> dput(head(resCondition_anno))
structure(list(ensembl = c("ENSMUSG00000051951", "ENSMUSG00000102331", 
"ENSMUSG00000025902", "ENSMUSG00000104238", "ENSMUSG00000102269", 
"ENSMUSG00000096126"), baseMean = c(2.34691358937965, 0.169507902147731, 
49.4591642836684, 0.253911076708937, 3.27439052075304, 0.258178295608587
), log2FoldChange = c(1.04699290132002, 1.89907052894015, 0.629095304499277, 
0.0597400040882164, -0.291997327218544, 1.97984690635658), lfcSE = c(1.09309963258445, 
4.36961772602319, 0.291712394209747, 4.37647193807779, 1.21524080418346, 
4.3263845102792), stat = c(0.95782019324678, 0.434607933236415, 
2.15656008104662, 0.0136502655411644, -0.240279396654017, 0.457621577937096
), pvalue = c(0.338153434807336, 0.66384703564954, 0.0310399577136823, 
0.989109002094381, 0.810113666298446, 0.647224338296786), padj = c(NA, 
NA, 0.106540309680362, NA, 0.911344697137259, NA), mgi_symbol = c("Xkr4", 
"Gm19938", "Sox17", "Gm37587", "Gm7357", "Gm22307"), gene_biotype = c("protein_coding", 
"sense_intronic", "protein_coding", "processed_transcript", "processed_pseudogene", 
"snRNA"), FoldChange = c(2.0662186086592, 3.72972827627808, 1.54659483966075, 
1.0422779093498, 0.816770504282921, 3.94451221821964)), class = c("data.table", 
"data.frame"), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x0000027bef7e1ef0>)

Expected results for aim2

> dput(head(resCondition_anno_padj05))
structure(list(ensembl = c("ENSMUSG00000103922", "ENSMUSG00000025907", 
"ENSMUSG00000061024", "ENSMUSG00000025911", "ENSMUSG00000025935", 
"ENSMUSG00000025937"), baseMean = c(7.45083924607695, 1035.42915800337, 
756.089939474399, 1510.50670239711, 2014.55644970672, 5206.99654662079
), log2FoldChange = c(3.31157886392159, -0.345358245876914, 0.340037961752993, 
-0.637902858828505, 0.592795289538968, 0.59912370697665), lfcSE = c(0.984296895396084, 
0.131191642000487, 0.0967702378760271, 0.120687031774959, 0.114283891072725, 
0.161639505766009), stat = c(3.36441055479404, -2.63247140298489, 
3.51386923517349, -5.28559572181691, 5.18704153292907, 3.70654255676794
), pvalue = c(0.000767073434065771, 0.00847661586751943, 0.000441630160084079, 
1.25296333033368e-07, 2.13661093734535e-07, 0.000210107944374613
), padj = c(0.00522376704325313, 0.0385092726153939, 0.00325683272694307, 
2.17721401368104e-06, 3.51690667040699e-06, 0.00168321660710376
), mgi_symbol = c("Gm6123", "Rb1cc1", "Rrs1", "Adhfe1", "Tram1", 
"Lactb2"), gene_biotype = c("processed_pseudogene", "protein_coding", 
"protein_coding", "protein_coding", "protein_coding", "protein_coding"
), FoldChange = c(9.92852128160573, 0.787112498791522, 1.26578990036559, 
0.642646438673565, 1.5081660610658, 1.51479619975327)), class = c("data.table", 
"data.frame"), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x0000027bef7e1ef0>)
  • Please add data using `dput` and show the expected output for the same. Please read the info about [how to ask a good question](http://stackoverflow.com/help/how-to-ask) and how to give a [reproducible example](http://stackoverflow.com/questions/5963269). – Ronak Shah May 26 '20 at 06:54
  • the question is truly unclear.to your question 'I would like to do the same functions to multiple lists.' consider `lapply`. To access elements of a list use `[[ ]]`. – efz May 26 '20 at 06:55
  • I did head() instead of dput() and hopefully that is sufficient since the dataset is large. I tried to input them as code but I think I need to practice how to do that better. – lara.bideyan May 26 '20 at 07:33
  • l did not have luck with lapply and I didn't know whether I was asking it to do something it was not designed to do or I just didn't know how to do it. When I input multiple tables, I had a hard time returning each output table individually. The only way I figured it out was to specify the output for each file line by line. I wanted to give for-loops a try but I will learn more about lapply if for-loops are not intended to do my aims. I tried the [[ ]] and posted the error. – lara.bideyan May 26 '20 at 07:38
  • what is `padj` in `subset(i, i[[padj]] <= 0.05)`. as far as I can see `i ` can be one of `resCondition_anno,resVirus_anno,resInter_anno`. By doing `i[[padj]]` you arer supposing that `i` is a list, right? and is `padj` an index? – efz May 26 '20 at 14:12
  • @efz `padj` is a column name common across my datasets (resCondition_anno, resVirus etc) Yes I was trying to use the padj column in my dataset to subset the data into a new list/dataset. I was able to accomplish this outside of a loop with the following code. `resCondition_anno_padj05 <- subset(resCondition_anno, resCondition$padj <= 0.05)` – lara.bideyan May 26 '20 at 16:46
  • would you be able to provide an example code for how I would accomplish these goals using lapply? @efz – lara.bideyan May 26 '20 at 17:00
  • I see now. if `padj` is a column name, then you need to use quotes: `i[['padj']]`. see if it works – efz May 26 '20 at 17:57
  • @efz I got `Error in i[["padj"]] : subscript out of bounds` although I put ' rather than ". But I tried the same line without variables outside of a for-loop. It works `i_with_padj05 <- subset(resCondition_anno, resCondition_anno[['padj']] <= 0.05)` – lara.bideyan May 26 '20 at 18:45
  • maybe you can try some variation on the theme:`resfiles1 <- list(resCondition_anno,resVirus_anno,resInter_anno)` and then try `lapply(resfiles1, function(x){i_with_padj05 = subset(x, x[['padj']] <= 0.05) })` . sorry but it's difficult without actual data to work on. – efz May 26 '20 at 19:57
  • @efz `resfiles1 <- list(resCondition_anno,resVirus_anno,resInter_anno)` using list instead of c made a big difference. It actually created a new file names i_with_padj with the subsetted information. How can I make the name change for each table I am running this for. for resCondition_anno, I want the new subsetted file new to be resCondition_anno_with_padj and similar for resVirus_anno and resInter_anno. Currently, it keeps updating the i_with_padj for each table in the for loop. I tried "i"_with_anno but got the error `Error: unexpected input in: "for (i in resfiles1){ "i"_"` – lara.bideyan May 26 '20 at 21:19
  • @efz lapply function works in performing the action but I don't know how to get the individual results files back. I can save the lapply into something like `a <- lapply(...)` but then I only know to take the individual tables out by specifying each one as one line. Is there a straightforward way to do this? – lara.bideyan May 26 '20 at 21:24
  • @efz added my data in dput format. Is that helpful? – lara.bideyan May 26 '20 at 21:34

1 Answers1

0

for aim 1

library(dplyr)
resCondition_anno_dumb <- resCondition_anno # produce a similar list
resCondition_anno_dumb$log2FoldChange <- resCondition_anno$log2FoldChange*3 # make some changes

list_t   <- list(resCondition_anno, resCondition_anno_dumb) # here you enter your dataframes
# mutate adds a column to existing data sets, lapply makes it recursive
new_list <- lapply(list_t, function(x){x %>% mutate(FoldChange=2^log2FoldChange)})

for aim 2 something like

new_list <- lapply(list_t, function(x){x %>% filter(padj<=0.05)})

or you can pipe them together:

new_list <- lapply(list_t, function(x){x %>% mutate(FoldChange=2^log2FoldChange) %>% filter (padj <=0.05)})
Ruben Helsloot
  • 12,582
  • 6
  • 26
  • 49
efz
  • 425
  • 4
  • 9
  • I have not run the code yet but I understand what it does mostly. How do I get the individual dataframes that we modified/created? I saw someone do it as `new_list[1] -> resCondition_anno new_list[2] -> resVirus_anno ...` Is there a better way to get the each individual data frame at the end? – lara.bideyan May 27 '20 at 21:28
  • there are several ways to achive that: first set the names of your list: `names(new_list) <- c('resCondition_anno', 'resCondition_anno_dumb')` then `lapply(seq_along(new_list), function(i) assign(names(new_list)[i], new_list[[i]], envir = .GlobalEnv))` or `split(new_list, names(new_list))`, see for example [link](https://stackoverflow.com/questions/48238039/extracting-a-dataframe-from-a-list-over-many-objects) – efz May 28 '20 at 07:17