0

I am working on a Drake workflow that is defined as such:

projectName <- c("lake_2018_CER_lib_norm_log2", "lake_2018_CER_lib_norm", "lake_2018_CER_raw_counts")
normalize <- c(TRUE, TRUE, FALSE)
logTransform <- c(TRUE, FALSE, FALSE)

normalize_fxn <- function(datExpr) {
  
  datExpr <- sweep(datExpr, 2, colSums(datExpr), FUN = "/")
  return(datExpr)
  
}

plan <- drake_plan(
  
  datExpr = target(fread(file_in(filePath), sep = "\t") %>% select(-1), transform = map(filePath = !!filePath, .id = FALSE)),
  datExprNorm = target(if(normalize == TRUE) {normalize_fxn(datExpr)*1e6 + 1} else {datExpr}, transform = map(datExpr, normalize = !!normalize)),
  datExprLog = target(if(logTransform == TRUE) {log2(datExprNorm*1e6 + 1)} else {datExprNorm}, transform = map(datExprNorm, logTransform = !!logTransform)),
  filterGenesMinCells = target(if(is.numeric(percentCells)) {round(ncol(datExprLog)*percentCells)} else {NULL}, transform = cross(datExprLog, percentCells = !!percentCells)),
  makePlots = target(realVsPermCor(datExpr = datExprLog,
                                   projectName = projectName,
                                   featureType = featureType,
                                   nPerms = 100,
                                   subsampleReal = NULL,
                                   resampleReal = NULL,
                                   subsamplePerm,
                                   filterGenesMinCells = filterGenesMinCells,
                                   filterCellsMinGenes = NULL,
                                   fdrSubsample,
                                   futureThreads = NULL,
                                   openBlasThreads = 10,
                                   outDir),
                     transform = cross(filterGenesMinCells, featureType = !!featureType, .id = c(featureType, percentCells)))
)

The target output looks like this:

> plan$target
 [1] "datExpr"                                                              "datExprLog_TRUE_datExprNorm_TRUE_datExpr"                            
 [3] "datExprLog_FALSE_datExprNorm_TRUE_datExpr_2"                          "datExprLog_FALSE_datExprNorm_FALSE_datExpr"                          
 [5] "datExprNorm_TRUE_datExpr"                                             "datExprNorm_TRUE_datExpr_2"                                          
 [7] "datExprNorm_FALSE_datExpr"                                            "filterGenesMinCells_NULL_datExprLog_TRUE_datExprNorm_TRUE_datExpr"   
 [9] "filterGenesMinCells_0.01_datExprLog_TRUE_datExprNorm_TRUE_datExpr"    "filterGenesMinCells_0.02_datExprLog_TRUE_datExprNorm_TRUE_datExpr"   
[11] "filterGenesMinCells_NULL_datExprLog_FALSE_datExprNorm_TRUE_datExpr_2" "filterGenesMinCells_0.01_datExprLog_FALSE_datExprNorm_TRUE_datExpr_2"
[13] "filterGenesMinCells_0.02_datExprLog_FALSE_datExprNorm_TRUE_datExpr_2" "filterGenesMinCells_NULL_datExprLog_FALSE_datExprNorm_FALSE_datExpr" 
[15] "filterGenesMinCells_0.01_datExprLog_FALSE_datExprNorm_FALSE_datExpr"  "filterGenesMinCells_0.02_datExprLog_FALSE_datExprNorm_FALSE_datExpr" 
[17] "makePlots_gene_NULL"                                                  "makePlots_cell_NULL"                                                 
[19] "makePlots_gene_0.01"                                                  "makePlots_cell_0.01"                                                 
[21] "makePlots_gene_0.02"                                                  "makePlots_cell_0.02"                                                 
[23] "makePlots_gene_NULL_2"                                                "makePlots_cell_NULL_2"                                               
[25] "makePlots_gene_0.01_2"                                                "makePlots_cell_0.01_2"                                               
[27] "makePlots_gene_0.02_2"                                                "makePlots_cell_0.02_2"                                               
[29] "makePlots_gene_NULL_3"                                                "makePlots_cell_NULL_3"                                               
[31] "makePlots_gene_0.01_3"                                                "makePlots_cell_0.01_3"                                               
[33] "makePlots_gene_0.02_3"                                                "makePlots_cell_0.02_3"                                               

This very close to what I want, but what I'm stuck on is the projectName: I want one of three project names to be used for the final target depending on whether the input, produced in earlier steps, were normalized and/or log transformed.

Currently, I produce 18 targets, so I want each project name to be mapped to 6 of the targets.

Is there some way I can accomplish this?

1 Answers1

1

Seems like you could write a function to accept normalization and log transform settings and output the name of the project. Sketch below.

Static branching in drake is hard. In drake's successor, targets, I try to make both kinds of branching easier. (Might not be feasible to make the switch mid-project though.)

library(drake)

filePath <- "file_path.txt"
normalize <- c(TRUE, TRUE, FALSE)
logTransform <- c(TRUE, FALSE, FALSE)
percentCells <- "percent_cells"
featureType <- "feature_type"
normalize_fxn <- function(datExpr) {
  datExpr <- sweep(datExpr, 2, colSums(datExpr), FUN = "/")
  return(datExpr)
}

name_project <- function(normalize, log_transform) {
  switch(
    paste0(normalize, "_", log_transform),
    TRUE_TRUE = "lake_2018_CER_lib_norm_log2",
    TRUE_FALSE = "lake_2018_CER_lib_norm",
    FALSE_FALSE = "lake_2018_CER_raw_counts"
  )
}

plan <- drake_plan(
  datExpr = target(fread(file_in(filePath), sep = "\t") %>% select(-1), transform = map(filePath = !!filePath, .id = FALSE)),
  datExprNorm = target(if(normalize == TRUE) {normalize_fxn(datExpr)*1e6 + 1} else {datExpr}, transform = map(datExpr, normalize = !!normalize)),
  datExprLog = target(if(logTransform == TRUE) {log2(datExprNorm*1e6 + 1)} else {datExprNorm}, transform = map(datExprNorm, logTransform = !!logTransform)),
  filterGenesMinCells = target(if(is.numeric(percentCells)) {round(ncol(datExprLog)*percentCells)} else {NULL}, transform = cross(datExprLog, percentCells = !!percentCells)),
  makePlots = target(
    realVsPermCor(
      datExpr = datExprLog,
      # The project name is a function of normalization and log transform.
      projectName = !!name_project(deparse(substitute(normalize)), deparse(substitute(logTransform))),
      featureType = featureType,
      nPerms = 100,
      subsampleReal = NULL,
      resampleReal = NULL,
      subsamplePerm,
      filterGenesMinCells = filterGenesMinCells,
      filterCellsMinGenes = NULL,
      fdrSubsample,
      futureThreads = NULL,
      openBlasThreads = 10,
      outDir
    ),
    transform = cross(filterGenesMinCells, featureType = !!featureType, .id = c(featureType, percentCells))
  )
)

dplyr::filter(plan, grepl("makePlots", target))$command
#> [[1]]
#> realVsPermCor(datExpr = datExprLog_TRUE_datExprNorm_TRUE_datExpr, 
#>     projectName = "lake_2018_CER_lib_norm_log2", featureType = "feature_type", 
#>     nPerms = 100, subsampleReal = NULL, resampleReal = NULL, 
#>     subsamplePerm, filterGenesMinCells = filterGenesMinCells_percent_cells_datExprLog_TRUE_datExprNorm_TRUE_datExpr, 
#>     filterCellsMinGenes = NULL, fdrSubsample, futureThreads = NULL, 
#>     openBlasThreads = 10, outDir)
#> 
#> [[2]]
#> realVsPermCor(datExpr = datExprLog_FALSE_datExprNorm_TRUE_datExpr_2, 
#>     projectName = "lake_2018_CER_lib_norm", featureType = "feature_type", 
#>     nPerms = 100, subsampleReal = NULL, resampleReal = NULL, 
#>     subsamplePerm, filterGenesMinCells = filterGenesMinCells_percent_cells_datExprLog_FALSE_datExprNorm_TRUE_datExpr_2, 
#>     filterCellsMinGenes = NULL, fdrSubsample, futureThreads = NULL, 
#>     openBlasThreads = 10, outDir)
#> 
#> [[3]]
#> realVsPermCor(datExpr = datExprLog_FALSE_datExprNorm_FALSE_datExpr, 
#>     projectName = "lake_2018_CER_raw_counts", featureType = "feature_type", 
#>     nPerms = 100, subsampleReal = NULL, resampleReal = NULL, 
#>     subsamplePerm, filterGenesMinCells = filterGenesMinCells_percent_cells_datExprLog_FALSE_datExprNorm_FALSE_datExpr, 
#>     filterCellsMinGenes = NULL, fdrSubsample, futureThreads = NULL, 
#>     openBlasThreads = 10, outDir)

Created on 2021-01-12 by the reprex package (v0.3.0)

landau
  • 5,636
  • 1
  • 22
  • 50
  • Beautiful, thank you so much! One question: how can it be that the name_project output maps to the correct datExprLog transformation? In other words, I just want clarify that (I think) it's up to me to configure the output produced by the function to be the same order that the variations of datExprLog are being fed into the function for the final target (as opposed to the workflow "knowing" the order these variations were produced)? – Rebecca Eliscu Jan 13 '21 at 00:55
  • 1
    If I am tracking, I think it is up to the user to make sure `datExprLog` matches up with `normalize` and `logTransform`. Even if you are reasonably confident, I always recommend checking the correctness of your branching using `vis_drake_graph(plan)` or `plot(plan)`. – landau Jan 13 '21 at 01:04
  • Gotcha. Thanks again! – Rebecca Eliscu Jan 13 '21 at 01:07