-1

I've written a custom function to read in a JSON file and extract all of the relevant info that I need, with the goal of running it over all files in a directory. I've created a character vector of all the files and using sapply/lapply, I've been able to run the function like below.

setwd("/directory/")
file.list=dir()
sapply(file.list,function)

For some reason during execution, it reaches a point where it outputs the following and stops for no reason, shown below. I have warnings suppressed, and the only warnings that I've been getting are ones that I expect. Running the parser individually on all the failing files works, giving me the table I want to see.

#expected output
#expected output

$'filename'
[1]FALSE
$'filename'
[1]NULL

I've attached my parser here if it helps, I'm sure it's not optimized and there's better ways to do it, but speed is not a primary concern here. Thanks in advance!

library(jsonlite)
library(data.table)
library(dplyr)
library(plyr)
library(stringr)
library(tidyr)

trialParse=function(filename){
  options(warn=-1)
  options(max.print=99999)
  parsefile=read_json(filename)
  title=str_remove(basename(filename),".json")
  #cat(sprintf("Schema Version is %s\n", parsefile$metadata$schemaVersion))
    if (parsefile$report$workflow$reportType!="DNA"){
      #cat(sprintf("%s report is not DNA, moved\n",title))
      from=paste("~/JSON_parsing/workingFiles/",title,".json",sep='')
      to=paste("~/JSON_parsing/failedFiles/",title,".json",sep='')
      file.rename(from=from,to=to)
    }else{
      #cat(sprintf("%s\n",title))
      
      #Extract report info
      if (exists("report",parsefile)&&length(parsefile$report)!=0){
        reportData=rbindlist(list(parsefile$report,parsefile$report$workflow),fill=TRUE)
        reportData$workflow=NULL
        reportData[is.na(reportData)]=""
        reportData=reportData%>%
          unique()%>%
          summarize_all(funs(trimws(paste(.,collapse=''))))
        #assign(paste("reportData_",title,sep=''),reportData,envir=.GlobalEnv)
      }else{
        print("No report info")
      }
      
      #Extract patient info  
      if(exists("patient",parsefile)&&length(parsefile$patient)!=0){
        patientData=as.data.frame(t(unlist(parsefile$patient)))
        #assign(paste("patientData_",title,sep=''),patientData,envir=.GlobalEnv)
      }else{
        print("No patient info")
      }
      
      #Extract order info
      if(exists("report",parsefile)&&length(parsefile$report)!=0){
        orderData=rbindlist(list(parsefile$order,parsefile$order$test),fill=TRUE)
        orderData$test=NULL
        orderData[is.na(orderData)]=""
        orderData=orderData%>%
          unique()%>%
          summarize_all(funs(trimws(paste(.,collapse=''))))
        #assign(paste("orderData_",title,sep=''),orderData,envir=.GlobalEnv)
      }else{
        print("No order info")
      }
      
      #Extract specimens info
      if(exists("specimens",parsefile)&&length(parsefile$specimens)!=0){
        specimens=list()
        for(i in 1:length(parsefile$specimens)){
          specimens[[i]]=as.data.frame(t(unlist(parsefile$specimens[[i]])))
        }
        specimensData=do.call(rbind.fill,specimens)%>%
          unique()
        #assign(paste("specimensData_",title,sep=''),specimensData,envir=.GlobalEnv)
      }else{
        print("No specimens info")
      }
      
      #Extract   mutations info
      if (exists("results",parsefile)&&length(parsefile$results)!=0){
        #Tumor Mutational Burden
        if (length(parsefile$results$tumorMutationalBurden!=0)){
          tmbdata=as.data.frame(t(unlist(c(parsefile$results[1],parsefile$results[2],parsefile$results[3]))))
          #assign(paste("tmbData_",title,sep=''),tmbdata,envir=.GlobalEnv)
        }
        #Somatic Potentially Actionable Mutations
        if (exists("somaticPotentiallyActionableMutations",parsefile$results)&&length(parsefile$results$somaticPotentiallyActionableMutations)!=0){
          #Remove Therapies
          for (i in 1:length(parsefile$results$somaticPotentiallyActionableMutations)){
            for (j in 1:length(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants)){
              parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[[j]]$therapies=NULL
              }
            }
          mutations=list()
          variants=list()
          #Extract Somatic Potentially Actionable Mutations data
            #Per Entry
          for(i in 1:length(parsefile$results$somaticPotentiallyActionableMutations)){
              #Per Variants in Entry
            for(j in 1:length(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants)){
              variants[[j]]=rbindlist(list(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[j],parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[[j]]),fill=TRUE)
            }
              #Per Variants in Entry
            for (j in 1:length(variants)){
              variantsData=do.call(rbind,variants[j])
              mutationsList=parsefile$results$somaticPotentiallyActionableMutations[[i]]
              mutationsList$variants=NULL
              mutationTable=rbindlist(list(mutationsList,variantsData),fill=TRUE)
              mutations=append(mutations,list(rbindlist(list(mutationsList,variantsData),fill=TRUE)))
            }
          }
          #Build SPAMS table
          SPAMsData=do.call(rbind,mutations)
          SPAMsData$mutationEffect=NULL
          SPAMsData=SPAMsData%>%
            unique()%>%
            fill(c(1:4),.direction=c("down"))%>%
            fill(c(5:ncol(SPAMsData)),.direction=c("up"))%>%
            unique()%>%
            mutate(mutation_type="Somatic Potentially Actionable Mutation")
        }else{
          print("No SPAMS")
        }
        
        #Somatic Potentially Actionable Copy Number Variants
        if (exists("somaticPotentiallyActionableCopyNumberVariants",parsefile$results)&&length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)!=0){
          #Remove Therapies
          for (i in 1:length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)){
            parsefile$results$somaticPotentiallyActionableCopyNumberVariants[[i]]$therapies=NULL
          } 
          #Extract Somatic Potentially Actionable Copy Number Variants Data
          variants=list()
          for(i in 1:length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)){
            variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticPotentiallyActionableCopyNumberVariants[[i]])))
          }
          SPACNVsData=do.call(rbind,variants)%>%
            mutate(mutation_type="Somatic Potentially Actionable Copy Number Variants")
          rm(variants)
        }else{
          print("No SPACNVs")
        }
        
        #Somatic Biologically Relevant Variants
        if (exists("somaticBiologicallyRelevantVariants",parsefile$results)&&length(parsefile$results$somaticBiologicallyRelevantVariants)!=0){
          #Extract Data
          variants=list()
          for(i in 1:length(parsefile$results$somaticBiologicallyRelevantVariants)){
            variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticBiologicallyRelevantVariants[[i]])))
          }
          SBRVsData=do.call(rbind,variants)%>%
            mutate(mutation_type="Somatic Biologically Relevant Variants")
          rm(variants)
        }else{
          print("No SBRVs")
        }
        
        #Somatic Variants of Unknown Significance
        if (exists("somaticVariantsOfUnknownSignificance",parsefile$results)&&length(parsefile$results$somaticVariantsOfUnknownSignificance)!=0){
          #Extract Data
          variants=list()
          for(i in 1:length(parsefile$results$somaticVariantsOfUnknownSignificance)){
            variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticVariantsOfUnknownSignificance[[i]])))
          }
          SVUSsData=do.call(rbind,variants)%>%
            mutate(mutation_type="Somatic Variants of Unknown Significance")
          rm(variants)
        }else{
          print("No SVUSs")
        }
        
        #Fusion Variants
        if (exists("fusionVariants",parsefile$results)&&length(parsefile$results$fusionVariants)!=0){
          #Remove Therapies
          for (i in 1:length(parsefile$results$fusionVariants)){
            parsefile$results$fusionVariants[[i]]$therapies=NULL
          }
          #Extract Data
          variants=list()
          for(i in 1:length(parsefile$results$fusionVariants)){
            variants[[i]]=as.data.frame(t(unlist(parsefile$results$fusionVariants[[i]])))
          }
          FVsData=do.call(rbind,variants)%>%
            mutate(mutation_type="Fusion Variants")%>%
            dplyr::rename(gene5display=gene5Display)%>%
            dplyr::rename(gene3display=gene3Display)
          rm(variants)
        }else{
          #print("No FVs")
        }
        
        #Inherited Relevant Variants
        if (exists("inheritedRelevantVariants",parsefile$results)&&length(parsefile$results$inheritedRelevantVariants)!=0){
          #Extract Data
          if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
          variants=list()
          for(i in 1:length(parsefile$results$inheritedRelevantVariants)){
            variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedRelevantVariants[[i]])))
          }
          IRVsData=do.call(rbind,variants)%>%
            mutate(mutation_type="Inherited Relevant Variants")
          rm(variants)
          }else{
            print("No IRVs")
          }
        }else{
          print("No IRVs")
        }
        
        #Inherited Incidental Findings
        if (exists("inheritedIncidentalFindings",parsefile$results)&&length(parsefile$results$inheritedIncidentalFindings)!=0){
          #Extract Data
          if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
          variants=list()
          for(i in 1:length(parsefile$results$inheritedIncidentalFindings)){
            variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedIncidentalFindings[[i]])))
          }
          IIFsData=do.call(rbind,variants)%>%
            mutate(mutation_type="Inherited Incidental Findings")
          rm(variants)
          }else{
            print("No IIFs")
          }
        }else{
          print("No IIFs")
        }
        
        #Inherited Variants of Unknown Significance
        if (exists("inheritedVariantsOfUnknownSignificance",parsefile$results)&&length(parsefile$results$inheritedVariantsOfUnknownSignificance)!=0){
          #Extract Data
          if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
          variants=list()
          for(i in 1:length(parsefile$results$inheritedVariantsOfUnknownSignificance)){
            variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedVariantsOfUnknownSignificance[[i]])))
          }
          IVUSsData=do.call(rbind,variants)%>%
            mutate(mutation_type="Inherited Variants of Unknown Significance")
          rm(variants)
          }else{
            print("No IVUSs")
          }
        }else{
          print("No IVUSs")
        }
        
        #Merge and Output data tables
        
        if (exists(c("SPAMsData","SBRVsData","SVUSsData","FVsData","IRVsData","IIFsData","IVUSsData"))){
        mergedMutations=rbind.fill(get0("SPAMsData"),get0("SPACNVsData"),get0("SBRVsData"),get0("SVUSsData"),get0("FVsData"),get0("IRVsData"),get0("IIFsData"),get0("IVUSsData"))%>%
          select(mutation_type,everything())
        
        outMutations=merge(patientData,mergedMutations)
        write.csv(outMutations,"~/JSON/mutations.csv",append=TRUE)
        #assign(paste("mergedMutations_",title,sep=''),mergedMutations,envir=.GlobalEnv)
        #assign(paste("patientMutations_",title,sep=''),merge(patientData,mergedMutations),envir=.GlobalEnv)
        }else{
          print("No mutations info")
        }
        
      }else{
        print("No mutations info")
      } 
      
      if (exists(c("orderData","reportData","specimensData","tmbData"))){        outPatients=rbind.fill(get0("patientData"),get0("orderData"),get0("reportData"),get0("specimensData"),get0("tmbData"))        
        write.csv(outPatients,"~/JSON/patients.csv",append=TRUE)        
      }else{
        print("Missing patient info")
      }        
    }  
}



scp010
  • 1
  • 2
    This code is simply too massive for anyone to compose a sensible answer. In your position i would try to debug the code. Carefully read [rstudios guide to debugging](https://support.rstudio.com/hc/en-us/articles/205612627-Debugging-with-RStudio) and [chapter 22](https://adv-r.hadley.nz/debugging.html) in hadley's book [Advanced R](https://adv-r.hadley.nz/). Use this knowledge to find your problem and isolate it, possibly by tracking the value throughout the function. Once you find the problem, and if you can't fix it. Make a smaller example replicating the problem, and post a new question. – Oliver Sep 14 '20 at 19:10
  • Debugging is an essential skill for anyone who'd like to program, data scientist, analyst and so forth. It is how the vast majority of programming problems are solved in practice. – Oliver Sep 14 '20 at 19:11
  • Additional notes: even with a reduced function, this is not reproducible since we don't know the files it is operating on when this failure occurs. And your summary of *"it prints `#expected output`"* doesn't help us narrow down where things break, since that string does not occur in your function. I understand (and generally appreciate) the attempt to reduce the size of your question, but (1) the function blew through that recommendation already; and (2) I suggest you provide unambiguous (not necessarily all) context. Good luck! – r2evans Sep 14 '20 at 19:30

1 Answers1

0

Solved this problem, there were a few files that returned NULL that the code didn't know how to handle and hidden in the thousands of files I had to parse through. This code is for working with health records, so sorry I couldn't post too many specifics! Thanks for the help!

scp010
  • 1