I've written a custom function to read in a JSON file and extract all of the relevant info that I need, with the goal of running it over all files in a directory. I've created a character vector of all the files and using sapply/lapply, I've been able to run the function like below.
setwd("/directory/")
file.list=dir()
sapply(file.list,function)
For some reason during execution, it reaches a point where it outputs the following and stops for no reason, shown below. I have warnings suppressed, and the only warnings that I've been getting are ones that I expect. Running the parser individually on all the failing files works, giving me the table I want to see.
#expected output
#expected output
$'filename'
[1]FALSE
$'filename'
[1]NULL
I've attached my parser here if it helps, I'm sure it's not optimized and there's better ways to do it, but speed is not a primary concern here. Thanks in advance!
library(jsonlite)
library(data.table)
library(dplyr)
library(plyr)
library(stringr)
library(tidyr)
trialParse=function(filename){
options(warn=-1)
options(max.print=99999)
parsefile=read_json(filename)
title=str_remove(basename(filename),".json")
#cat(sprintf("Schema Version is %s\n", parsefile$metadata$schemaVersion))
if (parsefile$report$workflow$reportType!="DNA"){
#cat(sprintf("%s report is not DNA, moved\n",title))
from=paste("~/JSON_parsing/workingFiles/",title,".json",sep='')
to=paste("~/JSON_parsing/failedFiles/",title,".json",sep='')
file.rename(from=from,to=to)
}else{
#cat(sprintf("%s\n",title))
#Extract report info
if (exists("report",parsefile)&&length(parsefile$report)!=0){
reportData=rbindlist(list(parsefile$report,parsefile$report$workflow),fill=TRUE)
reportData$workflow=NULL
reportData[is.na(reportData)]=""
reportData=reportData%>%
unique()%>%
summarize_all(funs(trimws(paste(.,collapse=''))))
#assign(paste("reportData_",title,sep=''),reportData,envir=.GlobalEnv)
}else{
print("No report info")
}
#Extract patient info
if(exists("patient",parsefile)&&length(parsefile$patient)!=0){
patientData=as.data.frame(t(unlist(parsefile$patient)))
#assign(paste("patientData_",title,sep=''),patientData,envir=.GlobalEnv)
}else{
print("No patient info")
}
#Extract order info
if(exists("report",parsefile)&&length(parsefile$report)!=0){
orderData=rbindlist(list(parsefile$order,parsefile$order$test),fill=TRUE)
orderData$test=NULL
orderData[is.na(orderData)]=""
orderData=orderData%>%
unique()%>%
summarize_all(funs(trimws(paste(.,collapse=''))))
#assign(paste("orderData_",title,sep=''),orderData,envir=.GlobalEnv)
}else{
print("No order info")
}
#Extract specimens info
if(exists("specimens",parsefile)&&length(parsefile$specimens)!=0){
specimens=list()
for(i in 1:length(parsefile$specimens)){
specimens[[i]]=as.data.frame(t(unlist(parsefile$specimens[[i]])))
}
specimensData=do.call(rbind.fill,specimens)%>%
unique()
#assign(paste("specimensData_",title,sep=''),specimensData,envir=.GlobalEnv)
}else{
print("No specimens info")
}
#Extract mutations info
if (exists("results",parsefile)&&length(parsefile$results)!=0){
#Tumor Mutational Burden
if (length(parsefile$results$tumorMutationalBurden!=0)){
tmbdata=as.data.frame(t(unlist(c(parsefile$results[1],parsefile$results[2],parsefile$results[3]))))
#assign(paste("tmbData_",title,sep=''),tmbdata,envir=.GlobalEnv)
}
#Somatic Potentially Actionable Mutations
if (exists("somaticPotentiallyActionableMutations",parsefile$results)&&length(parsefile$results$somaticPotentiallyActionableMutations)!=0){
#Remove Therapies
for (i in 1:length(parsefile$results$somaticPotentiallyActionableMutations)){
for (j in 1:length(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants)){
parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[[j]]$therapies=NULL
}
}
mutations=list()
variants=list()
#Extract Somatic Potentially Actionable Mutations data
#Per Entry
for(i in 1:length(parsefile$results$somaticPotentiallyActionableMutations)){
#Per Variants in Entry
for(j in 1:length(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants)){
variants[[j]]=rbindlist(list(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[j],parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[[j]]),fill=TRUE)
}
#Per Variants in Entry
for (j in 1:length(variants)){
variantsData=do.call(rbind,variants[j])
mutationsList=parsefile$results$somaticPotentiallyActionableMutations[[i]]
mutationsList$variants=NULL
mutationTable=rbindlist(list(mutationsList,variantsData),fill=TRUE)
mutations=append(mutations,list(rbindlist(list(mutationsList,variantsData),fill=TRUE)))
}
}
#Build SPAMS table
SPAMsData=do.call(rbind,mutations)
SPAMsData$mutationEffect=NULL
SPAMsData=SPAMsData%>%
unique()%>%
fill(c(1:4),.direction=c("down"))%>%
fill(c(5:ncol(SPAMsData)),.direction=c("up"))%>%
unique()%>%
mutate(mutation_type="Somatic Potentially Actionable Mutation")
}else{
print("No SPAMS")
}
#Somatic Potentially Actionable Copy Number Variants
if (exists("somaticPotentiallyActionableCopyNumberVariants",parsefile$results)&&length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)!=0){
#Remove Therapies
for (i in 1:length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)){
parsefile$results$somaticPotentiallyActionableCopyNumberVariants[[i]]$therapies=NULL
}
#Extract Somatic Potentially Actionable Copy Number Variants Data
variants=list()
for(i in 1:length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticPotentiallyActionableCopyNumberVariants[[i]])))
}
SPACNVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Somatic Potentially Actionable Copy Number Variants")
rm(variants)
}else{
print("No SPACNVs")
}
#Somatic Biologically Relevant Variants
if (exists("somaticBiologicallyRelevantVariants",parsefile$results)&&length(parsefile$results$somaticBiologicallyRelevantVariants)!=0){
#Extract Data
variants=list()
for(i in 1:length(parsefile$results$somaticBiologicallyRelevantVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticBiologicallyRelevantVariants[[i]])))
}
SBRVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Somatic Biologically Relevant Variants")
rm(variants)
}else{
print("No SBRVs")
}
#Somatic Variants of Unknown Significance
if (exists("somaticVariantsOfUnknownSignificance",parsefile$results)&&length(parsefile$results$somaticVariantsOfUnknownSignificance)!=0){
#Extract Data
variants=list()
for(i in 1:length(parsefile$results$somaticVariantsOfUnknownSignificance)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticVariantsOfUnknownSignificance[[i]])))
}
SVUSsData=do.call(rbind,variants)%>%
mutate(mutation_type="Somatic Variants of Unknown Significance")
rm(variants)
}else{
print("No SVUSs")
}
#Fusion Variants
if (exists("fusionVariants",parsefile$results)&&length(parsefile$results$fusionVariants)!=0){
#Remove Therapies
for (i in 1:length(parsefile$results$fusionVariants)){
parsefile$results$fusionVariants[[i]]$therapies=NULL
}
#Extract Data
variants=list()
for(i in 1:length(parsefile$results$fusionVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$fusionVariants[[i]])))
}
FVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Fusion Variants")%>%
dplyr::rename(gene5display=gene5Display)%>%
dplyr::rename(gene3display=gene3Display)
rm(variants)
}else{
#print("No FVs")
}
#Inherited Relevant Variants
if (exists("inheritedRelevantVariants",parsefile$results)&&length(parsefile$results$inheritedRelevantVariants)!=0){
#Extract Data
if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
variants=list()
for(i in 1:length(parsefile$results$inheritedRelevantVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedRelevantVariants[[i]])))
}
IRVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Inherited Relevant Variants")
rm(variants)
}else{
print("No IRVs")
}
}else{
print("No IRVs")
}
#Inherited Incidental Findings
if (exists("inheritedIncidentalFindings",parsefile$results)&&length(parsefile$results$inheritedIncidentalFindings)!=0){
#Extract Data
if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
variants=list()
for(i in 1:length(parsefile$results$inheritedIncidentalFindings)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedIncidentalFindings[[i]])))
}
IIFsData=do.call(rbind,variants)%>%
mutate(mutation_type="Inherited Incidental Findings")
rm(variants)
}else{
print("No IIFs")
}
}else{
print("No IIFs")
}
#Inherited Variants of Unknown Significance
if (exists("inheritedVariantsOfUnknownSignificance",parsefile$results)&&length(parsefile$results$inheritedVariantsOfUnknownSignificance)!=0){
#Extract Data
if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
variants=list()
for(i in 1:length(parsefile$results$inheritedVariantsOfUnknownSignificance)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedVariantsOfUnknownSignificance[[i]])))
}
IVUSsData=do.call(rbind,variants)%>%
mutate(mutation_type="Inherited Variants of Unknown Significance")
rm(variants)
}else{
print("No IVUSs")
}
}else{
print("No IVUSs")
}
#Merge and Output data tables
if (exists(c("SPAMsData","SBRVsData","SVUSsData","FVsData","IRVsData","IIFsData","IVUSsData"))){
mergedMutations=rbind.fill(get0("SPAMsData"),get0("SPACNVsData"),get0("SBRVsData"),get0("SVUSsData"),get0("FVsData"),get0("IRVsData"),get0("IIFsData"),get0("IVUSsData"))%>%
select(mutation_type,everything())
outMutations=merge(patientData,mergedMutations)
write.csv(outMutations,"~/JSON/mutations.csv",append=TRUE)
#assign(paste("mergedMutations_",title,sep=''),mergedMutations,envir=.GlobalEnv)
#assign(paste("patientMutations_",title,sep=''),merge(patientData,mergedMutations),envir=.GlobalEnv)
}else{
print("No mutations info")
}
}else{
print("No mutations info")
}
if (exists(c("orderData","reportData","specimensData","tmbData"))){ outPatients=rbind.fill(get0("patientData"),get0("orderData"),get0("reportData"),get0("specimensData"),get0("tmbData"))
write.csv(outPatients,"~/JSON/patients.csv",append=TRUE)
}else{
print("Missing patient info")
}
}
}