I am trying to compare survival of all patients with a mutation in a gene to all patients without the mutation. I have a list of 66 data frames for different batches of patients in different cancer types and I want to isolate the patients with the mutation and compare them against all other patients.
This is an example of the data frame
A tibble:6 × 36
Sample ID Start End mutation Patient ID OS
TCGA-OR-A5J1-01 3 42265106 42265106 C-T TCGA-OR-A5J1 1
OS.time
1355
TCGA-OR-A5J2-01 X 39913284 39913284 C-A TCGA-OR-A5J2 1 1677
TCGA-OR-A5J3-01 17 10608472 10608472 G-A TCGA-OR-A5J3 0 2091
TCGA-OR-A5J3-01 17 10608472 10608472 G-A TCGA-OR-A5J3 0 2091
TCGA-OR-A5J4-01 5 38427121 38427121 G-C TCGA-OR-A5J4 1 423
TCGA-OR-A5J4-01 11 66272099 66272099 G-A TCGA-OR-A5J4 1 423
6 rows | 1-10, 33-36 of 36 columns
I want to get the log rank test and put it in a pdf with the title of the cancer type and gene name
this is the code I am running but it keeps saying it "Error: cannot find df1_OS"
survdif_KM = function(x, y) {
#x is the name of your data.frame (cancer type)
#y is you gene id you're interested in
read_file = paste0("C:/Users/cian/OneDrive/Desktop/Thesis/mergerd_data",
"/", x)
df <- read.table(read_file, sep = "\t", header=TRUE)
df2 <- df[grep(y, df[[14]]), ]
df3 <- df[!grepl(y, df[[14]]), ]
if (nrow(df2) > 20)
{
df4 <- df2[df2$X_PATIENT %in% df3$X_PATIENT, ]
if(nrow(df4) > 0)
{
df3_1 <- df3[!df3$V5 %in% df2$V5, ]
df3 <- df3_1
}
df2$group <- "A"
df3$group <- "B"
df1_OS <- rbind(df2,df3)
x = gsub("\\..*", "", x)
name_of_file = paste("Log-Rank", "Test", x, y, sep="_")
surv_diff <- survdiff(Surv(OS.time, OS) ~ group, data = df1_OS)
pval_sav <- surv_pvalue(surv_diff)
if (pval_sav$pval >= 0.05)
{
filename= paste0('C:/Users/cian/OneDrive/Desktop/Thesis/tables/',
name_of_file, ".pdf", sep="")
pdf(filename)
a <- surv_diff
print(a)
dev.off()
}}}
as requested the head(df). The only columns of note are V5, V1, V2 V3, V4, X_PATIENT OS and OS.time
V5 V1 V2 V3 V4 V6 V7 V8 V9 V10 V11 V12 V13
1 TCGA-2E-A9G8-01A chr15 44536637 44536637 G-A chr15 HAVANA exon 44536166 44536681 NA - NA
2 TCGA-2E-A9G8-01A chr15 44536637 44536637 G-A chr15 HAVANA exon 44536043 44536933 NA - NA
3 TCGA-2E-A9G8-01A chr15 44536637 44536637 G-A chr15 HAVANA exon 44535760 44536913 NA - NA
4 TCGA-2E-A9G8-01A chr15 44536637 44536637 G-A chr15 HAVANA exon 44536043 44536640 NA - NA
5 TCGA-2E-A9G8-01A chr15 99131259 99131259 G-A chr15 HAVANA exon 99130803 99131806 NA - NA
6 TCGA-2E-A9G8-01A chr17 74261981 74261981 C-T chr17 HAVANA exon 74261589 74262020 NA - NA
gene_ID transcript_ID gene_type gene_name
1 ENSG00000179523 transcript_id ENST00000702400 gene_type lncRNA gene_name EIF3J-DT
2 ENSG00000179523 transcript_id ENST00000313807 gene_type lncRNA gene_name EIF3J-DT
3 ENSG00000179523 transcript_id ENST00000657789 gene_type lncRNA gene_name EIF3J-DT
4 ENSG00000179523 transcript_id ENST00000560750 gene_type lncRNA gene_name EIF3J-DT
5 ENSG00000261054 transcript_id ENST00000566974 gene_type lncRNA gene_name SYNM-AS2
6 ENSG00000264272 transcript_id ENST00000583018 gene_type lncRNA gene_name ENSG00000264272
transcript_type transcript_name exon_number exon_ID level
1 transcript_type lncRNA transcript_name EIF3J-DT-208 exon_number 1 exon_id ENSE00003985681 level 2
2 transcript_type lncRNA transcript_name EIF3J-DT-201 exon_number 1 exon_id ENSE00001213385 level 2
3 transcript_type lncRNA transcript_name EIF3J-DT-207 exon_number 1 exon_id ENSE00003985691 level 2
4 transcript_type lncRNA transcript_name EIF3J-DT-205 exon_number 1 exon_id ENSE00002562703 level 2
5 transcript_type lncRNA transcript_name SYNM-AS2-201 exon_number 1 exon_id ENSE00002601413 level 2
6 transcript_type lncRNA transcript_name ENST00000583018 exon_number 1 exon_id ENSE00002709179 level 2
tag1 tag2 havana_gene
1 hgnc_id HGNC:48616 tag TAGENE havana_gene OTTHUMG00000171875
2 transcript_support_level 1 hgnc_id HGNC:48616 tag basic
3 hgnc_id HGNC:48616 tag TAGENE havana_gene OTTHUMG00000171875
4 transcript_support_level 3 hgnc_id HGNC:48616 tag basic
5 transcript_support_level 2 hgnc_id HGNC:56228 tag basic
6 transcript_support_level 3 tag basic tag Ensembl_canonical
havana_transcript OS X_PATIENT OS.time
1 0 TCGA-2E-A9G8 1249
2 tag TAGENE 0 TCGA-2E-A9G8 1249
3 havana_transcript OTTHUMT00000526744 0 TCGA-2E-A9G8 1249
4 tag Ensembl_canonical 0 TCGA-2E-A9G8 1249
5 tag Ensembl_canonical 0 TCGA-2E-A9G8 1249
6 havana_gene OTTHUMG00000178577 0 TCGA-2E-A9G8 1249
This is an example of an x and y with x being the name of the dataframe and Y being the gene
x = "UCEC_TCGA_ms.txt"
y = "ENSG00000240498"
I am running the loop using mapply
on a crossed list of all the dataframes and genes.
I have already done a very similar function to get kaplan-meier curves using the survfit
function but that also did not work until I changed the survfit function to surv_fit. I tried this with survdiff but it did not work.
Is there another type of survdiff I can use like in survfit that will run over a list of dataframes?
the example with survfit is attached below for reference as a function that did work
new_create_KM = function(x, y) {
#x is the name of your data.frame (cancer type)
#y is you gene id you're interested in
read_file = paste0("C:/Users/cian/OneDrive/Desktop/Thesis/mergerd_data",
"/", x)
df <- read.table(read_file, sep = "\t", header=TRUE)
df2 <- df[grep(y, df[[14]]), ]
df3 <- df[!grepl(y, df[[14]]), ]
if (nrow(df2) > 150)
{
df4 <- df2[df2$X_PATIENT %in% df3$X_PATIENT, ]
if(nrow(df4) > 0)
{
df3_1 <- df3[!df3$V5 %in% df2$V5, ]
df3 <- df3_1
}
df2$group <- "A"
df3$group <- "B"
df1_OS <- rbind(df2,df3)
x = gsub("\\..*", "", x)
name_of_file = paste("KM", "Curve", x, y, sep="_")
survival_function <- surv_fit(Surv(df1_OS$OS.time, df1_OS$OS)~group,
data=df1_OS)
pval_sav <- surv_pvalue(survival_function)
if (pval_sav$pval <= 0.05)
{
filename= paste0('C:/Users/cian/OneDrive/Desktop/Thesis/Graphs2/',
name_of_file, ".pdf", sep="")
# I changed destination to graphs 2 so I could use the new all_genes
pdf(filename)
a <- ggsurvplot(survival_function, conf.int=TRUE, pval=TRUE,
risk.table=TRUE,
legend.labs=c("mutation", "no mutation"),
legend.title="lncRNA status",
palette=c("orangered1", "royalblue"),
title=name_of_file)
print(a)
dev.off()
}}}
Edit by @jared_mamrot
I'm not sure if this is enough data to reproduce the problem, but here is a dput()
of the head(df)
above:
structure(list(rn = c("TCGA-2E-A9G8-01A", "TCGA-2E-A9G8-01A",
"TCGA-2E-A9G8-01A", "TCGA-2E-A9G8-01A", "TCGA-2E-A9G8-01A", "TCGA-2E-A9G8-01A"
), V5 = c("chr15", "chr15", "chr15", "chr15", "chr15", "chr17"
), V1 = c(44536637L, 44536637L, 44536637L, 44536637L, 99131259L,
74261981L), V2 = c(44536637L, 44536637L, 44536637L, 44536637L,
99131259L, 74261981L), V3 = c("G-A", "G-A", "G-A", "G-A", "G-A",
"C-T"), V4 = c("chr15", "chr15", "chr15", "chr15", "chr15", "chr17"
), V6 = c("HAVANA", "HAVANA", "HAVANA", "HAVANA", "HAVANA", "HAVANA"
), V7 = c("exon", "exon", "exon", "exon", "exon", "exon"), V8 = c(44536166L,
44536043L, 44535760L, 44536043L, 99130803L, 74261589L), V9 = c(44536681L,
44536933L, 44536913L, 44536640L, 99131806L, 74262020L), V10 = c(NA,
NA, NA, NA, NA, NA), V11 = c("-", "-", "-", "-", "-", "-"), V12 = c(NA,
NA, NA, NA, NA, NA), V13 = c("ENSG00000179523", "ENSG00000179523",
"ENSG00000179523", "ENSG00000179523", "ENSG00000261054", "ENSG00000264272"
), gene_ID = c("transcript_id", "transcript_id", "transcript_id",
"transcript_id", "transcript_id", "transcript_id"), transcript_ID = c("ENST00000702400",
"ENST00000313807", "ENST00000657789", "ENST00000560750", "ENST00000566974",
"ENST00000583018"), gene_type = c("gene_type", "gene_type", "gene_type",
"gene_type", "gene_type", "gene_type"), gene_name = c("lncRNA",
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA"), NA. = c("gene_name",
"gene_name", "gene_name", "gene_name", "gene_name", "gene_name"
), NA..1 = c("EIF3J-DT", "EIF3J-DT", "EIF3J-DT", "EIF3J-DT",
"SYNM-AS2", "ENSG00000264272"), OS = c(0L, 0L, 1L, 1L, 0L, 0L
), X_PATIENT = c("TCGA-2E-A9G8", "TCGA-2E-A9G8", "TCGA-2E-A9G9",
"TCGA-2E-A9G9", "TCGA-2E-A9G1", "TCGA-2E-A9G9"), OS.time = c(1249L,
1249L, 999L, 999L, 1300L, 1300L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))