0

Please Help!!

I have the following dataframe (named Final_APOL1). I need to generate a summary table like the second dataframe shown. Once generated is it possible to save this as a separate output csv that will be saved to the same directory?

The summary table runs through the risk allele count variables and places them into categories so population frequencies can be calculated for each mutation.

Code for risk allele numbers 1, 2 or no

 "no APOL1 Risk Alleles" = ifelse(`Final genotype of APOL1` == "G0/G0", 1, NA),

    "1 APOL1 Risk Alleles" = 
      ifelse(`Final genotype of APOL1` %in% c("G0/G2", "G1^{GM}/G0", "G1^{G+}/G0"), 1, NA),

    "2 APOL1 Risk Alleles" =
      ifelse(`Final genotype of APOL1` %in% c("G1^{GM}/G1^{GM}", "G1^{GM}/G2", "G2/G2"), 1, NA))

The genotypes within Final_APOL1 fit into the following categories for Summary_table

G1^{GM}/G2 and G1^{G+}/G2 = G1/G2 for Summary_table
G1^{GM}/G1^{GM} = G1/G1 for Summary_table
G1^{GM}/G0 and G1^{G+}/G0 = G1/G0 for Summary_table
G2/G2 = G2/G2 for Summary_table
G0/G2 = G0/G2 for Summary_table

dataframe (Final_APOL1)

structure(list(Well = structure(1:96, .Label = c("A01", "A02", 
"A03", "A04", "A05", "A06", "A07", "A08", "A09", "A10", "A11", 
"A12", "B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", 
"B09", "B10", "B11", "B12", "C01", "C02", "C03", "C04", "C05", 
"C06", "C07", "C08", "C09", "C10", "C11", "C12", "D01", "D02", 
"D03", "D04", "D05", "D06", "D07", "D08", "D09", "D10", "D11", 
"D12", "E01", "E02", "E03", "E04", "E05", "E06", "E07", "E08", 
"E09", "E10", "E11", "E12", "F01", "F02", "F03", "F04", "F05", 
"F06", "F07", "F08", "F09", "F10", "F11", "F12", "G01", "G02", 
"G03", "G04", "G05", "G06", "G07", "G08", "G09", "G10", "G11", 
"G12", "H01", "H02", "H03", "H04", "H05", "H06", "H07", "H08", 
"H09", "H10", "H11", "H12"), class = "factor"), G1_1_1 = c("Blank", 
"Blank", "+", "+", "+", "+", "G1^{S342G}", "G1^{S342G}", "+", 
"+", "G1^{S342G}", "G1^{S342G}", "Blank", "Blank", "+", "+", 
"+", "+", "+", "+", "G1^{S342G}", "G1^{S342G}", "+", "+", "Blank", 
"Blank", "+", "+", "G1^{S342G}", "G1^{S342G}", "G1^{S342G}", 
"G1^{S342G}", "G1^{S342G}", "G1^{S342G}", "G1^{S342G}", "G1^{S342G}", 
"Blank", "Blank", "+", "+", "G1^{S342G}", "G1^{S342G}", "G1^{S342G}", 
"G1^{S342G}", "+", "+", "G1^{S342G}", "G1^{S342G}", "G1^{S342G}", 
"G1^{S342G}", "+", "+", "+", "+", "+", "+", "G1^{S342G}", "G1^{S342G}", 
"+", "+", "G1^{S342G}", "G1^{S342G}", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "G1^{S342G}", "G1^{S342G}", 
"G1^{S342G}", "G1^{S342G}", "+", "+", "+", "+", "+", "+", "G1^{S342G}", 
"G1^{S342G}", "G1^{S342G}", "G1^{S342G}", "+", "+", "+", "+", 
"+", "+", "G1^{S342G}", "G1^{S342G}"), G1_1_2 = c("Blank", "Blank", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "Blank", "Blank", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "Blank", "Blank", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "Blank", "Blank", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "G1^{S342G}", "G1^{S342G}", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "G1^{S342G}", 
"G1^{S342G}", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"
), G1_2_1 = c("Blank", "Blank", "+", "+", "+", "+", "G1^{I384M}", 
"G1^{I384M}", "+", "+", "G1^{I384M}", "G1^{I384M}", "Blank", 
"Blank", "+", "+", "+", "+", "+", "+", "G1^{I384M}", "G1^{I384M}", 
"+", "+", "Blank", "Blank", "+", "+", "G1^{I384M}", "G1^{I384M}", 
"G1^{I384M}", "G1^{I384M}", "G1^{I384M}", "G1^{I384M}", "G1^{I384M}", 
"G1^{I384M}", "Blank", "Blank", "+", "+", "G1^{I384M}", "G1^{I384M}", 
"G1^{I384M}", "G1^{I384M}", "+", "+", "G1^{I384M}", "G1^{I384M}", 
"G1^{I384M}", "G1^{I384M}", "+", "+", "+", "+", "+", "+", "G1^{I384M}", 
"G1^{I384M}", "+", "+", "G1^{I384M}", "G1^{I384M}", "+", "+", 
"Blank", "+", "+", "+", "+", "+", "+", "+", "+", "+", "G1^{I384M}", 
"G1^{I384M}", "G1^{I384M}", "G1^{I384M}", "+", "+", "+", "+", 
"+", "+", "G1^{I384M}", "G1^{I384M}", "G1^{I384M}", "G1^{I384M}", 
"+", "+", "+", "+", "+", "+", "G1^{I384M}", "G1^{I384M}"), G1_2_2 = c("Blank", 
"Blank", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "Blank", 
"Blank", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "Blank", 
"Blank", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "Blank", 
"Blank", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "G1^{I384M}", "G1^{I384M}", 
"+", "+", "+", "+", "+", "+", "Blank", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "G1^{I384M}", "G1^{I384M}", "+", "+", "+", 
"+", "+", "+", "+", "+", "G1^{I384M}", "G1^{I384M}", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+"), G2_1 = c("Blank", "Blank", 
"+", "+", "+", "+", "G2", "G2", "+", "+", "G2", "G2", "Blank", 
"Blank", "+", "+", "G2", "G2", "G2", "G2", "+", "+", "+", "+", 
"Blank", "Blank", "G2", "G2", "+", "+", "+", "+", "+", "+", "+", 
"+", "Blank", "Blank", "G2", "G2", "G2", "G2", "+", "+", "+", 
"+", "+", "+", "G2", "G2", "G2", "G2", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "G2", "G2", "G2", 
"G2", "G2", "G2", "G2", "G2", "+", "+", "+", "+", "+", "+", "+", 
"+", "G2", "G2", "+", "+", "+", "+", "G2", "G2", "+", "+", "+", 
"+", "+", "+"), G2_2 = c("Blank", "Blank", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "Blank", "Blank", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "Blank", "Blank", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "Blank", "Blank", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+"), `Final genotype of APOL1` = c("NA", 
"NA", "G0/G0", "G0/G0", "G0/G0", "G0/G0", "G1^{GM}/G2", "G1^{GM}/G2", 
"G0/G0", "G0/G0", "G1^{GM}/G2", "G1^{GM}/G2", "NA", "NA", "G0/G0", 
"G0/G0", "G0/G2", "G0/G2", "G0/G2", "G0/G2", "G1^{GM}/G0", "G1^{GM}/G0", 
"G0/G0", "G0/G0", "NA", "NA", "G0/G2", "G0/G2", "G1^{GM}/G0", 
"G1^{GM}/G0", "G1^{GM}/G0", "G1^{GM}/G0", "G1^{GM}/G0", "G1^{GM}/G0", 
"G1^{GM}/G0", "G1^{GM}/G0", "NA", "NA", "G0/G2", "G0/G2", "G1^{GM}/G2", 
"G1^{GM}/G2", "G1^{GM}/G0", "G1^{GM}/G0", "G0/G0", "G0/G0", "G1^{GM}/G0", 
"G1^{GM}/G0", "G1^{GM}/G2", "G1^{GM}/G2", "G0/G2", "G0/G2", "G0/G0", 
"G0/G0", "G0/G0", "G0/G0", "G1^{GM}/G1^{GM}", "G1^{GM}/G1^{GM}", 
"G0/G0", "G0/G0", "G1^{GM}/G0", "G1^{GM}/G0", "G0/G0", "G0/G0", 
NA, "G0/G0", "G0/G2", "G0/G2", "G0/G2", "G0/G2", "G0/G2", "G0/G2", 
"G0/G2", "G0/G2", "G1^{GM}/G0", "G1^{GM}/G0", "G1^{GM}/G0", "G1^{GM}/G0", 
"G0/G0", "G0/G0", "G0/G0", "G0/G0", "G0/G2", "G0/G2", "G1^{GM}/G1^{GM}", 
"G1^{GM}/G1^{GM}", "G1^{GM}/G0", "G1^{GM}/G0", "G0/G2", "G0/G2", 
"G0/G0", "G0/G0", "G0/G0", "G0/G0", "G1^{GM}/G0", "G1^{GM}/G0"
), `no APOL1 Risk Alleles` = c(NA, NA, 1, 1, 1, 1, NA, NA, 1, 
1, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, 1, 1, 1, 1, NA, NA, 1, 
1, NA, NA, 1, 1, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 1, 1, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 
1, NA, NA), `1 APOL1 Risk Alleles` = c(NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 1, 1, 1, NA, 
NA, NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, 1, 1, NA, NA, 
1, 1, NA, NA, 1, 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, 
NA, 1, 1, NA, NA, NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
NA, NA, NA, NA, 1, 1, NA, NA, 1, 1, 1, 1, NA, NA, NA, NA, 1, 
1), `2 APOL1 Risk Alleles` = c(NA, NA, NA, NA, NA, NA, 1, 1, 
NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
1, 1, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, 1, 
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA), Sample_Flag = c(FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE)), row.names = c(NA, -96L), class = "data.frame")

dataframe (Summary_table)

structure(list(X__1 = c("Total number", "Percentage", "G1/G1 %", 
"G2/G2 %", "G1/G2 %", "G0/G1 %", "G0/G2 %"), `no APOL1 Risk Alleles` = c(29, 
33.33, NA, NA, NA, NA, NA), `1 APOL1 Risk Alleles` = c(46, 52.88, 
NA, NA, NA, 27.59, 25.29), `2 APOL1 Risk Alleles` = c(12, 13.79, 
4.6, NA, 9.19, NA, NA), `Total Number` = c(87, NA, NA, NA, NA, 
NA, NA)), row.names = c(NA, -7L), class = c("tbl_df", "tbl", 
"data.frame"))
  • (1) Please learn to edit your question better. Look at my edit for some hints, but the way you structured it initially, some of your data was being hidden rendering the sample data unusable. https://stackoverflow.com/editing-help#syntax-highlighting. (2) Please *read* about tags you attach to a question (don't just "add many"), neither [tag:datatables] (a plug-in for jQuery) nor [tag:data-structures] (organizing data for queries) are particular necessary or applicable to this question. (3) Please keep to one question at a time: reshaping data. Once reshaped, `write.csv` will save your data. – r2evans Feb 01 '20 at 01:28
  • Why do you have both literal `"NA"` and symbolic `NA` in your data? Are `Total number` (both the row and the column) marginal sums? What are `G1_1_` through `G2_2`? When I try to come up with counts/sums, I get nowhere near your expected output totals of 108 or 161. Is this summary structure based on the sample data you provided? – r2evans Feb 01 '20 at 01:39
  • Hi, @r2evans yes the summary table is from a previous run, that my work colleague generated manually. The numbers shouldn't match its just the final APOL1 genotype variable that is needed for the summary table. I added the code context for this –  Feb 03 '20 at 03:57
  • @r2evans did you get my previous comment and update? –  Feb 04 '20 at 15:13
  • JordanBrowne, this is a difficult question to parse through. I've asked some questions about how the summary table is generated, but it is still unclear in the question and you haven't answered them. Regardless, when you have "expected output" in the quesiton, it really helps when it is based on the sample data you give us, otherwise there is nothing to use for verification. – r2evans Feb 04 '20 at 15:42
  • @r2evans sorry about the question not being clear. I have edited and updated summary table data so it matched with sample data –  Feb 04 '20 at 17:55

1 Answers1

0

I'm generally not a fan of storing row summaries (e.g., total) within the frame itself, as any follow-on row summarizing will include that number; I'm assuming that this is purely for aesthetics, and so including it is not horrible

library(dplyr)

tmp1 <- as_tibble(Final_APOL_1) %>%
  # discard G* fields, they seem unnecessary?
  select(-starts_with("G")) %>%
  # remove an reference to NA or the literal string "NA", they seem distracting?
  filter(
    !is.na(`Final genotype of APOL1`),
    `Final genotype of APOL1` != "NA"
  ) %>%
  mutate(
    `X__1` = case_when(
      `Final genotype of APOL1` %in% c("G1^{GM}/G2", "G1^{G+}/G2") ~ "G1/G2",
      `Final genotype of APOL1` == "G1^{GM}/G1^{GM}"               ~ "G1/G1",
      `Final genotype of APOL1` %in% c("G1^{GM}/G0", "G1^{G+}/G0") ~ "G1/G0",
      TRUE ~ `Final genotype of APOL1`
    )
  ) %>%
  select(-Well, -`Final genotype of APOL1`, -Sample_Flag) %>%
  group_by(X__1) %>%
  summarize_all(~ sum(.))

totals <- tmp1 %>%
  summarize_at(vars(-X__1), ~ sum(., na.rm = TRUE)) %>%
  mutate(
    `Total Number` = sum(.),
    X__1 = "Total number"
  )
percents <- totals %>%
  mutate_at(vars(-X__1, -`Total Number`), ~ 100 * . / `Total Number`) %>%
  mutate(
    `Total Number` = NA_real_,
    X__1 = "Percentage"
  )

Final steps and output:

tmp1 %>%
  mutate_at(vars(-X__1), ~ 100 * . / totals$`Total Number`) %>%
  bind_rows(., totals, percents)
# # A tibble: 7 x 5
#   X__1         `no APOL1 Risk Allele~ `1 APOL1 Risk Allele~ `2 APOL1 Risk Allele~ `Total Number`
#   <chr>                         <dbl>                 <dbl>                 <dbl>          <dbl>
# 1 G0/G0                          33.3                  NA                   NA                NA
# 2 G0/G2                          NA                    25.3                 NA                NA
# 3 G1/G0                          NA                    27.6                 NA                NA
# 4 G1/G1                          NA                    NA                    4.60             NA
# 5 G1/G2                          NA                    NA                    9.20             NA
# 6 Total number                   29                    46                   12                87
# 7 Percentage                     33.3                  52.9                 13.8              NA
r2evans
  • 141,215
  • 6
  • 77
  • 149