2

One question regarding the time axis for Swimmer waterfall plot.

I generated the swimmer plot using the code below>

enter image description here

However, I would like to have the bars for each subject colored by the responseType (rather than "Stage") based on the responseStartTime and responseEndTime duration from the dataset. Please suggest how I can define the color by responseType for the duration of that response.

Thanks!

Source: (http://rpubs.com/alexiswl/swimmer).

Data code

    library(magrittr)
library(stringi)
library(readr)   # Reading in the dataset
library(ggplot2) # Viewing the dataset
library(forcats) # Sorting factors
library(RColorBrewer) # Plot colours
library(dplyr, warn.conflicts=FALSE)   # Manipulating the dataframes
library(purrr, warn.conflicts=FALSE)   # Manipulating dataframe metadata
library(zoo, warn.conflicts=FALSE)     # Filling in  NA values
library(reshape2) # Reformmating dataframes 

library(editData)
df.data <- df
swimmer_file = "https://blogs.sas.com/content/graphicallyspeaking/files/2014/06/Swimmer_93.txt"
col.names = c("subjectID", "stage", "startTime", "endTime", 
              "isContinued", "responseType", "responseStartTime", "responseEndTime", "Durable")
df <- readr::read_lines(swimmer_file) %>%
  # Split by line recursion (\r\n)
  stringi::stri_split(fixed="\r\n", simplify=TRUE) %>%
  # Take only lines starting with a number (sample id)
  .[grepl("^[0-9]+", .)] %>%
  # Remove spaces from response column
  gsub(pattern="\\sresponse", replacement="_response") %>%
  # Remove spaces from stage column
  gsub(pattern="Stage\\s",  replacement="Stage_") %>%
  # Some lines missing 'Stage' and 'isContinued' column. 
  # Replace any set of 8 or more spaces with ' . '
  gsub(pattern="\\s{8,}", replacement=' . ') %>%
  # Split strings by spaces, do not include empty strings as columns
  stringi::stri_split(fixed=" ", simplify=TRUE, omit_empty=TRUE) %>%
  # Convert to dataframe
  as.data.frame(stringsAsFactors=FALSE) %>%
  # Set the column names
  purrr::set_names(col.names) %>%
  # We need to do some more cleaning up of the dataframe
  # Convert all . to NAs
  dplyr::na_if(".") %>%
  # Fill NAs in Stage column
  dplyr::mutate(stage=zoo::na.locf(stage)) %>%
  # Turn isContinued into boolean
  dplyr::mutate(isContinued=dplyr::if_else(isContinued=="FilledArrow", TRUE, FALSE, missing=FALSE)) %>%
  # Convert stage variable to factor, remove underscore
  dplyr::mutate(stage = as.factor(gsub(pattern="_", replacement=" ", x=stage))) %>%
  # Remove underscore from response types 
  dplyr::mutate(responseType = gsub("_", " ", responseType)) %>%
  # Change Durable from character to numeric
  dplyr::mutate(Durable = as.numeric(Durable)) %>%
  # Change Time variables from character to numeric
  dplyr::mutate_at(vars(dplyr::ends_with("Time")), as.numeric)

df.shapes <- df %>%
  # Get just the subject and response time columns
  dplyr::select(subjectID, responseType, responseStartTime) %>%
  # Melt the data frame, so one row per response value.
  reshape2::melt(id.vars=c("subjectID", "responseType"), value.name="time") %>%
  # Remove na values
  dplyr::filter(!is.na(time)) %>%
  # Remove response variable column
  dplyr::select(-variable) %>%
  # Add 'start' to the end of the response type
  dplyr::mutate(responseType=paste(responseType, "start", sep=" "))

# Add the end time for each 
df.shapes %<>%
  dplyr::bind_rows(df %>%
                     dplyr::select(subjectID, endTime, responseEndTime, isContinued) %>%
                     # Place endtime as response endtime if not continuing and responseEndTime is NA
                     dplyr::mutate(responseEndTime=dplyr::if_else(!isContinued & is.na(responseEndTime),
                                                                  endTime, responseEndTime)) %>%
                     dplyr::select(-endTime, -isContinued) %>%
                     # Remove other existing NA responseEndTimes
                     dplyr::filter(!is.na(responseEndTime)) %>%
                     dplyr::mutate(responseType="Response end") %>%
                     dplyr::rename(time=responseEndTime))

# Append on the durable column
df.shapes %<>% 
  dplyr::bind_rows(df %>% 
                     dplyr::select(subjectID, Durable) %>%
                     dplyr::filter(!is.na(Durable)) %>%
                     dplyr::mutate(responseType="Durable") %>%
                     dplyr::rename(time=Durable))
# Add on the arrow sets
df.shapes %<>% 
  dplyr::bind_rows(df %>%
                     dplyr::select(subjectID, endTime, isContinued) %>%
                     dplyr::filter(isContinued) %>%
                     dplyr::select(-isContinued) %>%
                     dplyr::mutate(responseType="Continued Treatment") %>%
                     dplyr::mutate(endTime=endTime+0.25) %>%
                     dplyr::rename(time=endTime))

responseLevels = c("Complete response start", "Partial response start", 
                   "Response end", "Durable", "Continued Treatment")

# Convert responseType to factor and set the levels
df.shapes %<>% 
  dplyr::mutate(responseType = factor(responseType, levels=responseLevels)) %>%
  # Order by response type
  dplyr::arrange(desc(responseType))

unicode = list(triangle=sprintf('\u25B2'),
               circle=sprintf('\u25CF'),
               square=sprintf('\u25A0'),
               arrow=sprintf('\u2794'))

Plot code:

  df %>% 
  # Get just the variables we need for the base of the plot
  dplyr::select(subjectID, endTime, stage) %>%
  # Remove duplicate rows
  dplyr::distinct() %>%
  # Order subject ID by numeric value
  dplyr::mutate(subjectID=forcats::fct_reorder(.f=subjectID, .x=as.numeric(subjectID), .desc = TRUE)) %>%
  # Pipe into ggplot
  ggplot(aes(subjectID, endTime)) + # Base axis
  geom_bar(stat="identity", aes(fill=factor(stage))) + # Bar plot. Colour by stage
  geom_point(data=df.shapes, # Use df.shapes to add reponse points
             aes(subjectID, time, colour=responseType, shape=responseType), size=5) +
  coord_flip() + # Flip to horizonal bar plot.
  scale_colour_manual(values=c(RColorBrewer::brewer.pal(3, "Set1")[1:2], # Add colours
                               rep("black", 3))) + # min of brewerpal is three but we only need 2.
  scale_shape_manual(values=c(rep(unicode[["triangle"]], 2), # Add shapes
                              unicode[["circle"]], unicode[["square"]], unicode[["arrow"]])) +
  scale_y_continuous(limits=c(-0.5, 20), breaks=0:20) + # Set time limits
  labs(fill="Disease Stage", colour="Symbol Key", shape="Symbol Key",  # Add labels
       x="Subject ID ", y="Months since diagnosis",
       title="Swimmer Plot",
       caption="Durable defined as subject with six months or more of confirmed response") +
  theme(plot.title = element_text(hjust = 0.5), # Put title in the middle of plot
        plot.caption = element_text(size=7, hjust=0)) # Make caption size smaller
rk123
  • 67
  • 1
  • 8
  • 1
    The code runs fine on my machine. Are you sure you have the latest updated versions of all packages and no other errors? – Roman Oct 21 '18 at 15:09
  • I've tried it with R version 3.5.1& 3.3.2 on macOS v10.13.6, it works in both case... – godot Oct 21 '18 at 15:14
  • @Roman I don't think it's a problem of packages, as the first part of the Data code is failing but has no library reference... – godot Oct 21 '18 at 16:12
  • 1
    @godot, I suspect OP missed an error message above the one he posted. **rk123**: Please resart R / Rstudio, run the code exactly as is and post the full output of the console. – Roman Oct 21 '18 at 16:39
  • Thanks a lot for your help. It seems that packages were outdated. It is working fine with updated packages. – rk123 Oct 21 '18 at 19:40
  • I have an additional question on the ggplot (posted above). Look forward to guidance from the group. – rk123 Oct 21 '18 at 20:26
  • `However, I would like to have the bars for each subject colored by the responseType (rather than "Stage") based on the responseStartTime and responseEndTime duration from the dataset. Please suggest how I can define the color by responseType for the duration of that response.` The question is answered in your code. I would really suggest reading the comments. – Roman Oct 21 '18 at 21:46
  • Sorry, if it's not clear to me. the response is colored, however, it currently appears as shape color. I want to have a bar for each subject where the color changes by response so for eg. subject 5, complete response start at 3.5 and end at 4.5, another complete response start at 6.5 and end at 8.5, and partial response start at 10.5, so I need a bar plot with color showing the duration of complete response and partial response. Any help would be appreciated. – rk123 Oct 21 '18 at 21:53

1 Answers1

3

Sorry for the late answer but I think it's a quite interesting question, so even if you don't need anymore, I post a solution. I hope I understood what you wanted.

Basically, you have to have a different approach and use geom_segment(). If you do that the solution is quite straightforward. The only problem is that you didn't make your goal explicit: for example, what do you want to do in case you have a NA for the responseEndTime value or do you want to keep or not the info you're showing in your bar graph so I had to make arbitrary choices but you should be able to figure out how to get what you want from this solution:

df %>% 
  # Add a few variables to your df
  dplyr::select(subjectID, stage, responseStartTime, responseEndTime, 
                endTime, responseType) %>%
  # Remove duplicate rows
  dplyr::distinct() %>%
  # Order subject ID by numeric value
  dplyr::mutate(
          subjectID=forcats::fct_reorder(.f=subjectID, 
                                         .x=as.numeric(subjectID),
                                         .desc = TRUE)) %>%
  # Pipe into ggplot
  ggplot(aes(subjectID, endTime)) + # Base axis
  # substitute geom_bar by a geom_segment
  geom_segment(aes(x = 0, xend = endTime, y=subjectID, 
                   yend=subjectID, color = factor(stage)),
               size = 12) +
  # Substitue geom_point with another geom_segment for the responseTime part
  geom_segment(aes(x = responseStartTime, xend = responseEndTime,
                   y=subjectID, yend=subjectID, color = responseType),
               size =8) +
  # don't need coord_flip anymore
  # probably could improve this part but you got the idea
  scale_colour_manual(values=c("#FFFFFF", "#000000",
                               "#F8766D", "#C49A00", "#53B400", 
                               "#00C094")) +
  # the y scale is now the x scale...
  scale_x_continuous(limits=c(-0.5, 20), breaks=0:20) + # Set time limits
  labs(fill="Disease Stage", colour="Symbol Key", 
       shape="Symbol Key",  # Add labels
       y="Subject ID ", x="Months since diagnosis",
       title="Swimmer Plot",
       caption="Durable defined as subject with six months or more of confirmed response") +
  theme(plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(size=7, hjust=0))

result of the code above

godot
  • 1,550
  • 16
  • 33