3

I have a data table of patient clusters before (consensus) and after treatments (single drug) and I want to show how patients flows into different clusters before and after treatment. In this case the actual cluster number doesn't mean much, the important bit is that for most patients cluster together before treatment also end up together after the treatment. Some moves around.

Here is a screenshot of the data enter image description here

dummy dataset 

structure(list(Stimulation = c("3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S"), Patient.ID =       c("S3077497", 
"S1041120", "S162465", "S563275", "S2911623", "S3117192", "S2859024", 
"S2088278", "S3306185", "S190789", "S12146451", "S2170842", "S115594", 
"S2024203", "S1063872", "S2914138", "S303984", "S570813", "S2176683", 
"S820460", "S1235729", "S3009401", "S2590229", "S629309", "S1208256", 
"S2572773", "S3180483", "S3032079", "S3217608", "S5566943",     "S5473728", 
"S104259", "S2795346", "S2848989", "S2889801", "S2813983", "S2528246", 
"S3151923", "S2592908", "S2603793", "S5565867", "S3127064", "S675629", 
"S834679", "S3011944", "S5011583", "S2687896", "S2998620", "S651963", 
"S2104595", "S2433454", "S2565220", "S3307762", "S294778", "S995510", 
"S2476822", "S140868", "S1018263", "S2990223", "S5524130", "S1042529", 
"S999706", "S363003", "S2303087", "S868213", "S5568359", "S3174542", 
"S521782", "S3294727"), `Cluster assigned consensus` = c(2, 2, 
2, 2, 2, 5, 5, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 4, 3, 7, 4, 4, 4, 
4, 4, 4, 8, 8, 4, 7, 4, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 7, 7, 
7, 7, 7, 3, 7, 6, 6, 6, 6, 6, 8, 7, 7, 5, 7, 5, 7, 7, 7, 8, 8, 
4, 7, 4, 7), `Cluster assigned single drug` = c("1", "1", "1", 
"1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "4", "4", 
"4", "4", "5", "5", "5", "5", "5", "5", "5", "6", "6", "6", "6", 
"6", "6", "6", "6", "6", "6", "6", "7", "7", "7", "7", "7", "7", 
"7", "7", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", 
"8"), count = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -69L), class =     c("tbl_df", 
"tbl", "data.frame"))

I'm first time getting to sankey plot so I 'm no expert. I added the count column, so each patient has a count of 1, the flow thickness can be then added by the count.

I modified from R tutorial and the code to visualise is here

library(ggplot2)
library(ggalluvial)

ggplot(data = CLL3S,
       aes(axis1 = `Cluster assigned consensus`, axis2 = `Cluster assigned single drug`, y = count)) +
  scale_x_discrete(limits = c("Consensus cluster", "Single-drug cluster"), expand = c(.1, .1)) +
  xlab("Clusters") +
  geom_alluvium(aes(fill = `Cluster assigned consensus`)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  theme_minimal() +
  ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
          "3S stimulated patients")

This kind of works but the figure isn't pretty:

enter image description here

You see the cluster numbers are surrounded by huge white empty boxes. How can I change that to something smaller? And how do I color code the box into different colors and make sure the if I change the geom_alluvium (fill) so the flow of the data matches the color of the boxes(consensus boxes)?

ML33M
  • 341
  • 2
  • 19
  • Do you have any data you can share to make this reproducible? Most folks have better things to do on a Friday night than transcribe a picture of your data into their r console. Could you do `dput(CLL3S)` into your console and paste the result as text into your question? Thanks. – Allan Cameron Oct 02 '20 at 21:28
  • 1
    @AllanCameron Thank you for the comment and now I learn I can use the dput() to do that. At first I thought this was more of a figure formatting problem. Sorry for the sloppiness, still learning my way around coding... I will update the question right now. – ML33M Oct 03 '20 at 04:40

1 Answers1

4

You control that in geom_stratum. Try this

library(ggplot2)
library(ggalluvial)
library(RColorBrewer)

# Define the number of colors you want
nb.cols <- 10
mycolor1 <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
mycolor2 <- colorRampPalette(brewer.pal(2, "Set2"))(nb.cols)

mycolors <- c("red","blue","green","orange")

ggplot(data = CLL3S,
       aes(y = count, axis1 = `Cluster assigned consensus`, axis2 = `Cluster assigned single drug` 
           )) +
  scale_x_discrete(limits = c("Consensus cluster", "Single-drug cluster"), expand = c(.1, .1)) +
  labs(x="Clusters") +
  geom_alluvium(aes(fill = `Cluster assigned consensus`)) +
  geom_stratum(width = 1/4, fill = c(mycolor1[1:8],mycolor1[1:8]), color = "red") +
  #geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  #scale_fill_manual(values = mycolors) +
  theme_minimal() +
  guides(fill=guide_legend(override.aes = list(color=mycolors)))+
  ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
          "3S stimulated patients")

output

YBS
  • 19,324
  • 2
  • 9
  • 27
  • thank you for the teaching! This is exactly what I wanted in my head! – ML33M Oct 03 '20 at 16:18
  • Hi sorry I'm playing around with the code, and I wonder what does the mycolor2 and the guides() inside the ggplot do – ML33M Oct 04 '20 at 12:52
  • That is just modifying the legend. Try commenting out the guides() line and run it. You should get your original legend in a gradient scale. You should also test by giving one color and so on. – YBS Oct 04 '20 at 13:57
  • yep, I'm playing with the legends like you suggested. Sine the currently legend only shows cluster 2 4 6 8. I'm aiming to have all 8 listed . Commenting it away seem to give a blurry gradient. – ML33M Oct 04 '20 at 15:15