0

I have got a problem in managing multiple observations in the geom_dotplot-function due to overlap between different groups:

v1 <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
v2 <- c(0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
v3 <- c(13,67,89,280,40,1,23,99,32,1,75,280,270,200,196,300,320,277,23,4,1,2,5,89,45,23,11,1,3,23,100,100,100,100,100,200,100,11,6,6,123,100,100,100,100,100,12,86,11,300,75,100,110,19,299,100,100,100,100,100,100,100,100,11,100,120,110,100,100,300,300,250,100,100,100,12,100,100,75,5,10,10,10,10,10)

summary <- data.frame(v1, v2, v3)

summary$v1 <- as.factor(summary$v1)
summary$v2 <- as.factor(summary$v2)

ggplot(summary, aes(x = v1, y = v3, fill = v2)) + 
geom_boxplot(width = 0.5, position = position_dodge(0.75)) + geom_dotplot(
  binaxis  = "y",
  stackdir = "center",
  binwidth = 3.25,
  position = position_dodge(0.75)
)

Example as image

I thought about manually changing the data with the aim to have only up to 5 observations with the same values (like v3 <- (... 100, 100, 100, 100, 100, 110, 110, 110, 110, 110, 120, 120, 120, 120, 120, 130, ...)). However, it also affects the results for the boxplots (median, interquartile range).

I could not find any option for an automatically break after 5 dots to have no overlap. Maybe, there is a simple and clever solution. All your help is appreciated. Thank you in advance!

Allan Cameron
  • 147,086
  • 7
  • 49
  • 87

2 Answers2

2

You can make a smaller dataset that just contains the five observations per group (summary2 below). You can use the original data to make the boxes and the smaller data to make the points.

library(dplyr)
library(ggplot2)  
v1 <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
v2 <- c(0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
v3 <- c(13,67,89,280,40,1,23,99,32,1,75,280,270,200,196,300,320,277,23,4,1,2,5,89,45,23,11,1,3,23,100,100,100,100,100,200,100,11,6,6,123,100,100,100,100,100,12,86,11,300,75,100,110,19,299,100,100,100,100,100,100,100,100,11,100,120,110,100,100,300,300,250,100,100,100,12,100,100,75,5,10,10,10,10,10)

summary <- data.frame(v1, v2, v3)

summary$v1 <- as.factor(summary$v1)
summary$v2 <- as.factor(summary$v2)
summary2 <- summary %>% 
  group_by(v1, v2, v3) %>% 
  filter(1:n() <= 5)



ggplot() + 
  geom_boxplot(data = summary, aes(x = v1, y = v3, fill = v2), width = 0.5, position = position_dodge(0.75)) + 
  geom_dotplot(data = summary2, aes(x = v1, y = v3, fill = v2), binaxis  = "y", stackdir = "center", binwidth = 3.25, 
               position = position_dodge(0.75))

Created on 2023-01-22 by the reprex package (v2.0.1)

DaveArmstrong
  • 18,377
  • 2
  • 13
  • 25
  • Thank you for your useful answer. The approach works. However, each observation must be displayed as a single point so as not to underestimate the total number. – StatNewbie Jan 29 '23 at 18:45
1

One option is to find out where the counts are going to be high, then add some random noise to just those counts. If you do this in just the data passed to the layer with the points, it will not affect the box plots.

library(dplyr)

ggplot(summary, aes(x = v1, y = v3, fill = v2)) + 
  geom_boxplot(width = 0.5, position = position_dodge(0.75)) + 
  geom_dotplot(
    data =   . %>% group_by_all() %>%
      mutate(v3 = if(n() > 6) v3 + runif(n(), -5, 5) else v3),
    binaxis  = "y",
    stackdir = "center",
    binwidth = 3.25,
    position = position_dodge(0.75),
    dotsize = 1.2
  )

enter image description here

Another option to use geom_beeswarm rather than geom_dotplot using the same approach with the data:

library(ggbeeswarm)

ggplot(summary, aes(x = v1, y = v3, fill = v2)) + 
  geom_boxplot(width = 0.5, position = position_dodge(0.75)) + 
  geom_beeswarm(data =   . %>% group_by_all() %>%
                  mutate(v3 = if(n() > 6) v3 + runif(n(), -5, 5) else v3),
                shape = 21, dodge.width = 0.75, priority = 'density')

enter image description here

Allan Cameron
  • 147,086
  • 7
  • 49
  • 87