1

I am plotting the same data once as geom_point() and once as geom_boxplot(), but the width of my boxplots seems to be off. The largest x is at 292, but the corresponding box is smaller than 285. How can i get this to the correct size?

enter image description here

Here is a minimal example of my data:

x = data.frame(cluster = c("c1","c2","c3","c4","c5","c6"), 
               elements = c(292,277,170,160,153,141), 
               divergence = c(0.08344564,0.12130600,0.05564219,0.12826086,0.05386341,0.09620389))

x.160 = x[x$elements >= 160,]
x.160$Size = "160+"

x.60 = x[x$elements >= 60 & x$elements < 160,]
x.60$Size = "60-160"

binnedX = rbind(x.60,x.160)


p1a = ggplot(data = binnedX, aes(x = elements, y =  divergence, group = Size))+
  geom_hline(yintercept = mean(binnedX$divergence), color = "black", linetype=2)+
  geom_point(aes(color = Size))+ 
  scale_x_continuous(breaks = c(seq(0,300,15))) +
  scale_y_continuous(breaks = seq(0.00,0.25,0.05))+
  scale_color_brewer(palette = "Spectral") +  
  ggtitle("element sequence divergence by cluster [clustalO, 100bp]") +
  labs(x="Elements per cluster", y="Divergence")+
  theme_linedraw(base_size = 18)+
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(margin = margin(15,15,15,15,"pt")), 
        axis.title.y = element_text(margin = margin(15,15,15,15,"pt")))

p2a = ggplot(data = binnedX, aes(x = elements, y =  divergence, group = Size))+
  geom_hline(yintercept = mean(binnedX$divergence), color = "Red", linetype=2)+
  geom_boxplot(aes(fill = Size)) +
  scale_x_continuous(breaks = c(seq(0,300,15)))+
  scale_y_continuous(breaks = seq(0.00,0.25,0.05))+
  scale_fill_brewer(palette = "Spectral") +
  labs(x="Elements per cluster", y="Divergence")+
  theme_linedraw(base_size = 18)+
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(margin = margin(15,15,15,15,"pt")), 
        axis.title.y = element_text(margin = margin(15,15,15,15,"pt")),
        axis.text.x = element_text(angle=0))

multiplot(p1a,p2a)
voiDnyx
  • 975
  • 1
  • 11
  • 24
  • Can you provide `dput(binnedEleData)` and `dput(eleData)` in your question necessary to create those two plots? – acylam Jun 18 '18 at 16:02
  • 1
    I edited my post with a minimal example for my problem, instead of my actual data. – voiDnyx Jun 19 '18 at 10:07

1 Answers1

2

When we create box plots by group, the width of each box is hard coded as 90% of the group's data range. We can see this from the compute_group() function in StatBoxplot:

# reproducing lines 87-88 of stat-boxplot.r
if (length(unique(data$x)) > 1)
  width <- diff(range(data$x)) * 0.9

We can override this by defining a modified compute_group() function based on StatBoxplot$compute_group:

modified.function <- function(data, scales, width = NULL, na.rm = FALSE, coef = 1.5) {
  qs <- c(0, 0.25, 0.5, 0.75, 1)
  if (!is.null(data$weight)) {
    mod <- quantreg::rq(y ~ 1, weights = weight, data = data, tau = qs)
    stats <- as.numeric(stats::coef(mod))
  }
  else {
    stats <- as.numeric(stats::quantile(data$y, qs))
  }
  names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
  iqr <- diff(stats[c(2, 4)])
  outliers <- data$y < (stats[2] - coef * iqr) | data$y > (stats[4] + coef * iqr)
  if (any(outliers)) {
    stats[c(1, 5)] <- range(c(stats[2:4], data$y[!outliers]), na.rm = TRUE)
  }
  if (length(unique(data$x)) > 1)
    width <- diff(range(data$x)) * 1 # instead of 0.9
  df <- as.data.frame(as.list(stats))
  df$outliers <- list(data$y[outliers])
  if (is.null(data$weight)) {
    n <- sum(!is.na(data$y))
  }
  else {
    n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)])
  }
  df$notchupper <- df$middle + 1.58 * iqr/sqrt(n)
  df$notchlower <- df$middle - 1.58 * iqr/sqrt(n)
  df$x <- if (is.factor(data$x))
    data$x[1]
  else mean(range(data$x))
  df$width <- width
  df$relvarwidth <- sqrt(n)
  df}

Create a modified stat class based off StatBoxplot, as well as the corresponding layer function for it based off stat_boxplot:

StatBoxplot2 <- ggproto(`_class` = "StatBoxplot2",          # class name
                        `_inherit` = StatBoxplot,           # inherit from StatBoxplot
                        compute_group = modified.function)  # override its compute_group function

stat_boxplot2 <- function(mapping = NULL, data = NULL, geom = "boxplot", position = "dodge2", ...,
                           coef = 1.5, na.rm = FALSE, show.legend = NA, inherit.aes = TRUE){
  layer(data = data, mapping = mapping, 
        stat = StatBoxplot2,                           # use StatBoxplot2 rather than StatBoxplot
        geom = geom, position = position, 
        show.legend = show.legend, inherit.aes = inherit.aes, 
        params = list(na.rm = na.rm, coef = coef, ...))
}

Compare a boxplot that uses the default stat = "boxplot", with one that uses our modified stat = "boxplot2":

# Base plot with vertical dashed lines to indicate each point's position, & theme_classic for a
# less cluttered background. I also changed the palette as Spectral's yellow was really hard to see.
p <- ggplot(data = binnedX, 
            aes(x = elements, y =  divergence, fill = Size))+
  geom_point(aes(color = Size), size = 3) +
  geom_vline(aes(xintercept = elements), linetype = "dashed") +
  scale_fill_brewer(palette = "Set1") +
  scale_color_brewer(palette = "Set1", guide = FALSE) +
  theme_classic()

gridExtra::grid.arrange(p + ggtitle("original") + geom_boxplot(alpha = 0.5),
                        p + ggtitle("modified") + geom_boxplot(alpha = 0.5, stat = "boxplot2"))

plot

Z.Lin
  • 28,055
  • 6
  • 54
  • 94