1

Looking for a way to graph density ridges on pre aggregated data. Is there a way to feed pre aggregated data to geom_density_ridges or something similar? Particularly looking for a solution that doesn't entail blowing out replicates/rows from the count totals.

Example:

library(dplyr)
library(ggridges)
library(zoo)
library(magrittr)
library(ggplot2)

dat <- data.frame(qtr = rep(c(as.yearqtr(Sys.Date()),
                              as.yearqtr(Sys.Date()-94),
                              as.yearqtr(Sys.Date()-185),
                              as.yearqtr(Sys.Date()-280)), 1000) %>%
                    factor(ordered = TRUE),
                  age = rbeta(4000, 8, 8) %>% 
                    multiply_by(100) %>% 
                    ceiling)

with subject-level records

dat %>%
  ggplot(aes(x = age, y = qtr, fill = qtr)) +
  geom_density_ridges(quantile_lines=TRUE,
                      quantile_fun=function(x,...) quantile(x, .5),
                      alpha = .5)

pre-summarised count data looking to use

dat_summy <- dat %>%
  group_by(qtr, age) %>%
  summarise(count = n()) 
Z.Lin
  • 28,055
  • 6
  • 54
  • 94
  • I presume by "doesn't entail blowing out replicates/rows from the count totals" you mean "don't use `tidyr::uncount(count)`" or some variation with sampling? – Jon Spring Mar 31 '23 at 23:16

1 Answers1

0

The underlying stats::density() function used by ggridges for density estimation is able to accept a weights parameter, so this is certainly possible. I took a look at the package's GH repository, where this has been an open request for a while now, so it's unlikely that it will be changed just yet.

But if you are comfortable with hacking the underlying ggproto objects, this is certainly possible for the use case you outlined here.

p.s. Do note that the data still needs to be temporarily "blown up" during density estimation in order to make appropriate calculations for bandwidth, standard deviation, etc., because the functions used for these do not accept a weight parameter, and it seems more robust to blow up the dataset than to find weighted alternatives from other, probably less known R packages.

Plotting code

# first, run the code at the end of this answer to define StatDensityRidges2

p1 <- dat %>%
  ggplot(aes(x = age, y = qtr, fill = qtr)) +
  geom_density_ridges(quantile_lines=TRUE,
                      quantile_fun=function(x,...) quantile(x, .5),
                      stat = StatDensityRidges2,
                      alpha = .5); p1

p2 <- dat_summy %>%
  ggplot(aes(x = age, y = qtr, fill = qtr, weight = count)) +
  geom_density_ridges(quantile_lines=TRUE,
                      quantile_fun=function(x,...) quantile(x, .5),
                      alpha = .5,
                      stat = StatDensityRidges2); p2

Comparison of results

> print(p1); print(p2) # look identical
> sapply(c(p1, p2), object.size) # p2's stored data is much smaller
       data      layers      scales     mapping       theme coordinates       facet    plot_env      labels 
      49552         536         424        2704          48         480         480          56        1112 
       data      layers      scales     mapping       theme coordinates       facet    plot_env      labels 
       6072         536         424        3440          48         480         480          56        1312 

2 plots side by side

Data

set.seed(123) # set code for reproducibility
dat <- data.frame(qtr = rep(c(as.yearqtr(Sys.Date()),
                              as.yearqtr(Sys.Date()-94),
                              as.yearqtr(Sys.Date()-185),
                              as.yearqtr(Sys.Date()-280)), 1000) %>%
                    factor(ordered = TRUE),
                  age = rbeta(4000, 8, 8) %>% 
                    magrittr::multiply_by(100) %>% 
                    ceiling)
dat_summy <- dat %>%
  group_by(qtr, age) %>%
  summarise(count = n(),
            .groups = "drop") 

Code used to define StatDensityRidges2

StatDensityRidges2 <- ggproto(
  "StatDensityRidges2",
  StatDensityRidges,
  dropped_aes = "weight",
  non_missing_aes = "weight",
  setup_data = function(data, params) {
    # add weight to dataset if it's not already defined
    if(is.null(data$weight)) data$weight <- 1
    data
  },
  calc_panel_params = function(data, params) {
    # need to "blow out" data here in order to calculate bandwidth appropriately
    if (is.null(params$bandwidth)) {
      if(!is.null(data$weight)) {
        xdata <- na.omit(data.frame(x=data$x, group=data$group, weight=data$weight))
        xdata <- xdata[rep(seq_along(xdata$weight), xdata$weight), ]
      } else {
        xdata <- na.omit(data.frame(x=data$x, group=data$group))
      }
      xs <- split(xdata$x, xdata$group)
      xs_mask <- vapply(xs, length, numeric(1)) > 1
      bws <- vapply(xs[xs_mask], bw.nrd0, numeric(1))
      bw <- mean(bws, na.rm = TRUE)
      message("Picking joint bandwidth of ", signif(bw, 3))
      
      params$bandwidth <- bw
    }
    
    if (is.null(params$from)) {
      params$from <- min(data$x, na.rm=TRUE) - 3 * params$bandwidth
    }
    
    if (is.null(params$to)) {
      params$to <- max(data$x, na.rm=TRUE) + 3 * params$bandwidth
    }
    
    data.frame(
      bandwidth = params$bandwidth,
      from = params$from,
      to = params$to
    )
  },
  compute_group = function (data, scales, from, to, bandwidth = 1, calc_ecdf = FALSE, 
                            jittered_points = FALSE, quantile_lines = FALSE, quantiles = 4, 
                            quantile_fun = quantile, n = 512) {
    # add weight as one of the parameters to be passed to stats::density;
    # also need to temporarily "blow up" data for calculation of qx
    if(sum(data$weight) < 3)
      return(data.frame())
    if (is.null(calc_ecdf)) 
      calc_ecdf <- FALSE
    if (is.null(jittered_points)) 
      jittered_points <- FALSE
    if (is.null(quantile_lines)) 
      quantile_lines <- FALSE
    if (quantile_lines) 
      calc_ecdf <- TRUE
    panel <- unique(data$PANEL)
    if (length(panel) > 1) {
      stop("Error: more than one panel in compute group; something's wrong.")
    }
    panel_id <- as.numeric(panel)
    d <- stats::density(data$x, bw = bandwidth[panel_id], 
                        from = from[panel_id], to = to[panel_id], 
                        na.rm = TRUE, n = n,
                        weights = data$weight / sum(data$weight))
    d$n <- sum(data$weight)
    maxdens <- max(d$y, na.rm = TRUE)
    densf <- approxfun(d$x, d$y, rule = 2)
    if (jittered_points) {
      df_jittered <- data.frame(x = data$x, density = densf(data$x), 
                                ndensity = densf(data$x)/maxdens, datatype = "point", 
                                stringsAsFactors = FALSE)
      df_points <- data[grepl("point_", names(data))]
      if (ncol(df_points) == 0) {
        df_points <- NULL
        df_points_dummy <- NULL
      }
      else {
        df_jittered <- cbind(df_jittered, df_points)
        df_points_dummy <- na.omit(df_points)[1, , drop = FALSE]
      }
    }
    else {
      df_jittered <- NULL
      df_points_dummy <- NULL
    }
    if ((length(quantiles) == 1) && (all(quantiles >= 1))) {
      if (quantiles > 1) {
        probs <- seq(0, 1, length.out = quantiles + 1)[2:quantiles]
      }
      else {
        probs <- NA
      }
    }
    else {
      probs <- quantiles
      probs[probs < 0 | probs > 1] <- NA
    }
    qx <- na.omit(quantile_fun(rep(data$x, times = data$weight), probs = probs))
    df_quantiles <- NULL
    if (quantile_lines && length(qx) > 0) {
      qy <- densf(qx)
      df_quantiles <- data.frame(x = qx, density = qy, ndensity = qy/maxdens, 
                                 datatype = "vline", stringsAsFactors = FALSE)
      if (!is.null(df_points_dummy)) {
        df_quantiles <- data.frame(df_quantiles, as.list(df_points_dummy))
      }
    }
    df_nondens <- rbind(df_quantiles, df_jittered)
    if (calc_ecdf) {
      n <- length(d$x)
      ecdf <- c(0, cumsum(d$y[1:(n - 1)] * (d$x[2:n] - d$x[1:(n - 1)])))
      ecdf_fun <- approxfun(d$x, ecdf, rule = 2)
      ntile <- findInterval(d$x, qx, left.open = TRUE) + 1
      if (!is.null(df_nondens)) {
        df_nondens <- data.frame(df_nondens, ecdf = ecdf_fun(df_nondens$x), 
                                 quantile = findInterval(df_nondens$x, qx, left.open = TRUE) + 
                                   1)
      }
      df_density <- data.frame(x = d$x, density = d$y, ndensity = d$y/maxdens, 
                               ecdf = ecdf, quantile = ntile, datatype = "ridgeline", 
                               stringsAsFactors = FALSE)
    }
    else {
      df_density <- data.frame(x = d$x, density = d$y, ndensity = d$y/maxdens, 
                               datatype = "ridgeline", stringsAsFactors = FALSE)
    }
    if (!is.null(df_points_dummy)) {
      df_density <- data.frame(df_density, as.list(df_points_dummy))
    }
    df_final <- rbind(df_density, df_nondens)
    if ("quantile" %in% names(df_final)) {
      df_final$quantile <- factor(df_final$quantile)
    }
    df_final
  }
)
Z.Lin
  • 28,055
  • 6
  • 54
  • 94