0

I plotted a series of bars that represent a simulation of confidence intervals of a proportion. I want to add a line to each bar representing the proportion of successes.

The proportion I want to plot is in the data frame for the plot. I haven't figured out how to add a line element for that data point inside each individual bar.

The visualization come from page 36 of Harvey Matulsky's Intuitive Biostatistics. This is a simulation of taking samples from a given sample space, recording the proportion of successes, and computing the confidence intervals.

I plotted the bars using geom_segment so I could make the bars start at the lower end of the confidence interval instead of drawing them starting on the x axis. I added one horizontal line to the whole chart showing the true proportion of successes (red balls in a collection of balls colored red and white) in the sample space.

I tried doing stuff with geom_hline and geom_segment mapped to the data point trials_df$proportion. I couldn't get on the right track with that.

Here is the code for my whole visualization. It's factored into some functions, and then the whole simulation runs, printing the data frame for the plot and then running the plot I have so far (lacking the proportion line on each bar).


    library(ggplot2)

    run_trials <- function(sample_space, N) {
            sample(sample_space,
                   size = N,
                   replace = TRUE)
    }

    success_count <- function(trials, success_value) {
            result <- sum(trials == success_value)
            result
    }

    proportion <- function(trials, success_value) {
            success_count(trials, success_value) / length(trials)
    }

    wald_mod <- function(success_count, trial_count) {
            z <- 1.96
            p_prime <- (success_count + (0.5 * z^2)) / (trial_count + z^2)
            W <- z * sqrt((p_prime * (1 - p_prime)) / (trial_count + z^2))
            result <- c((p_prime - W), (p_prime + W))
            result
    }

    get_trial_results <- function(trials, success_value) {
            p <- proportion(trials, success_value)
            successes <- success_count(trials, success_value)
            confidence_interval <- wald_mod(successes, length(trials))

            result <- list(p, confidence_interval)
            result
    }

    run_simulation <- function() {
            sample_space <- c(rep('Red', 25), rep('White', 75))
            N <- 15

            trials_df <- data.frame(trials_index = integer(),
                                    proportion = double(),
                                    ci_min = double(),
                                    ci_max = double())

            for (i in 1:20) {
                    t <- run_trials(sample_space, N)
                    t_results <- get_trial_results(t, "Red")
                    trials_df <- rbind(trials_df, c(i, t_results[[1]][1], t_results[[2]][1], t_results[[2]][2]))
            }
            names(trials_df) <- c("trials_index", "proportion", "ci_min", "ci_max")

            print(trials_df)

            ggplot(trials_df, aes(trials_index, ci_max)) +
                    geom_segment(aes(xend = trials_index, yend = ci_min), size = 4, lineend = "butt",
                                 color = "turquoise4") +
                    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed")
    }

    run_simulation()

I added @Simon 's solution to my code and improved the labeling of my plot. Developing this little simulation helped me understand Confidence Intervals.


    library(ggplot2)

    run_experiment <- function(sample_space, N) {
            sample(sample_space,
                   size = N,
                   replace = TRUE)
    }

    success_count <- function(experiment, success_value) {
            result <- sum(experiment == success_value)
            result
    }

    proportion <- function(experiment, success_value) {
            success_count(experiment, success_value) / length(experiment)
    }

    wald_mod <- function(success_count, trial_count) {
            z <- 1.96
            p_prime <- (success_count + (0.5 * z^2)) / (trial_count + z^2)
            W <- z * sqrt((p_prime * (1 - p_prime)) / (trial_count + z^2))
            result <- c((p_prime - W), (p_prime + W))
            result
    }

    get_experiment_results <- function(experiment, success_value) {
            p <- proportion(experiment, success_value)
            successes <- success_count(experiment, success_value)
            confidence_interval <- wald_mod(successes, length(experiment))
            p_plot_value <- confidence_interval[1] + p * abs(diff(confidence_interval))

            result <- list(c(p, p_plot_value), confidence_interval)
            result
    }

    run_simulation <- function() {
            sample_space <- c(rep('Red', 25), rep('White', 75))
            N <- 15

            experiments_df <- data.frame()

            for (i in 1:20) {
                    t <- run_experiment(sample_space, N)
                    t_results <- get_experiment_results(t, "Red")

                    experiments_df <- rbind(experiments_df, c(i, t_results[[1]][[1]], t_results[[1]][[2]], t_results[[2]][[1]], t_results[[2]][[2]]))
            }
            names(experiments_df) <- c("experiment_index", "proportion", "proportion_plot_value", "ci_min", "ci_max")

            print(experiments_df)

            # Jaap's answer on SO solves floating bar plot.
            # https://stackoverflow.com/questions/29916770/geom-bar-from-min-to-max-data-value
            # Simon's answer to me on SO solves plotting the proportion.
            # https://stackoverflow.com/questions/29916770/geom-bar-from-min-to-max-data-value
            ggplot(experiments_df, aes(experiment_index)) +
                    geom_segment(aes(xend = experiment_index, yend = ci_min, y = ci_max), size = 4, lineend = "butt",
                                 color = "turquoise4") +
                    geom_segment(aes(xend = experiment_index, yend = proportion_plot_value-.001, y = proportion_plot_value+.001), size = 4, lineend = "butt",
                                 color = "black") +
                    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed") +
                    coord_cartesian(ylim = c(0, 1)) +
                    labs(x = "Experiment", y = "Probability",
                         title = "Each bar shows 95% CI computed from one
    simulated experiment",
                         subtitle = "Dashed line is true proportion in sample space",
                         caption = "Intuitive Biostatistics. Harvey Mitulsky. p. 36") 
    }

    run_simulation()

My final plot (which my reputation points don't yet permit me to paste)

Jai Jeffryes
  • 511
  • 5
  • 12

1 Answers1

1

First calculate the proportion with respect to the lower end of the bar:

trials_df <- data.frame(trials_index = integer(),
                          proportion = double(),
                          ci_min = double(),
                          ci_max = double())

  for (i in 1:20) {
    t <- run_trials(sample_space, N)
    t_results <- get_trial_results(t, "Red")
    trials_df <- rbind(trials_df, c(i, t_results[[1]][1], t_results[[2]][1], t_results[[2]][2], t_results[[2]][1]+t_results[[1]][1]*asbs(diff(t_results[[2]][2], t_results[[2]][1]))))
  }
  names(trials_df) <- c("trials_index", "proportion", "ci_min", "ci_max", 'proportion_max')

For just a small horizontal line on each of your bars you could do:

  ggplot(trials_df, aes(trials_index, ci_max)) +
    geom_segment(aes(xend = trials_index, yend = ci_min), size = 4, #lineend = "butt",
                 color = "turquoise4") +
    geom_segment(aes(xend = trials_index, yend = proportion_max-.001, y = proportion_max+.001), size = 4, lineend = "butt",
                 color = "turquoise3") +
    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed")

enter image description here

Is one of these what you want?

To color the lower proportion of each bar can can do:

ggplot(trials_df, aes(trials_index, ci_max)) +
    geom_segment(aes(xend = trials_index, yend = ci_min), size = 4, #lineend = "butt",
                 color = "turquoise4") +
    geom_segment(aes(xend = trials_index, yend = ci_min, y = proportion_max), size = 4, lineend = "butt",
                 color = "turquoise3") +
    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed")

enter image description here

Simon
  • 577
  • 3
  • 9
  • Simon, your first one is exactly what I’m trying to do. The second one is another cool idea, but that first one is straight out of the book I’m emulating. Thanks. (Am I really not supposed put in “thank you” comments? – Jai Jeffryes Apr 20 '19 at 21:01
  • Glad to help (event though the important part of the code was mssing until edit)! Feel free to express your thanks and mark the answer as correct :) – Simon Apr 20 '19 at 21:17
  • I added your approach to my function that collects the results. You also taught me the diff() function in R. I think in Excel and try to make things happen in R. That one, diff(), is another helpful trick. I improved the labeling of my plot and now it looks just like the one in the book. Developing this little simulation helped me understand Confidence Intervals. – Jai Jeffryes Apr 21 '19 at 16:12