I plotted a series of bars that represent a simulation of confidence intervals of a proportion. I want to add a line to each bar representing the proportion of successes.
The proportion I want to plot is in the data frame for the plot. I haven't figured out how to add a line element for that data point inside each individual bar.
The visualization come from page 36 of Harvey Matulsky's Intuitive Biostatistics. This is a simulation of taking samples from a given sample space, recording the proportion of successes, and computing the confidence intervals.
I plotted the bars using geom_segment so I could make the bars start at the lower end of the confidence interval instead of drawing them starting on the x axis. I added one horizontal line to the whole chart showing the true proportion of successes (red balls in a collection of balls colored red and white) in the sample space.
I tried doing stuff with geom_hline and geom_segment mapped to the data point trials_df$proportion. I couldn't get on the right track with that.
Here is the code for my whole visualization. It's factored into some functions, and then the whole simulation runs, printing the data frame for the plot and then running the plot I have so far (lacking the proportion line on each bar).
library(ggplot2)
run_trials <- function(sample_space, N) {
sample(sample_space,
size = N,
replace = TRUE)
}
success_count <- function(trials, success_value) {
result <- sum(trials == success_value)
result
}
proportion <- function(trials, success_value) {
success_count(trials, success_value) / length(trials)
}
wald_mod <- function(success_count, trial_count) {
z <- 1.96
p_prime <- (success_count + (0.5 * z^2)) / (trial_count + z^2)
W <- z * sqrt((p_prime * (1 - p_prime)) / (trial_count + z^2))
result <- c((p_prime - W), (p_prime + W))
result
}
get_trial_results <- function(trials, success_value) {
p <- proportion(trials, success_value)
successes <- success_count(trials, success_value)
confidence_interval <- wald_mod(successes, length(trials))
result <- list(p, confidence_interval)
result
}
run_simulation <- function() {
sample_space <- c(rep('Red', 25), rep('White', 75))
N <- 15
trials_df <- data.frame(trials_index = integer(),
proportion = double(),
ci_min = double(),
ci_max = double())
for (i in 1:20) {
t <- run_trials(sample_space, N)
t_results <- get_trial_results(t, "Red")
trials_df <- rbind(trials_df, c(i, t_results[[1]][1], t_results[[2]][1], t_results[[2]][2]))
}
names(trials_df) <- c("trials_index", "proportion", "ci_min", "ci_max")
print(trials_df)
ggplot(trials_df, aes(trials_index, ci_max)) +
geom_segment(aes(xend = trials_index, yend = ci_min), size = 4, lineend = "butt",
color = "turquoise4") +
geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed")
}
run_simulation()
I added @Simon 's solution to my code and improved the labeling of my plot. Developing this little simulation helped me understand Confidence Intervals.
library(ggplot2)
run_experiment <- function(sample_space, N) {
sample(sample_space,
size = N,
replace = TRUE)
}
success_count <- function(experiment, success_value) {
result <- sum(experiment == success_value)
result
}
proportion <- function(experiment, success_value) {
success_count(experiment, success_value) / length(experiment)
}
wald_mod <- function(success_count, trial_count) {
z <- 1.96
p_prime <- (success_count + (0.5 * z^2)) / (trial_count + z^2)
W <- z * sqrt((p_prime * (1 - p_prime)) / (trial_count + z^2))
result <- c((p_prime - W), (p_prime + W))
result
}
get_experiment_results <- function(experiment, success_value) {
p <- proportion(experiment, success_value)
successes <- success_count(experiment, success_value)
confidence_interval <- wald_mod(successes, length(experiment))
p_plot_value <- confidence_interval[1] + p * abs(diff(confidence_interval))
result <- list(c(p, p_plot_value), confidence_interval)
result
}
run_simulation <- function() {
sample_space <- c(rep('Red', 25), rep('White', 75))
N <- 15
experiments_df <- data.frame()
for (i in 1:20) {
t <- run_experiment(sample_space, N)
t_results <- get_experiment_results(t, "Red")
experiments_df <- rbind(experiments_df, c(i, t_results[[1]][[1]], t_results[[1]][[2]], t_results[[2]][[1]], t_results[[2]][[2]]))
}
names(experiments_df) <- c("experiment_index", "proportion", "proportion_plot_value", "ci_min", "ci_max")
print(experiments_df)
# Jaap's answer on SO solves floating bar plot.
# https://stackoverflow.com/questions/29916770/geom-bar-from-min-to-max-data-value
# Simon's answer to me on SO solves plotting the proportion.
# https://stackoverflow.com/questions/29916770/geom-bar-from-min-to-max-data-value
ggplot(experiments_df, aes(experiment_index)) +
geom_segment(aes(xend = experiment_index, yend = ci_min, y = ci_max), size = 4, lineend = "butt",
color = "turquoise4") +
geom_segment(aes(xend = experiment_index, yend = proportion_plot_value-.001, y = proportion_plot_value+.001), size = 4, lineend = "butt",
color = "black") +
geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed") +
coord_cartesian(ylim = c(0, 1)) +
labs(x = "Experiment", y = "Probability",
title = "Each bar shows 95% CI computed from one
simulated experiment",
subtitle = "Dashed line is true proportion in sample space",
caption = "Intuitive Biostatistics. Harvey Mitulsky. p. 36")
}
run_simulation()
My final plot (which my reputation points don't yet permit me to paste)