I want to create a facetted area plot showing the cumulative proportion of each category, with a trend line and a Pearson's correlation coefficient.
I've so far used this code (exemplified with the iris data set):
# Create data set ====
ciformat <- function(val) { paste0(sprintf("%.2f", val)) }
pformat_P <- function(val) { paste0("P ", ifelse(val < 0.001, "<.001", paste0("=", sub("^(-?)0.", "\\1.", sprintf("%.3f", val))))) }
figure_data <- iris[,c("Petal.Width", "Species")]
figure_data <- table(figure_data)
figure_data <- as.data.frame(figure_data)
figure_data <- figure_data %>% group_by(Species) %>% mutate(Proportion = Freq / sum(Freq)) %>% ungroup() %>% as.data.frame()
figure_data$Species <- factor(figure_data$Species, levels = c("setosa", "versicolor", "virginica"))
as.data.frame(figure_data)
figure_data$Species_f = factor(figure_data$Species, levels=c("setosa", "versicolor", "virginica"))
# Create correlation values ====
figure_setosa <- figure_data[(figure_data$Species == "setosa"),c("Species", "Petal.Width", "Proportion")]
figure_setosa$Petal.Width <- as.numeric(figure_setosa$Petal.Width)
figure_setosa$Proportion <- as.numeric(figure_setosa$Proportion)
figure_setosa_r <- paste0("r = ", ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$estimate))
figure_setosa_ci1 <- ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$conf.int[1])
figure_setosa_ci2 <- ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$conf.int[2])
figure_setosa_p <- pformat_P(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$p.value)
figure_setosa_ci <- paste0("(95% CI ", figure_setosa_ci1, " to ", figure_setosa_ci2, ")")
figure_versicolor <- figure_data[(figure_data$Species == "versicolor"),c("Species", "Petal.Width", "Proportion")]
figure_versicolor$Petal.Width <- as.numeric(figure_versicolor$Petal.Width)
figure_versicolor$Proportion <- as.numeric(figure_versicolor$Proportion)
figure_versicolor_r <- paste0("r = ", ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$estimate))
figure_versicolor_ci1 <- ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$conf.int[1])
figure_versicolor_ci2 <- ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$conf.int[2])
figure_versicolor_p <- pformat_P(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$p.value)
figure_versicolor_ci <- paste0("(95% CI ", figure_versicolor_ci1, " to ", figure_versicolor_ci2, ")")
figure_virginica <- figure_data[(figure_data$Species == "virginica"),c("Species", "Petal.Width", "Proportion")]
figure_virginica$Petal.Width <- as.numeric(figure_virginica$Petal.Width)
figure_virginica$Proportion <- as.numeric(figure_virginica$Proportion)
figure_virginica_r <- paste0("r = ", ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$estimate))
figure_virginica_ci1 <- ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$conf.int[1])
figure_virginica_ci2 <- ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$conf.int[2])
figure_virginica_p <- pformat_P(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$p.value)
figure_virginica_ci <- paste0("(95% CI ", figure_virginica_ci1, " to ", figure_virginica_ci2, ")")
# Create figure ====
figure_data %>% mutate(Petal.Width = as.numeric(Petal.Width), decimals=1) %>%
mutate(Petal.Width = Petal.Width) %>% mutate(Proportion = as.numeric(Proportion)) %>%
ggplot(aes(x = Petal.Width, y = Proportion, fill = Species)) +
geom_area(position = "identity", color = "black", size = 0, alpha = 0.5) +
geom_smooth(aes(x=Petal.Width, y=Proportion, fill = Species), method=lm,
se=T, color="black", size=0.7, alpha=0.7) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Petal.Width", y="Proportion (%)") +
geom_text(data = (data.frame(label = c(paste0(figure_setosa_r), paste0(figure_versicolor_r), paste0(figure_virginica_r)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.45, .45, .45))), mapping = aes(x = x, y = y, label = label)) +
geom_text(data = (data.frame(label = c(paste0(figure_setosa_ci), paste0(figure_versicolor_ci), paste0(figure_virginica_ci)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.40, .40, .40))), mapping = aes(x = x, y = y, label = label)) +
geom_text(data = (data.frame(label = c(paste0(figure_setosa_p), paste0(figure_versicolor_p), paste0(figure_virginica_p)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.35, .35, .35))), mapping = aes(x = x, y = y, label = label)) +
coord_cartesian(ylim = c(0, 1), clip = "on", expand = F) +
facet_grid(~fct_relevel(Species,"setosa", "versicolor", "virginica"))
However, I see that the Pearson's correlation coefficients are not correct here, as they use the proportion instead of the original iris$Petal.Width vs iris$Species.
How can I correct this so that the trend line and the Pearson's r is correct?
Addition 230410: Thank you @TarJae. So what I want to do is that the trend line and the correlation coefficient is correct. But when I do it from the original data, without the proportions data:
iris$Species = ifelse(iris$Species == "setosa", 1, 0)
cor.test(x=iris$Petal.Width, y=iris$Species, method = "pearson")
Then I get -0.89 (-0.91 to -0.85), P <.001, for setosa for example. While in my example with the proportions I get -0.52 (-0.77 to -0.13), P = .013, as in your example above.
The true correlation between pedal width and setosa must be the -0.89 rather than -0.52, right?
Kind regards