2

I am trying to add a line that extends only within the borders of the plotted area on a density graph. This is similar to the question here: Adding summary information to a density plot created with ggplot but they do not go so far as to add a single vertical line within the plotted area (it extends the for entire plot). I would like each of the vertical mean lines to stop at the top of the density geom, but I can't figure out a way to do this.

In short: I want to capture the value at the top of the density graph where the mean is for each group, and have the line end at that point.

Here is some code to demonstrate:

iris <- as.data.table(iris) iris_summary <- iris[, .(sepal_mean = mean(Sepal.Length), sepal_se_low = mean(Sepal.Length) - sd(Sepal.Length) / sqrt(length(Sepal.Length)), sepal_se_high = mean(Sepal.Length) + sd(Sepal.Length) / sqrt(length(Sepal.Length))), Species] unique(iris$Species) x.dens.s <- density(iris[Species == "setosa", Sepal.Length]) x.dens.ve <- density(iris[Species == "versicolor", Sepal.Length]) x.dens.vi <- density(iris[Species == "virginica", Sepal.Length]) df.dens <- data.table(x = c(x.dens.s$x, x.dens.ve$x, x.dens.vi$x), y = c(x.dens.s$y, x.dens.ve$y, x.dens.vi$y)) df.dens$Species <- c(rep("setosa", length(x.dens.s$y)), rep("versicolor", length(x.dens.ve$y)), rep("virginica", length(x.dens.vi$y))) iris_density <- ggplot() + geom_density(data=iris, aes(x=Sepal.Length,fill=Species),alpha=0.5) + geom_area(data = df.dens[Species == "setosa" & x %between% c(iris_summary[Species == "setosa", sepal_se_low], iris_summary[Species == "setosa", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_area(data = df.dens[Species == "versicolor" & x %between% c(iris_summary[Species == "versicolor", sepal_se_low], iris_summary[Species == "versicolor", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_area(data = df.dens[Species == "virginica" & x %between% c(iris_summary[Species == "virginica", sepal_se_low], iris_summary[Species == "virginica", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_vline(data = iris_summary, aes(xintercept = sepal_mean, color = Species), linetype = 2, linewidth = 0.7, color = "black") iris_density 

enter image description here

2 Answers 2

6

You need to find the y-value corresponding to the means, that requires some rounding. Check and see if that's what you need.

library(data.table) library(ggplot2) library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:data.table': #> #> between, first, last #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union 
 iris <- as.data.table(iris) iris_summary <- iris[, .(sepal_mean = mean(Sepal.Length), sepal_se_low = mean(Sepal.Length) - sd(Sepal.Length) / sqrt(length(Sepal.Length)), sepal_se_high = mean(Sepal.Length) + sd(Sepal.Length) / sqrt(length(Sepal.Length))), Species] unique(iris$Species) #> [1] setosa versicolor virginica #> Levels: setosa versicolor virginica 
x.dens.s <- density(iris[Species == "setosa", Sepal.Length]) x.dens.ve <- density(iris[Species == "versicolor", Sepal.Length]) x.dens.vi <- density(iris[Species == "virginica", Sepal.Length]) df.dens <- data.table(x = c(x.dens.s$x, x.dens.ve$x, x.dens.vi$x), y = c(x.dens.s$y, x.dens.ve$y, x.dens.vi$y)) df.dens$Species <- c(rep("setosa", length(x.dens.s$y)), rep("versicolor", length(x.dens.ve$y)), rep("virginica", length(x.dens.vi$y))) ## Add mean value to density dataset df.dens <- left_join(df.dens, iris_summary) #> Joining with `by = join_by(Species)` 
 ## Find y-value at mean value yend <- df.dens |> group_by(Species) |> filter(round(x, 2) == round(sepal_mean, 2)) |> summarise(yend = mean(y)) ## Plot iris_density <- ggplot() + geom_density(data=iris, aes(x=Sepal.Length,fill=Species),alpha=0.5) + geom_area(data = df.dens[Species == "setosa" & x %between% c(iris_summary[Species == "setosa", sepal_se_low], iris_summary[Species == "setosa", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_area(data = df.dens[Species == "versicolor" & x %between% c(iris_summary[Species == "versicolor", sepal_se_low], iris_summary[Species == "versicolor", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_area(data = df.dens[Species == "virginica" & x %between% c(iris_summary[Species == "virginica", sepal_se_low], iris_summary[Species == "virginica", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_segment(aes(x = iris_summary$sepal_mean, y = 0, yend = yend$yend), linetype = 2) 
 iris_density 

Created on 2024-08-29 with reprex v2.1.0

Sign up to request clarification or add additional context in comments.

1 Comment

Beautiful! If anyone wants a data.table solution, this has the same principles but identifies the row for which y is closest to sepal_mean (no rounding) ## Add mean value to density dataset df.dens <- merge(df.dens, iris_summary) ## Find y-value closest to sepal_mean yend <- df.dens[, .SD[which.min(abs(x - sepal_mean))], by = Species][, .(yend = y), Species]
2

First, you need to decide whether the line should represent the mean or median of your distributions, because they will not be the same, specially with skewed distributions. Once you have decided which value you want, a simple way to do so would be as follows:

library(dplyr) Maximums_statistic <- iris %>% group_by(Species) %>% #Calculate your desired statistic (mean, median) and the densities reframe(Mean = mean(Sepal.Length), density.x = (density(Sepal.Length))$x, density.y = (density(Sepal.Length))$y) %>% #Select the closest x value to your mean mutate(Closest_x = abs(Mean - density.x)) %>% group_by(Species) %>% #Which height value corresponds to this x value? mutate(Corresponding_y = density.y[which(Closest_x == min(Closest_x))]) %>% select(Species, Mean, Height = Corresponding_y) %>% distinct() 

With this dataframe, you can now re-do your plot using geom_segment rather than a full vertical line:

iris_density <- ggplot() + geom_density(data=iris, aes(x=Sepal.Length,fill=Species),alpha=0.5) + geom_area(data = df.dens[Species == "setosa" & x %between% c(iris_summary[Species == "setosa", sepal_se_low], iris_summary[Species == "setosa", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_area(data = df.dens[Species == "versicolor" & x %between% c(iris_summary[Species == "versicolor", sepal_se_low], iris_summary[Species == "versicolor", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + geom_area(data = df.dens[Species == "virginica" & x %between% c(iris_summary[Species == "virginica", sepal_se_low], iris_summary[Species == "virginica", sepal_se_high]),], aes(x=x,y=y), fill = "white", alpha = 0.5) + # This part is new geom_segment(data = Maximums_statistic, aes(x = Mean, xend = Mean, y = Height, yend = 0), linetype = 2, linewidth = 0.7, color = "black") 

Hope this helps!

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.