Add regression line equation and R^2 on graph

Question

I wonder how to add regression line equation and R^2 on the ggplot. My code is:

library(ggplot2) df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) p <- ggplot(data = df, aes(x = x, y = y)) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + geom_point() p

Any help will be highly appreciated.

@JoshO'Brien Error: 'lmlineq' is not an exported object from 'namespace:latticeExtra' — robertspierre
– robertspierre, Commented Mar 14, 2021 at 17:12

MrFlick · Accepted Answer · 2019-07-02 16:03:08Z

321

+100

Here is one solution

# GET EQUATION AND R-SQUARED AS STRING # SOURCE: https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA lm_eqn <- function(df){ m <- lm(y ~ x, df); eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, list(a = format(unname(coef(m)[1]), digits = 2), b = format(unname(coef(m)[2]), digits = 2), r2 = format(summary(m)$r.squared, digits = 3))) as.character(as.expression(eq)); } p1 <- p + geom_text(x = 25, y = 300, label = lm_eqn(df), parse = TRUE)

EDIT. I figured out the source from where I picked this code. Here is the link to the original post in the ggplot2 google groups

Output

edited Jul 2, 2019 at 16:03

MrFlick

209k19 gold badges300 silver badges324 bronze badges

answered Sep 26, 2011 at 1:20

Ramnath

55.9k16 gold badges129 silver badges155 bronze badges

Sign up to request clarification or add additional context in comments.

9 Comments

IRTFM Over a year ago

@JonasRaedle's comment about getting better looking texts with annotate was correct on my machine.

PatrickT Over a year ago

This doesn't look anything like the posted output on my machine, where the label is overwritten as many times as the data is called, resulting in a thick and blurry label text. Passing the labels to a data.frame first works (see my suggestion in a comment below.

naught101 Over a year ago

@PatrickT: remove the aes( and the corresponding ). aes is for mapping dataframe variables to visual variables - that's not needed here, since there's just one instance, so you can put it all in the main geom_text call. I'll edit this in to the answer.

Jerry T Over a year ago

for those who wants r and p values instead of R2 and equation: eq <- substitute(italic(r)~"="~rvalue*","~italic(p)~"="~pvalue, list(rvalue = sprintf("%.2f",sign(coef(m)[2])*sqrt(summary(m)$r.squared)), pvalue = format(summary(m)$coefficients[2,4], digits = 2)))

nelliott Over a year ago

By default geom_text will plot for each row in your data frame, resulting in blurring and the performance issues several people mentioned. To fix, wrap the arguments passed to geom_text in aes() and also pass an empty data frame like so: geom_text(aes(x = xpoint, y = ypoint, label = lm(df)), parse = TRUE, data.frame()). See stackoverflow.com/questions/54900695/….

|

Pedro J. Aphalo · Accepted Answer · 2023-03-30 14:07:49Z

Statistic stat_poly_eq() in my package ggpmisc makes it possible to add text labels to plots based on a linear model fit. (Statistics stat_ma_eq() and stat_quant_eq() work similarly and support major axis regression and quantile regression, respectively. Each eq stat has a matching line drawing stat.)

I have updated this answer for 'ggpmisc' (>= 0.5.0) and 'ggplot2' (>= 3.4.0) on 2023-03-30. The main change is the assembly of labels and their mapping using function use_label() added in 'ggpmisc' (==0.5.0). Although use of aes() and after_stat() remains unchanged, use_label() makes coding of mappings and assembly of labels simpler.

In the examples I use stat_poly_line() instead of stat_smooth() as it has the same defaults as stat_poly_eq() for method and formula. I have omitted in all code examples the additional arguments to stat_poly_line() as they are irrelevant to the question of adding labels.

library(ggplot2) library(ggpmisc) #> Loading required package: ggpp #> #> Attaching package: 'ggpp' #> The following object is masked from 'package:ggplot2': #> #> annotate # artificial data df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) df$yy <- 2 + 3 * df$x + 0.1 * df$x^2 + rnorm(100, sd = 40) # using default formula, label and methods ggplot(data = df, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq() + geom_point()

 # assembling a single label with equation and R2 ggplot(data = df, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(use_label(c("eq", "R2"))) + geom_point()

 # assembling a single label with equation, adjusted R2, F-value, n, P-value ggplot(data = df, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(use_label(c("eq", "adj.R2", "f", "p", "n"))) + geom_point()

 # assembling a single label with R2, its confidence interval, and n ggplot(data = df, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(use_label(c("R2", "R2.confint", "n"))) + geom_point()

 # adding separate labels with equation and R2 ggplot(data = df, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(use_label("eq")) + stat_poly_eq(label.y = 0.9) + geom_point()

 # regression through the origin ggplot(data = df, aes(x = x, y = y)) + stat_poly_line(formula = y ~ x + 0) + stat_poly_eq(use_label("eq"), formula = y ~ x + 0) + geom_point()

 # fitting a polynomial ggplot(data = df, aes(x = x, y = yy)) + stat_poly_line(formula = y ~ poly(x, 2, raw = TRUE)) + stat_poly_eq(formula = y ~ poly(x, 2, raw = TRUE), use_label("eq")) + geom_point()

 # adding a hat as asked by @MYaseen208 and @elarry ggplot(data = df, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(eq.with.lhs = "italic(hat(y))~`=`~", use_label(c("eq", "R2"))) + geom_point()

 # variable substitution as asked by @shabbychef # same labels in equation and axes ggplot(data = df, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(eq.with.lhs = "italic(h)~`=`~", eq.x.rhs = "~italic(z)", use_label("eq")) + labs(x = expression(italic(z)), y = expression(italic(h))) + geom_point()

 # grouping as asked by @helen.h dfg <- data.frame(x = c(1:100)) dfg$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40) dfg$group <- factor(rep(c("A", "B"), 50)) ggplot(data = dfg, aes(x = x, y = y, colour = group)) + stat_poly_line() + stat_poly_eq(use_label(c("eq", "R2"))) + geom_point()

 # A group label is available, for grouped data ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) + stat_poly_line() + stat_poly_eq(use_label(c("grp", "eq", "R2"))) + geom_point()

 # use_label() makes it easier to create the mappings, but when more # flexibility is needed like different separators at different positions, # as shown here, aes() has to be used instead of use_label(). ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) + stat_poly_line() + stat_poly_eq(aes(label = paste(after_stat(grp.label), "*\": \"*", after_stat(eq.label), "*\", \"*", after_stat(rr.label), sep = ""))) + geom_point()

 # a single fit with grouped data as asked by @Herman ggplot(data = dfg, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(use_label(c("eq", "R2"))) + geom_point(aes(colour = group))

 # facets ggplot(data = dfg, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(use_label(c("eq", "R2"))) + geom_point() + facet_wrap(~group)

^{Created on 2023-03-30 with reprex v2.0.2}

It should be noted that the x and y in the formula refer to the x and y data in the layers of the plot, and not necessarily to those in scope at the time my.formula is constructed. Thus the formula should always use x and y variables?
Good point @elarry! This is related to how R's parse() function works. Through trial and error I found that aes(label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~")) does the job.
@PedroAphalo is it possible to use the stat_poly_eq function for groups in the same plot? i'm specifying the colour= and linetype= in my aes() call then also calling geom_smooth(method="rlm") which is currently giving me a regression line for each group which i'd like to print unique equations for.
@PedroAphalo is there anyway I can display r Pearson? so not R2, perhaps can I just sqrt() the result?
@HermanToothrot Usually R2 is preferred for a regression, so there is no predefined r.label in the data returned by stat_poly_eq(). You can use stat_fit_glance(), also from package 'ggpmisc', which returns R2 as a numeric value. See examples in the help page, and replace stat(r.squared) by sqrt(stat(r.squared)).

kdauria · Accepted Answer · 2016-02-21 01:59:01Z

112

I changed a few lines of the source of stat_smooth and related functions to make a new function that adds the fit equation and R squared value. This will work on facet plots too!

library(devtools) source_gist("524eade46135f6348140") df = data.frame(x = c(1:100)) df$y = 2 + 5 * df$x + rnorm(100, sd = 40) df$class = rep(1:2,50) ggplot(data = df, aes(x = x, y = y, label=y)) + stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) + geom_smooth(method="lm",se=FALSE) + geom_point() + facet_wrap(~class)

enter image description here

I used the code in @Ramnath's answer to format the equation. The stat_smooth_func function isn't very robust, but it shouldn't be hard to play around with it.

https://gist.github.com/kdauria/524eade46135f6348140. Try updating ggplot2 if you get an error.

edited Feb 21, 2016 at 1:59

answered Jan 15, 2015 at 8:34

kdauria

6,7814 gold badges37 silver badges55 bronze badges

17 Comments

Julian Over a year ago

Many Thanks. This one doesn't only work for facets, but even for groups. I find it very useful for piecewise regressions, e.g. stat_smooth_func(mapping=aes(group=cut(x.val,c(-70,-20,0,20,50,130))),geom="text",method="lm",hjust=0,parse=TRUE), in combination with EvaluateSmooths from stackoverflow.com/questions/19735149/…

kdauria Over a year ago

@aelwan, change these lines: gist.github.com/kdauria/… as you like. Then source the entire file in your script.

shiny Over a year ago

@kdauria What if I have several equations in each of facet_wraps and I have different y_values in each of facet_wrap. Any suggestions how to fix the positions of the equations? I tried several options of hjust, vjust and angle using this example dropbox.com/s/9lk9lug2nwgno2l/R2_facet_wrap.docx?dl=0 but I couldn't bring all the equations at the same level in each of the facet_wrap

kdauria Over a year ago

@aelwan, the position of the equation is determined by these lines: gist.github.com/kdauria/…. I made xpos and ypos arguments of the function in the Gist. So if you wanted all the equations to overlap, just set xpos and ypos. Otherwise, xpos and ypos are calculated from the data. If you want something fancier, it shouldn't be too hard to add some logic inside the function. For example, maybe you could write a function to determine what part of the graph has the most empty space and put the function there.

Matifou Over a year ago

I ran into an error with source_gist: Error in r_files[[which]] : invalid subscript type 'closure'. See this post for the solution: stackoverflow.com/questions/38345894/r-source-gist-not-working

|

Ricardo Saporta · Accepted Answer · 2012-12-09 20:14:29Z

80

I've modified Ramnath's post to a) make more generic so it accepts a linear model as a parameter rather than the data frame and b) displays negatives more appropriately.

lm_eqn = function(m) { l <- list(a = format(coef(m)[1], digits = 2), b = format(abs(coef(m)[2]), digits = 2), r2 = format(summary(m)$r.squared, digits = 3)); if (coef(m)[2] >= 0) { eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l) } else { eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l) } as.character(as.expression(eq)); }

Usage would change to:

p1 = p + geom_text(aes(x = 25, y = 300, label = lm_eqn(lm(y ~ x, df))), parse = TRUE)

edited Dec 9, 2012 at 20:14

Ricardo Saporta

55.5k17 gold badges149 silver badges180 bronze badges

answered Nov 19, 2012 at 10:09

Jayden

2,7872 gold badges26 silver badges35 bronze badges

5 Comments

bshor Over a year ago

This looks great! But I'm plotting geom_points on multiple facets, where the df differs based on the facet variable. How do I do that?

Jonas Raedle Over a year ago

Jayden's solution works quite well, but the typeface looks very ugly. I would recommend changing the usage to this: p1 = p + annotate("text", x = 25, y = 300, label = lm_eqn(lm(y ~ x, df)), colour="black", size = 5, parse=TRUE) edit: this also resolves any issues you might have with letters showing up in your legend.

PatrickT Over a year ago

@ Jonas, for some reason I'm getting "cannot coerce class "lm" to a data.frame". This alternative works: df.labs <- data.frame(x = 25, y = 300, label = lm_eqn(df)) and p <- p + geom_text(data = df.labs, aes(x = x, y = y, label = label), parse = TRUE)

Hamy Over a year ago

@PatrickT - That's the error message you would get if you called lm_eqn(lm(...)) with Ramnath's solution. You probably tried this one after trying that one but forgot to ensure that you had redefined lm_eqn

JelenaČuklina Over a year ago

@PatrickT: could you make your answer a separate answer? I would be happy to vote it up!

matmar · Accepted Answer · 2020-05-31 07:43:04Z

35

Here's the most simplest code for everyone

Note: Showing Pearson's Rho and not R^2.

library(ggplot2) library(ggpubr) df <- data.frame(x = c(1:100) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) p <- ggplot(data = df, aes(x = x, y = y)) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + geom_point()+ stat_cor(label.y = 35)+ #this means at 35th unit in the y axis, the r squared and p value will be shown stat_regline_equation(label.y = 30) #this means at 30th unit regresion line equation will be shown p

edited May 31, 2020 at 7:43

matmar

3473 silver badges13 bronze badges

answered Feb 29, 2020 at 18:46

Sork-kal

3673 silver badges11 bronze badges

8 Comments

matmar Over a year ago

Same problem as above, in your plot it is shown rho and not R² !

Matifou Over a year ago

actually you can add just the R2 with: stat_cor(aes(label = ..rr.label..))

JJGabe Over a year ago

I find this to be the simplest solution with the best control over the location of the labels (I was not able to find a simple way to put the R^2 below the equation using stat_poly_eq) and can be combined with stat_regline_equation() to plot the regression equation

Pedro J. Aphalo Over a year ago

'ggpubr' seems not to be actively maintaine; as it has many open issues in GitHub. Anyway, much of the code in stat_regline_equation() and in stat_cor() was just copied without acknowledgement from my package 'ggpmisc'. It was taken from stat_poly_eq() which is actively maintained and has gained several new features since it was copied. The example code needs minimal edits to work with 'ggpmisc'.

JJGabe Over a year ago

@PedroJ.Aphalo While I agree with you that if code was taken from your package you should get acknowledgement, I still find stat_poly_eq() to be more cumbersome. The fact that I can't easily separate the label=..eq.label.. and label=..rr.label.. onto separate lines and intuitively place them on the grid means I will continue to prefer stat_cor() and stat_regline_equation(). This is certainly my own personal opinion and may not be shared by other users but just some things to consider since you're still actively updating ggpmisc

|

zx8754 · Accepted Answer · 2020-04-28 07:01:00Z

Using ggpubr:

library(ggpubr) # reproducible data set.seed(1) df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) # By default showing Pearson R ggscatter(df, x = "x", y = "y", add = "reg.line") + stat_cor(label.y = 300) + stat_regline_equation(label.y = 280)

# Use R2 instead of R ggscatter(df, x = "x", y = "y", add = "reg.line") + stat_cor(label.y = 300, aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))) + stat_regline_equation(label.y = 280) ## compare R2 with accepted answer # m <- lm(y ~ x, df) # round(summary(m)$r.squared, 2) # [1] 0.85

Have you seen a neat programmatic way to specify a number for label.y?
@MarkNeal maybe get the max of y then multiply by 0.8. label.y = max(df$y) * 0.8
@MarkNeal good points, maybe submit issue as feature request at GitHub ggpubr.
@zx8754 , in your plot it is shown rho and not R² ,any easy way to show R² ?

X.X · Accepted Answer · 2018-08-22 20:38:56Z

really love @Ramnath solution. To allow use to customize the regression formula (instead of fixed as y and x as literal variable names), and added the p-value into the printout as well (as @Jerry T commented), here is the mod:

lm_eqn <- function(df, y, x){ formula = as.formula(sprintf('%s ~ %s', y, x)) m <- lm(formula, data=df); # formating the values into a summary string to print out # ~ give some space, but equal size and comma need to be quoted eq <- substitute(italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue), list(target = y, input = x, a = format(as.vector(coef(m)[1]), digits = 2), b = format(as.vector(coef(m)[2]), digits = 2), r2 = format(summary(m)$r.squared, digits = 3), # getting the pvalue is painful pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=1) ) ) as.character(as.expression(eq)); } geom_point() + ggrepel::geom_text_repel(label=rownames(mtcars)) + geom_text(x=3,y=300,label=lm_eqn(mtcars, 'hp','wt'),color='red',parse=T) + geom_smooth(method='lm')

Unfortunately, this doesn't work with facet_wrap or facet_grid.

Very neat, I've referenced here. A clarification - is your code missing ggplot(mtcars, aes(x = wt, y = mpg, group=cyl))+ before the geom_point()? A semi-related question - if we refer to hp and wt in the aes() for ggplot, can we then grab them to use in the call to lm_eqn, so then we only have to code in one place? I know we could set up xvar = "hp" before the ggplot() call, and use xvar in both locations to replace hp, but this feels like it ought to be unnecessary.

AlexB · Accepted Answer · 2020-10-15 16:55:33Z

Another option would be to create a custom function generating the equation using dplyr and broom libraries:

get_formula <- function(model) { broom::tidy(model)[, 1:2] %>% mutate(sign = ifelse(sign(estimate) == 1, ' + ', ' - ')) %>% #coeff signs mutate_if(is.numeric, ~ abs(round(., 2))) %>% #for improving formatting mutate(a = ifelse(term == '(Intercept)', paste0('y ~ ', estimate), paste0(sign, estimate, ' * ', term))) %>% summarise(formula = paste(a, collapse = '')) %>% as.character } lm(y ~ x, data = df) -> model get_formula(model) #"y ~ 6.22 + 3.16 * x" scales::percent(summary(model)$r.squared, accuracy = 0.01) -> r_squared

Now we need to add the text to the plot:

p + geom_text(x = 20, y = 300, label = get_formula(model), color = 'red') + geom_text(x = 20, y = 285, label = r_squared, color = 'blue')

rvezy · Accepted Answer · 2019-04-05 13:52:14Z

Inspired by the equation style provided in this answer, a more generic approach (more than one predictor + latex output as option) can be:

print_equation= function(model, latex= FALSE, ...){ dots <- list(...) cc= model$coefficients var_sign= as.character(sign(cc[-1]))%>%gsub("1","",.)%>%gsub("-"," - ",.) var_sign[var_sign==""]= ' + ' f_args_abs= f_args= dots f_args$x= cc f_args_abs$x= abs(cc) cc_= do.call(format, args= f_args) cc_abs= do.call(format, args= f_args_abs) pred_vars= cc_abs%>% paste(., x_vars, sep= star)%>% paste(var_sign,.)%>%paste(., collapse= "") if(latex){ star= " \\cdot " y_var= strsplit(as.character(model$call$formula), "~")[[2]]%>% paste0("\\hat{",.,"_{i}}") x_vars= names(cc_)[-1]%>%paste0(.,"_{i}") }else{ star= " * " y_var= strsplit(as.character(model$call$formula), "~")[[2]] x_vars= names(cc_)[-1] } equ= paste(y_var,"=",cc_[1],pred_vars) if(latex){ equ= paste0(equ," + \\hat{\\varepsilon_{i}} \\quad where \\quad \\varepsilon \\sim \\mathcal{N}(0,", summary(MetamodelKdifEryth)$sigma,")")%>%paste0("$",.,"$") } cat(equ) }

The model argument expects an lm object, the latex argument is a boolean to ask for a simple character or a latex-formated equation, and the ... argument pass its values to the format function.

I also added an option to output it as latex so you can use this function in a rmarkdown like this:

 ```{r echo=FALSE, results='asis'} print_equation(model = lm_mod, latex = TRUE) ```

Now using it:

df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) df$z <- 8 + 3 * df$x + rnorm(100, sd = 40) lm_mod= lm(y~x+z, data = df) print_equation(model = lm_mod, latex = FALSE)

This code yields: y = 11.3382963933174 + 2.5893419 * x + 0.1002227 * z

And if we ask for a latex equation, rounding the parameters to 3 digits:

print_equation(model = lm_mod, latex = TRUE, digits= 3)

This yields:

How to modify this function for: model = lm(y ~ x * z, data = my_data) ? X and z are categorical variables with two levels.

tassones · Accepted Answer · 2022-07-20 18:06:22Z

Similar to @zx8754 and @kdauria answers except using ggplot2 and ggpubr. I prefer using ggpubr because it does not require custom functions such as the top answer to this question.

library(ggplot2) library(ggpubr) df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) ggplot(data = df, aes(x = x, y = y)) + stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + geom_point() + stat_cor(aes(label = paste(..rr.label..)), # adds R^2 value r.accuracy = 0.01, label.x = 0, label.y = 375, size = 4) + stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression label.x = 0, label.y = 400, size = 4)

Could also add p-value to the figure above

ggplot(data = df, aes(x = x, y = y)) + stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + geom_point() + stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), # adds R^2 and p-value r.accuracy = 0.01, p.accuracy = 0.001, label.x = 0, label.y = 375, size = 4) + stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression label.x = 0, label.y = 400, size = 4)

Also works well with facet_wrap() when you have multiple groups

df$group <- rep(1:2,50) ggplot(data = df, aes(x = x, y = y)) + stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + geom_point() + stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), r.accuracy = 0.01, p.accuracy = 0.001, label.x = 0, label.y = 375, size = 4) + stat_regline_equation(aes(label = ..eq.label..), label.x = 0, label.y = 400, size = 4) + theme_bw() + facet_wrap(~group)

Collectives™ on Stack Overflow

Add regression line equation and R^2 on graph

10 Answers 10

9 Comments

42 Comments

17 Comments

5 Comments

8 Comments

7 Comments

1 Comment

Comments

1 Comment

Comments

Linked

Hot Network Questions

Collectives™ on Stack Overflow

10 Answers 10

9 Comments

42 Comments

17 Comments

5 Comments

8 Comments

7 Comments

1 Comment

Comments

1 Comment

Comments

Linked

Related