bootstrap within groups in R

Question

I have a data frame with response ratios for multiple locations and each location is assigned to a group (region). I want to generate a regression for each group (region) that uses Response Ratio (RR) as the response, location as the unit of replication, and each soil type as a predictor. I would like to use bootstrap resampling to generate confidence intervals around the coefficients for each soil type but I am not sure how to generate this.

#sample data df <- data.frame( group=rep(c('region1','region2'), 100), subgroup=rep(c('location1','location2', 'location2', 'location1'), 25), predictor = rep(c('soil1','soil2','soil3','soil4'), 25), RR=rnorm(200) )

Adding script from @Rui below. I actually have a multiple regression and so I added an additional predictor. It is still unclear to me how to extract the coefficient CIs for both soil type and temperature.

library(boot) bootfun <- function(data, i) { d <- data[i,] fit <- lm(RR ~ soil_type + temperature, data = d) coef(fit) } set.seed(2022) set.seed(123) df <- data.frame( group=rep(c('region1','region2'), 100), subgroup=rep(c('location1','location2', 'location2', 'location1'), 25), soil_type = rep(c('soil1','soil2','soil3','soil4'), 25), temperature = abs(rnorm(100, 2,1.75)), RR=rnorm(200), stringsAsFactors = TRUE ) R <- 1000 b_list <- by(df, df$group, \(X) { boot(X, bootfun, R, strata = X$subgroup) }) b_list$region1

Rui Barradas · Accepted Answer · 2022-10-25 21:07:51Z

Function boot is base package boot has an argument strata. Split by group and apply a boot function with, for instance, by stratifying by location.

library(boot) bootfun <- function(data, i) { d <- data[i,] fit <- lm(RR ~ predictor, data = d) coef(fit) } set.seed(2022) df <- data.frame( group=rep(c('region1','region2'), 100), subgroup=rep(c('location1','location2', 'location2', 'location1'), 25), predictor = rep(c('soil1','soil2','soil3','soil4'), 25), RR=rnorm(200), stringsAsFactors = TRUE ) R <- 1000 b_list <- by(df, df$group, \(X) { boot(X, bootfun, R, strata = X$subgroup) }) b_list$region1 #> #> STRATIFIED BOOTSTRAP #> #> #> Call: #> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup) #> #> #> Bootstrap Statistics : #> original bias std. error #> t1* -0.2608885 0.000469295 0.1541482 #> t2* 0.3502007 -0.004239248 0.2083503 b_list$region2 #> #> STRATIFIED BOOTSTRAP #> #> #> Call: #> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup) #> #> #> Bootstrap Statistics : #> original bias std. error #> t1* -0.03727332 -0.0001557172 0.1422502 #> t2* 0.11987005 0.0016393125 0.1952310 lapply(b_list, boot.ci) #> Warning in sqrt(tv[, 2L]): NaNs produced #> Warning in sqrt(tv[, 2L]): NaNs produced #> $region1 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> FUN(boot.out = X[[i]]) #> #> Intervals : #> Level Normal Basic Studentized #> 95% (-0.5635, 0.0408 ) (-0.5611, 0.0545 ) (-0.8227, -0.0225 ) #> #> Level Percentile BCa #> 95% (-0.5762, 0.0393 ) (-0.5733, 0.0446 ) #> Calculations and Intervals on Original Scale #> #> $region2 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> FUN(boot.out = X[[i]]) #> #> Intervals : #> Level Normal Basic Studentized #> 95% (-0.3159, 0.2417 ) (-0.3260, 0.2460 ) (-0.3493, 0.1757 ) #> #> Level Percentile BCa #> 95% (-0.3206, 0.2514 ) (-0.3321, 0.2352 ) #> Calculations and Intervals on Original Scale

^{Created on 2022-10-25 with reprex v2.0.2}

Edit

To get the bootstrapped confidence intervals of each coefficient, the code below uses two nested loops. The outer loop is by region, according to the original data partition. The inner loop is on index, meaning, on the matrix t returned by boot, see help("boot"), section Value. The index are the column numbers in any of

b_list$region1$t
b_list$region2$t

each of them with 3 columns.

library(boot) npars <- ncol(b_list$region1$t) ci_list <- lapply(b_list, \(region) { ci <- lapply(seq.int(npars), \(index) { boot.ci(region, index = index, type = c("norm","basic", "perc", "bca")) }) setNames(ci, c("Intercept", "soil", "temperature")) }) ci_list$region1$Intercept #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = region, type = c("norm", "basic", "perc", #> "bca"), index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.2517, 0.6059 ) (-0.2423, 0.6043 ) #> #> Level Percentile BCa #> 95% (-0.2410, 0.6056 ) (-0.2414, 0.6048 ) #> Calculations and Intervals on Original Scale ci_list$region2$temperature #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = region, type = c("norm", "basic", "perc", #> "bca"), index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.2317, 0.0420 ) (-0.2416, 0.0404 ) #> #> Level Percentile BCa #> 95% (-0.2278, 0.0542 ) (-0.2265, 0.0570 ) #> Calculations and Intervals on Original Scale

^{Created on 2022-10-25 with reprex v2.0.2}

Edit 2

Like I say in a comment below, in the new data the soil type uniquely identifies pairs of region and location, unique(df[1:3]) shows it. And it becomes useless to split by group and stratify within groups.

bootfun2 <- function(data, i) { d <- data[i,] fit <- lm(RR ~ temperature + soil_type, data = d) coef(fit) } unique(df[1:3]) # soil type uniquely identifies region/location #> group subgroup soil_type #> 1 region1 location1 soil1 #> 2 region2 location2 soil2 #> 3 region1 location2 soil3 #> 4 region2 location1 soil4 fit <- lm(RR ~ temperature + soil_type, data = df) coef(fit) #> (Intercept) temperature soil_typesoil2 soil_typesoil3 soil_typesoil4 #> 0.25928498 -0.06352205 -0.17739104 -0.05243836 -0.20408527 set.seed(2022) R <- 1000 b_3 <- boot(df, bootfun2, R) b_3 #> #> ORDINARY NONPARAMETRIC BOOTSTRAP #> #> #> Call: #> boot(data = df, statistic = bootfun2, R = R) #> #> #> Bootstrap Statistics : #> original bias std. error #> t1* 0.25928498 0.005724634 0.18033509 #> t2* -0.06352205 -0.002910677 0.05161868 #> t3* -0.17739104 0.004932486 0.18665594 #> t4* -0.05243836 0.005796168 0.19602658 #> t5* -0.20408527 0.004914674 0.20355549 btype <- c("norm","basic", "perc", "bca") ci_list3 <- lapply(seq_len(ncol(b_3$t)), \(index) { boot.ci(b_3, type = btype, index = index) }) names(ci_list3) <- names(coef(fit)) ci_list3 #> $`(Intercept)` #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.0999, 0.6070 ) (-0.0868, 0.6172 ) #> #> Level Percentile BCa #> 95% (-0.0986, 0.6054 ) (-0.0992, 0.6034 ) #> Calculations and Intervals on Original Scale #> #> $temperature #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.1618, 0.0406 ) (-0.1631, 0.0401 ) #> #> Level Percentile BCa #> 95% (-0.1672, 0.0360 ) (-0.1552, 0.0503 ) #> Calculations and Intervals on Original Scale #> #> $soil_typesoil2 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.5482, 0.1835 ) (-0.5541, 0.1955 ) #> #> Level Percentile BCa #> 95% (-0.5503, 0.1994 ) (-0.5542, 0.1927 ) #> Calculations and Intervals on Original Scale #> #> $soil_typesoil3 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.4424, 0.3260 ) (-0.4399, 0.3068 ) #> #> Level Percentile BCa #> 95% (-0.4117, 0.3350 ) (-0.4116, 0.3350 ) #> Calculations and Intervals on Original Scale #> #> $soil_typesoil4 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.6080, 0.1900 ) (-0.6116, 0.2127 ) #> #> Level Percentile BCa #> 95% (-0.6208, 0.2035 ) (-0.6284, 0.1801 ) #> Calculations and Intervals on Original Scale

^{Created on 2022-10-25 with reprex v2.0.2}

b_list <- lapply( unique(df$group), \(X) { eval(bquote(boot(df(df$group==.(X), bootfun, R, strata = df[df$group==.(x),]$subgroup))) }) to preserve the call in each group.
This gets me really close to the solution I need. I actually have more than one predictor in the regression and so I need the CIs for the coefficients for each predictor. I modified the original question to reflect this.
@JoshuaSmith Your new data has a problem, the soil type uniquely identifies pairs of region and location, try unique(df[1:3]) to see it. An it becomes useless to stratify within groups. I will edit with a version of the bootstrapped ci's.

Collectives™ on Stack Overflow

bootstrap within groups in R

1 Answer 1

Edit

Edit 2

4 Comments

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Edit

Edit 2

4 Comments

Related