EDIT: updated based on comments.
You could add a column that indicates if a row is part of your sample. So maybe try the following:
df = data.frame(year= c(1,1,1,1,1,1,2,2,2,2,2,2), id=c(1,2,3,4,5,6,7,8,9,10,11,12),age=c(7,7,7,12,12,12,7,7,7,12,12,12)) library(dplyr) n_per_yearn_per_year_low_age = 2 n_per_year_high_age = 1 df <- df %>% group_by(year) %>% mutate( in_samplein_sample1 = as.numeric(id %in% sample(id[age<8],n_per_year_low_age))) %>% mutate(in_sample2 = as.numeric(id %in% sample(id[age>8],n_per_yearn_per_year_high_age))) %>% mutate(in_sample = in_sample1+in_sample2) %>% select(-in_sample1,-in_sample2) Output:
# A tibble: 812 x 34 # Groups: year [2] year id age in_sample <dbl> <dbl> <dbl> <dbl> 1 1.00 1.00 7.00 1.00 2 1.00 2.00 7.00 0 1.00 3 1.00 3.00 7.00 0 4 1.00 4.00 12.0 1.00 5 21.00 5.00 12.0 0 6 1.00 6.00 12.0 0 7 2.00 67.00 7.00 1.00 7 8 2.00 8.00 7.00 0 8 9 2.00 89.00 7.00 1.00 10 2.00 10.0 12.0 0 11 2.00 11.0 12.0 0 12 2.00 12.0 12.0 1.00 Futher operations are then trivial:
# extracting your sample df %>% filter(in_sample==1) # comparing statistics of your sample against the rest of the population df %>% group_by(year,in_sample) %>% summarize(mean(id))