I wrote something in R for my own use, based on the quote from Sherman and Cessie (1997) in StatsStudent's answer.
It implements bootstrap replicates on clustered data with clusters of different sizes.
It makes sure that clusters sampled more than once (due to replacement) are treated as distinct clusters within bootstrap samples (especially important in estimations involving fixed-effects along the cluster dimension).
Finally, note that this approach allows to compute the one-way cluster bootstrap standard error of any statistic computed via a custom, user-provided function. In this respect, it is more flexible than sandwich::vcovBS or sandwich::vcovCL which accept only some fitted model objects as input.
library(boot)
library(sandwich)
## Make some necessary objects
# unbalanced panel data
data("PetersenCL", package = "sandwich")
data <- subset(PetersenCL, !(firm %in% c(1:10) & year == 10))
cluster_var <- "firm"
# get the different cluster sizeS. This is necessary to cluster bootstrapping with clusters of different sizes.
sizes <- table(data[,cluster_var])
u_sizes <- sort(unique(sizes))
# names and numbers of clusters of every sizes
cl_names <- list()
n_clusters <- list()
for(s in u_sizes){
cl_names[[s]] <- names(sizes[sizes == s])
n_clusters[[s]] <- length(cl_names[[s]])
}
par_list <- list(unique_sizes = u_sizes,
cluster_names = cl_names,
number_clusters = n_clusters)
## Design the bootstrap sampling function.
# It will tell boot::boot how to sample data at each replicate
ran.gen_cluster <- function(original_data, arg_list){
cl_boot_dat <- NULL
# non-unique names of clusters (repeated when there is more than one obs. in a cluster)
nu_cl_names <- as.character(original_data[,cluster_var])
for(s in arg_list[["unique_sizes"]]){
# sample, in the vector of names of clusters of size s, as many draws as there are clusters of that size, with replacement
sample_cl_s <- sample(arg_list[["cluster_names"]][[s]],
arg_list[["number_clusters"]][[s]],
replace = TRUE)
# because of replacement, some names are sampled more than once
sample_cl_s_tab <- table(sample_cl_s)
# we need to give them a new cluster identifier, otherwise a cluster sampled more than once
# will be "incorrectly treated as one large cluster rather than two distinct clusters" (by the fixed effects) (Cameron 2015)
for(n in 1:max(sample_cl_s_tab)){ # from 1 to the max number of times a name was sampled bc of replacement
# vector to select obs. that are within the sampled clusters.
names_n <- names(sample_cl_s_tab[sample_cl_s_tab == n])
sel <- nu_cl_names %in% names_n
# select data accordingly to the cluster sampling (duplicating n times observations from clusters sampled n times)
clda <- original_data[sel,][rep(seq_len(sum(sel)), n), ]
#identify row names without periods, and add ".0"
row.names(clda)[grep("\\.", row.names(clda), invert = TRUE)] <- paste0(grep("\\.", row.names(clda), invert = TRUE, value = TRUE),".0")
# add the suffix due to the repetition after the existing cluster identifier.
clda[,cluster_var] <- paste0(clda[,cluster_var], sub(".*\\.","_",row.names(clda)))
# stack the bootstrap samples iteratively
cl_boot_dat <- rbind(cl_boot_dat, clda)
}
}
return(cl_boot_dat)
}
#test that the returned data are the same dimension as input
test_boot_d <- ran.gen_cluster(original_data = data,
arg_list = par_list)
dim(test_boot_d)
dim(data)
# test new clusters are not duplicated (correct if anyDuplicated returns 0)
base::anyDuplicated(test_boot_d[,c("firm","year")])
## Custom ("black-box") estimation function
est_fun <- function(est_data){
est <- lm(as.formula("y ~ x"), est_data)
# statistics we want to evaluate the variance of:
return(est$coefficients[2])
}
# see ?boot::boot for more details on these arguments
boot(data = data,
statistic = est_fun,
ran.gen = ran.gen_cluster,
mle = par_list,
sim = "parametric",
parallel = "no",
R = 200)
Compare with the asymptotic solution (available for unbalanced data, but only for lm or glm objects):
sdw_cl <- vcovCL(lm(as.formula("y ~ x"), data), cluster = ~firm)
sqrt(sdw_cl["x","x"])
NOTE: I have not managed to identify why the output from the code I provide does not equal the output from vcovBS though. Even for balanced data:
## compare with sandwich::vcovBS, on balanced panel data
set.seed(1234)
custom_bs <- boot(data = PetersenCL,
statistic = est_fun,
ran.gen = ran.gen_cluster,
mle = par_list,
sim = "parametric",
parallel = "no",
R = 200)
sd(custom_bs$t[,2])
set.seed(1234)
sdw_bs <- vcovBS(lm(as.formula("y ~ x"), PetersenCL), cluster = ~firm, R=200)#
sqrt(sdw_bs["x","x"])