Suppose we observe panel data $$y_{it} = \alpha_i + \beta_i\,t + \epsilon_{it}$$ where $i$ indexes organizations, $t$ is time, and $\epsilon_{it}$ is i.i.d noise. The terms $\alpha_i$ and $\beta_i$ are organization-specific constants and organization-specific time trends, respectively.
A simple approach to estimating $\{\alpha_i\}$ and $\{\beta_i\}$ would be to run a simple linear regression for each organization (no pooling). Suppose instead that we want to shrink our estimates of $\alpha_i$ and $\beta_i$ toward some central value (partial pooling). We could solve $$\min_{\hat{\alpha}, \, \hat{\beta}, \, \{\hat{\alpha}_i\}, \{\hat{\beta}_i\}} \sum_{it} \left(y_{it} - \hat{y}_{it}\right)^2 + \lambda^{(\alpha)}\sum_i \left(\hat{\alpha}_i - \hat{\alpha}\right)^2 + \lambda^{(\beta)}\sum_i \left(\hat{\beta}_i - \hat{\beta}\right)^2$$ where the predicted values $\hat{y}_{it}$ are defined as $$\hat{y}_{it} \equiv \hat{\alpha}_i + \hat{\beta}_i \, t$$ and $\lambda^{(\alpha)}$ > 0 and $\lambda^{(\beta)}$ > 0 are regularization parameters (which could be selected to minimize e.g. root mean squared error on test data).
(Edit: I believe it is always optimal to set $\hat{\alpha}$ equal to the mean of the $\hat{\alpha}_i$, and similarly for $\hat{\beta}$, in which case the minimization can be done with respect to only $\{\hat{\alpha}_i\}$ and $\{\hat{\beta}_i\}$.)
This feels similar to a Bayesian hierarchical model or to a mixed effects model in which the $\{\alpha_i\}$ and $\{\beta_i\}$ are each drawn from a distribution (e.g. $\alpha_i \sim \mathcal{N}(\mu_\alpha, \sigma^2_\alpha))$, and the estimates are shrunk towards a pooled mean. In the minimization problem I wrote above, the shrinkage terms should tend to shrink $\hat{\alpha}_i$ towards $\hat{\alpha}$.
My questions are
- How does this compare to a Bayesian hierarchical model? Is my $\hat{\alpha}$ analogous to $\mu_\alpha$ when $\alpha_i \sim \mathcal{N}(\mu_\alpha, \sigma^2_\alpha))$?
- How does this compare to a mixed effects model?
- Is there a closed form solution to $\hat{\alpha}$ and $\hat{\beta}$? Are they equal to the complete-pooling ordinary least squares estimates?
- Can my minimization problem be reformulated as a standard ridge regression problem (perhaps by making the data zero-mean)?
- Are there any references on using ridge / L2 normalization for hierarchical regressions as I do here? Is this commonly done, or am I doing something unusual?
Here is an R simulation of what I described above:
library(data.table)
library(ggplot2)
library(RColorBrewer)
params <- list('n_orgs' = 100,
'n_years' = 8,
'alpha' = 100.0, # Constant
'beta' = 5.0, # Time trend
'alpha_sd' = 10.0, # Std dev of org-specific alphas (centered around params['alpha'])
'beta_sd' = 1.0, # Std dev of org-specific betas (centered around params['beta'])
'epsilon_sd' = 8.0) # Std dev of iid noise
## Organizations indexed by i, time indexed by t
## Observe y_it = alpha_i + beta_i * t + epsilon_it (linear time trend plus iid noise)
## Want to shrink org-specific intercept and slope towards fully-pooled mean (if it improves prediction MSE)
get_simulation <- function(params) {
org_ids <- seq_len(params$n_orgs)
times <- seq_len(params$n_years) # Make time start at 1 for simplicity (e.g. years since initial year)
org_params <- data.frame(org_id=org_ids,
alpha=rnorm(n=length(org_ids),
mean=params$alpha,
sd=params$alpha_sd),
beta=rnorm(n=length(org_ids),
mean=params$beta,
sd=params$beta_sd))
df <- expand.grid(org_id=org_ids, time=times)
df <- merge(df, org_params, by='org_id')
df$epsilon <- rnorm(n=nrow(df),
mean=0,
sd=params$epsilon_sd)
df$y <- df$alpha + df$beta * df$time + df$epsilon
df$org_id_string <- sprintf("org %03i", df$org_id)
return(list(df=df,
org_params=org_params)) # True org-specific parameters (compare to estimates)
}
simulation <- get_simulation(params)
df <- simulation$df
head(df, 20)
palette <- colorRampPalette(brewer.pal(8, 'Accent'))(length(unique(df$org_id)))
p <- (ggplot(df, aes(x=time, y=y, color=org_id_string)) +
scale_colour_manual(values=palette) +
geom_point(size=2.0) +
geom_line(size=1.2))
p
df_train <- subset(df, time <= params$n_year - 2)
df_test <- subset(df, time > params$n_years - 2) # Test on two held-out years
stopifnot(nrow(df) == nrow(df_train) + nrow(df_test))
stopifnot(all(sort(unique(df_train$org_id)) == sort(unique(df_test$org_id)))) # Need same orgs in train and test set (estimated params are org-specific)
get_params_hat <- function(x, n_orgs) {
stopifnot(length(x) == 2 * (n_orgs + 1)) # Overall (alpha, beta), plus org-specific (alpha_i, beta_i)
alpha <- x[1]
beta <- x[2]
org_alphas <- x[seq(3, 2 + n_orgs)]
org_betas <- x[seq(2 + n_orgs + 1, length(x))]
stopifnot(length(org_alphas) == length(org_betas))
return(list(alpha=alpha,
beta=beta,
org_alphas=org_alphas,
org_betas=org_betas))
}
objective_fn <- function(x, df, params, lambda_alpha, lambda_beta) {
params_hat_list <- get_params_hat(x, params$n_orgs)
org_ids <- sort(unique(df$org_id))
stopifnot(length(org_ids) == length(params_hat_list$org_alphas))
org_params_hat_df <- data.frame(org_id=org_ids,
alpha_hat=params_hat_list$org_alphas,
beta_hat=params_hat_list$org_betas)
df_params_hat <- merge(df, org_params_hat_df, by='org_id')
df_params_hat$y_hat <- df_params_hat$alpha_hat + df_params_hat$beta_hat * df_params_hat$time
return(sum((df_params_hat$y - df_params_hat$y_hat) ^ 2) / nrow(df_params_hat) +
lambda_alpha * sum((params_hat_list$org_alphas - params_hat_list$alpha) ^ 2) +
lambda_beta * sum((params_hat_list$org_betas - params_hat_list$beta) ^ 2))
}
model_unregularized <- lm(y ~ org_id_string + org_id_string:time, data=df_train)
summary(model_unregularized)
rmse_test_unregularized <- sqrt(mean((df_test$y - predict(model_unregularized, newdata=df_test)) ^ 2))
x <- c(params$alpha,
params$beta,
rep(params$alpha, params$n_orgs),
rep(params$beta, params$n_orgs)) # Initial parameter values for optim
cv_grid <- expand.grid(lambda_alpha=10^seq(2, -6, -1),
lambda_beta=10^seq(2, -6, -1))
cv_grid$mse_test <- 0
cv_grid$rmse_test <- 0
params_hat_list <- list()
for(i in seq_len(nrow(cv_grid))) {
optimization_result <- optim(par=x,
fn=objective_fn,
gr=NULL, # TODO Gradient
method='BFGS',
control=list(maxit=200),
df=df_train,
params=params,
lambda_alpha=cv_grid$lambda_alpha[i],
lambda_beta=cv_grid$lambda_beta[i])
params_hat <- get_params_hat(optimization_result$par, params$n_orgs)
params_hat_list[[i]] <- params_hat
mse_test <- objective_fn(optimization_result$par,
df=df_test,
params=params,
lambda_alpha=0,
lambda_beta=0) # Turn off lambdas to get only MSE
cv_grid$mse_test[i] <- mse_test
cv_grid$rmse_test[i] <- sqrt(mse_test)
}
summary(cv_grid)
p <- (ggplot(cv_grid, aes(x=log10(lambda_alpha),
y=rmse_test,
color=log10(lambda_beta),
group=lambda_beta)) +
geom_point() +
geom_line() +
geom_hline(yintercept=rmse_test_unregularized, linetype=2) +
scale_color_gradient2(midpoint=median(log10(cv_grid$lambda_beta))))
p
p <- (ggplot(cv_grid, aes(x=log10(lambda_alpha),
y=log10(lambda_beta),
fill=rmse_test)) +
geom_raster() +
scale_fill_gradientn(colours=terrain.colors(10)))
p # When one lambda is large, the other should be small
cv_grid[which.min(cv_grid$rmse_test), ]
params_hat <- params_hat_list[[which.min(cv_grid$rmse_test)]]