I am trying to recover the true (simulated) effect of a treatment Z on an outcome Y, which is set to ATE = 5 (the csv file for the data is located here: https://www.dropbox.com/s/92obn9hsu3tqy92/synthetic_data_2.csv?dl=0). I am able to recover the true effect using a linear model, however, for some reason, I am unable to get the same effect using MatchIt (or Opmatch). As the main confounder (variable name “C_p”) is a binary variable, I have tried converting it to numeric, integer, and factor, but the same problem persists. I have also tried “cem” method and “nearest” but no progress.
After suspecting that something is convoluted in the original simulated file, I simulated some new data (see below). Using these data, I am recovering the true effect using lm. With matching, the effect is closer to the truth compared to the original problem, but still biased. Using a t.test, we see that the ATE is -4.15 – (-2.55)= -1.6, yet it should be equal to 5.
Any ideas of why matching is not recovering the true effect of synthetic_data_2.csv, using matching?
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../../")
library(MatchIt)
library(dplyr)
#setwd("path/to/dir")
imf.meta <- read.csv("synthetic_data_2.csv", sep= ",")
imf.meta$Z <- imf.meta$'T'
#imf.meta$C_p <- as.numeric(imf.meta$C_p) AD: tried numeric, integer, and factor
# Executing a matching algorithm
imf.meta_nomiss <- imf.meta %>%
select(C1, C2, C3, Cp, Y, Z) %>%
na.omit()
# tried different approaches
mod_match <- matchit(Z ~ C1 + C2 + C3 + Cp,
method = "nearest", data = imf.meta_nomiss)
# mod_match <- matchit(Z ~ C1 + C2 + C3 + C_p,
# method = "cem", data = imf.meta_nomiss)
# mod_match <- matchit(Z ~ C_p,
# method = "nearest", data = imf.meta_nomiss)
# mod_match <- matchit(Z ~ C_p,
# method = "cem", data = imf.meta_nomiss)
dta_m <- match.data(mod_match)
# Estimating treatment effects
with(dta_m, t.test(Y ~ Z))
# recover treatment effect withouth additional adjusting
lm_treat1 <- lm(Y ~ Z, data = dta_m)
summary(lm_treat1)
# recover treatment effect with adjusting
lm_treat2 <- lm(Y ~ Z + C1 + C2 + C3 + C_p, data = dta_m)
summary(lm_treat2)
### Simulate new data
# all covariates are continous
n <- 2000
p <- 10
X <- matrix(rnorm(n * p), n, p)
W <- rbinom(n, 1, 0.4 + 0.2 * (X[, 1] > 0))
Y <- pmax(X[, 1], 0) * W + X[, 2] + pmin(X[, 3], 0) + rnorm(n)
# create binary covariate
n <- 2000
p <- 10
X <- matrix(rnorm(n * p), n, p)
X[,1] <- rbinom(n, 1, 0.6)
W <- rbinom(n, 1, 0.1+0.7 * (X[, 1] > 0.5))
#Y <- pmax(X[, 1], 0) * W + X[, 2] + pmin(X[, 3], 0) + rnorm(n)
Y <- pmax(X[, 1], 0)*(-10) + 5*W + rnorm(n)
# na.omit()
imf.meta_nomiss <- as.data.frame(X)
imf.meta_nomiss$Y <- Y
imf.meta_nomiss$W <- W
# compare with grf
library(grf)
tau.forest <- causal_forest(X, Y, W)
average_treatment_effect(tau.forest, target.sample = "all")
# compare with lm
lm_treat1 <- lm(Y ~ W + V1+V2+V3+V4+V5+V6+V7+V8+V9+V10, data = imf.meta_nomiss)
summary(lm_treat1)
# use matching
mod_match <- matchit(W ~ V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,
method = "nearest", data = imf.meta_nomiss)
dta_m <- match.data(mod_match)
# Estimating treatment effects
with(dta_m, t.test(Y ~ W))
# AD with and without mweights is not making a difference
# without weights
lm_treat3 <- lm(Y ~ W, data = dta_m)
summary(lm_treat3)
# with weigths
lm_treat4 <- lm(Y ~ W, data = dta_m, weights=weights )
summary(lm_treat4)