# load libraries
!require(kableExtra)){install.packages("kableExtra")} # tables
if(!require(gtsummary)){install.packages("gtsummary")} # tables
# simulation seed
set.seed(123) # reproducibility
# define the parameters
= 1000 # Number of observations
n = 0.5 # Probability of A = 1 (exposure)
p = 0 # Intercept for L (mediator)
alpha = 2 # Effect of A on L
beta = 1 # Intercept for Y
gamma = 1.5 # Effect of L on Y
delta = 1 # Standard deviation of L
sigma_L = 1.5 # Standard deviation of Y
sigma_Y
# simulate the data: fully mediated effect by L
= rbinom(n, 1, p) # binary exposure variable
A = alpha + beta*A + rnorm(n, 0, sigma_L) # mediator L affect by A
L = gamma + delta*L + rnorm(n, 0, sigma_Y) # Y affected only by L,
Y
# make the data frame
= data.frame(A = A, L = L, Y = Y)
data
# fit regression in which we control for L, a mediator
# (cross-sectional data is consistent with this model)
<- lm( Y ~ A + L, data = data)
fit_1
# fit regression in which L is assumed to be a mediator, not a confounder.
# (cross-sectional data is also consistent with this model)
<- lm( Y ~ A, data = data)
fit_2
# create gtsummary tables for each regression model
<- gtsummary::tbl_regression(fit_1)
table1 <- gtsummary::tbl_regression(fit_2)
table2
# merge the tables for comparison
<- gtsummary::tbl_merge(
table_comparison list(table1, table2),
tab_spanner = c("Model: Exercise assumed confounder",
"Model: Exercise assumed to be a mediator")
)# make latex table (for publication)
<- as_kable_extra(table_comparison,
markdown_table_0 format = "html")
# print table (note, you might prefer "markdown" or another format)
markdown_table_0
Simulation: Population Estimate
Simulation of Population ATE
Methodology
Data Generation: we simulate a dataset for 1,000 individuals, where e.g. degree of religious belief (A) influences wealth (L), which in turn affects charitable donations (Y$). The simulation is based on predefined parameters that establish L as a mediator between A and Y.
Parameter Definitions:
- The probability of access to green space (A) is set at 0.5.
- The effect of A on L (exercise) is given by \beta = 2.
- The effect of L on Y (happiness) is given by \delta = 1.5.
- Standard deviations for L and Y are set at 1 and 1.5, respectively.
Model 1 (Correct Assumption): fits a linear regression model assuming L as a mediator, including both A and L as regressors on Y. This model aligns with the data-generating process, and, by the rules of d-separation, induces mediator bias for the A\to Y path.
Model 2 (Incorrect Assumption): fits a linear regression model including only A as a regressor on Y, omitting the mediator L. This model assesses the direct effect of A on Y without accounting for mediation.
Analysis: We compares the estimated effects of A on Y under each model specification.
The following code is designed to estimate the Average Treatment Effect (ATE) using the clarify
package in R, which is referenced here as (Greifer et al. 2023). The procedure involves two steps: simulating coefficient distributions for regression models and then calculating the ATE based on these simulations. This process is applied to two distinct models to demonstrate the effects of including versus excluding a mediator variable in the analysis.
Steps to Estimate the ATE
Load the
clarify
Package: this package provides functions to simulate regression coefficients and compute average marginal effects (AME), robustly facilitating the estimation of ATE.Set seed:
set.seed(123)
ensures that the results of the simulations are reproducible, allowing for consistent outcomes across different code runs.Simulate the data distribution:
sim_coefs_fit_1
andsim_coefs_fit_2
are generated using thesim
function from theclarify
package, applied to two fitted models (fit_1
andfit_2
). These functions simulate the distribution of coefficients based on the specified models, capturing the uncertainty around the estimated parameters.Calculate ATE:
For both models, the sim_ame
function calculates the ATE as the marginal risk difference (RD) when the treatment variable (A
) is present (A == 1
). This function uses the simulated coefficients to estimate the treatment effect across the simulated distributions, providing a comprehensive view of the ATE under each model.
To streamline the output, the function is set to verbose mode off (verbose = FALSE
).
- Results:
Summaries of these estimates (summary_sim_est_fit_1
and summary_sim_est_fit_2
) are obtained, providing detailed statistics including the estimated ATE and its 95% confidence intervals (CI).
- Presentation: report ATE and CIs:
Using the glue
package, the ATE and its 95% CIs for both models are formatted into a string for easy reporting. This step transforms the statistical output into a more interpretable form, highlighting the estimated treatment effect and its precision.
# use `clarify` package to obtain ATE
if(!require(clarify)){install.packages("clarify")} # clarify package
# simulate fit 1 ATE
set.seed(2025)
<- sim(fit_1)
sim_coefs_fit_1 <- sim(fit_2)
sim_coefs_fit_2
# marginal risk difference ATE, simulation-based: model 1 (L is a confounder)
<-
sim_est_fit_1 sim_ame(
sim_coefs_fit_1,var = "A",
subset = A == 1,
contrast = "RD",
verbose = FALSE
)# marginal risk difference ATE, simulation-based: model 2 (L is a mediator)
<-
sim_est_fit_2 sim_ame(
sim_coefs_fit_2,var = "A",
subset = A == 1,
contrast = "RD",
verbose = FALSE
)# obtain summaries
<- summary(sim_est_fit_1, null = c(`RD` = 0))
summary_sim_est_fit_1 <- summary(sim_est_fit_2, null = c(`RD` = 0))
summary_sim_est_fit_2
# reporting
# ate for fit 1, with 95% CI
<- glue::glue(
ATE_fit_1 "ATE =
{round(summary_sim_est_fit_1[3, 1], 2)},
CI = [{round(summary_sim_est_fit_1[3, 2], 2)},
{round(summary_sim_est_fit_1[3, 3], 2)}]"
)# ate for fit 2, with 95% CI
<-
ATE_fit_2 ::glue(
glue"ATE = {round(summary_sim_est_fit_2[3, 1], 2)},
CI = [{round(summary_sim_est_fit_2[3, 2], 2)},
{round(summary_sim_est_fit_2[3, 3], 2)}]"
)
Upshot of the Simulation and Analysis
Model 1 (L as a Confounder): this analysis assumes that
L
is a confounder in the relationship between the treatment (A
) and the outcome (Y
), and thus, it includesL
in the model. The ATE estimated here reflects the effect ofA
while controlling forL
.Model 2 (L as a Mediator): in contrast, this analysis considers
L
to be a mediator, and the model either includesL
explicitly in its estimation process or excludes it to examine the direct effect ofA
onY
. The approach to mediation analysis here is crucial as it influences the interpretation of the ATE.
By comparing the ATEs from both models, researchers can understand the effect of mediation (or the lack thereof) on the estimated treatment effect. This comparison sheds light on how assumptions about variable roles (confounder vs. mediator) can significantly alter causal inferences drawn from cross-sectional data.
Wherever it is uncertain whether a variable is a confounder or a mediator, we suggest creating two causal diagrams and reporting both analyses.
Appendix D: Simulation of Different Confounding Control Strategies
This appendix outlines the methodology and results of a data simulation designed to compare different strategies for controlling confounding in the context of environmental psychology research. Specifically, the simulation examines the effect of access to open green spaces (treatment, A_1) on happiness (outcome, Y_2) while addressing the challenge of unmeasured confounding. The simulation incorporates baseline measures of exposure and outcome (A_0, Y_0), baseline confounders (L_0), and an unmeasured confounder (U) to evaluate the effectiveness of different analytical approaches.
Methodology
1.Load Libraries kableExtra
, gtsummary
, and grf
.
Target: we simulate data for 10,000 individuals, including baseline exposure to green spaces (A_0), baseline happiness (Y_0), baseline confounders (L_0), and an unmeasured confounder (U). The simulation uses a logistic model for treatment assignment and a linear model for the continuous outcome, incorporating interactions to assess how baseline characteristics modify the treatment effect.
Set seed and simulate the data distribution:
Treatment assignment coefficients: \beta_{A0} = 0.25, \beta_{Y0} = 0.3, \beta_{L0} = 0.2, and \beta_{U} = 0.1. Outcome model coefficients: \delta_{A1} = 0.3, \delta_{Y0} = 0.9, \delta_{A0} = 0.1, \delta_{L0} = 0.3, with an interaction effect (\theta_{A0Y0L0} = 0.5) indicating the combined influence of baseline exposure, outcome, and confounders on the follow-up outcome.
Model comparison:
- No control model: estimates the effect of A_1 on Y_2 without controlling for any confounders.
- Standard covariate control model: controls for baseline confounders (L_0) alongside treatment (A_1).
- Baseline exposure and outcome model: extends the standard model by including baseline treatment and outcome (A_0, Y_0) and their interaction with L_0.
Results: each modelโs effectiveness in estimating the true treatment effect is assessed by comparing regression outputs. The simulation evaluates how well each model addresses the bias introduced by unmeasured confounding and the role of baseline characteristics in modifying treatment effects.
Presentation: the results are synthesised in a comparative table, formatted using the
kableExtra
{Zhu (2021)] andgtsummary
packages (Sjoberg et al. 2021), highlighting the estimated treatment effects and their statistical significance across models.
Overall, we use the simulation to illustrate the importance of incorporating baseline characteristics and their interactions to mitigate the influence of unmeasured confounding.
Here is the simulation/model code:
library(kableExtra)
if(!require(kableExtra)){install.packages("kableExtra")} # causal forest
if(!require(gtsummary)){install.packages("gtsummary")} # causal forest
Loading required package: gtsummary
if(!require(grf)){install.packages("grf")} # causal forest
Loading required package: grf
# r_texmf()eproducibility
set.seed(123)
# set number of observations
<- 10000
n
# baseline covariates
<- rnorm(n) # Unmeasured confounder
U <- rbinom(n, 1, prob = plogis(U)) # Baseline exposure
A_0 <- rnorm(n, mean = U, sd = 1) # Baseline outcome
Y_0 <- rnorm(n, mean = U, sd = 1) # Baseline confounders
L_0
# coefficients for treatment assignment
= 0.25
beta_A0 = 0.3
beta_Y0 = 0.2
beta_L0 = 0.1
beta_U
# simulate treatment assignment
<- rbinom(n, 1, prob = plogis(-0.5 +
A_1 * A_0 +
beta_A0 * Y_0 +
beta_Y0 * L_0 +
beta_L0 * U))
beta_U # coefficients for continuous outcome
= 0.3
delta_A1 = 0.9
delta_Y0 = 0.1
delta_A0 = 0.3
delta_L0 = 0.5 # Interaction effect between A_1 and L_0
theta_A0Y0L0 = 0.05
delta_U # simulate continuous outcome including interaction
<- rnorm(n,
Y_2 mean = 0 +
* A_1 +
delta_A1 * Y_0 +
delta_Y0 * A_0 +
delta_A0 * L_0 +
delta_L0 * Y_0 *
theta_A0Y0L0 * L_0 +
A_0 * U,
delta_U sd = .5)
# assemble data frame
<- data.frame(Y_2, A_0, A_1, L_0, Y_0, U)
data
# model: no control
<- lm(Y_2 ~ A_1, data = data)
fit_no_control
# model: standard covariate control
<- lm(Y_2 ~ A_1 + L_0, data = data)
fit_standard
# model: interaction with baseline confounders, and baseline outcome and exposure
<- lm(Y_2 ~ A_1 * (L_0 + A_0 + Y_0), data = data)
fit_interaction
# create gtsummary tables for each regression model
<- tbl_regression(fit_no_control)
tbl_fit_no_control<- tbl_regression(fit_standard)
tbl_fit_standard <- tbl_regression(fit_interaction)
tbl_fit_interaction
# get only the treatment variable
<- lapply(list(
tbl_list_modified
tbl_fit_no_control,
tbl_fit_standard,
tbl_fit_interaction),function(tbl) {
%>%
tbl modify_table_body(~ .x %>% dplyr::filter(variable == "A_1"))
})# merge tables
<- tbl_merge(
table_comparison tbls = tbl_list_modified,
tab_spanner = c(
"No Control",
"Standard",
"Interaction")
|>
) modify_table_styling(
column = c(p.value_1, p.value_2, p.value_3),
hide = TRUE
)# latex table for publication
<-
markdown_table as_kable_extra(table_comparison, format = "markdown") |>
print(markdown_table)
<table style="NAborder-bottom: 0;">
<thead>
<tr>
<th style="empty-cells: hide;border-bottom:hidden;" colspan="1"></th>
<th style="border-bottom:hidden;padding-bottom:0; padding-left:3px;padding-right:3px;text-align: center; " colspan="2"><div style="border-bottom: 1px solid #ddd; padding-bottom: 5px; ">No Control</div></th>
<th style="border-bottom:hidden;padding-bottom:0; padding-left:3px;padding-right:3px;text-align: center; " colspan="2"><div style="border-bottom: 1px solid #ddd; padding-bottom: 5px; ">Standard</div></th>
<th style="border-bottom:hidden;padding-bottom:0; padding-left:3px;padding-right:3px;text-align: center; " colspan="2"><div style="border-bottom: 1px solid #ddd; padding-bottom: 5px; ">Interaction</div></th>
</tr>
<tr>
<th style="text-align:left;"> Characteristic </th>
<th style="text-align:center;"> Beta </th>
<th style="text-align:center;"> 95% CI </th>
<th style="text-align:center;"> Beta </th>
<th style="text-align:center;"> 95% CI </th>
<th style="text-align:center;"> Beta </th>
<th style="text-align:center;"> 95% CI </th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;"> A_1 </td>
<td style="text-align:center;"> 1.5 </td>
<td style="text-align:center;"> 1.5, 1.6 </td>
<td style="text-align:center;"> 0.86 </td>
<td style="text-align:center;"> 0.80, 0.93 </td>
<td style="text-align:center;"> 0.25 </td>
<td style="text-align:center;"> 0.20, 0.31 </td>
</tr>
</tbody>
<tfoot><tr><td style="padding: 0; " colspan="100%">
<sup></sup> Abbreviation: CI = Confidence Interval</td></tr></tfoot>
</table>
Next, in the following code, we calculate the Average Treatment Effect (ATE) using simulation-based approaches for two distinct models: one with standard covariate control and another incorporating interaction. This approach leverages the clarify
package in R, which facilitates the simulation and interpretation of estimated coefficients from linear models to derive ATEs under different modelling assumptions (Greifer et al. 2023).
First, we use the sim
function from the clarify
package to generate simulated coefficient distributions for the standard model (fit_standard
) and the interaction model (fit_interaction
). This step is crucial for capturing the uncertainty in our estimates arising from sampling variability.
Next, we employ each modelโs sim_ame
function to compute the average marginal effects (AME), focusing on the treatment variable (A_1
). The calculation is done under the assumption that all individuals are treated (i.e., A_1 == 1
), and we specify the contrast type as โRDโ (Risk Difference) to directly obtain the ATE (Average Treatment Effect). The sim_ame
function simulates the treatment effect across the distribution of simulated coefficients, providing a robust estimate of the ATE and its variability.
The summaries of these simulations (summary_sim_est_fit_std
and summary_sim_est_fit_int
) are then extracted to provide concise estimates of the ATE along with 95% confidence intervals (CIs) for both the standard and interaction models. This step is essential for understanding the magnitude and precision of the treatment effects estimated by the models.
Finally, we use the glue
package to format these estimates into a human-readable form, presenting the ATE and its corresponding 95% CIs for each model. This presentation facilitates clear communication of the estimated treatment effects, allowing for direct comparison between the models and highlighting the effect of including baseline characteristics and their interactions on estimating the ATE (Hester and Bryan 2022).
This simulation-based approach to estimating the ATE underscores the importance of considering model complexity and the roles of confounders and mediators in causal inference analyses. By comparing the ATE estimates from different models, we can assess the sensitivity of our causal conclusions to various assumptions and modelling strategies.
# use `clarify` package to obtain ATE
if(!require(clarify)){install.packages("clarify")} # clarify package
Loading required package: clarify
# simulate fit 1 ATE
set.seed(123)
<- sim(fit_no_control)
sim_coefs_fit_no_control<- sim(fit_standard)
sim_coefs_fit_std <- sim(fit_interaction)
sim_coefs_fit_int
# marginal risk difference ATE, no controls
<-
sim_est_fit_no_control sim_ame(
sim_coefs_fit_no_control,var = "A_1",
subset = A_1 == 1,
contrast = "RD",
verbose = FALSE
)# marginal risk difference ATE, simulation-based: model 1 (L is a confounder)
<-
sim_est_fit_std sim_ame(
sim_coefs_fit_std,var = "A_1",
subset = A_1 == 1,
contrast = "RD",
verbose = FALSE
)# marginal risk difference ATE, simulation-based: model 2 (L is a mediator)
<-
sim_est_fit_int sim_ame(
sim_coefs_fit_int,var = "A_1",
subset = A_1 == 1,
contrast = "RD",
verbose = FALSE
)# obtain summaries
<-
summary_sim_coefs_fit_no_control summary(sim_est_fit_no_control, null = c(`RD` = 0))
<-
summary_sim_est_fit_std summary(sim_est_fit_std, null = c(`RD` = 0))
<-
summary_sim_est_fit_int summary(sim_est_fit_int, null = c(`RD` = 0))
# get coefficients for reporting
# ate for fit 1, with 95% CI
<- glue::glue(
ATE_fit_no_control "ATE = {round(summary_sim_coefs_fit_no_control[3, 1], 2)},
CI = [{round(summary_sim_coefs_fit_no_control[3, 2], 2)},
{round(summary_sim_coefs_fit_no_control[3, 3], 2)}]"
)# ate for fit 2, with 95% CI
<- glue::glue(
ATE_fit_std "ATE = {round(summary_sim_est_fit_std[3, 1], 2)},
CI = [{round(summary_sim_est_fit_std[3, 2], 2)},
{round(summary_sim_est_fit_std[3, 3], 2)}]"
)# ate for fit 3, with 95% CI
<-
ATE_fit_int ::glue(
glue"ATE = {round(summary_sim_est_fit_int[3, 1], 2)},
CI = [{round(summary_sim_est_fit_int[3, 2], 2)},
{round(summary_sim_est_fit_int[3, 3], 2)}]"
)# coefs they used in the manuscript
Using the clarify
package, we infer the ATE for the standard model is ATE = 0.86, CI = [0.8, 0.92].
Using the clarify
package, we infer the ATE for the model that conditions on the baseline exposure and baseline outcome to be: ATE = 0.43, CI = [0.39, 0.47], which is close to the values supplied to the data-generating mechanism.
Take-home message:
The baseline exposure and baseline outcome are often the most important variables to include for confounding control. The baseline exposure also allows us to estimate an incident-exposure effect. For this reason, we should endeavour to obtain at least three waves of data such that these variables and other baseline confounders are included at time 0, the exposure is included at time 1, and the outcome is included at time 2.
Appendix E: Non-parametric Estimation of Average Treatment Effects Using Causal Forests
This appendix provides a practical example of estimating average treatment effects (ATE) using a non-parametric approach, specifically applying causal forests. Unlike traditional regression models, causal forests allow for estimating treatment effects without imposing strict assumptions about the form of the relationship between treatment, covariates, and outcomes. This flexibility makes them particularly useful for analysing complex datasets where the treatment effect may vary across observations.
Causal Forest Model Implementation
Libraries: the implementation begins with loading the necessary R libraries:
grf
for estimating conditional and average treatment effects using causal forests andglue
for formatting the results for reporting.- Data generation: the code assumes the presence of a data frame
data
generated from the previous code snippet containing the variables:
A_1
: Treatment indicator.L_0
: A covariate.Y_2
: Outcome of interest.A_0
andY_0
: Baseline exposure and outcome, respectively.
Treatment (
W
) and outcome (Y
) vectors are extracted fromdata
alongside a matrixX
that includes covariates and baseline characteristics.- Data generation: the code assumes the presence of a data frame
Causal Forest model: a causal forest model is fitted using the
causal_forest
function from thegrf
package (Tibshirani et al. 2024). This function takes the covariate matrixX
, the outcome vectorY
, and the treatment vectorW
as inputs, and it returns a model object that can be used for further analysis.Average Treatment Effect estimation: the
average_treatment_effect
function computes the ATE from the fitted causal forest model. This step is crucial as it quantifies the overall effect of the treatment across the population, adjusting for covariates included in the model.Reporting: The estimated ATE and its standard error (se) are extracted and formatted for reporting using the
glue
package (Hester and Bryan 2022). This facilitates clear communication of the results, showing the estimated effect size and its uncertainty.
Key Takeaways
First, causal forests offer a robust way to estimate treatment effects without making parametric solid assumptions. This approach is particularly advantageous in settings where the treatment effect may vary with covariates or across different subpopulations.
Second, the model estimates the ATE as the difference in expected outcomes between treated and untreated units, averaged across the population. This estimate reflects the overall effect of the treatment, accounting for the distribution of covariates in the sample.
Third, we find that the estimated ATE by the causal forest model converges to the actual value used in the data-generating process (assumed to be 0.3). This demonstrates the effectiveness of causal forests in uncovering the true treatment effect from complex data.
This example underscores the utility of semi-parametric and non-parametric methods, such as causal forests, in causal inference analyses.
# load causal forest library
library(grf) # estimate conditional and average treatment effects
library(glue) # reporting
# 'data' is our data frame with columns 'A_1' for treatment, 'L_0' for a covariate, and 'Y_2' for the outcome
# we also have the baseline exposure 'A_0' and 'Y_0'
# ensure W (treatment) and Y (outcome) are vectors
<- as.matrix(data$A_1) # Treatment
W <- as.matrix(data$Y_2) # Outcome
Y <- as.matrix(data[, c("L_0", "A_0", "Y_0")])
X
# fit causal forest model
<- causal_forest(X, Y, W)
fit_causal_forest
# estimate the average treatment effect (ATE)
<- average_treatment_effect(fit_causal_forest)
ate
# make data frame for reporting using "glue'
<- data.frame(ate)
ate
# obtain ate for report
<-
ATE_fit_causal_forest ::glue(
glue"ATE = {round(ate[1, 1], 2)}, se = {round(ate[2, 1], 2)}"
)
Causal forest estimates the average treatment effect as ATE = 0.3, se = 0.01. This approach converges to the true value supplied to the generating mechanism of 0.3