Merge branch 'main' of https://github.com/steno-aarhus/mediation-anal…

…ysis-course
steno-aarhus · May 29, 2024 · a92fdf5 · a92fdf5
2 parents ee5dab8 + f96f0e9
commit a92fdf5
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 120 deletions.
diff --git a/includes/papers.bib b/includes/papers.bib
@@ -389,7 +389,7 @@ @article{knowler_reduction_2002
 	year = {2002},
 	pmid = {11832527},
 	pmcid = {PMC1370926},
-	keywords = {Adult, Blood Glucose, Body Mass Index, Diabetes Mellitus, Type 2, Double-Blind Method, Energy Intake, Exercise, Female, Humans, Hypoglycemic Agents, Incidence, Life Style, Male, Metformin, Middle Aged, Patient Compliance, Risk Factors, Weight Loss},
+	keywords = {Male, Adult, Humans, Energy Intake, Female, Middle Aged, Risk Factors, Diabetes Mellitus, Type 2, Life Style, Blood Glucose, Metformin, Weight Loss, Exercise, Body Mass Index, Incidence, Double-Blind Method, Hypoglycemic Agents, Patient Compliance},
 	pages = {393--403},
 	file = {Full Text:C\:\\Users\\au302898\\Zotero\\storage\\PQA8KYS7\\Knowler et al. - 2002 - Reduction in the incidence of type 2 diabetes with.pdf:application/pdf},
 }
@@ -410,7 +410,7 @@ @article{pan_effects_1997
 	month = apr,
 	year = {1997},
 	pmid = {9096977},
-	keywords = {Adult, Blood Glucose, Body Mass Index, China, Combined Modality Therapy, Diabetes Mellitus, Diabetes Mellitus, Type 2, Exercise, Female, Follow-Up Studies, Glucose Intolerance, Humans, Incidence, Male, Mass Screening, Middle Aged, Obesity, Proportional Hazards Models, Risk Factors, Time Factors},
+	keywords = {Male, Adult, Obesity, Humans, Female, Middle Aged, Risk Factors, Diabetes Mellitus, Type 2, Blood Glucose, Exercise, Body Mass Index, Diabetes Mellitus, Proportional Hazards Models, Incidence, Follow-Up Studies, China, Time Factors, Combined Modality Therapy, Glucose Intolerance, Mass Screening},
 	pages = {537--544},
 }
 
@@ -430,7 +430,7 @@ @article{tuomilehto_prevention_2001
 	month = may,
 	year = {2001},
 	pmid = {11333990},
-	keywords = {Diabetes Mellitus, Type 2, Diet, Fat-Restricted, Dietary Fiber, Exercise, Female, Glucose Intolerance, Glucose Tolerance Test, Humans, Incidence, Life Style, Male, Middle Aged, Obesity, Risk, Weight Loss},
+	keywords = {Male, Obesity, Humans, Female, Middle Aged, Diabetes Mellitus, Type 2, Life Style, Diet, Fat-Restricted, Weight Loss, Dietary Fiber, Glucose Tolerance Test, Risk, Exercise, Incidence, Glucose Intolerance},
 	pages = {1343--1350},
 }
 
@@ -534,3 +534,22 @@ @article{tingley_mediation_2014
 	year = {2014},
 	file = {Tingley et al. - 2014 - mediation  R Package for Causal Med.pdf:C\:\\Users\\au302898\\Zotero\\storage\\AQ6Z2QNW\\Tingley et al. - 2014 - mediation  R Package for Causal Med.pdf:application/pdf},
 }
+
+@article{vanderweele_sensitivity_2017,
+	title = {Sensitivity {Analysis} in {Observational} {Research}: {Introducing} the {E}-{Value}},
+	volume = {167},
+	issn = {1539-3704},
+	shorttitle = {Sensitivity {Analysis} in {Observational} {Research}},
+	doi = {10.7326/M16-2607},
+	abstract = {Sensitivity analysis is useful in assessing how robust an association is to potential unmeasured or uncontrolled confounding. This article introduces a new measure called the "E-value," which is related to the evidence for causality in observational studies that are potentially subject to confounding. The E-value is defined as the minimum strength of association, on the risk ratio scale, that an unmeasured confounder would need to have with both the treatment and the outcome to fully explain away a specific treatment-outcome association, conditional on the measured covariates. A large E-value implies that considerable unmeasured confounding would be needed to explain away an effect estimate. A small E-value implies little unmeasured confounding would be needed to explain away an effect estimate. The authors propose that in all observational studies intended to produce evidence for causality, the E-value be reported or some other sensitivity analysis be used. They suggest calculating the E-value for both the observed association estimate (after adjustments for measured confounders) and the limit of the confidence interval closest to the null. If this were to become standard practice, the ability of the scientific community to assess evidence from observational studies would improve considerably, and ultimately, science would be strengthened.},
+	language = {eng},
+	number = {4},
+	journal = {Annals of Internal Medicine},
+	author = {VanderWeele, Tyler J. and Ding, Peng},
+	month = aug,
+	year = {2017},
+	pmid = {28693043},
+	keywords = {Confounding Factors, Epidemiologic, Epidemiologic Research Design, Humans, Observational Studies as Topic, Sensitivity and Specificity},
+	pages = {268--274},
+	file = {Submitted Version:C\:\\Users\\au302898\\Zotero\\storage\\5RG8AH7Q\\VanderWeele and Ding - 2017 - Sensitivity Analysis in Observational Research In.pdf:application/pdf},
+}
diff --git a/sessions/causal-mediation-analysis-estimation.qmd b/sessions/causal-mediation-analysis-estimation.qmd
@@ -432,27 +432,27 @@ grViz("
 digraph {
   graph []
   node [shape = plaintext]
-    X
-    M
-    L
-    U [fontcolor = red]
-    Y
-  edge [minlen = 1.5]
-    X->Y
-    X->L
-    L->Y
+    W [label = 'W']
+    A [label = 'A']
+    M [label = 'M']
+    Y [label = 'Y']
+    L [label = 'L']
+    U [label = 'U']
+  edge [minlen = 2]
+    A->M
+    A->L
     L->M
-    U->L
-    U->Y
-    X->M
+    L->Y
     M->Y
-    W->X
-    W->M
+    A->Y
+    W->A
     W->Y
-  { rank = same; X; M; Y }
-  { rank = min; L;}
-  { rank = max; W;}
-}")
+    W->M
+    U->L
+    U->Y
+{rank = same; W; A; M; Y; }
+}
+")
 ```
 
 ### Time-varying confounding
@@ -478,7 +478,7 @@ digraph {
     L2
     Lt
     Y
-  edge [minlen = 1.5]
+  edge [minlen = 2]
     X0 -> X1
     X1 -> X2
     X2 -> Xt
@@ -554,111 +554,71 @@ We will now demonstrate how to conduct g-computation in a simulated
 dataset.
 
 ```{r}
-datasim <- function(n) {
-  x1 <- rbinom(n, size = 1, prob = 0.5)
+datasim <- function(n) { 
+  # This small function simulates a dataset with n rows
+  # containing covariats, and action, an outcome and
+  # the underlying potential outcomes
+  x1 <- rbinom(n, size = 1, prob = 0.5) 
   x2 <- rbinom(n, size = 1, prob = 0.65)
   x3 <- rnorm(n, 0, 1)
   x4 <- rnorm(n, 0, 1)
-  A <- rbinom(n, size = 1, prob = plogis(-1.6 + 2.5 * x2 + 2.2 * x3 + 0.6 * x4 + 0.4 * x2 * x4))
-  M <- 0.5 * A + 0.3 * x1 + rnorm(n)
-  Y.1 <- rbinom(n, size = 1, prob = plogis(-0.7 + 1 - 0.15 * x1 + 0.45 * x2 + 0.20 * x4 + 0.4 * x2 * x4))
-  Y.0 <- rbinom(n, size = 1, prob = plogis(-0.7 + 0 - 0.15 * x1 + 0.45 * x2 + 0.20 * x4 + 0.4 * x2 * x4))
-  Y <- Y.1 * A + Y.0 * (1 - A)
-  data.frame(x1, x2, x3, x4, A, M, Y, Y.1, Y.0)
-}
+  # The action (independent variable, treatment, exposure...)
+  # is a function of x2, x3, x4 and 
+  # the product of (ie, an interaction of) x2 and x4
+  A <- rbinom(n, size = 1, prob = plogis(-1.6 + 2.5*x2 + 2.2*x3 + 0.6*x4 + 0.4*x2*x4)) 
+  # Simulate the two potential outcomes
+  # as functions of x1, X2, X4 and the product of x2 and x4
+  
+  # Potential outcome if the action is 1
+  # note that here, we add 1
+  Y.1 <- rbinom(n, size = 1, prob = plogis(-0.7 + 1 - 0.15*x1 + 0.45*x2 + 0.20*x4 + 0.4*x2*x4)) 
+  # Potential outcome if the action is 0
+  # note that here, we do not add 1 so that there
+  # is a different the two potential outcomes (ie, an effect of A)
+  Y.0 <- rbinom(n, size = 1, prob = plogis(-0.7 + 0 - 0.15*x1 + 0.45*x2 + 0.20*x4 + 0.4*x2*x4)) 
+  
+  # Observed outcome 
+  # is the potential outcomes (Y.1 or Y.0)
+  # corresponding to action the individual experienced (A)
+  Y <- Y.1*A + Y.0*(1 - A) 
+  
+  # Return a data.frame as the output of this function
+  data.frame(x1, x2, x3, x4, A, Y, Y.1, Y.0) 
+} 
+
+
 
 set.seed(120110) # for reproducibility
-ObsData <- datasim(n = 10000) # really large sample
-TRUE_EY.1 <- mean(ObsData$Y.1)
-TRUE_EY.1 # mean outcome under A = 1
-TRUE_EY.0 <- mean(ObsData$Y.0)
-# True average causal effect
-TRUE_ACE <- TRUE_EY.1 - TRUE_EY.0
-TRUE_EY.1 - TRUE_EY.0
-TRUE_MOR <- (TRUE_EY.1 * (1 - TRUE_EY.0)) / ((1 - TRUE_EY.1) * TRUE_EY.0)
-TRUE_MOR # true marginal OR
-```
+ObsData <- datasim(n = 500000) # really large sample
+TRUE_EY.1 <- mean(ObsData$Y.1); TRUE_EY.1  # mean outcome under A = 1
 
-Step 1: Fit the models
 
-```{r}
-##fit the mediator as a function of A and W
-mediator_model <- lm(M ~ A + x1 + x2 + x3 + x4, data = ObsData)
-##fit the outcome as a function of A, M and W
-outcome_model <- glm(Y ~ A + M + x1 + x2 + x3 + x4, data = ObsData, family = binomial)
-```
+# Fit a logistic regression model predicting Y from the relevant confounders
+Q <- glm(Y ~ A + x2 + x4 + x2*x4, data = ObsData, family=binomial)
 
-Step 2: Duplicate the initial dataset in two counterfactual datasets. In
-one world, set A=1; in the other, set A=0. All other variables keep the
-original values.
 
-```{r}
-# Copy the "actual" data twice
+# Copy the "actual" (simulated) data twice
 A1Data <- A0Data <- ObsData
 # In one world A equals 1 for everyone, in the other one it equals 0 for everyone
 # The rest of the data stays as is (for now)
-A1Data$A <- 1
-A0Data$A <- 0
-```
+A1Data$A <- 1; A0Data$A <- 0
+head(ObsData); head(A1Data); head(A0Data)
 
-Step 3. Apply the function from step 1 to predict each individual's
-mediator and outcome in the two counterfactual datasets.
-
-```{r}
-# Predict M if A=1
-M_A1 <- predict(mediator_model, newdata=A1Data)
-# Predict M if A=0
-M_A0 <- predict(mediator_model, newdata=A0Data)
+# Predict Y if everybody attends
+Y_A1 <- predict(Q, A1Data, type="response") 
+# Predict Y if nobody attends
+Y_A0 <- predict(Q, A0Data, type="response") 
 # Taking a look at the predictions
-data.frame(M_A1 = head(M_A1), M_A0 = head(M_A0))
-
-
-# Add the predicted mediator values to the respective datasets
-A1Data$M <- M_A1
-A0Data$M <- M_A0
-# Predict Y if A=1 and MA1 using M_A1
-Y_A1_M1 <- predict(outcome_model, newdata = A1Data, type = "response")
-# Predict Y if A=0 using MA0 using M_A0
-Y_A0_M0 <- predict(outcome_model, newdata = A0Data, type = "response")
-
+data.frame(Y_A1=head(Y_A1), Y_A0=head(Y_A0), TRUE_Y=head(ObsData$Y)) |> round(digits = 2)
 
-# Create dataset for A=1, M=0 and A=0, M=1 scenarios
-A1Data_M0 <- A1Data
-A0Data_M1 <- A0Data
-A1Data_M0$M <- M_A0
-A0Data_M1$M <- M_A1
+# Mean outcomes in the two worlds
+pred_A1 <- mean(Y_A1); pred_A0 <- mean(Y_A0)
 
-
-# Predict the outcomes for these scenarios
-Y_A1_M0 <- predict(outcome_model, newdata = A1Data_M0, type = "response")
-Y_A0_M1 <- predict(outcome_model, newdata = A0Data_M1, type = "response")
-data.frame(Y_A1_M1 = head(Y_A1_M1), Y_A0_M0 = head(Y_A0_M0),Y_A1_M0 = head(Y_A1_M0),Y_A0_M1 = head(Y_A0_M1))
-```
-
-Step 4: Compute the average effects
-
-```{r}
-# Average outcome when A = 1
-mean_Y_A1 <- mean(Y_A1_M1)
-# Average outcome when A = 0
-mean_Y_A0 <- mean(Y_A0_M0)
-# Total effect
-total_effect <- mean_Y_A1 - mean_Y_A0
-# Total effect
-total_effect 
-
-# Calculate the average outcomes under counterfactual scenarios
-mean_Y_A1 <- mean(Y_A1_M1)
-mean_Y_A0 <- mean(Y_A0_M0)
-mean_Y_A1_M0 <- mean(Y_A1_M0)
-mean_Y_A0_M1 <- mean(Y_A0_M1)
-mean_Y_A1_M1 <- mean(Y_A1_M1)
-mean_Y_A0_M0 <- mean(Y_A0_M0)
-# Calculate the total effect
-gcomp_ACE <- mean_Y_A1 - mean_Y_A0
-gcomp_ACE
-# Calculate the Marginal Odds Ratio using g-computation predictions
-gcomp_MOR <- (mean_Y_A1 * (1 - mean_Y_A0)) / ((1 - mean_Y_A1) * mean_Y_A0)
+# Marginal odds ratio
+MOR_gcomp <- (pred_A1*(1 - pred_A0))/((1 - pred_A1)*pred_A0)
+# ATE (risk difference)
+RD_gcomp <- pred_A1 - pred_A0
+c(MOR_gcomp, RD_gcomp) |> round(digits=3)
 ```
 
 We recommend take the advantage of R packages to do the work

diff --git a/sessions/causal-mediation-analysis-sensitivity-analysis.qmd b/sessions/causal-mediation-analysis-sensitivity-analysis.qmd
@@ -252,8 +252,8 @@ levels *A* = *a* and *A* = *a*^\*^.
 Once we have calculated the bias term **B~*mult*~(*c*)**, we can
 estimate our risk ratio controlling only for *C* (if the outcome is
 rare, fit a logistic regression) and we divide our estimate by
-**B~*mult*~(*c*)** to get the corrected estimate for risk ratio—that is,
-what we would have obtained if we had adjusted for *U* as well.
+**B~*mult*~(*c*)** to get the corrected estimate for risk ratio---that
+is, what we would have obtained if we had adjusted for *U* as well.
 
 Under the simplifying assumptions of (A8.1.1) and (A8.1.2b), we can also
 obtain corrected confidence intervals by dividing both limits of the
@@ -429,7 +429,7 @@ Once we have calculated the bias term $B^{CDE}_{mult}(m|c)$, we can
 estimate the *CDE* risk ratio controlling only for *C* (if the outcome
 is rare), we fit a logistic regression) and we divide our estimate and
 confidence intervals by the bias factor $B^{CDE}_{mult}(m|c)$ to get the
-corrected estimate for CDE risk ratio and its confidence interval—that
+corrected estimate for CDE risk ratio and its confidence interval---that
 is, what we would have obtained if we had adjusted for *U* a well.
 
 We have to specify the two prevalences of *U*, namely $P(U = 1|a,m, c)$
@@ -573,14 +573,14 @@ uc_sens
 -   The E-value is the minimum strength of association, on the risk
     ratio scale, that an unmeasured confounder would need to have with
     both the treatment and the outcome to fully explain away a specific
-    treatment–outcome association, conditional on the measured
-    covariates.
+    treatment--outcome association, conditional on the measured
+    covariates @vanderweele_sensitivity_2017.
 
 -   A large E-value implies that considerable unmeasured confounding
     would be needed to explain away an effect estimate.
 
 -   A small E-value implies little unmeasured confounding would be
     needed to explain away an effect estimate.
-
-(Tyler J VanderWeele 2017)
 :::
+
+## References
diff --git a/sessions/causal-mediation-analysis-survival-outcomes.qmd b/sessions/causal-mediation-analysis-survival-outcomes.qmd
@@ -75,6 +75,8 @@ results with respect to time-to-event outcomes @tein_estimating_2003.
 They found that the methods coincides for the accelerated failure time
 model but not for the proportional hazards model.
 
+When the outcome is common, we can use weight approach (Lange 2012).
+
 To sum up, we can only use the traditional approaches for rare outcomes.
 Otherwise, we can use the product method to get an indication of whether
 there is mediation, but be aware that the estimate is not accurate.
@@ -116,8 +118,6 @@ have to satisfy below assumptions:
 -   Additionally, we assume that the mediator is measured for everyone
     before the outcome occurs.
 
-When the outcome is common, we can use weight approach (Lange 2012)
-
 ## Example
 
 We will continue working on the obesity-CVD example in the Framingham