From c64eced3f9d86f71fbf179b44cb4ea8efaddbd48 Mon Sep 17 00:00:00 2001 From: elliecurnow Date: Mon, 30 Sep 2024 12:04:14 +0100 Subject: [PATCH] Updated output to message format --- DESCRIPTION | 11 +- R/checkcra.R | 73 +- R/checkmi.R | 42 +- R/checkmodspec.R | 114 +-- R/descMissData.R | 4 +- R/doMImice.R | 54 +- R/exploreDAG.R | 46 +- R/proposeMI.R | 126 ++-- README.md | 332 ++++----- docs/articles/midoc.html | 649 +++++++++++------- docs/index.html | 330 ++++----- docs/pkgdown.yml | 2 +- docs/reference/Rplot001.png | Bin 20903 -> 0 bytes docs/reference/Rplot002.png | Bin 39368 -> 0 bytes docs/reference/checkCRA.html | 170 ++--- docs/reference/checkMI.html | 57 +- docs/reference/checkModSpec.html | 69 +- docs/reference/descMissData.html | 12 +- docs/reference/doMImice.html | 93 ++- docs/reference/exploreDAG.html | 82 ++- .../figures/README-unnamed-chunk-2-2.png | Bin 8241 -> 8236 bytes .../figures/README-unnamed-chunk-2-3.png | Bin 8770 -> 8486 bytes docs/reference/midoc-package.html | 7 +- docs/reference/proposeMI.html | 231 +++---- docs/search.json | 2 +- inst/shinyexamples/midocdemoShiny.rmd | 316 ++++++--- man/checkcra.Rd | 37 +- man/checkmi.Rd | 16 +- man/checkmodspec.Rd | 37 +- man/doMImice.Rd | 38 +- man/exploreDAG.Rd | 5 +- man/figures/README-unnamed-chunk-2-2.png | Bin 8241 -> 8236 bytes man/figures/README-unnamed-chunk-2-3.png | Bin 8770 -> 8486 bytes man/midoc-package.Rd | 2 +- man/proposeMI.Rd | 51 +- tests/testthat/test-checkcra.R | 16 +- tests/testthat/test-checkmi.R | 6 +- tests/testthat/test-checkmodspec.R | 16 +- tests/testthat/test-descMissData.R | 4 +- tests/testthat/test-doMImice.R | 6 +- tests/testthat/test-exploreDAG.R | 12 +- tests/testthat/test-proposeMI.R | 4 +- vignettes/midoc.rmd | 214 ++++-- 43 files changed, 1882 insertions(+), 1404 deletions(-) delete mode 100644 docs/reference/Rplot001.png delete mode 100644 docs/reference/Rplot002.png diff --git a/DESCRIPTION b/DESCRIPTION index 222b7eb..80f081e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,8 +13,10 @@ Description: A guidance system for analysis with missing data. It incorporates expert, up-to-date methodology to help researchers choose the most appropriate analysis approach when some data are missing. You provide the available data and the assumed causal structure, including the likely causes - of missing data. 'midoc' will advise whether multiple imputation is needed, and - if so, how best to perform it. + of missing data. `midoc` will advise which analysis approaches can be used, + and how best to perform them. `midoc` follows the framework for the treatment + and reporting of missing data in observational studies (TARMOS). Lee et al + (2021). . License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) @@ -22,7 +24,7 @@ RoxygenNote: 7.3.1 Suggests: knitr, shiny, - testthat (>= 3.0.0) + testthat Config/testthat/edition: 3 Imports: arm, @@ -35,7 +37,8 @@ Imports: mice (>= 3.16.0), rlang, rmarkdown, - stats + stats, + utils VignetteBuilder: knitr Depends: R (>= 3.6) diff --git a/R/checkcra.R b/R/checkcra.R index 2c4307f..a474122 100644 --- a/R/checkcra.R +++ b/R/checkcra.R @@ -26,73 +26,64 @@ #' the proposed DAG and analysis model outcome and covariate(s) #' @export #' +#' @references Hughes R, Heron J, Sterne J, Tilling K. 2019. Accounting for +#' missing data in statistical analyses: multiple imputation is not always the +#' answer. Int J Epidemiol. +#' +#' Bartlett JW, Harel O, Carpenter JR. 2015. Asymptotically Unbiased +#' Estimation of Exposure Odds Ratios in Complete Records Logistic Regression. +#' Am J Epidemiol. +#' #' @examples #' # Example DAG for which complete records analysis is not valid, but could be -#' ## valid for a different set of covariates +#' ## valid for a different set of covariates #' checkCRA(y="bmi7", covs="matage", r_cra="r", -#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> -#' mated sep_unmeas -> r") -#' ## For the DAG in the example above, complete records analysis is valid -#' ## if a different set of covariates is used +#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 +#' sep_unmeas -> mated sep_unmeas -> r") +#' # For the DAG in the example above, complete records analysis is valid +#' ## if a different set of covariates is used #' checkCRA(y="bmi7", covs="matage mated", r_cra="r", -#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> -#' mated sep_unmeas -> r") +#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 +#' sep_unmeas -> mated sep_unmeas -> r") #' #' # Example DAG for which complete records is not valid, but could be valid -#' ## for a different estimand +#' ## for a different estimand #' checkCRA(y="bmi7", covs="matage mated", r_cra="r", -#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> -#' mated sep_unmeas -> r matage -> bmi3 mated -> bmi3 bmi3 -> bmi7 -#' bmi3 -> r") +#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 +#' sep_unmeas -> mated sep_unmeas -> r matage -> bmi3 +#' mated -> bmi3 bmi3 -> bmi7 bmi3 -> r") #' -#' # Example DAG for which complete records analysis is not valid +#' # Example DAG for which complete records analysis is never valid #' checkCRA(y="bmi7", covs="matage mated", r_cra="r", -#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> -#' mated sep_unmeas -> r bmi7 -> r") +#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 +#' sep_unmeas -> mated sep_unmeas -> r bmi7 -> r") checkCRA <- function(y, covs, r_cra, mdag) { mdagspec <- paste('dag {',mdag,'}') covsvec <- unlist(strsplit(covs," ")) #If r does not depend on y conditional on covariates, then CRA is valid if(dagitty::dseparated(dagitty::dagitty(mdagspec, layout=T), y, r_cra, covsvec)){ - cat(strwrap("Based on the proposed directed acyclic graph (DAG), -the analysis model outcome and complete record indicator are independent -given analysis model covariates. Hence, complete records analysis is valid."),"\n",fill=TRUE) + result <- paste("Based on the proposed directed acyclic graph (DAG), the analysis model outcome and complete record indicator are independent given analysis model covariates. Hence, complete records analysis is valid.", collapse="\n") } else { - cat(strwrap("Based on the proposed directed acyclic graph (DAG), -the analysis model outcome and complete record indicator are not independent -given analysis model covariates. -Hence, in general, complete records analysis is not valid."),"\n", - strwrap("In special cases, depending on the type of analysis model and estimand of interest, complete -records analysis may still be valid. See, for example, Bartlett et al. (2015) -(https://doi.org/10.1093/aje/kwv114) for further details."),"\n", fill=TRUE) + result1 <- paste("Based on the proposed directed acyclic graph (DAG), the analysis model outcome and complete record indicator are not independent given analysis model covariates. Hence, in general, complete records analysis is not valid. \n \nIn special cases, depending on the type of analysis model and estimand of interest, complete records analysis may still be valid. See, for example, Bartlett et al. (2015) (https://doi.org/10.1093/aje/kwv114) for further details.\n",collapse = "\n") adjsets <- dagitty::adjustmentSets(mdagspec,exposure=c(covsvec,r_cra),outcome=y,type = "all") adjsetsfull <- dagitty::adjustmentSets(mdagspec,exposure=r_cra,outcome=y,type = "all") if(length(adjsets)==0 & length(adjsetsfull)==0){ - cat(strwrap("There are no other variables which could be added to the model to make -the analysis model outcome and complete record indicator conditionally independent. Consider -using a different strategy e.g. multiple imputation."),"\n", fill=TRUE) + result2 <- paste("There are no other variables which could be added to the model to make the analysis model outcome and complete record indicator conditionally independent. Consider using a different strategy e.g. multiple imputation.",collapse = "\n") } if(length(adjsets)==0 & length(adjsetsfull)>0){ - cat(strwrap("There are no other variables which could be added to the model to make -the analysis model outcome and complete record indicator conditionally independent, -without changing the estimand of interest."),"\n", - strwrap("Consider using a different strategy e.g. -multiple imputation. Alternatively, consider whether a different estimand could -be of interest. For example, the analysis model outcome and complete record -indicator are independent given each of the following sets of variables:"),"\n", fill=TRUE) - print(adjsetsfull) + result2 <- paste("There are no other variables which could be added to the model to make the analysis model outcome and complete record indicator conditionally independent, without changing the estimand of interest. Consider using a different strategy e.g. multiple imputation. \n \nAlternatively, consider whether a different estimand could be of interest. For example, the analysis model outcome and complete record indicator are independent given each of the following sets of variables:\n \n", + paste0(adjsetsfull, prefix="\n", collapse = "\n"),collapse = "\n") + #print(adjsetsfull) } if(length(adjsets)>0){ - cat(strwrap("Consider using a different analysis model and/or strategy, e.g. multiple imputation. -\nFor example, the analysis model outcome and complete record indicator are independent -if, in addition to the specified covariates, the following sets of variables are included -as covariates in the analysis model (note that this list is not necessarily exhaustive, -particularly if your DAG is complex):"),"\n", fill=TRUE) - print(adjsets) + result2 <- paste("Consider using a different analysis model and/or strategy, e.g. multiple imputation. \n \nFor example, the analysis model outcome and complete record indicator are independent if, in addition to the specified covariates, the following sets of variables are included as covariates in the analysis model (note that this list is not necessarily exhaustive, particularly if your DAG is complex):\n \n", paste0(adjsets, prefix="\n", collapse = "\n"), collapse = "\n") + #print(adjsets) } + result <- paste(result1, "\n", result2, collapse="\n") } + message(paste(strwrap(result),collapse="\n")) } diff --git a/R/checkmi.R b/R/checkmi.R index 37b0a46..1a85661 100644 --- a/R/checkmi.R +++ b/R/checkmi.R @@ -26,44 +26,44 @@ #' proposed DAG and imputation model #' @export #' +#' @references Curnow E, Tilling K, Heron JE, Cornish RP, Carpenter JR. 2023. +#' Multiple imputation of missing data under missing at random: including a +#' collider as an auxiliary variable in the imputation model can induce bias. +#' Frontiers in Epidemiology. +#' #' @examples #' # Example DAG for which multiple imputation is valid #' checkMI(dep="bmi7", preds="matage mated pregsize", r_dep="r", -#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> -#' mated sep_unmeas -> r pregsize -> bmi7 pregsize -> bwt sep_unmeas -> bwt") +#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 +#' sep_unmeas -> mated sep_unmeas -> r pregsize -> bmi7 +#' pregsize -> bwt sep_unmeas -> bwt") #' #' # Example DAG for which multiple imputation is not valid, due to a collider #' checkMI(dep="bmi7", preds="matage mated bwt", r_dep="r", -#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> -#' mated sep_unmeas -> r pregsize -> bmi7 pregsize -> bwt sep_unmeas -> bwt") +#' mdag="matage -> bmi7 mated -> matage mated -> bmi7 +#' sep_unmeas -> mated sep_unmeas -> r pregsize -> bmi7 +#' pregsize -> bwt sep_unmeas -> bwt") checkMI <- function(dep, preds, r_dep, mdag) { mdagspec <- paste('dag {',mdag,'}') predsvec <- unlist(strsplit(preds," ")) #If r_dep does not depend on dep conditional on predictors, then MI is valid if(dagitty::dseparated(dagitty::dagitty(mdagspec, layout=T), dep, r_dep, predsvec)){ - cat(strwrap("Based on the proposed directed acyclic graph (DAG), -the incomplete variable and its missingness indicator are independent -given imputation model predictors. Hence, multiple imputation methods which -assume data are missing at random are valid in principle."), fill=TRUE) + result <- paste("Based on the proposed directed acyclic graph (DAG), the incomplete variable and its missingness indicator are independent given imputation model predictors. Hence, multiple imputation methods which assume data are missing at random are valid in principle.", collapse="\n") } else { - cat(strwrap("Based on the proposed directed acyclic graph (DAG), -the incomplete variable and its missingness indicator are not independent -given imputation model predictors. Hence, multiple imputation methods which -assume data are missing at random are not valid."),"\n", - strwrap("Consider using a different imputation model and/or strategy (e.g. not-at-random -fully conditional specification)."),"\n",fill=TRUE) + result1 <- paste("Based on the proposed directed acyclic graph (DAG), the incomplete variable and its missingness indicator are not independent given imputation model predictors. Hence, multiple imputation methods which assume data are missing at random are not valid. \n \nConsider using a different imputation model and/or strategy (e.g. not-at-random fully conditional specification).", + collapse="\n") adjsets_r <- dagitty::adjustmentSets(mdagspec,exposure=c(predsvec,r_dep),outcome=dep,type = "all") adjsets_dep <- dagitty::adjustmentSets(mdagspec,exposure=c(predsvec,dep),outcome=r_dep,type = "all") if(length(adjsets_r)>0) (adjsets <- adjsets_r) else (adjsets <- adjsets_dep) if(length(adjsets)>0){ - cat(strwrap("For example, the incomplete variable and its missingness indicator are -independent if, in addition to the specified predictors, the following sets of -variables are included as predictors in the imputation model -(note that this list is not necessarily exhaustive, particularly if your DAG is complex):"),"\n",fill=TRUE) - print(adjsets) - } - } + result2 <- paste("For example, the incomplete variable and its missingness indicator are independent if, in addition to the specified predictors, the following sets of variables are included as predictors in the imputation model (note that this list is not necessarily exhaustive, particularly if your DAG is complex):\n \n", + paste0(adjsets, prefix="\n", collapse = "\n"),collapse = "\n") + #print(adjsets) + } + result <- paste(result1, "\n", result2, collapse = "\n") + } + message(paste(strwrap(result),collapse="\n")) } diff --git a/R/checkmodspec.R b/R/checkmodspec.R index b130f97..cce57b3 100644 --- a/R/checkmodspec.R +++ b/R/checkmodspec.R @@ -11,87 +11,87 @@ #' used in the model, specified as a string; family functions that are #' supported are "gaussian(identity)" and "binomial(logit)" #' @param data A data frame containing all the variables stated in the formula -#' @param message If TRUE (the default), displays a message indicating whether the -#' relationships between the dependent variable and covariates are likely to -#' be correctly specified or not; use message = FALSE to suppress the message -#' @param plot If TRUE (the default) and there is evidence of model mis-specification, -#' displays a plot which can be used to explore the functional form of each -#' covariate in the specified model; use plot = FALSE to disable the plot +#' @param plot If TRUE (the default) and there is evidence of model +#' mis-specification, displays a plot which can be used to explore the +#' functional form of each covariate in the specified model; use plot = FALSE +#' to disable the plot +#' @param message If TRUE (the default), displays a message indicating whether +#' the relationships between the dependent variable and covariates are likely +#' to be correctly specified or not; use message = FALSE to suppress the +#' message #' -#' @return An object of type 'miprop' (a list containing the -#' specified formula, family, and dataset name), plus, optionally, messages -#' and plots indicating whether the relationships between the dependent -#' variable and covariates are likely to be correctly specified or not +#' @return An object of type 'mimod' (a list containing the specified formula, +#' family, and dataset name). Optionally, a message indicating whether the +#' relationships between the dependent variable and covariates are likely to +#' be correctly specified or not. If there is evidence of model +#' mis-specification, optionally returns a plot of the model residuals versus +#' the fitted values which can be used to explore the appropriate functional +#' form for the specified model. #' #' @export #' +#' @references Curnow E, Carpenter JR, Heron JE, et al. 2023. Multiple +#' imputation of missing data under missing at random: compatible imputation +#' models are not sufficient to avoid bias if they are mis-specified. J Clin +#' Epidemiol. +#' #' @examples #' # Example (incorrectly) assuming a linear relationship #' checkModSpec(formula="bmi7~matage+mated+pregsize", -#' family="gaussian(identity)", data=bmi) +#' family="gaussian(identity)", data=bmi) #' ## For the example above, (correctly) assuming a quadratic relationship #' checkModSpec(formula="bmi7~matage+I(matage^2)+mated+pregsize", -#' family="gaussian(identity)", data=bmi) -checkModSpec <- function(formula, family, data, message=TRUE, plot=TRUE) { +#' family="gaussian(identity)", data=bmi) +checkModSpec <- function(formula, family, data, plot=TRUE, message=TRUE) { if(family == "gaussian(identity)"){ mod <- stats::glm(stats::as.formula(formula), data=data) modfit <- data.frame(r=mod[["residuals"]],fitvals=mod[["fitted.values"]]) modfittest <- mfp2::mfp2(r ~ fitvals, data=modfit, verbose = FALSE) - pval <- 1-stats::pchisq(modfittest$null.deviance-modfittest$deviance, modfittest$df.null-modfittest$df.residual) - - if (message){ - cat(strwrap("Model mis-specification method: regression of model residuals -on a fractional polynomial of the fitted values"), - "\n", - "P-value:", - pval,"\n",fill=TRUE) - if(pval > 0.1){ - cat(strwrap("A large p-value means there is little evidence of model mis-specification."),"\n",fill=TRUE) - } else { - cat(strwrap("A small p-value means the model may be mis-specified. Check -the specification of each relationship in your model."),"\n",fill=TRUE) + pval <- round(1-stats::pchisq(modfittest$null.deviance-modfittest$deviance, modfittest$df.null-modfittest$df.residual),6) - #Optionally create a plot of residual vs fitted values - if (plot){ - plot(modfit$fitvals,modfit$r,xlab="",ylab="", - main="Residuals versus fitted values", - sub=list("This plot may suggest the appropriate functional form \nfor the specified model",cex=0.8)) - } + result1 <- paste("Model mis-specification method: regression of model residuals on a fractional polynomial of the fitted values \n \nP-value:", + paste0(pval),collapse = "\n") } - } - } else if(family == "binomial(logit)"){ mod <- stats::glm(formula=stats::as.formula(formula),family="binomial", data=data) + modfit <- data.frame(r=mod[["residuals"]],fitvals=mod[["fitted.values"]]) modfittest <- blorr::blr_linktest(mod) - pval <- stats::coef(modfittest)[3,4] - if (message){ - cat(strwrap("Model mis-specification method: Pregibon's link test"), - "\n", - "P-value:", - pval,"\n", fill=TRUE) - if(pval > 0.1){ - cat(strwrap("A large p-value means there is little evidence of model mis-specification."),"\n", fill=TRUE) - } else { - cat(strwrap("A small p-value means the model may be mis-specified. Check the -specification of each relationship in your model."),"\n",fill=TRUE) + pval <- round(stats::coef(modfittest)[3,4],6) - #Optionally create a plot of deviance residual vs binned fitted values - modfit <- data.frame(r=mod[["residuals"]],fitvals=mod[["fitted.values"]]) - if (plot){ - arm::binnedplot(modfit$fitvals,modfit$r,xlab="",ylab="",col.int="white", - main="Residuals versus (binned) fitted values", - sub=list("This plot may suggest the appropriate functional form \nfor the specified model", cex=0.8)) - } - } + result1 <- paste("Model mis-specification method: Pregibon's link test\n \nP-value:", + paste0(pval),collapse = "\n") + } + + if(pval > 0.1){ + result2 <- paste("\nA large p-value means there is little evidence of model mis-specification.", collapse = "\n") + } else { + result2 <- paste("\nA small p-value means the model may be mis-specified. Check the specification of each relationship in your model.", collapse = "\n") + } + result <- paste(result1, "\n", result2, collapse = "\n") + + #Return message with model check results + if(message) {message(paste(strwrap(result),collapse="\n"))} + + #Optionally create a plot of residual vs fitted values when evidence of mis-specification + if (plot & pval <= 0.1){ + if (family == "gaussian(identity)"){ + plot(modfit$fitvals,modfit$r,xlab="",ylab="", + main="Residuals versus fitted values", + sub=list("This plot may suggest the appropriate functional form \nfor the specified model",cex=0.8)) + } + else if (family == "binomial(logit)"){ + arm::binnedplot(modfit$fitvals,modfit$r,xlab="",ylab="",col.int="white", + main="Residuals versus (binned) fitted values", + sub=list("This plot may suggest the appropriate functional form \nfor the specified model", cex=0.8)) } } -#Output an object with formula and family -mimod <- list(formula = formula,family = family, - datalab=deparse(substitute(data))) -invisible(mimod) + #Return an object with formula and family + mimod <- list(formula = formula,family = family, + datalab=deparse(substitute(data))) + invisible(mimod) } diff --git a/R/descMissData.R b/R/descMissData.R index 99941fd..fe4cfcb 100644 --- a/R/descMissData.R +++ b/R/descMissData.R @@ -36,7 +36,9 @@ descMissData <- function(y, covs, data, plot=FALSE) { n <- as.numeric(row.names(mdtab[1:nrow(mdtab)-1,])) pct <- round(n*100/sum(n)) pattern <- 1:(nrow(mdtab)-1) - print(data.frame(pattern,mdtab2,n,pct),row.names=FALSE) + result<-data.frame(pattern,mdtab2,n,pct) + row.names(result) <- NULL + return(result) } } diff --git a/R/doMImice.R b/R/doMImice.R index 6a3019e..55e38ed 100644 --- a/R/doMImice.R +++ b/R/doMImice.R @@ -1,35 +1,44 @@ #' Performs multiple imputation #' -#' Performs multiple imputation using \link[mice]{mice}, based on the 'mice' -#' options and dataset specified by a call to \link[midoc]{proposeMI}. +#' Creates multiple imputations using \link[mice]{mice}, based on the options +#' and dataset specified by a call to \link[midoc]{proposeMI}. If a substantive +#' model is specified, also calculates the pooled estimates using +#' \link[mice]{pool}. #' #' @param mipropobj An object of type 'miprop', created by a call to 'proposeMI' #' @param seed An integer that is used to set the seed of the 'mice' call #' @param substmod Optionally, a symbolic description of the substantive model #' to be fitted, specified as a string; if supplied, the model will be fitted #' to each imputed dataset and the results pooled +#' @param message If TRUE (the default), displays a message summarising the +#' analysis that has been performed; use message = FALSE to suppress the +#' message #' -#' @return A 'mice' object of type 'mids' +#' @return A 'mice' object of class 'mids' (the multiply imputed datasets). +#' Optionally, a message summarising the analysis that has been performed. #' #' @export #' #' @examples -#' # First specify the imputation model as a 'mimod' object (suppressing the -#' ## output) -#' mimod_bmi7 <- checkModSpec( -#' formula="bmi7~matage+I(matage^2)+mated+pregsize", -#' family="gaussian(identity)", data=bmi, message = FALSE) -#' # Save the proposed 'mice' options as a 'miprop' object (suppressing the -#' ## output) -#' miprop <- proposeMI(mimodobj=mimod_bmi7, data=bmi, message = FALSE, -#' plot = FALSE) +#' # First specify the imputation model as a 'mimod' object +#' ## (suppressing the message) +#' mimod_bmi7 <- checkModSpec(formula="bmi7~matage+I(matage^2)+mated+pregsize", +#' family="gaussian(identity)", +#' data=bmi, +#' message=FALSE) +#' # Save the proposed 'mice' options as a 'miprop' object +#' ## (suppressing the message) +#' miprop <- proposeMI(mimodobj=mimod_bmi7, +#' data=bmi, +#' message=FALSE, +#' plot = FALSE) #' # Create the set of imputed datasets using the proposed 'mice' options #' imp <- doMImice(miprop,123) #' #' # Additionally, fit the substantive model to each imputed dataset and display -#' ## the pooled results +#' ## the pooled results #' doMImice(miprop, 123, substmod="lm(bmi7 ~ matage + I(matage^2) + mated)") -doMImice <- function(mipropobj, seed, substmod = " ") { +doMImice <- function(mipropobj, seed, substmod = " ", message = TRUE) { mids <- mice::mice( data = mipropobj$data, @@ -40,15 +49,20 @@ doMImice <- function(mipropobj, seed, substmod = " ") { printFlag = FALSE, seed = seed) - #Optionally return the pooled estimates - #substexpr <- deparse(substitute(substmod)) + #If a substantive model is specified, calculate the pooled estimates if(substmod != " "){ - cat("Given the substantive model:", substmod, "\n", strwrap("multiple imputation estimates are as follows:"),"\n",fill=TRUE) - print(summary(mice::pool(with(mids,parse(text=substmod, keep.source=FALSE))),conf.int=TRUE)) + mipo <- mice::pool(with(mids,parse(text=substmod, keep.source=FALSE))) + result <- paste("Given the substantive model:", + substmod, +"\n, multiple imputation estimates are as follows: \n \n", + paste0(gsub(" ", "@",utils::capture.output(summary(mipo,conf.int=TRUE))),prefix="\n",collapse = "\n"), + collapse = "\n") } else { - cat(strwrap("Now you have created your multiply imputed datasets, you can perform your analysis - and pool the results using the 'mice' functions 'with()' and 'pool()'"),"\n",fill=TRUE)} + result <- paste("Now you have created your multiply imputed datasets, you can perform your analysis and pool the results using the 'mice' functions 'with()' and 'pool()'", collapse = "\n") + } + + if(message) {message(paste(gsub("@", " ",strwrap(result)),collapse="\n"))} invisible(mids) diff --git a/R/exploreDAG.R b/R/exploreDAG.R index dff433e..b9c6939 100644 --- a/R/exploreDAG.R +++ b/R/exploreDAG.R @@ -16,8 +16,9 @@ #' @export #' #' @examples -#' exploreDAG(mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> -#' mated sep_unmeas -> r", data=bmi) +#' exploreDAG(mdag="matage -> bmi7 mated -> matage mated -> bmi7 +#' sep_unmeas -> mated sep_unmeas -> r", +#' data=bmi) exploreDAG <- function(mdag, data) { mod <- dagitty::dagitty(paste('dag {',mdag,'}'), layout=T) @@ -31,38 +32,17 @@ exploreDAG <- function(mdag, data) { comptests <- Filter(function(x) all(x$X %in% compvar) & all(x$Y %in% compvar) & all(x$Z %in% compvar), tests) comptestsres <- dagitty::localTests(x=mod,data=compdata,tests=comptests,type="cis.pillai",abbreviate.names=FALSE) + result1 <- paste("The proposed directed acyclic graph (DAG) implies the following conditional independencies (where, for example, 'X _||_ Y | Z' should be read as 'X is independent of Y conditional on Z'). Note that variable names are abbreviated: \n \n", + paste0(utils::capture.output(tests), prefix="\n",collapse = "\n"),collapse = "\n") + if(nrow(comptestsres)==0){ - cat(strwrap("The proposed directed acyclic graph (DAG) implies the following -conditional independencies -(where, for example, 'X _||_ Y | Z' should be read as -'X is independent of Y conditional on Z'). Note that variable names are abbreviated:"),"\n",fill=TRUE) - print(tests) - cat("\n") - cat(strwrap("None of the fully observed variables are conditionally independent. - \nHence, no consistency checks will be performed. \nConsider whether -it is valid and possible to explore relationships between partially observed variables -using the observed data, e.g. avoiding perfect prediction."),"\n",fill=TRUE) + result2 <- paste("None of the fully observed variables are conditionally independent.\nHence, no consistency checks will be performed. \nConsider whether it is valid and possible to explore relationships between partially observed variables using the observed data, e.g. avoiding perfect prediction.",collapse = "\n") }else{ - cat(strwrap("The proposed directed acyclic graph (DAG) implies that the - following pairs of variables are independent, or conditionally independent. - 'X _||_ Y | Z' should be read as 'X is independent of Y conditional on Z'. - Note that variable names are abbreviated:"),"\n",fill=TRUE) - print(tests) - cat("\n") - cat(strwrap("These (conditional) independence statements are explored below - using the canonical correlations approach for mixed data. See - ??dagitty::localTests for further details."),"\n", - strwrap("Results are shown for variables that are fully observed in the - specified dataset."), - strwrap("The null hypothesis is that the stated variables are (conditionally) independent."),"\n",fill=TRUE) - print(dagitty::localTests(x=mod,data=compdata,tests=comptests,type="cis.pillai",abbreviate.names=FALSE)) - cat("\n", strwrap("Interpretation: A small p-value means the stated variables may not be (conditionally) independent - in the specified dataset: your data may not be consistent with the proposed DAG. - A large p-value means there is little evidence of inconsistency between your data - and the proposed DAG."),"\n",fill=TRUE) - cat(strwrap("Note that these results assume that relationships between variables are linear. -Consider exploring the specification of each relationship in your model. \nAlso consider whether -it is valid and possible to explore relationships between partially observed variables -using the observed data, e.g. avoiding perfect prediction."),"\n",fill=TRUE) + result2 <- paste("These (conditional) independence statements are explored below using the canonical correlations approach for mixed data. See ??dagitty::localTests for further details. \nResults are shown for variables that are fully observed in the specified dataset.\nThe null hypothesis is that the stated variables are (conditionally) independent. \n \n", + paste0(gsub(" ", "@",utils::capture.output(comptestsres)),prefix="\n",collapse = "\n"), +"\n \nInterpretation: A small p-value means the stated variables may not be (conditionally) independent in the specified dataset: your data may not be consistent with the proposed DAG. A large p-value means there is little evidence of inconsistency between your data and the proposed DAG. \n \nNote that these results assume that relationships between variables are linear. Consider exploring the specification of each relationship in your model. \nAlso consider whether it is valid and possible to explore relationships between partially observed variables using the observed data, e.g. avoiding perfect prediction.",collapse = "\n") } + + result <- paste(result1, "\n", result2, collapse = "\n") + message(paste(gsub("@", " ",strwrap(result)),collapse="\n")) } diff --git a/R/proposeMI.R b/R/proposeMI.R index 6936b80..09ec62a 100644 --- a/R/proposeMI.R +++ b/R/proposeMI.R @@ -1,43 +1,50 @@ #' Suggests multiple imputation options #' -#' Suggests the \link[mice]{mice} options to perform multiple -#' imputation, based on the proposed set of imputation models (one for each -#' partially observed variable) and specified dataset. +#' Suggests the \link[mice]{mice} options to perform multiple imputation, based +#' on the proposed set of imputation models (one for each partially observed +#' variable) and specified dataset. #' #' @param mimodobj An object, or list of objects, of type 'mimod', which stands #' for 'multiple imputation model', created by a call to #' \link[midoc]{checkModSpec} #' @param data A data frame containing all the variables required for imputation #' and the substantive analysis -#' @param message If TRUE (the default), displays a message describing the -#' proposed 'mice' options; use message=FALSE to suppress the message -#' @param plot If TRUE (the default), displays diagnostic plots for the -#' proposed 'mice' call; use plot=FALSE to disable the plots +#' @param plot If TRUE (the default), displays diagnostic plots for the proposed +#' 'mice' call; use plot=FALSE to disable the plots #' @param plotprompt If TRUE (the default), the user is prompted before the #' second plot is displayed; use plotprompt=FALSE to remove the prompt +#' @param message If TRUE (the default), displays a message describing the +#' proposed 'mice' options; use message=FALSE to suppress the message #' #' @return An object of type 'miprop', which can be used to run 'mice' using the -#' proposed options, plus, optionally, a message and diagnostic plots describing -#' the proposed 'mice' options +#' proposed options, plus, optionally, a message and diagnostic plots +#' describing the proposed 'mice' options #' #' @export #' #' @examples -#' # First specify each imputation model as a 'mimod' object (suppressing the -#' ## output) -#' mimod_bmi7 <- checkModSpec( -#' formula="bmi7~matage+I(matage^2)+mated+pregsize", -#' family="gaussian(identity)", data=bmi, message = FALSE) +#' # First specify each imputation model as a 'mimod' object +#' ## (suppressing the message) +#' mimod_bmi7 <- checkModSpec(formula="bmi7~matage+I(matage^2)+mated+pregsize", +#' family="gaussian(identity)", +#' data=bmi, +#' message=FALSE) #' mimod_pregsize <- checkModSpec( -#' formula="pregsize~bmi7+matage+I(matage^2)+mated", -#' family="binomial(logit)", data=bmi, message = FALSE) +#' formula="pregsize~bmi7+matage+I(matage^2)+mated", +#' family="binomial(logit)", +#' data=bmi, +#' message=FALSE) #' #' # Display the proposed 'mice' options (suppressing the plot prompt) -#' ## When specifying a single imputation model -#' proposeMI(mimodobj=mimod_bmi7, data=bmi, plotprompt = FALSE) -#' ## When specifying more than one imputation model (suppressing the plot) -#' proposeMI(mimodobj=list(mimod_bmi7,mimod_pregsize), data=bmi, plot = FALSE) -proposeMI <- function(mimodobj, data, message = TRUE, plot = TRUE, plotprompt = TRUE) { +#' ## When specifying a single imputation model +#' proposeMI(mimodobj=mimod_bmi7, +#' data=bmi, +#' plotprompt = FALSE) +#' ## When specifying more than one imputation model (suppressing the plots) +#' proposeMI(mimodobj=list(mimod_bmi7,mimod_pregsize), +#' data=bmi, +#' plot = FALSE) +proposeMI <- function(mimodobj, data, plot = TRUE, plotprompt = TRUE, message = TRUE) { m_min <- ceiling((1-mean(ifelse(apply(data,1,anyNA)==F,1,0)))*100) @@ -78,53 +85,40 @@ Check that the specification of each imputation model was explored using the sam } } - if (message) { - cat(strwrap("Based on your proposed imputation model and dataset, your mice() call should be as follows:"), - "\nmice(data = ", datalab,",", - strwrap("# You may need to specify a subset of the columns in your dataset"),"\n", - "\nm = ", m_min, ",", strwrap("# You should use at least this number of imputations - based on the proportion of complete records in your dataset"),"\n", - "\nmethod = c(",paste(sQuote(method), collapse=", "),")","\n",fill=TRUE) - cat(strwrap("# Specify a method for each incomplete variable. \nIf displayed, the box-and-whisker plots can be used to inform - your choice of method(s): for example, if the imputation model does not predict extreme values appropriately, - consider a different imputation model/method e.g. PMM. Note the distribution of - imputed and observed values is displayed for numeric variables only. The distribution - may differ if data are missing at random. If you suspect data are missing not at random, - the plots can also inform your choice of sensitivity parameter."), - "\n", "\n", - strwrap("\nformulas = formulas_list , # Note that you do not additionally need to specify a 'predmatrix'"),"\n", - strwrap("# The formulas_list specifies the conditional imputation models, as follows:"),"\n", - paste(sQuote(formulas_list), collapse="\n"),"\n", - strwrap("\nmaxit = 10 , \n# If you have more than one incomplete variable, you should check - this number of iterations is sufficient by inspecting the trace plots, if displayed. - Consider increasing the number of iterations if there is a trend that does not - stabilise by the 10th iteration. Note that iteration is not required when only one variable is - partially observed."),"\n","\n", - strwrap("\nprintFlag = FALSE , # Change to printFlag=TRUE - to display the history as imputation is performed"),"\n", - strwrap("\nseed = NA) # It is good practice to choose a seed so your results are reproducible"),"\n",fill=TRUE) - } + result <- paste("Based on your proposed imputation model and dataset, your mice() call should be as follows: \n \nmice(data = ", + datalab, +", # You may need to specify a subset of the columns in your dataset \n \nm = ", + m_min, +", # You should use at least this number of imputations based on the proportion of complete records in your dataset \n \nmethod = c(", + paste(sQuote(method), collapse=", "), +") # Specify a method for each incomplete variable. \nIf displayed, the box-and-whisker plots can be used to inform your choice of method(s): for example, if the imputation model does not predict extreme values appropriately, consider a different imputation model/method e.g. PMM. Note the distribution of imputed and observed values is displayed for numeric variables only. The distribution may differ if data are missing at random or missing not at random. If you suspect data are missing not at random, the plots can also inform your choice of sensitivity parameter. \n \nformulas = formulas_list , # Note that you do not additionally need to specify a 'predmatrix' \n \n# The formulas_list specifies the conditional imputation models, which are as follows: \n \n", + paste(sQuote(formulas_list), prefix="\n", collapse="\n"), +"\n \nmaxit = 10 , \n# If you have more than one incomplete variable, you should check this number of iterations is sufficient by inspecting the trace plots, if displayed. Consider increasing the number of iterations if there is a trend that does not stabilise by the 10th iteration. Note that iteration is not performed when only one variable is partially observed. \n \nprintFlag = FALSE , # Change to printFlag=TRUE to display the history as imputation is performed \n\nseed = NA) # It is good practice to choose a seed so your results are reproducible", collapse = "\n") - #Optionally create a plot of observed versus imputed values and a trace plot for five imputed datasets - if (plot){ - imp <- mice::mice(data = data, method = method, - formulas = formulas_list, maxit = 20, printFlag = FALSE) - #changed from bwplot to densityplot - print(mice::bwplot(imp, - main=list("Plot of imputed (red) values, with distribution of \nobserved (blue) values for comparison",cex=0.9))) - #Optionally prompt for second plot after first plot is displayed - if (plotprompt){ - oask <- grDevices::devAskNewPage(TRUE) - print(plot(imp,main=list("Trace plots across 20 iterations",cex=0.9))) - #Reset original settings - grDevices::devAskNewPage(oask) - } else { - print(plot(imp,main=list("Trace plots across 20 iterations",cex=0.9))) + #Return message with model check results + if(message) {message(paste(strwrap(result),collapse="\n"))} + + #Optionally create a plot of observed versus imputed values and a trace plot for five imputed datasets + if (plot){ + imp <- mice::mice(data = data, method = method, + formulas = formulas_list, maxit = 20, printFlag = FALSE) + #changed from bwplot to densityplot + print(mice::bwplot(imp, + main=list("Plot of imputed (red) values, with distribution of \nobserved (blue) values for comparison",cex=0.9))) + + #Optionally prompt for second plot after first plot is displayed + if (plotprompt){ + oask <- grDevices::devAskNewPage(TRUE) + print(plot(imp,main=list("Trace plots across 20 iterations",cex=0.9))) + #Reset original settings + grDevices::devAskNewPage(oask) + } else { + print(plot(imp,main=list("Trace plots across 20 iterations",cex=0.9))) + } } - } -#Output an object with the proposed specification -miprop <- list(data=data,m= m_min,method=method,formulas = formulas_list) -invisible(miprop) + #Return an object with the proposed specification + miprop <- list(data=data,m= m_min,method=method,formulas = formulas_list) + invisible(miprop) } diff --git a/README.md b/README.md index b290e8c..a6fcab3 100644 --- a/README.md +++ b/README.md @@ -64,188 +64,198 @@ head(bmi) #> 4 NA 0.81459107 1 0 3.103251 0 #> 5 17.97791 -0.55260086 0 0 3.830381 1 #> 6 NA -0.03829346 1 0 2.775282 0 -``` - -``` r descMissData(y="bmi7", covs="matage mated", data=bmi, plot=TRUE) ``` - #> pattern bmi7 matage mated n pct - #> 1 1 1 1 592 59 - #> 2 0 1 1 408 41 - -``` r - -exploreDAG(mdag=" matage -> bmi7 - mated -> matage - mated -> bmi7 - sep_unmeas -> mated - sep_unmeas -> r - pregsize -> bmi7 - pregsize -> bwt - sep_unmeas -> bwt", - data=bmi) -#> The proposed directed acyclic graph (DAG) implies that the following -#> pairs of variables are independent, or conditionally independent. 'X -#> _||_ Y | Z' should be read as 'X is independent of Y conditional on Z'. -#> Note that variable names are abbreviated: -#> -#> bmi7 _||_ bwt | prgs, sp_n -#> bmi7 _||_ bwt | matd, prgs -#> bmi7 _||_ r | sp_n -#> bmi7 _||_ r | matd -#> bmi7 _||_ sp_n | matd -#> bwt _||_ matg | matd -#> bwt _||_ matg | sp_n -#> bwt _||_ matd | sp_n -#> bwt _||_ r | sp_n -#> matg _||_ prgs -#> matg _||_ r | sp_n -#> matg _||_ r | matd -#> matg _||_ sp_n | matd -#> matd _||_ prgs -#> matd _||_ r | sp_n -#> prgs _||_ r -#> prgs _||_ sp_n -#> -#> These (conditional) independence statements are explored below using -#> the canonical correlations approach for mixed data. See -#> ??dagitty::localTests for further details. -#> -#> Results are shown for variables that are fully observed in the -#> specified dataset. -#> The null hypothesis is that the stated variables are (conditionally) -#> independent. -#> -#> estimate p.value 2.5% 97.5% -#> bwt _||_ matage | mated 0.05018898 0.1127099 -0.01184095 0.11183410 -#> matage _||_ pregsize 0.03029139 0.3386080 -0.03176134 0.09211150 -#> matage _||_ r | mated 0.02998323 0.3435470 -0.03206946 0.09180567 -#> mated _||_ pregsize 0.01594976 0.6144181 -0.04608889 0.07786585 -#> pregsize _||_ r 0.01482015 0.6397174 -0.04721631 0.07674273 -#> -#> Interpretation: A small p-value means the stated variables may not be -#> (conditionally) independent in the specified dataset: your data may not -#> be consistent with the proposed DAG. A large p-value means there is -#> little evidence of inconsistency between your data and the proposed DAG. -#> -#> Note that these results assume that relationships between variables are -#> linear. Consider exploring the specification of each relationship in -#> your model. Also consider whether it is valid and possible to explore -#> relationships between partially observed variables using the observed -#> data, e.g. avoiding perfect prediction. -``` - -``` r - -checkCRA(y="bmi7", covs="matage mated", r_cra="r", - mdag=" matage -> bmi7 - mated -> matage - mated -> bmi7 - sep_unmeas -> mated - sep_unmeas -> r - pregsize -> bmi7 - pregsize -> bwt - sep_unmeas -> bwt") -#> Based on the proposed directed acyclic graph (DAG), the analysis model -#> outcome and complete record indicator are independent given analysis -#> model covariates. Hence, complete records analysis is valid. -``` - -``` r - -checkMI(dep="bmi7", preds="matage mated pregsize", r_dep="r", - mdag=" matage -> bmi7 - mated -> matage - mated -> bmi7 - sep_unmeas -> mated - sep_unmeas -> r - pregsize -> bmi7 - pregsize -> bwt - sep_unmeas -> bwt") -#> Based on the proposed directed acyclic graph (DAG), the incomplete -#> variable and its missingness indicator are independent given imputation -#> model predictors. Hence, multiple imputation methods which assume data -#> are missing at random are valid in principle. -``` - -``` r - -mimod_bmi7 <- checkModSpec(formula="bmi7~matage+I(matage^2)+mated+pregsize", - family="gaussian(identity)", data=bmi) -#> Model mis-specification method: regression of model residuals on a -#> fractional polynomial of the fitted values -#> P-value: 1 -#> -#> A large p-value means there is little evidence of model mis-specification. -``` - -``` r - -miprop <- proposeMI(mimodobj=mimod_bmi7, data=bmi) -#> Based on your proposed imputation model and dataset, your mice() call -#> should be as follows: -#> mice(data = bmi , -#> # You may need to specify a subset of the columns in your dataset -#> -#> m = 41 , -#> # You should use at least this number of imputations based on the -#> proportion of complete records in your dataset -#> -#> method = c( 'norm' ) -#> -#> # Specify a method for each incomplete variable. If displayed, the -#> box-and-whisker plots can be used to inform your choice of method(s): -#> for example, if the imputation model does not predict extreme values -#> appropriately, consider a different imputation model/method e.g. PMM. -#> Note the distribution of imputed and observed values is displayed for -#> numeric variables only. The distribution may differ if data are missing -#> at random. If you suspect data are missing not at random, the plots can -#> also inform your choice of sensitivity parameter. -#> -#> -#> formulas = formulas_list , # Note that you do not additionally need to -#> specify a 'predmatrix' -#> -#> # The formulas_list specifies the conditional imputation models, as follows: -#> -#> 'bmi7 ~ matage + I(matage^2) + mated + pregsize' -#> -#> maxit = 10 , # If you have more than one incomplete variable, you -#> should check this number of iterations is sufficient by inspecting the -#> trace plots, if displayed. Consider increasing the number of -#> iterations if there is a trend that does not stabilise by the 10th -#> iteration. Note that iteration is not required when only one variable -#> is partially observed. -#> -#> -#> printFlag = FALSE , # Change to printFlag=TRUE to display the history -#> as imputation is performed -#> -#> seed = NA) # It is good practice to choose a seed so your results are -#> reproducible -``` + #> pattern bmi7 matage mated n pct + #> 1 1 1 1 1 592 59 + #> 2 2 0 1 1 408 41 + + exploreDAG(mdag=" matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r + pregsize -> bmi7 + pregsize -> bwt + sep_unmeas -> bwt", + data=bmi) + #> The proposed directed acyclic graph (DAG) implies the following + #> conditional independencies (where, for example, 'X _||_ Y | Z' should + #> be read as 'X is independent of Y conditional on Z'). Note that + #> variable names are abbreviated: + #> + #> bmi7 _||_ bwt | prgs, sp_n + #> + #> bmi7 _||_ bwt | matd, prgs + #> + #> bmi7 _||_ r | sp_n + #> + #> bmi7 _||_ r | matd + #> + #> bmi7 _||_ sp_n | matd + #> + #> bwt _||_ matg | matd + #> + #> bwt _||_ matg | sp_n + #> + #> bwt _||_ matd | sp_n + #> + #> bwt _||_ r | sp_n + #> + #> matg _||_ prgs + #> + #> matg _||_ r | sp_n + #> + #> matg _||_ r | matd + #> + #> matg _||_ sp_n | matd + #> + #> matd _||_ prgs + #> + #> matd _||_ r | sp_n + #> + #> prgs _||_ r + #> + #> prgs _||_ sp_n + #> + #> These (conditional) independence statements are explored below using + #> the canonical correlations approach for mixed data. See + #> ??dagitty::localTests for further details. Results are shown for + #> variables that are fully observed in the specified dataset. The null + #> hypothesis is that the stated variables are (conditionally) + #> independent. + #> + #> estimate p.value 2.5% 97.5% + #> + #> bwt _||_ matage | mated 0.05018898 0.1127099 -0.01184095 0.11183410 + #> + #> matage _||_ pregsize 0.03029139 0.3386080 -0.03176134 0.09211150 + #> + #> matage _||_ r | mated 0.02998323 0.3435470 -0.03206946 0.09180567 + #> + #> mated _||_ pregsize 0.01594976 0.6144181 -0.04608889 0.07786585 + #> + #> pregsize _||_ r 0.01482015 0.6397174 -0.04721631 0.07674273 + #> + #> Interpretation: A small p-value means the stated variables may not be + #> (conditionally) independent in the specified dataset: your data may not + #> be consistent with the proposed DAG. A large p-value means there is + #> little evidence of inconsistency between your data and the proposed + #> DAG. + #> + #> Note that these results assume that relationships between variables are + #> linear. Consider exploring the specification of each relationship in + #> your model. Also consider whether it is valid and possible to explore + #> relationships between partially observed variables using the observed + #> data, e.g. avoiding perfect prediction. + + checkCRA(y="bmi7", covs="matage mated", r_cra="r", + mdag=" matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r + pregsize -> bmi7 + pregsize -> bwt + sep_unmeas -> bwt") + #> Based on the proposed directed acyclic graph (DAG), the analysis model + #> outcome and complete record indicator are independent given analysis + #> model covariates. Hence, complete records analysis is valid. + + checkMI(dep="bmi7", preds="matage mated pregsize", r_dep="r", + mdag=" matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r + pregsize -> bmi7 + pregsize -> bwt + sep_unmeas -> bwt") + #> Based on the proposed directed acyclic graph (DAG), the incomplete + #> variable and its missingness indicator are independent given imputation + #> model predictors. Hence, multiple imputation methods which assume data + #> are missing at random are valid in principle. + + mimod_bmi7 <- checkModSpec(formula="bmi7~matage+I(matage^2)+mated+pregsize", + family="gaussian(identity)", data=bmi) + #> Model mis-specification method: regression of model residuals on a + #> fractional polynomial of the fitted values + #> + #> P-value: 1 + #> + #> A large p-value means there is little evidence of model + #> mis-specification. + + miprop <- proposeMI(mimodobj=mimod_bmi7, data=bmi) + #> Based on your proposed imputation model and dataset, your mice() call + #> should be as follows: + #> + #> mice(data = bmi , # You may need to specify a subset of the columns in + #> your dataset + #> + #> m = 41 , # You should use at least this number of imputations based on + #> the proportion of complete records in your dataset + #> + #> method = c( 'norm' ) # Specify a method for each incomplete variable. + #> If displayed, the box-and-whisker plots can be used to inform your + #> choice of method(s): for example, if the imputation model does not + #> predict extreme values appropriately, consider a different imputation + #> model/method e.g. PMM. Note the distribution of imputed and observed + #> values is displayed for numeric variables only. The distribution may + #> differ if data are missing at random or missing not at random. If you + #> suspect data are missing not at random, the plots can also inform your + #> choice of sensitivity parameter. + #> + #> formulas = formulas_list , # Note that you do not additionally need to + #> specify a 'predmatrix' + #> + #> # The formulas_list specifies the conditional imputation models, which + #> are as follows: + #> + #> 'bmi7 ~ matage + I(matage^2) + mated + pregsize' + #> + #> maxit = 10 , # If you have more than one incomplete variable, you + #> should check this number of iterations is sufficient by inspecting the + #> trace plots, if displayed. Consider increasing the number of iterations + #> if there is a trend that does not stabilise by the 10th iteration. Note + #> that iteration is not performed when only one variable is partially + #> observed. + #> + #> printFlag = FALSE , # Change to printFlag=TRUE to display the history + #> as imputation is performed + #> + #> seed = NA) # It is good practice to choose a seed so your results are + #> reproducible ``` r doMImice(miprop, 123, substmod="lm(bmi7 ~ matage + I(matage^2) + mated)") -#> Given the substantive model: lm(bmi7 ~ matage + I(matage^2) + mated) -#> -#> multiple imputation estimates are as follows: +#> Given the substantive model: lm(bmi7 ~ matage + I(matage^2) + mated) , +#> multiple imputation estimates are as follows: #> #> term estimate std.error statistic df p.value +#> #> 1 (Intercept) 17.6607324 0.07126548 247.816079 233.1668 2.116834e-284 +#> #> 2 matage 1.1504545 0.05230345 21.995769 184.5081 1.863532e-53 +#> #> 3 I(matage^2) 0.8414975 0.03231752 26.038433 257.1270 4.754845e-74 +#> #> 4 mated1 -1.0026194 0.10787751 -9.294054 159.1101 1.094881e-16 +#> #> 2.5 % 97.5 % +#> #> 1 17.5203258 17.8011389 +#> #> 2 1.0472648 1.2536442 +#> #> 3 0.7778567 0.9051382 +#> #> 4 -1.2156760 -0.7895629 ``` diff --git a/docs/articles/midoc.html b/docs/articles/midoc.html index a6a60ff..fbbdf7d 100644 --- a/docs/articles/midoc.html +++ b/docs/articles/midoc.html @@ -121,8 +121,10 @@

About midoc1. We assume +you are interested in obtaining unbiased estimates of regression coefficients - note that bias is not necessarily a concern if your interest is in prediction (i.e. diagnostic/prognostic modelling).

@@ -156,21 +158,26 @@

matage -> bmi7 mated -> matage mated -> bmi7
+
+matage -> bmi7 
+mated -> matage 
+mated -> bmi7

Next, for each partially observed variable, we will specify the variables related to its probability of being missing (its “missingness”) by adding these relationships to our DAG. This type of -DAG is often referred to as a “missingness” DAG (mDAG) 1, 2.

+DAG is often referred to as a “missingness” DAG (mDAG) 2, 3.

We will first use the midoc function descMissData to identify which variables in our dataset are partially observed, specifying our outcome (y), covariates, i.e. our independent variables, (covs), and dataset (data), as follows.

-descMissData(y="bmi7", covs="matage mated", data=bmi)
-
 pattern bmi7 matage mated   n pct
-       1    1      1     1 592  59
-       2    0      1     1 408  41
+descMissData(y="bmi7", + covs="matage mated", + data=bmi) +
  pattern bmi7 matage mated   n pct
+1       1    1      1     1 592  59
+2       2    0      1     1 408  41

We see that there are two missing data patterns: either all variables are observed, or BMI at age 7 years is missing and all covariates are observed. We will use indicator variable “R” to denote the missingness @@ -187,10 +194,19 @@

matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r
+
+matage -> bmi7 
+mated -> matage 
+mated -> bmi7 
+sep_unmeas -> mated 
+sep_unmeas -> r

Note that if instead you believe maternal education is a direct cause of R, the mDAG would be as follows:

-
matage -> bmi7 mated -> matage mated -> bmi7 mated -> r
+
+matage -> bmi7 
+mated -> matage 
+mated -> bmi7 
+mated -> r

We will now draw our mDAG and visually check that the relationships are specified as we intended:

Note We have used additional commands to specify the @@ -203,42 +219,53 @@

-exploreDAG(mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r", data=bmi)
-
The proposed directed acyclic graph (DAG) implies that the following 
-pairs of variables are independent, or conditionally independent.  'X 
-_||_ Y | Z' should be read as 'X is independent of Y conditional on Z'. 
-Note that variable names are abbreviated: 
+exploreDAG(mdag="matage -> bmi7 
+                  mated -> matage 
+                  mated -> bmi7 
+                  sep_unmeas -> mated 
+                  sep_unmeas -> r", 
+           data=bmi)
+
The proposed directed acyclic graph (DAG) implies the following
+conditional independencies (where, for example, 'X _||_ Y | Z' should
+be read as 'X is independent of Y conditional on Z'). Note that
+variable names are abbreviated:
 
 bmi7 _||_ r | sp_n
+
 bmi7 _||_ r | matd
+
 bmi7 _||_ sp_n | matd
+
 matg _||_ r | sp_n
+
 matg _||_ r | matd
+
 matg _||_ sp_n | matd
+
 matd _||_ r | sp_n
 
-These (conditional) independence statements are explored below using 
-the canonical correlations approach for mixed data. See 
-??dagitty::localTests for further details. 
- 
-Results are shown for variables that are fully observed in the 
-specified dataset. 
-The null hypothesis is that the stated variables are (conditionally) 
-independent. 
+These (conditional) independence statements are explored below using
+the canonical correlations approach for mixed data. See
+??dagitty::localTests for further details.  Results are shown for
+variables that are fully observed in the specified dataset. The null
+hypothesis is that the stated variables are (conditionally)
+independent.
 
                         estimate  p.value        2.5%      97.5%
-matage _||_ r | mated 0.02998323 0.343547 -0.03206946 0.09180567
 
- Interpretation: A small p-value means the stated variables may not be 
-(conditionally) independent in the specified dataset: your data may not 
-be consistent with the proposed DAG.  A large p-value means there is 
-little evidence of inconsistency between your data and the proposed DAG. 
+matage _||_ r | mated 0.02998323 0.343547 -0.03206946 0.09180567
 
-Note that these results assume that relationships between variables are 
-linear. Consider exploring the specification of each relationship in 
-your model.  Also consider whether it is valid and possible to explore 
-relationships between partially observed variables using the observed 
-data, e.g. avoiding perfect prediction. 
+Interpretation: A small p-value means the stated variables may not be +(conditionally) independent in the specified dataset: your data may not +be consistent with the proposed DAG. A large p-value means there is +little evidence of inconsistency between your data and the proposed +DAG. + +Note that these results assume that relationships between variables are +linear. Consider exploring the specification of each relationship in +your model. Also consider whether it is valid and possible to explore +relationships between partially observed variables using the observed +data, e.g. avoiding perfect prediction.

Based on the relationships between fully observed variables maternal age, maternal education, and missingness of BMI at age 7 years, we can see that there is little evidence of inconsistency between our dataset @@ -246,14 +273,18 @@

-checkCRA(y="bmi7", covs="matage", r_cra="r",
-         mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r")
-
Based on the proposed directed acyclic graph (DAG), the analysis model 
-outcome and complete record indicator are not independent given 
-analysis model covariates. Hence, in general, complete records analysis 
-is not valid. 
- 
-In special cases, depending on the type of analysis model and estimand 
-of interest, complete records analysis may still be valid. See, for 
-example, Bartlett et al. (2015) (https://doi.org/10.1093/aje/kwv114) 
-for further details. 
-
-Consider using a different analysis model and/or strategy, e.g. 
-multiple imputation.  
-For example, the analysis model outcome and complete record indicator 
-are independent if, in addition to the specified covariates, the 
-following sets of variables are included as covariates in the analysis 
-model (note that this list is not necessarily exhaustive, particularly 
-if your DAG is complex): 
-
-{ mated }
-{ mated, sep_unmeas }
+checkCRA(y="bmi7", + covs="matage", + r_cra="r", + mdag="matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r") +
Based on the proposed directed acyclic graph (DAG), the analysis model
+outcome and complete record indicator are not independent given
+analysis model covariates. Hence, in general, complete records analysis
+is not valid.
+
+In special cases, depending on the type of analysis model and estimand
+of interest, complete records analysis may still be valid. See, for
+example, Bartlett et al. (2015) (https://doi.org/10.1093/aje/kwv114)
+for further details.
+
+Consider using a different analysis model and/or strategy, e.g.
+multiple imputation.
+
+For example, the analysis model outcome and complete record indicator
+are independent if, in addition to the specified covariates, the
+following sets of variables are included as covariates in the analysis
+model (note that this list is not necessarily exhaustive, particularly
+if your DAG is complex):
+
+mated
+
+c("mated", "sep_unmeas")

We can see that CRA would not be valid (we can also tell this by inspecting our DAG: there is an open path from bmi7 to r via mated and sep_unmeas if we @@ -333,11 +372,17 @@

-checkCRA(y="bmi7", covs="matage mated", r_cra="r",
-         mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r")
-
Based on the proposed directed acyclic graph (DAG), the analysis model 
-outcome and complete record indicator are independent given analysis 
-model covariates. Hence, complete records analysis is valid. 
+checkCRA(y="bmi7", + covs="matage mated", + r_cra="r", + mdag="matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r") +
Based on the proposed directed acyclic graph (DAG), the analysis model
+outcome and complete record indicator are independent given analysis
+model covariates. Hence, complete records analysis is valid.

Note If our outcome, BMI at age 7 years, was itself a cause of missingness, CRA would always be invalid, i.e. there would be no other variables we could add to the analysis model to make @@ -345,21 +390,29 @@

-checkCRA(y="bmi7", covs="matage mated", r_cra="r",
-         mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r bmi7 -> r")
-
Based on the proposed directed acyclic graph (DAG), the analysis model 
-outcome and complete record indicator are not independent given 
-analysis model covariates. Hence, in general, complete records analysis 
-is not valid. 
- 
-In special cases, depending on the type of analysis model and estimand 
-of interest, complete records analysis may still be valid. See, for 
-example, Bartlett et al. (2015) (https://doi.org/10.1093/aje/kwv114) 
-for further details. 
-
-There are no other variables which could be added to the model to make 
-the analysis model outcome and complete record indicator conditionally 
-independent. Consider using a different strategy e.g. multiple imputation. 
+checkCRA(y="bmi7", + covs="matage mated", + r_cra="r", + mdag="matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r + bmi7 -> r") +
Based on the proposed directed acyclic graph (DAG), the analysis model
+outcome and complete record indicator are not independent given
+analysis model covariates. Hence, in general, complete records analysis
+is not valid.
+
+In special cases, depending on the type of analysis model and estimand
+of interest, complete records analysis may still be valid. See, for
+example, Bartlett et al. (2015) (https://doi.org/10.1093/aje/kwv114)
+for further details.
+
+There are no other variables which could be added to the model to make
+the analysis model outcome and complete record indicator conditionally
+independent. Consider using a different strategy e.g. multiple
+imputation.

Step 3 Check whether multiple imputation is likely to be a valid @@ -377,13 +430,13 @@

+
  pattern bmi7 matage mated pregsize bwt   n pct
+1       1    1      1     1        1   1 592  59
+2       2    0      1     1        1   1 408  41

We can see that our auxiliary variables are fully observed.

We assume that pregnancy size is a cause of BMI at age 7 years, but -not its missingness, whereas we assume birth weight is related to both -BMI at 7 years (via pregnancy size) and its missingness (via SEP). We -will now add these variables to our mDAG. Below, we have shown our -updated mDAG.

+not its missingness. We assume birth weight is related to both BMI at 7 +years (via pregnancy size) and its missingness (via SEP). We will now +add these variables to our mDAG. Below, we have shown our updated +mDAG.

We will also once again explore whether relationships in the dataset are consistent with the updated mDAG using exploreDAG, as follows.

-exploreDAG(mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r 
-           pregsize -> bmi7 pregsize -> bwt  sep_unmeas -> bwt", data=bmi)
-
The proposed directed acyclic graph (DAG) implies that the following 
-pairs of variables are independent, or conditionally independent.  'X 
-_||_ Y | Z' should be read as 'X is independent of Y conditional on Z'. 
-Note that variable names are abbreviated: 
+exploreDAG(mdag="matage -> bmi7 
+                  mated -> matage 
+                  mated -> bmi7 
+                  sep_unmeas -> mated 
+                  sep_unmeas -> r 
+                  pregsize -> bmi7 
+                  pregsize -> bwt  
+                  sep_unmeas -> bwt", 
+           data=bmi)
+
The proposed directed acyclic graph (DAG) implies the following
+conditional independencies (where, for example, 'X _||_ Y | Z' should
+be read as 'X is independent of Y conditional on Z'). Note that
+variable names are abbreviated:
 
 bmi7 _||_ bwt | prgs, sp_n
+
 bmi7 _||_ bwt | matd, prgs
+
 bmi7 _||_ r | sp_n
+
 bmi7 _||_ r | matd
+
 bmi7 _||_ sp_n | matd
+
 bwt _||_ matg | matd
+
 bwt _||_ matg | sp_n
+
 bwt _||_ matd | sp_n
+
 bwt _||_ r | sp_n
+
 matg _||_ prgs
+
 matg _||_ r | sp_n
+
 matg _||_ r | matd
+
 matg _||_ sp_n | matd
+
 matd _||_ prgs
+
 matd _||_ r | sp_n
+
 prgs _||_ r
+
 prgs _||_ sp_n
 
-These (conditional) independence statements are explored below using 
-the canonical correlations approach for mixed data. See 
-??dagitty::localTests for further details. 
- 
-Results are shown for variables that are fully observed in the 
-specified dataset. 
-The null hypothesis is that the stated variables are (conditionally) 
-independent. 
+These (conditional) independence statements are explored below using
+the canonical correlations approach for mixed data. See
+??dagitty::localTests for further details.  Results are shown for
+variables that are fully observed in the specified dataset. The null
+hypothesis is that the stated variables are (conditionally)
+independent.
 
                           estimate   p.value        2.5%      97.5%
+
 bwt _||_ matage | mated 0.05018898 0.1127099 -0.01184095 0.11183410
+
 matage _||_ pregsize    0.03029139 0.3386080 -0.03176134 0.09211150
+
 matage _||_ r | mated   0.02998323 0.3435470 -0.03206946 0.09180567
+
 mated _||_ pregsize     0.01594976 0.6144181 -0.04608889 0.07786585
-pregsize _||_ r         0.01482015 0.6397174 -0.04721631 0.07674273
 
- Interpretation: A small p-value means the stated variables may not be 
-(conditionally) independent in the specified dataset: your data may not 
-be consistent with the proposed DAG.  A large p-value means there is 
-little evidence of inconsistency between your data and the proposed DAG. 
+pregsize _||_ r         0.01482015 0.6397174 -0.04721631 0.07674273
 
-Note that these results assume that relationships between variables are 
-linear. Consider exploring the specification of each relationship in 
-your model.  Also consider whether it is valid and possible to explore 
-relationships between partially observed variables using the observed 
-data, e.g. avoiding perfect prediction. 
+Interpretation: A small p-value means the stated variables may not be +(conditionally) independent in the specified dataset: your data may not +be consistent with the proposed DAG. A large p-value means there is +little evidence of inconsistency between your data and the proposed +DAG. + +Note that these results assume that relationships between variables are +linear. Consider exploring the specification of each relationship in +your model. Also consider whether it is valid and possible to explore +relationships between partially observed variables using the observed +data, e.g. avoiding perfect prediction.

Our results suggest that our updated mDAG is plausible.

Note that CRA is still valid for our updated mDAG. We can check this using checkCRA once more:

-checkCRA(y="bmi7", covs="matage mated", r_cra="r",
-         mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r 
-         pregsize -> bmi7 pregsize -> bwt  sep_unmeas -> bwt")
-
Based on the proposed directed acyclic graph (DAG), the analysis model 
-outcome and complete record indicator are independent given analysis 
-model covariates. Hence, complete records analysis is valid. 
+checkCRA(y="bmi7", + covs="matage mated", + r_cra="r", + mdag="matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r + pregsize -> bmi7 + pregsize -> bwt + sep_unmeas -> bwt") +
Based on the proposed directed acyclic graph (DAG), the analysis model
+outcome and complete record indicator are independent given analysis
+model covariates. Hence, complete records analysis is valid.

We will now use the midoc function checkMI applied to our DAG to check whether MI is valid when the imputation model predictors for BMI at age 7 years include pregnancy size or birth -weight, as well as maternal age and maternal education, specifying the -partially observed variable (dep), predictors +weight, as well as maternal age and maternal education. We will specify +the partially observed variable (dep), predictors (preds), missingness indicator for the partially observed variable (r_dep), and mDAG (mdag).

We will first consider the imputation model including pregnancy size. @@ -493,41 +585,57 @@

-checkMI(dep="bmi7", preds="matage mated pregsize", r_dep="r",
-        mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r 
-        pregsize -> bmi7 pregsize -> bwt  sep_unmeas -> bwt")
-
Based on the proposed directed acyclic graph (DAG), the incomplete 
-variable and its missingness indicator are independent given imputation 
-model predictors. Hence, multiple imputation methods which assume data 
+checkMI(dep="bmi7", 
+        preds="matage mated pregsize", 
+        r_dep="r",
+        mdag="matage -> bmi7 
+              mated -> matage 
+              mated -> bmi7 
+              sep_unmeas -> mated 
+              sep_unmeas -> r 
+              pregsize -> bmi7 
+              pregsize -> bwt  
+              sep_unmeas -> bwt")
+
Based on the proposed directed acyclic graph (DAG), the incomplete
+variable and its missingness indicator are independent given imputation
+model predictors. Hence, multiple imputation methods which assume data
 are missing at random are valid in principle.

We will next consider the imputation model including birth weight. The results are shown below. These suggest that MI would not be valid if we included birth weight as well as the other analysis model variables -in the imputation model for BMI at age 7 years (we can also tell this by +in the imputation model for BMI at age 7 years. We can also tell this by inspecting our mDAG: since bwt shares a common cause with both bmi7 and r, it is a “collider”, and hence conditioning on bwt opens a path from bmi7 to -r via bwt).

+r via bwt.

-checkMI(dep="bmi7", preds="matage mated bwt", r_dep="r",
-        mdag="matage -> bmi7 mated -> matage mated -> bmi7 sep_unmeas -> mated sep_unmeas -> r 
-        pregsize -> bmi7 pregsize -> bwt  sep_unmeas -> bwt")
-
Based on the proposed directed acyclic graph (DAG), the incomplete 
-variable and its missingness indicator are not independent given 
-imputation model predictors. Hence, multiple imputation methods which 
-assume data are missing at random are not valid. 
- 
-Consider using a different imputation model and/or strategy (e.g. 
-not-at-random fully conditional specification). 
-
-For example, the incomplete variable and its missingness indicator are 
-independent if, in addition to the specified predictors, the following 
-sets of variables are included as predictors in the imputation model 
-(note that this list is not necessarily exhaustive, particularly if 
-your DAG is complex): 
-
-{ pregsize }
-{ pregsize, sep_unmeas }
+checkMI(dep="bmi7", + preds="matage mated bwt", + r_dep="r", + mdag="matage -> bmi7 + mated -> matage + mated -> bmi7 + sep_unmeas -> mated + sep_unmeas -> r + pregsize -> bmi7 + pregsize -> bwt + sep_unmeas -> bwt") +
Based on the proposed directed acyclic graph (DAG), the incomplete
+variable and its missingness indicator are not independent given
+imputation model predictors. Hence, multiple imputation methods which
+assume data are missing at random are not valid.
+
+Consider using a different imputation model and/or strategy (e.g.
+not-at-random fully conditional specification).  For example, the
+incomplete variable and its missingness indicator are independent if,
+in addition to the specified predictors, the following sets of
+variables are included as predictors in the imputation model (note that
+this list is not necessarily exhaustive, particularly if your DAG is
+complex):
+
+pregsize
+
+c("pregsize", "sep_unmeas")

Note In theory, and as suggested by the checkMI results shown above, MI would be valid if we added both birth weight and pregnancy size as auxiliary variables in our @@ -536,7 +644,7 @@