Skip to content

Commit

Permalink
Clean up trues, update infants naming
Browse files Browse the repository at this point in the history
  • Loading branch information
delosh653 committed Sep 13, 2023
1 parent 16f9dc5 commit 374aeec
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 35 deletions.
46 changes: 23 additions & 23 deletions R/growth.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
#' @param adult_columns_filename Name of file to save original adult data, with additional output columns to
#' as CSV. Defaults to "", for which this data will not be saved. Useful
#' for post-analysis. For more information on this output, please see README.
#' @param infants TRUE/FALSE. Run the beta-release of the infants algorithm (expands pediatric algorithm to clean 0 - 2). Defaults to FALSE.
#' @param prelim_infants TRUE/FALSE. Run the in-development release of the infants algorithm (expands pediatric algorithm to improve performance for children 0 – 2 years). Not recommended for use in research. For more information regarding the logic of the algorithm, see the vignette 'Preliminary Infants Algorithm.' Defaults to FALSE.
#'
#' @return Vector of exclusion codes for each of the input measurements.
#'
Expand Down Expand Up @@ -157,7 +157,7 @@ cleangrowth <- function(subjid,
adult_cutpoint = 20,
weight_cap = Inf,
adult_columns_filename = "",
infants = FALSE) {
prelim_infants = FALSE) {
# avoid "no visible binding" warnings
N <- age_years <- batch <- exclude <- index <- line <- NULL
newbatch <- sd.median <- sd.orig <- tanner.months <- tbc.sd <- NULL
Expand All @@ -168,7 +168,7 @@ cleangrowth <- function(subjid,
sd.orig_uncorr <- agemonths <- intwt <- fengadays <- pmagedays <- cagedays <-
unmod_zscore <- fen_wt_m <- fen_wt_l <- fen_wt_s <- cwho_cv <- ccdc_cv <-
sd.c_cdc <- sd.c_who <- sd.c <- sd.corr <- seq_win <- sd.corr_abssumdiff <-
sd.orig_abssumdiff <- ..orig_colnames <- ctbc.sd <- sum_sde <- no_sde <-
sd.orig_abssumdiff <- orig_colnames <- ctbc.sd <- sum_sde <- no_sde <-
sum_val <- no_dup_val <- no_outliers <- no_bigdiff <- nottoofar <- nnte <-
nnte_full <- NULL

Expand Down Expand Up @@ -227,7 +227,7 @@ cleangrowth <- function(subjid,

# constants for pediatric
# enumerate the different exclusion levels
if (infants){
if (prelim_infants){
# different for infants
exclude.levels.peds <- c(
'Include',
Expand Down Expand Up @@ -372,7 +372,7 @@ cleangrowth <- function(subjid,
if (is.na(num.batches)) {
num.batches <- getDoParWorkers()
}
if (infants){
if (prelim_infants){
# variables needed for parallel workers
var_for_par <- c("temporary_extraneous", "valid", "swap_parameters",
"na_as_false", "ewma", "read_anthro", "as_matrix_delta",
Expand Down Expand Up @@ -426,7 +426,7 @@ cleangrowth <- function(subjid,
tanner.fields <- colnames(tanner.ht.vel)
tanner.fields <- tanner.fields[!tanner.fields %in% c('sex', 'tanner.months')]

if (!infants){
if (!prelim_infants){
who_max_ht_vel_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata", "who_ht_maxvel_3sd.csv.gz"), package = "growthcleanr"),
Expand Down Expand Up @@ -510,15 +510,15 @@ cleangrowth <- function(subjid,
data.all[param == 'LENGTHCM', param := 'HEIGHTCM']

# calculate z/sd scores
if(infants){
if(prelim_infants){
if (!quietly)
cat(sprintf("[%s] Calculating z-scores...\n", Sys.time()))
# removing z calculations, as they are not used
# for infants, use z and who
measurement.to.z <- read_anthro(ref.data.path, cdc.only = TRUE,
infants = T)
prelim_infants = TRUE)
measurement.to.z_who <- read_anthro(ref.data.path, cdc.only = FALSE,
infants = T)
prelim_infants = TRUE)

# calculate "standard deviation" scores
if (!quietly)
Expand Down Expand Up @@ -589,7 +589,7 @@ cleangrowth <- function(subjid,
# merge with fenton curves
data.all <- merge(
data.all, fentlms_foraga, by = c("sex", "intwt"),
all.x = T)
all.x = TRUE)

data.all[fengadays < 259, pmagedays := agedays + fengadays]
data.all[fengadays < 259, cagedays := pmagedays - 280]
Expand All @@ -599,7 +599,7 @@ cleangrowth <- function(subjid,
# merge with fenton curves
data.all <- merge(
data.all, fentlms_forz, by = c("sex", "fengadays"),
all.x = T)
all.x = TRUE)

# add unmodified zscore using weight in unrounded grams
data.all[, unmod_zscore :=
Expand All @@ -617,11 +617,11 @@ cleangrowth <- function(subjid,
data.all <- merge(data.all, growthfile_who,
by.x = c("sex", "cagedays"),
by.y = c("sex", "agedays"),
all.x = T)
all.x = TRUE)
data.all <- merge(data.all, growthfile_cdc,
by.x = c("sex", "cagedays"),
by.y = c("sex", "agedays"),
all.x = T)
all.x = TRUE)

# adjust WHO and CDC heights based on age
data.all[, cwho_cv := v]
Expand Down Expand Up @@ -759,7 +759,7 @@ cleangrowth <- function(subjid,
if (!is.data.table(sd.recenter)) {
# INFANTS CHANGES:
# use recentering file derived from work, independent of sex
if (infants){
if (prelim_infants){
infants_reference_medians_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata",
Expand Down Expand Up @@ -838,7 +838,7 @@ cleangrowth <- function(subjid,

setkey(data.all, subjid, param, agedays)
data.all[, tbc.sd := sd.orig - sd.median]
if (infants){
if (prelim_infants){
# separate out corrected and noncorrected values
data.all[, ctbc.sd := sd.corr - sd.median]
}
Expand Down Expand Up @@ -871,7 +871,7 @@ cleangrowth <- function(subjid,
# safety check: treat observations where tbc.sd cannot be calculated as missing
data.all[is.na(tbc.sd), exclude := 'Missing']

if (infants){
if (prelim_infants){
# 4: identify subset that don't need to be cleaned (nnte) ----

# identify those meeting all subjects meeting these criteria as "no need
Expand All @@ -894,7 +894,7 @@ cleangrowth <- function(subjid,
data.all[, no_outliers := no_outliers == 1]
# all max - min tbd.sc < 2.5
data.all[, no_bigdiff :=
rep((abs(max(tbc.sd, na.rm = T) - min(tbc.sd, na.rm = T)) < 2.5),
rep((abs(max(tbc.sd, na.rm = TRUE) - min(tbc.sd, na.rm = TRUE)) < 2.5),
.N),
by = c("subjid", "param")]
# the previous value can't be too far from the current value
Expand Down Expand Up @@ -930,7 +930,7 @@ cleangrowth <- function(subjid,
num.batches
))
if (num.batches == 1) {
if (!infants){
if (!prelim_infants){
ret.df <- cleanbatch(data.all,
log.path = log.path,
quietly = quietly,
Expand Down Expand Up @@ -975,7 +975,7 @@ cleangrowth <- function(subjid,
ifelse(!dir.exists(log.path), dir.create(log.path, recursive = TRUE), FALSE)
}

if (!infants){
if (!prelim_infants){
ret.df <- ddply(
data.all,
.(batch),
Expand Down Expand Up @@ -1152,7 +1152,7 @@ cleangrowth <- function(subjid,
#'
#' @param path Path to supplied reference anthro data. Defaults to package anthro tables.
#' @param cdc.only Whether or not only CDC data should be used. Defaults to false.
#' @param infants TRUE/FALSE. Run the beta-release of the infants algorithm (expands pediatric algorithm to clean 0 - 2). Defaults to FALSE.
#' @param prelim_infants TRUE/FALSE. Run the in-development release of the infants algorithm (expands pediatric algorithm to improve performance for children 0 – 2 years). Not recommended for use in research. For more information regarding the logic of the algorithm, see the vignette 'Preliminary Infants Algorithm.' Defaults to FALSE.
#'
#' @return Function for calculating BMI based on measurement, age in days, sex, and measurement value.
#' @export
Expand All @@ -1165,13 +1165,13 @@ cleangrowth <- function(subjid,
#' # Return calculating function while specifying a path and using only CDC data
#' afunc <- read_anthro(path = system.file("extdata", package = "growthcleanr"),
#' cdc.only = TRUE)
read_anthro <- function(path = "", cdc.only = FALSE, infants = FALSE) {
read_anthro <- function(path = "", cdc.only = FALSE, prelim_infants = FALSE) {
# avoid "no visible bindings" warning
src <- param <- sex <- age <- ret <- m <- NULL
csdneg <- csdpos <- s <- NULL

# set correct path based on input reference table path (if any)
if (!infants){
if (!prelim_infants){
weianthro_path <- ifelse(
path == "",
system.file(file.path("extdata", "weianthro.txt.gz"), package = "growthcleanr"),
Expand Down Expand Up @@ -1207,7 +1207,7 @@ read_anthro <- function(path = "", cdc.only = FALSE, infants = FALSE) {
}
growth_cdc_ext <- read.csv(gzfile(growth_cdc_ext_path))

l <- if (!infants){
l <- if (!prelim_infants){
list(
with(
read.table(gzfile(weianthro_path), header = TRUE),
Expand Down
24 changes: 12 additions & 12 deletions R/infants_clean.R
Original file line number Diff line number Diff line change
Expand Up @@ -365,15 +365,15 @@ cleanbatch_infants <- function(data.df,
upd.df <- copy(df)
upd.df <- calc_oob_evil_twins(upd.df)
# count the amount of oobs for each subject/param and distribute it out
upd.df[, `:=` (sum_oob = sum(oob, na.rm = T)), by =.(subjid, param)]
upd.df[, `:=` (sum_oob = sum(oob, na.rm = TRUE)), by =.(subjid, param)]

any_oob <- any(upd.df$sum_oob >= 2)
# while there are multiple oob, we want to remove
while (any_oob){

# 9D
# now calculate the maximum difference from the median tbc.sd
upd.df[, `:=` (sd_med = median(tbc.sd, na.rm = T)), by =.(subjid, param)]
upd.df[, `:=` (sd_med = median(tbc.sd, na.rm = TRUE)), by =.(subjid, param)]
upd.df[, `:=` (med_diff = abs(tbc.sd - sd_med)), by =.(subjid, param)]
upd.df[, `:=` (max_diff = med_diff == max(med_diff)), by =.(subjid, param)]
# for ones with no tbc.sd, mark as false
Expand All @@ -386,7 +386,7 @@ cleanbatch_infants <- function(data.df,
#9E
# reupdate valid (to recalculate OOB -- others are not included)
upd.df <- calc_oob_evil_twins(df[valid(df),])
upd.df[, `:=` (sum_oob = sum(oob, na.rm = T)), by =.(subjid, param)]
upd.df[, `:=` (sum_oob = sum(oob, na.rm = TRUE)), by =.(subjid, param)]

any_oob <- any(upd.df$sum_oob >= 2)

Expand Down Expand Up @@ -445,7 +445,7 @@ cleanbatch_infants <- function(data.df,
"before" = abs(df_sub$agedays - c(NA, df_sub$agedays[1:(nrow(df_sub)-1)])),
"after" = abs(df_sub$agedays - c(df_sub$agedays[2:(nrow(df_sub))], NA))
)
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = T)})
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = TRUE)})
exp_vals <- rep(-1.5, nrow(tmp))
exp_vals[maxdiff > 365.25] <- -2.5
exp_vals[maxdiff > 730.5] <- -3.5
Expand Down Expand Up @@ -652,7 +652,7 @@ cleanbatch_infants <- function(data.df,
idx_roll <- c(embed(1:nrow(subj_df),4)[adjacent,, drop = FALSE])
idx_adj <- which(subj_df$agedays %in% subj_df$agedays[idx_roll])
} else {
adjacent <- F
adjacent <- FALSE
idx_adj <- idx_roll <- c()
}

Expand Down Expand Up @@ -680,7 +680,7 @@ cleanbatch_infants <- function(data.df,
# only exclude sdes that are adjacent
subj_df$agedays %in% subj_df$agedays[idx_roll]
} else {
rep(F, nrow(subj_df))
rep(FALSE, nrow(subj_df))
}
# re-include similar groups
criteria[subj_df$index %in% similar_ids] <- FALSE
Expand Down Expand Up @@ -709,15 +709,15 @@ cleanbatch_infants <- function(data.df,

# check for SDEs by EWMA -- alternate calculate excluding other SDEs
all_sdes <- duplicated(subj_df$agedays) |
duplicated(subj_df$agedays, fromLast = T)
duplicated(subj_df$agedays, fromLast = TRUE)

# first, calculate which exponent we want to put through (pass a different
# on for each exp)
tmp <- data.frame(
"before" = abs(subj_df$agedays - c(NA, subj_df$agedays[1:(nrow(subj_df)-1)])),
"after" = abs(subj_df$agedays - c(subj_df$agedays[2:(nrow(subj_df))], NA))
)
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = T)})
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = TRUE)})
exp_vals <- rep(-1.5, nrow(tmp))
exp_vals[maxdiff > 365.25] <- -2.5
exp_vals[maxdiff > 730.5] <- -3.5
Expand Down Expand Up @@ -792,7 +792,7 @@ cleanbatch_infants <- function(data.df,
if (any(subj_df$extraneous)){
# check for SDEs
all_sdes <- duplicated(subj_df$agedays) |
duplicated(subj_df$agedays, fromLast = T)
duplicated(subj_df$agedays, fromLast = TRUE)

rem_ids <- c()
rem_ids_extreme <- c()
Expand Down Expand Up @@ -930,7 +930,7 @@ cleanbatch_infants <- function(data.df,
"before" = abs(df_sub$agedays - c(NA, df_sub$agedays[1:(nrow(df_sub)-1)])),
"after" = abs(df_sub$agedays - c(df_sub$agedays[2:(nrow(df_sub))], NA))
)
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = T)})
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = TRUE)})
exp_vals <- rep(-1.5, nrow(tmp))
exp_vals[maxdiff > 365.25] <- -2.5
exp_vals[maxdiff > 730.5] <- -3.5
Expand Down Expand Up @@ -1191,7 +1191,7 @@ cleanbatch_infants <- function(data.df,
"before" = abs(df_sub$agedays - c(NA, df_sub$agedays[1:(nrow(df_sub)-1)])),
"after" = abs(df_sub$agedays - c(df_sub$agedays[2:(nrow(df_sub))], NA))
)
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = T)})
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = TRUE)})
exp_vals <- rep(-1.5, nrow(tmp))
exp_vals[maxdiff > 365.25] <- -2.5
exp_vals[maxdiff > 730.5] <- -3.5
Expand Down Expand Up @@ -1556,7 +1556,7 @@ cleanbatch_infants <- function(data.df,
"before" = abs(df$agedays - c(NA, df$agedays[1:(nrow(df)-1)])),
"after" = abs(df$agedays - c(df$agedays[2:(nrow(df))], NA))
)
maxdiff_e <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = T)})
maxdiff_e <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = TRUE)})
exp_vals <- rep(-1.5, nrow(tmp))
exp_vals[maxdiff_e > 365.25] <- -2.5
exp_vals[maxdiff_e > 730.5] <- -3.5
Expand Down

0 comments on commit 374aeec

Please sign in to comment.