diff --git a/R/infants_clean.R b/R/infants_clean.R index 8e5a891..a1c5b0d 100644 --- a/R/infants_clean.R +++ b/R/infants_clean.R @@ -853,6 +853,267 @@ cleanbatch_infants <- function(data.df, return(exclude_all) })(copy(.SD)), by = .(subjid, param), .SDcols = colnames(data.df)] + # 15: moderate EWMA ---- + + # THINK ABOUT NNTE FULL HERE + + # create the valid set + # we only running carried forwards on valid values, non NNTE values, + # and non single values, and non pair + tmp <- table(paste0(data.df$subjid, "_", data.df$param)) + not_single_pairs <- paste0(data.df$subjid, "_", data.df$param) %in% names(tmp)[tmp > 2] + valid_set <- valid(data.df, include.temporary.extraneous = FALSE) & + !data.df$nnte_full & # does not use the "other" + not_single_pairs + + # work in an a function order to encapsulate and not keep all the additional + # columns + + # FUTURE UPDATE: I think I have to make sure I replace the temp-extraneous codes with includes after the first time, they're alone now + + # calculate plus/minus values + + # order just for ease later + data.df <- data.df[order(subjid, param, agedays),] + data.df <- data.df[valid.rows, exclude := (function(df) { + # 15A: calc plus/minus values + df[param == "WEIGHTKG", p_plus := 1.05*v] + df[param == "WEIGHTKG", p_minus := .95*v] + + df[param == "HEIGHTCM", p_plus := v+1] + df[param == "HEIGHTCM", p_minus := v-1] + + df[param == "HEADCM", p_plus := v+1] + df[param == "HEADCM", p_minus := v-1] + + #15B: smooth and recenter + df <- calc_and_recenter_z_scores(df, "p_plus", ref.data.path) + df <- calc_and_recenter_z_scores(df, "p_minus", ref.data.path) + + #15c: exclude agedays = 0 for ht, hc + df <- df[(param != "HEIGHTCM" | param != "HEADCM") & agedays != 0, ] + + #15d: calculate ewma -- run within a subject/parameter + df <- df[, exclude := (function(df_sub) { + # save initial exclusions to keep track + ind_all <- copy(df_sub$index) + exclude_all <- copy(df_sub$exclude) + + testing <- TRUE + + while (testing & nrow(df_sub) > 3){ + df_sub[, (ewma.fields) := as.double(NaN)] + + # first, calculate which exponent we want to put through (pass a different + # on for each exp) + # subset df to only valid rows + + tmp <- data.frame( + "before" = abs(df_sub$agedays - c(NA, df_sub$agedays[1:(nrow(df_sub)-1)])), + "after" = abs(df_sub$agedays - c(df_sub$agedays[2:(nrow(df_sub))], NA)) + ) + maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = T)}) + exp_vals <- rep(-1.5, nrow(tmp)) + exp_vals[maxdiff > 365.25] <- -2.5 + exp_vals[maxdiff > 730.5] <- -3.5 + df_sub[, exp_vals := exp_vals] + + # calculate ewma + df_sub[, (ewma.fields) := ewma(agedays, tbc.sd, exp_vals, TRUE)] + df_sub[, paste0("c.",ewma.fields) := ewma(agedays, ctbc.sd, exp_vals, TRUE)] + + # calculate dewma + df_sub[, `:=`( + dewma.all = tbc.sd - ewma.all, + dewma.before = tbc.sd - ewma.before, + dewma.after = tbc.sd - ewma.after, + + c.dewma.all = ctbc.sd - c.ewma.all + )] + + # 15E: calculate prior and next differences + df_sub[, tbc_diff_next := tbc.sd - c(tbc.sd[2:nrow(df_sub)], NA)] + df_sub[, tbc_diff_prior := tbc.sd - c(NA, tbc.sd[1:(nrow(df_sub)-1)])] + + df_sub[, tbc_diff_plus_next := tbc.p_plus - c(tbc.sd[2:nrow(df_sub)], NA)] + df_sub[, tbc_diff_plus_prior := + tbc.p_plus- c(NA, tbc.sd[1:(nrow(df_sub)-1)])] + + df_sub[, tbc_diff_minus_next := tbc.p_minus - c(tbc.sd[2:nrow(df_sub)], NA)] + df_sub[, tbc_diff_minus_prior := + tbc.p_minus- c(NA, tbc.sd[1:(nrow(df_sub)-1)])] + + # 15F: all exclusion criteria + df_sub[, + addcrithigh := + dewma.before > 1 & dewma.after > 1 & + ((tbc_diff_next > 1 & tbc_diff_plus_next > 1 & tbc_diff_minus_next > 1) | is.na(tbc_diff_next)) & + ((tbc_diff_prior > 1 & tbc_diff_plus_prior > 1 & tbc_diff_minus_prior > 1) | is.na(tbc_diff_prior)) + ] + df_sub[, + addcritlow := + dewma.before < -1 & dewma.after < -1 & + ((tbc_diff_next < -1 & tbc_diff_plus_next < -1 & tbc_diff_minus_next < -1) | is.na(tbc_diff_next)) & + ((tbc_diff_prior < -1 & tbc_diff_plus_prior < -1 & tbc_diff_minus_prior < -1) | is.na(tbc_diff_prior)) + ] + + #15G: find all the values for the DOP + # find the comparison -- making sure to keep only valid rows + compare_df <- data.df[subjid == df_sub$subjid[1] & + param == get_dop(df_sub$param[1]) & + exclude == "Include",] + if (nrow(compare_df) > 0){ + df_sub[compare_df, tbc_dop := i.tbc.sd, + on = c("agedays")] + df_sub[is.na(tbc_dop), tbc_dop := median(compare_df$tbc.sd)] + } else { + df_sub[, tbc_dop := NA] + } + + #15H: all exclusions + # for ease, right now -- REMOVE + df_sub[, exclude := as.character(df_sub$exclude)] + df_sub$rowind <- 1:nrow(df_sub) + + #a + df_sub[-c(1, nrow(df_sub)),][ + dewma.all > 1 & (c.dewma.all > 1 | is.na(c.dewma.all)) & addcrithigh, + exclude := "Exclude-EWMA2-middle"] + df_sub[-c(1, nrow(df_sub)),][ + dewma.all < -1 & (c.dewma.all < -1 | is.na(c.dewma.all)) & addcritlow, + exclude := "Exclude-EWMA2-middle"] + + #b + df_sub[agedays == 0 & agedays[2] < 365.25 & + dewma.all > 3 & (c.dewma.all > 3 | is.na(c.dewma.all)) & + addcrithigh + , exclude := "Exclude-EWMA2-birth-WT"] + df_sub[agedays == 0 & agedays[2] < 365.25 & + dewma.all < -3 & (c.dewma.all < -3 | is.na(c.dewma.all)) & + addcritlow + , exclude := "Exclude-EWMA2-birth-WT"] + + #c + df_sub[agedays == 0 & agedays[2] >= 365.25 & + dewma.all > 4 & (c.dewma.all > 4 | is.na(c.dewma.all)) & + addcrithigh + , exclude := "Exclude-EWMA-2-birth-WT-ext"] + df_sub[agedays == 0 & agedays[2] >= 365.25 & + dewma.all < -4 & (c.dewma.all < -4 | is.na(c.dewma.all)) & + addcritlow + , exclude := "Exclude-EWMA-2-birth-WT-ext"] + + #d + df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] < 365.25 & + dewma.all > 2 & (c.dewma.all > 2 | is.na(c.dewma.all)) & + addcrithigh + , exclude := "Exclude-EWMA-2-first"] + df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] < 365.25 & + dewma.all < -2 & (c.dewma.all < -2 | is.na(c.dewma.all)) & + addcritlow + , exclude := "Exclude-EWMA-2-first"] + + # e + df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] >= 365.25 & + dewma.all > 3 & (c.dewma.all > 3 | is.na(c.dewma.all)) & + addcrithigh + , exclude := "Exclude-EWMA-2-first-ext"] + df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] >= 365.25 & + dewma.all < -3 & (c.dewma.all < -3 | is.na(c.dewma.all)) & + addcritlow + , exclude := "Exclude-EWMA-2-first-ext"] + + # f + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) < 2 & + dewma.all > 2 & (c.dewma.all > 2 | is.na(c.dewma.all)) & + addcrithigh + , exclude := "Exclude-EWMA2-last"] + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) < 2 & + dewma.all < -2 & (c.dewma.all < -2 | is.na(c.dewma.all)) & + addcritlow + , exclude := "Exclude-EWMA2-last"] + + # g + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) >= 2 & + dewma.all > abs(tbc.sd[nrow(df_sub)-1]) & + (c.dewma.all > 3 | is.na(c.dewma.all)) & + addcrithigh + , exclude := "Exclude-EWMA2-last-high"] + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) >= 2 & + dewma.all < abs(tbc.sd[nrow(df_sub)-1]) & + (c.dewma.all < -3 | is.na(c.dewma.all)) & + addcritlow + , exclude := "Exclude-EWMA2-last-high"] + + # h + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) < 2 & + dewma.all > 3 & + (c.dewma.all > 3 | is.na(c.dewma.all)) & + (tbc.sd - tbc_dop > 4 | is.na(tbc_dop)) & + addcrithigh + , exclude := "Exclude-EWMA2-last-ext"] + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) < 2 & + dewma.all < -3 & + (c.dewma.all <- 3 | is.na(c.dewma.all)) & + (tbc.sd - tbc_dop < -4 | is.na(tbc_dop)) & + addcritlow + , exclude := "Exclude-EWMA2-last-ext"] + + # i + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) >= 2 & + dewma.all > (1+abs(tbc.sd[nrow(df_sub)-1])) & + (c.dewma.all > 3 | is.na(c.dewma.all)) & + (tbc.sd - tbc_dop > 4 | is.na(tbc_dop)) & + addcrithigh + , exclude := "Exclude-EWMA2-last-ext-high"] + df_sub[rowind == nrow(df_sub) & + agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 & + abs(tbc.sd[nrow(df_sub)-1]) >= 2 & + dewma.all < (-1-abs(tbc.sd[nrow(df_sub)-1])) & + (c.dewma.all <- 3 | is.na(c.dewma.all)) & + (tbc.sd - tbc_dop < -4 | is.na(tbc_dop)) & + addcritlow + , exclude := "Exclude-EWMA2-last-ext-high"] + + # figure out if any of the exclusions hit + count_exclude <- sum(df_sub$exclude != "Include") + if (count_exclude > 0){ + df_sub[, abssum := abs(tbc.sd + dewma.all)] + + # choose the highest abssum for exclusion + idx <- df_sub$index[which.max(df_sub[df_sub$exclude != "Include", + abssum])] + + exclude_all[ind_all == idx] <- df_sub[index == idx, exclude] + + #set up to continue on + testing <- TRUE + + df_sub <- df_sub[ind_all != idx, ] + } else { + testing <- FALSE + } + } + return(exclude_all) + })(copy(.SD)), , by = .(subjid, param), .SDcols = colnames(df)] + + return(df$exclude) + })(copy(.SD)), .SDcols = colnames(data.df)] + # end ---- diff --git a/R/infants_support.R b/R/infants_support.R index 20341e6..a6e5232 100644 --- a/R/infants_support.R +++ b/R/infants_support.R @@ -139,3 +139,58 @@ calc_oob_evil_twins <- function(df){ return(df) } + +# moderate ewma ---- + +# function to calculate and recenter z scores for a given column +# df: data frame with parameter, agedays, sex, cn +# cn: column name to calculate, smooth, and recenter +# ref.data.path: reference data path +# +# returns df with additional column, tbc.(cn), which is the recentered z-score +# for the input +calc_and_recenter_z_scores <- function(df, cn, ref.data.path){ + # for infants, use z and who + measurement.to.z <- read_anthro(ref.data.path, cdc.only = TRUE, + infants = T) + measurement.to.z_who <- read_anthro(ref.data.path, cdc.only = FALSE, + infants = T) + + # calculate "standard deviation" scores + if (!quietly) + cat(sprintf("[%s] Calculating SD-scores...\n", Sys.time())) + df[, cn.orig_cdc := measurement.to.z(param, agedays, sex, get(cn), TRUE)] + df[, cn.orig_who := measurement.to.z_who(param, agedays, sex, get(cn), TRUE)] + + # smooth z-scores/SD scores between ages 1 - 3yo using weighted scores + # older uses cdc, younger uses who + who_weight <- 4 - (df$agedays/365.25) + cdc_weight <- (df$agedays/365.25) - 2 + + smooth_val <- df$agedays/365.25 >= 2 & + df$agedays/365.25 <= 4 & + df$param != "HEADCM" + df[smooth_val, + cn.orig := (cn.orig_cdc[smooth_val]*cdc_weight[smooth_val] + + cn.orig_who[smooth_val]*who_weight[smooth_val])/2] + + # otherwise use WHO and CDC for older and younger, respectively + who_val <- df$param == "HEADCM" | + df$agedays/365.25 < 2 + df[who_val | (smooth_val & is.na(df$cn.orig_cdc)), + cn.orig := df$cn.orig_who[who_val | (smooth_val & is.na(df$cn.orig_cdc))]] + + cdc_val <- df$param != "HEADCM" | + df$agedays/365.25 > 4 + df[cdc_val | (smooth_val & is.na(df$cn.orig_who)), + cn.orig := df$cn.orig_cdc[cdc_val | (smooth_val & is.na(df$sd.orig_who))]] + + # now recenter -- already has the sd.median from the original recentering + setkey(df, subjid, param, agedays) + df[, tbc.cn := cn.orig - sd.median] + + # rename ending column + setnames(df, "tbc.cn", paste0("tbc.", cn)) + + return(df) +}