Skip to content

Commit

Permalink
Add first draft of moderate ewma
Browse files Browse the repository at this point in the history
  • Loading branch information
delosh653 committed Jul 26, 2023
1 parent 5de245d commit 87f1942
Show file tree
Hide file tree
Showing 2 changed files with 316 additions and 0 deletions.
261 changes: 261 additions & 0 deletions R/infants_clean.R
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,267 @@ cleanbatch_infants <- function(data.df,
return(exclude_all)
})(copy(.SD)), by = .(subjid, param), .SDcols = colnames(data.df)]

# 15: moderate EWMA ----

# THINK ABOUT NNTE FULL HERE

# create the valid set
# we only running carried forwards on valid values, non NNTE values,
# and non single values, and non pair
tmp <- table(paste0(data.df$subjid, "_", data.df$param))
not_single_pairs <- paste0(data.df$subjid, "_", data.df$param) %in% names(tmp)[tmp > 2]
valid_set <- valid(data.df, include.temporary.extraneous = FALSE) &
!data.df$nnte_full & # does not use the "other"
not_single_pairs

# work in an a function order to encapsulate and not keep all the additional
# columns

# FUTURE UPDATE: I think I have to make sure I replace the temp-extraneous codes with includes after the first time, they're alone now

# calculate plus/minus values

# order just for ease later
data.df <- data.df[order(subjid, param, agedays),]
data.df <- data.df[valid.rows, exclude := (function(df) {
# 15A: calc plus/minus values
df[param == "WEIGHTKG", p_plus := 1.05*v]
df[param == "WEIGHTKG", p_minus := .95*v]

df[param == "HEIGHTCM", p_plus := v+1]
df[param == "HEIGHTCM", p_minus := v-1]

df[param == "HEADCM", p_plus := v+1]
df[param == "HEADCM", p_minus := v-1]

#15B: smooth and recenter
df <- calc_and_recenter_z_scores(df, "p_plus", ref.data.path)
df <- calc_and_recenter_z_scores(df, "p_minus", ref.data.path)

#15c: exclude agedays = 0 for ht, hc
df <- df[(param != "HEIGHTCM" | param != "HEADCM") & agedays != 0, ]

#15d: calculate ewma -- run within a subject/parameter
df <- df[, exclude := (function(df_sub) {
# save initial exclusions to keep track
ind_all <- copy(df_sub$index)
exclude_all <- copy(df_sub$exclude)

testing <- TRUE

while (testing & nrow(df_sub) > 3){
df_sub[, (ewma.fields) := as.double(NaN)]

# first, calculate which exponent we want to put through (pass a different
# on for each exp)
# subset df to only valid rows

tmp <- data.frame(
"before" = abs(df_sub$agedays - c(NA, df_sub$agedays[1:(nrow(df_sub)-1)])),
"after" = abs(df_sub$agedays - c(df_sub$agedays[2:(nrow(df_sub))], NA))
)
maxdiff <- sapply(1:nrow(tmp), function(x){max(tmp[x,], na.rm = T)})
exp_vals <- rep(-1.5, nrow(tmp))
exp_vals[maxdiff > 365.25] <- -2.5
exp_vals[maxdiff > 730.5] <- -3.5
df_sub[, exp_vals := exp_vals]

# calculate ewma
df_sub[, (ewma.fields) := ewma(agedays, tbc.sd, exp_vals, TRUE)]
df_sub[, paste0("c.",ewma.fields) := ewma(agedays, ctbc.sd, exp_vals, TRUE)]

# calculate dewma
df_sub[, `:=`(
dewma.all = tbc.sd - ewma.all,
dewma.before = tbc.sd - ewma.before,
dewma.after = tbc.sd - ewma.after,

c.dewma.all = ctbc.sd - c.ewma.all
)]

# 15E: calculate prior and next differences
df_sub[, tbc_diff_next := tbc.sd - c(tbc.sd[2:nrow(df_sub)], NA)]
df_sub[, tbc_diff_prior := tbc.sd - c(NA, tbc.sd[1:(nrow(df_sub)-1)])]

df_sub[, tbc_diff_plus_next := tbc.p_plus - c(tbc.sd[2:nrow(df_sub)], NA)]
df_sub[, tbc_diff_plus_prior :=
tbc.p_plus- c(NA, tbc.sd[1:(nrow(df_sub)-1)])]

df_sub[, tbc_diff_minus_next := tbc.p_minus - c(tbc.sd[2:nrow(df_sub)], NA)]
df_sub[, tbc_diff_minus_prior :=
tbc.p_minus- c(NA, tbc.sd[1:(nrow(df_sub)-1)])]

# 15F: all exclusion criteria
df_sub[,
addcrithigh :=
dewma.before > 1 & dewma.after > 1 &
((tbc_diff_next > 1 & tbc_diff_plus_next > 1 & tbc_diff_minus_next > 1) | is.na(tbc_diff_next)) &
((tbc_diff_prior > 1 & tbc_diff_plus_prior > 1 & tbc_diff_minus_prior > 1) | is.na(tbc_diff_prior))
]
df_sub[,
addcritlow :=
dewma.before < -1 & dewma.after < -1 &
((tbc_diff_next < -1 & tbc_diff_plus_next < -1 & tbc_diff_minus_next < -1) | is.na(tbc_diff_next)) &
((tbc_diff_prior < -1 & tbc_diff_plus_prior < -1 & tbc_diff_minus_prior < -1) | is.na(tbc_diff_prior))
]

#15G: find all the values for the DOP
# find the comparison -- making sure to keep only valid rows
compare_df <- data.df[subjid == df_sub$subjid[1] &
param == get_dop(df_sub$param[1]) &
exclude == "Include",]
if (nrow(compare_df) > 0){
df_sub[compare_df, tbc_dop := i.tbc.sd,
on = c("agedays")]
df_sub[is.na(tbc_dop), tbc_dop := median(compare_df$tbc.sd)]
} else {
df_sub[, tbc_dop := NA]
}

#15H: all exclusions
# for ease, right now -- REMOVE
df_sub[, exclude := as.character(df_sub$exclude)]
df_sub$rowind <- 1:nrow(df_sub)

#a
df_sub[-c(1, nrow(df_sub)),][
dewma.all > 1 & (c.dewma.all > 1 | is.na(c.dewma.all)) & addcrithigh,
exclude := "Exclude-EWMA2-middle"]
df_sub[-c(1, nrow(df_sub)),][
dewma.all < -1 & (c.dewma.all < -1 | is.na(c.dewma.all)) & addcritlow,
exclude := "Exclude-EWMA2-middle"]

#b
df_sub[agedays == 0 & agedays[2] < 365.25 &
dewma.all > 3 & (c.dewma.all > 3 | is.na(c.dewma.all)) &
addcrithigh
, exclude := "Exclude-EWMA2-birth-WT"]
df_sub[agedays == 0 & agedays[2] < 365.25 &
dewma.all < -3 & (c.dewma.all < -3 | is.na(c.dewma.all)) &
addcritlow
, exclude := "Exclude-EWMA2-birth-WT"]

#c
df_sub[agedays == 0 & agedays[2] >= 365.25 &
dewma.all > 4 & (c.dewma.all > 4 | is.na(c.dewma.all)) &
addcrithigh
, exclude := "Exclude-EWMA-2-birth-WT-ext"]
df_sub[agedays == 0 & agedays[2] >= 365.25 &
dewma.all < -4 & (c.dewma.all < -4 | is.na(c.dewma.all)) &
addcritlow
, exclude := "Exclude-EWMA-2-birth-WT-ext"]

#d
df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] < 365.25 &
dewma.all > 2 & (c.dewma.all > 2 | is.na(c.dewma.all)) &
addcrithigh
, exclude := "Exclude-EWMA-2-first"]
df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] < 365.25 &
dewma.all < -2 & (c.dewma.all < -2 | is.na(c.dewma.all)) &
addcritlow
, exclude := "Exclude-EWMA-2-first"]

# e
df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] >= 365.25 &
dewma.all > 3 & (c.dewma.all > 3 | is.na(c.dewma.all)) &
addcrithigh
, exclude := "Exclude-EWMA-2-first-ext"]
df_sub[rowind == 1 & agedays[1] != 0 & agedays[2]-agedays[1] >= 365.25 &
dewma.all < -3 & (c.dewma.all < -3 | is.na(c.dewma.all)) &
addcritlow
, exclude := "Exclude-EWMA-2-first-ext"]

# f
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) < 2 &
dewma.all > 2 & (c.dewma.all > 2 | is.na(c.dewma.all)) &
addcrithigh
, exclude := "Exclude-EWMA2-last"]
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) < 2 &
dewma.all < -2 & (c.dewma.all < -2 | is.na(c.dewma.all)) &
addcritlow
, exclude := "Exclude-EWMA2-last"]

# g
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) >= 2 &
dewma.all > abs(tbc.sd[nrow(df_sub)-1]) &
(c.dewma.all > 3 | is.na(c.dewma.all)) &
addcrithigh
, exclude := "Exclude-EWMA2-last-high"]
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] < 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) >= 2 &
dewma.all < abs(tbc.sd[nrow(df_sub)-1]) &
(c.dewma.all < -3 | is.na(c.dewma.all)) &
addcritlow
, exclude := "Exclude-EWMA2-last-high"]

# h
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) < 2 &
dewma.all > 3 &
(c.dewma.all > 3 | is.na(c.dewma.all)) &
(tbc.sd - tbc_dop > 4 | is.na(tbc_dop)) &
addcrithigh
, exclude := "Exclude-EWMA2-last-ext"]
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) < 2 &
dewma.all < -3 &
(c.dewma.all <- 3 | is.na(c.dewma.all)) &
(tbc.sd - tbc_dop < -4 | is.na(tbc_dop)) &
addcritlow
, exclude := "Exclude-EWMA2-last-ext"]

# i
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) >= 2 &
dewma.all > (1+abs(tbc.sd[nrow(df_sub)-1])) &
(c.dewma.all > 3 | is.na(c.dewma.all)) &
(tbc.sd - tbc_dop > 4 | is.na(tbc_dop)) &
addcrithigh
, exclude := "Exclude-EWMA2-last-ext-high"]
df_sub[rowind == nrow(df_sub) &
agedays[nrow(df_sub)]-agedays[nrow(df_sub)-1] >= 365.25*2 &
abs(tbc.sd[nrow(df_sub)-1]) >= 2 &
dewma.all < (-1-abs(tbc.sd[nrow(df_sub)-1])) &
(c.dewma.all <- 3 | is.na(c.dewma.all)) &
(tbc.sd - tbc_dop < -4 | is.na(tbc_dop)) &
addcritlow
, exclude := "Exclude-EWMA2-last-ext-high"]

# figure out if any of the exclusions hit
count_exclude <- sum(df_sub$exclude != "Include")
if (count_exclude > 0){
df_sub[, abssum := abs(tbc.sd + dewma.all)]

# choose the highest abssum for exclusion
idx <- df_sub$index[which.max(df_sub[df_sub$exclude != "Include",
abssum])]

exclude_all[ind_all == idx] <- df_sub[index == idx, exclude]

#set up to continue on
testing <- TRUE

df_sub <- df_sub[ind_all != idx, ]
} else {
testing <- FALSE
}
}
return(exclude_all)
})(copy(.SD)), , by = .(subjid, param), .SDcols = colnames(df)]

return(df$exclude)
})(copy(.SD)), .SDcols = colnames(data.df)]


# end ----

Expand Down
55 changes: 55 additions & 0 deletions R/infants_support.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,58 @@ calc_oob_evil_twins <- function(df){

return(df)
}

# moderate ewma ----

# function to calculate and recenter z scores for a given column
# df: data frame with parameter, agedays, sex, cn
# cn: column name to calculate, smooth, and recenter
# ref.data.path: reference data path
#
# returns df with additional column, tbc.(cn), which is the recentered z-score
# for the input
calc_and_recenter_z_scores <- function(df, cn, ref.data.path){
# for infants, use z and who
measurement.to.z <- read_anthro(ref.data.path, cdc.only = TRUE,
infants = T)
measurement.to.z_who <- read_anthro(ref.data.path, cdc.only = FALSE,
infants = T)

# calculate "standard deviation" scores
if (!quietly)
cat(sprintf("[%s] Calculating SD-scores...\n", Sys.time()))
df[, cn.orig_cdc := measurement.to.z(param, agedays, sex, get(cn), TRUE)]
df[, cn.orig_who := measurement.to.z_who(param, agedays, sex, get(cn), TRUE)]

# smooth z-scores/SD scores between ages 1 - 3yo using weighted scores
# older uses cdc, younger uses who
who_weight <- 4 - (df$agedays/365.25)
cdc_weight <- (df$agedays/365.25) - 2

smooth_val <- df$agedays/365.25 >= 2 &
df$agedays/365.25 <= 4 &
df$param != "HEADCM"
df[smooth_val,
cn.orig := (cn.orig_cdc[smooth_val]*cdc_weight[smooth_val] +
cn.orig_who[smooth_val]*who_weight[smooth_val])/2]

# otherwise use WHO and CDC for older and younger, respectively
who_val <- df$param == "HEADCM" |
df$agedays/365.25 < 2
df[who_val | (smooth_val & is.na(df$cn.orig_cdc)),
cn.orig := df$cn.orig_who[who_val | (smooth_val & is.na(df$cn.orig_cdc))]]

cdc_val <- df$param != "HEADCM" |
df$agedays/365.25 > 4
df[cdc_val | (smooth_val & is.na(df$cn.orig_who)),
cn.orig := df$cn.orig_cdc[cdc_val | (smooth_val & is.na(df$sd.orig_who))]]

# now recenter -- already has the sd.median from the original recentering
setkey(df, subjid, param, agedays)
df[, tbc.cn := cn.orig - sd.median]

# rename ending column
setnames(df, "tbc.cn", paste0("tbc.", cn))

return(df)
}

0 comments on commit 87f1942

Please sign in to comment.