Library and Data

library(survival)
library(tidyverse)
library(penalized)

source("../R/build_feature.R")
source("../R/lasso_helper.R")
source("../R/cox_helper.R")

library(readxl)
raw_data <- read_excel("../data/Aim3-early-adversity-subjects-F-survival-2020-10-09.xlsx")
raw_data_larger <- read_excel("../data/Aim3-early-adversity-subjects-F-survival-2021-01-07.xlsx")

EDA

Preprocess

data <- preprocess(raw_data)
data_larger <- preprocess(raw_data_larger)
data_larger_sub <- data_larger[!data_larger$age_less_than_1, ]

The process is

  • run full model with all the main effects of six adversities
  • run cross-validation lasso to pick the optimal penalization parameter
  • run lasso with optimal penalization parameter and pick out the covariates
  • run cox model with the selected covariates

Results

Smaller dataset

Full Model with main effects

varnames <- names(data)[startsWith(names(data), "adv_")][-7]
full_formula <- paste("Surv(data$age, data$dead) ~", paste0(varnames, collapse = "+ "))

full <- coxph(
  as.formula(full_formula),
  data = data
)

round(summary(full)$coef, 3)[, -3]
##                    coef exp(coef)      z Pr(>|z|)
## adv_densityTRUE  -0.608     0.545 -1.617    0.106
## adv_rainTRUE     -0.110     0.896 -0.394    0.693
## adv_momTRUE       0.734     2.084  3.531    0.000
## adv_sibTRUE       0.135     1.145  0.576    0.565
## adv_mom_rankTRUE  0.088     1.092  0.404    0.686
## adv_mom_dsiTRUE  -0.103     0.902 -0.477    0.633

The candidate interaction terms used in lasso are adv_mom:<others>.

penalized_covariates <- get_penalized_covariates(data = data, below_one = TRUE)

opt_lambda <- get_opt_lambda_of_lasso(
  data = data,
  penalized_covariates = penalized_covariates
)

final <- penalized(
  Surv(data$age, data$dead),
  penalized = penalized_covariates,
  standardize = T,
  lambda1 = opt_lambda,
  trace = F
)

nonzero_varnames <- names(final@penalized)[final@penalized != 0]

cox <- run_cox(
  data = data,
  nonzero_varnames = nonzero_varnames,
  penalized_covariates = penalized_covariates
)

round(summary(cox)$coef, 3)[, -3]
##                                    coef exp(coef)      z Pr(>|z|)
## adv_momTRUE                       0.802     2.230  3.197    0.001
## adv_densityTRUE:adv_mom_dsiTRUE -16.629     0.000 -0.006    0.995
## adv_densityTRUE:adv_cumulative   -0.187     0.830 -0.831    0.406
## adv_momTRUE:adv_rainTRUE         -0.839     0.432 -1.531    0.126
## adv_momTRUE:adv_mom_dsiTRUE       0.320     1.377  0.852    0.394

Larger dataset (complete)

full_formula <- paste("Surv(data_larger$age, data_larger$dead) ~", paste0(varnames, collapse = "+ "))

full_larger <- coxph(
  as.formula(full_formula),
  data = data_larger
)

round(summary(full_larger)$coef, 3)[, -3]
##                    coef exp(coef)      z Pr(>|z|)
## adv_densityTRUE  -0.012     0.988 -0.085    0.932
## adv_rainTRUE     -0.198     0.821 -1.071    0.284
## adv_momTRUE       0.629     1.876  5.181    0.000
## adv_sibTRUE       0.889     2.434  7.388    0.000
## adv_mom_rankTRUE  0.189     1.207  1.438    0.150
## adv_mom_dsiTRUE  -0.235     0.791 -1.699    0.089

The candidate interaction terms used in lasso are adv_mom:<others> and adv_sib:<others>.

penalized_covariates_larger <- get_penalized_covariates(data = data_larger, below_one = FALSE)

opt_lambda_larger <- get_opt_lambda_of_lasso(
  data = data_larger,
  penalized_covariates = penalized_covariates_larger
)

final_larger <- penalized(
  Surv(data_larger$age, data_larger$dead),
  penalized = penalized_covariates_larger,
  standardize = T,
  lambda1 = opt_lambda_larger,
  trace = F
)

nonzero_varnames_larger <- names(final_larger@penalized)[final_larger@penalized != 0]

cox_larger <- run_cox(
  data = data_larger,
  nonzero_varnames = nonzero_varnames_larger,
  penalized_covariates = penalized_covariates_larger
)

round(summary(cox_larger)$coef, 3)[, -3]
##                                 coef exp(coef)      z Pr(>|z|)
## adv_momTRUE                    0.476     1.610  3.199    0.001
## adv_sibTRUE                    0.504     1.656  1.793    0.073
## adv_mom_dsiTRUE               -0.265     0.767 -1.845    0.065
## adv_rainTRUE:adv_mom_rankTRUE -0.836     0.434 -1.803    0.071
## adv_momTRUE:adv_mom_rankTRUE   0.417     1.518  1.716    0.086
## adv_sibTRUE:adv_mom_rankTRUE   0.150     1.161  0.607    0.544
## adv_sibTRUE:adv_cumulative     0.186     1.205  1.365    0.172

Larger dataset (age > 1)

full_formula <- paste("Surv(data_larger_sub$age, data_larger_sub$dead) ~", paste0(varnames, collapse = "+ "))

full_larger_sub <- coxph(
  as.formula(full_formula),
  data = data_larger_sub
)

round(summary(full_larger_sub)$coef, 3)[, -3]
##                    coef exp(coef)      z Pr(>|z|)
## adv_densityTRUE   0.079     1.082  0.438    0.662
## adv_rainTRUE     -0.345     0.708 -1.519    0.129
## adv_momTRUE       0.764     2.147  4.909    0.000
## adv_sibTRUE       0.088     1.092  0.481    0.630
## adv_mom_rankTRUE  0.230     1.258  1.352    0.176
## adv_mom_dsiTRUE  -0.444     0.641 -2.534    0.011

The candidate interaction terms used in lasso are adv_mom:<others> and adv_mom_dsi:<others>.

penalized_covariates_larger_sub <- get_penalized_covariates(data = data_larger_sub, below_one = FALSE)

opt_lambda_larger_sub <- get_opt_lambda_of_lasso(
  data = data_larger_sub,
  penalized_covariates = penalized_covariates_larger_sub
)

final_larger_sub <- penalized(
  Surv(data_larger_sub$age, data_larger_sub$dead),
  penalized = penalized_covariates_larger_sub,
  standardize = T,
  lambda1 = opt_lambda_larger_sub,
  trace = F
)

nonzero_varnames_larger_sub <- names(final_larger_sub@penalized)[final_larger_sub@penalized != 0]

cox_larger_sub <- run_cox(
  data = data_larger_sub,
  nonzero_varnames = nonzero_varnames_larger_sub,
  penalized_covariates = penalized_covariates_larger_sub
)

round(summary(cox_larger_sub)$coef, 3)[, -3]
##                                coef exp(coef)      z Pr(>|z|)
## adv_momTRUE                   0.801     2.229  4.527    0.000
## adv_mom_dsiTRUE              -0.601     0.548 -3.206    0.001
## adv_momTRUE:adv_rainTRUE     -1.322     0.267 -3.047    0.002
## adv_rainTRUE:adv_sibTRUE      1.076     2.932  2.289    0.022
## adv_mom_dsiTRUE:adv_rainTRUE  1.162     3.197  2.462    0.014
## adv_momTRUE:adv_mom_rankTRUE  0.650     1.915  2.184    0.029