Lasso and Cox Regression - Larger Dataset
Library and Data
library(survival)
library(tidyverse)
library(penalized)
source("../R/build_feature.R")
source("../R/lasso_helper.R")
source("../R/cox_helper.R")
library(readxl)
<- read_excel("../data/Aim3-early-adversity-subjects-F-survival-2020-10-09.xlsx")
raw_data <- read_excel("../data/Aim3-early-adversity-subjects-F-survival-2021-01-07.xlsx") raw_data_larger
EDA
Preprocess
<- preprocess(raw_data)
data <- preprocess(raw_data_larger)
data_larger <- data_larger[!data_larger$age_less_than_1, ] data_larger_sub
The process is
- run full model with all the main effects of six adversities
- run cross-validation lasso to pick the optimal penalization parameter
- run lasso with optimal penalization parameter and pick out the covariates
- run cox model with the selected covariates
Results
Smaller dataset
Full Model with main effects
<- names(data)[startsWith(names(data), "adv_")][-7]
varnames <- paste("Surv(data$age, data$dead) ~", paste0(varnames, collapse = "+ "))
full_formula
<- coxph(
full as.formula(full_formula),
data = data
)
round(summary(full)$coef, 3)[, -3]
## coef exp(coef) z Pr(>|z|)
## adv_densityTRUE -0.608 0.545 -1.617 0.106
## adv_rainTRUE -0.110 0.896 -0.394 0.693
## adv_momTRUE 0.734 2.084 3.531 0.000
## adv_sibTRUE 0.135 1.145 0.576 0.565
## adv_mom_rankTRUE 0.088 1.092 0.404 0.686
## adv_mom_dsiTRUE -0.103 0.902 -0.477 0.633
The candidate interaction terms used in lasso are adv_mom:<others>
.
<- get_penalized_covariates(data = data, below_one = TRUE)
penalized_covariates
<- get_opt_lambda_of_lasso(
opt_lambda data = data,
penalized_covariates = penalized_covariates
)
<- penalized(
final Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
lambda1 = opt_lambda,
trace = F
)
<- names(final@penalized)[final@penalized != 0]
nonzero_varnames
<- run_cox(
cox data = data,
nonzero_varnames = nonzero_varnames,
penalized_covariates = penalized_covariates
)
round(summary(cox)$coef, 3)[, -3]
## coef exp(coef) z Pr(>|z|)
## adv_momTRUE 0.802 2.230 3.197 0.001
## adv_densityTRUE:adv_mom_dsiTRUE -16.629 0.000 -0.006 0.995
## adv_densityTRUE:adv_cumulative -0.187 0.830 -0.831 0.406
## adv_momTRUE:adv_rainTRUE -0.839 0.432 -1.531 0.126
## adv_momTRUE:adv_mom_dsiTRUE 0.320 1.377 0.852 0.394
Larger dataset (complete)
<- paste("Surv(data_larger$age, data_larger$dead) ~", paste0(varnames, collapse = "+ "))
full_formula
<- coxph(
full_larger as.formula(full_formula),
data = data_larger
)
round(summary(full_larger)$coef, 3)[, -3]
## coef exp(coef) z Pr(>|z|)
## adv_densityTRUE -0.012 0.988 -0.085 0.932
## adv_rainTRUE -0.198 0.821 -1.071 0.284
## adv_momTRUE 0.629 1.876 5.181 0.000
## adv_sibTRUE 0.889 2.434 7.388 0.000
## adv_mom_rankTRUE 0.189 1.207 1.438 0.150
## adv_mom_dsiTRUE -0.235 0.791 -1.699 0.089
The candidate interaction terms used in lasso are adv_mom:<others>
and adv_sib:<others>
.
<- get_penalized_covariates(data = data_larger, below_one = FALSE)
penalized_covariates_larger
<- get_opt_lambda_of_lasso(
opt_lambda_larger data = data_larger,
penalized_covariates = penalized_covariates_larger
)
<- penalized(
final_larger Surv(data_larger$age, data_larger$dead),
penalized = penalized_covariates_larger,
standardize = T,
lambda1 = opt_lambda_larger,
trace = F
)
<- names(final_larger@penalized)[final_larger@penalized != 0]
nonzero_varnames_larger
<- run_cox(
cox_larger data = data_larger,
nonzero_varnames = nonzero_varnames_larger,
penalized_covariates = penalized_covariates_larger
)
round(summary(cox_larger)$coef, 3)[, -3]
## coef exp(coef) z Pr(>|z|)
## adv_momTRUE 0.476 1.610 3.199 0.001
## adv_sibTRUE 0.504 1.656 1.793 0.073
## adv_mom_dsiTRUE -0.265 0.767 -1.845 0.065
## adv_rainTRUE:adv_mom_rankTRUE -0.836 0.434 -1.803 0.071
## adv_momTRUE:adv_mom_rankTRUE 0.417 1.518 1.716 0.086
## adv_sibTRUE:adv_mom_rankTRUE 0.150 1.161 0.607 0.544
## adv_sibTRUE:adv_cumulative 0.186 1.205 1.365 0.172
Larger dataset (age > 1)
<- paste("Surv(data_larger_sub$age, data_larger_sub$dead) ~", paste0(varnames, collapse = "+ "))
full_formula
<- coxph(
full_larger_sub as.formula(full_formula),
data = data_larger_sub
)
round(summary(full_larger_sub)$coef, 3)[, -3]
## coef exp(coef) z Pr(>|z|)
## adv_densityTRUE 0.079 1.082 0.438 0.662
## adv_rainTRUE -0.345 0.708 -1.519 0.129
## adv_momTRUE 0.764 2.147 4.909 0.000
## adv_sibTRUE 0.088 1.092 0.481 0.630
## adv_mom_rankTRUE 0.230 1.258 1.352 0.176
## adv_mom_dsiTRUE -0.444 0.641 -2.534 0.011
The candidate interaction terms used in lasso are adv_mom:<others>
and adv_mom_dsi:<others>
.
<- get_penalized_covariates(data = data_larger_sub, below_one = FALSE)
penalized_covariates_larger_sub
<- get_opt_lambda_of_lasso(
opt_lambda_larger_sub data = data_larger_sub,
penalized_covariates = penalized_covariates_larger_sub
)
<- penalized(
final_larger_sub Surv(data_larger_sub$age, data_larger_sub$dead),
penalized = penalized_covariates_larger_sub,
standardize = T,
lambda1 = opt_lambda_larger_sub,
trace = F
)
<- names(final_larger_sub@penalized)[final_larger_sub@penalized != 0]
nonzero_varnames_larger_sub
<- run_cox(
cox_larger_sub data = data_larger_sub,
nonzero_varnames = nonzero_varnames_larger_sub,
penalized_covariates = penalized_covariates_larger_sub
)
round(summary(cox_larger_sub)$coef, 3)[, -3]
## coef exp(coef) z Pr(>|z|)
## adv_momTRUE 0.801 2.229 4.527 0.000
## adv_mom_dsiTRUE -0.601 0.548 -3.206 0.001
## adv_momTRUE:adv_rainTRUE -1.322 0.267 -3.047 0.002
## adv_rainTRUE:adv_sibTRUE 1.076 2.932 2.289 0.022
## adv_mom_dsiTRUE:adv_rainTRUE 1.162 3.197 2.462 0.014
## adv_momTRUE:adv_mom_rankTRUE 0.650 1.915 2.184 0.029