Lasso and Cox Regression
Library and Data
library(survival)
library(tidyverse)
library(penalized)
library(latex2exp)
library(readxl)
raw_data <- read_excel("../data/Aim3-early-adversity-subjects-F-survival-2020-10-09.xlsx")EDA
# sum(is.na(raw_data))
data <- raw_data %>%
mutate(sex = factor(sex),
matgrp = factor(matgrp),
status = factor(status))
summary(data)## sname birth sex matgrp
## Length:300 Min. :1982-12-24 00:00:00 F:300 1.22 :46
## Class :character 1st Qu.:1997-01-22 12:00:00 1.1 :45
## Mode :character Median :2003-12-22 12:00:00 2.2 :41
## Mean :2002-05-11 19:50:24 2 :37
## 3rd Qu.:2010-05-10 00:00:00 2.1 :37
## Max. :2015-12-30 00:00:00 1 :29
## (Other):65
## statdate status age dead
## Min. :1987-12-21 00:00:00 0:122 Min. : 4.003 Mode :logical
## 1st Qu.:2009-08-29 18:00:00 1:123 1st Qu.: 7.192 FALSE:177
## Median :2018-03-10 12:00:00 3: 55 Median :10.198 TRUE :123
## Mean :2014-04-09 02:52:48 Mean :11.737
## 3rd Qu.:2020-06-27 00:00:00 3rd Qu.:16.005
## Max. :2020-06-29 00:00:00 Max. :26.741
##
## adv_density adv_rain adv_mom adv_sib
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:241 FALSE:255 FALSE:236 FALSE:245
## TRUE :59 TRUE :45 TRUE :64 TRUE :55
##
##
##
##
## adv_mom_rank adv_mom_dsi adv_cumulative
## Mode :logical Mode :logical Min. :0.000
## FALSE:227 FALSE:219 1st Qu.:1.000
## TRUE :73 TRUE :81 Median :1.000
## Mean :1.257
## 3rd Qu.:2.000
## Max. :5.000
##
- No
NAvalues - Response (survival time):
age - Categorical variables (3):
sex: only contains Femalematgrp: social group of mothersstatus: 0, alive; 1, dead; 3, dropped out due to observer events
- Logical variables (6):
dead,adv_density,adv_rain,adv_mom,adv_sib,adv_mom_rank,adv_mom_dsi - Date (2):
birth,statdate - Numerical variables (1):
adv_cumulative - Other (1):
sname
sex is the same for all samples so there is no need to include it. Here I would treat the status = 3 as non-informative censoring so that dead is included as censoring variable. The date does not seem useful at this stage since I am not going to consider temporal information. Also, sname is not informative as raw string type.
Hence, the censoring variable is dead and the predictors I use are
- six adversities,
adv_cumulative - interactions:
adv_mom:other adversity,adv_rain:other adversity
data_sub <- data %>% select(starts_with("adv_"))
# interactions selected are between adv_mom:other, and adv_rain:other
formula <- "~ . + adv_mom:(.-adv_cumulative) + adv_rain:(.-adv_cumulative)"
penalized_covariates <- model.matrix(as.formula(formula) , data = data.frame(data_sub))
# remove intercept ready for penalized covariates
penalized_covariates <- penalized_covariates[, colnames(penalized_covariates) != "(Intercept)"]Lasso
set.seed(42)
cv <- profL1(
Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
fold = 5,
minlambda1 = 5,
maxlambda1 = 25,
trace = F
)
set.seed(42)
opt <- optL1(
Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
fold = 5,
trace = F
)Stepwise Lasso Fit
stepwise <- penalized(
Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
steps = 20,
lambda1 = 1,
trace = F
)
plotpath(
stepwise,
labelsize = 0.8,
standardize = T
)
abline(
v = opt$lambda,
col = "gray",
lwd = 2
)Nonzero covariates
final <- penalized(
Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
lambda1 = opt$lambda,
trace = F
)
nonzero_varnames <- names(final@penalized)[final@penalized != 0]
nonzero_varnames## [1] "adv_densityTRUE" "adv_momTRUE"
## [3] "adv_rainTRUE:adv_momTRUE" "adv_momTRUE:adv_sibTRUE"
## [5] "adv_momTRUE:adv_mom_dsiTRUE"
Variable selection by lasso picks out 5 covariates: adv_density, adv_mom, adv_rain:adv_mom, adv_mom:adv_sib, and adv_mom:adv_mom_ds.
Cox Regression
formula = paste("Surv(data$age, data$dead) ~", paste0(nonzero_varnames, collapse = "+ "))
cox <- coxph(
as.formula(formula),
data = data.frame(penalized_covariates)
)
summary(cox)## Call:
## coxph(formula = as.formula(formula), data = data.frame(penalized_covariates))
##
## n= 300, number of events= 123
##
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.5734 0.5636 0.3748 -1.530 0.1260
## adv_momTRUE 0.6928 1.9993 0.2861 2.422 0.0154 *
## adv_momTRUE:adv_rainTRUE -0.7772 0.4597 0.5511 -1.410 0.1585
## adv_momTRUE:adv_sibTRUE 0.4074 1.5029 0.4233 0.963 0.3358
## adv_momTRUE:adv_mom_dsiTRUE 0.3029 1.3538 0.3849 0.787 0.4313
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## adv_densityTRUE 0.5636 1.7743 0.2703 1.175
## adv_momTRUE 1.9993 0.5002 1.1412 3.503
## adv_momTRUE:adv_rainTRUE 0.4597 2.1754 0.1561 1.354
## adv_momTRUE:adv_sibTRUE 1.5029 0.6654 0.6556 3.445
## adv_momTRUE:adv_mom_dsiTRUE 1.3538 0.7387 0.6366 2.879
##
## Concordance= 0.606 (se = 0.026 )
## Likelihood ratio test= 19.04 on 5 df, p=0.002
## Wald test = 21.55 on 5 df, p=6e-04
## Score (logrank) test = 23.38 on 5 df, p=3e-04
Rerun with composite adversity
"“LASSO pick up five terms (two main effects and three interactions (all with maternal loss), but none of the interaction term is significant.I was wondering if you can create a composite variable of whether there is one adversity besides mom death (sum of all other adversities>0), and rerun the analysis with the two main effects and the interaction between mom death and this composite adversity.”"
data_composite <- raw_data %>%
mutate(adv_composite = adv_rain+adv_sib+adv_mom_dsi) %>%
select(adv_density, adv_mom, adv_composite)
cox_composite <- coxph(
Surv(data$age, data$dead) ~ adv_density + adv_mom + adv_mom*adv_composite,
data = data_composite
)
summary(cox_composite)## Call:
## coxph(formula = Surv(data$age, data$dead) ~ adv_density + adv_mom +
## adv_mom * adv_composite, data = data_composite)
##
## n= 300, number of events= 123
##
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.64269 0.52588 0.37330 -1.722 0.0851 .
## adv_momTRUE 0.66552 1.94550 0.30018 2.217 0.0266 *
## adv_composite -0.05055 0.95071 0.18456 -0.274 0.7842
## adv_momTRUE:adv_composite 0.08668 1.09055 0.34477 0.251 0.8015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## adv_densityTRUE 0.5259 1.902 0.2530 1.093
## adv_momTRUE 1.9455 0.514 1.0802 3.504
## adv_composite 0.9507 1.052 0.6621 1.365
## adv_momTRUE:adv_composite 1.0906 0.917 0.5549 2.143
##
## Concordance= 0.604 (se = 0.03 )
## Likelihood ratio test= 14.64 on 4 df, p=0.006
## Wald test = 15.36 on 4 df, p=0.004
## Score (logrank) test = 16.02 on 4 df, p=0.003