Lasso and Cox Regression
Library and Data
library(survival)
library(tidyverse)
library(penalized)
library(latex2exp)
library(readxl)
<- read_excel("../data/Aim3-early-adversity-subjects-F-survival-2020-10-09.xlsx") raw_data
EDA
# sum(is.na(raw_data))
<- raw_data %>%
data mutate(sex = factor(sex),
matgrp = factor(matgrp),
status = factor(status))
summary(data)
## sname birth sex matgrp
## Length:300 Min. :1982-12-24 00:00:00 F:300 1.22 :46
## Class :character 1st Qu.:1997-01-22 12:00:00 1.1 :45
## Mode :character Median :2003-12-22 12:00:00 2.2 :41
## Mean :2002-05-11 19:50:24 2 :37
## 3rd Qu.:2010-05-10 00:00:00 2.1 :37
## Max. :2015-12-30 00:00:00 1 :29
## (Other):65
## statdate status age dead
## Min. :1987-12-21 00:00:00 0:122 Min. : 4.003 Mode :logical
## 1st Qu.:2009-08-29 18:00:00 1:123 1st Qu.: 7.192 FALSE:177
## Median :2018-03-10 12:00:00 3: 55 Median :10.198 TRUE :123
## Mean :2014-04-09 02:52:48 Mean :11.737
## 3rd Qu.:2020-06-27 00:00:00 3rd Qu.:16.005
## Max. :2020-06-29 00:00:00 Max. :26.741
##
## adv_density adv_rain adv_mom adv_sib
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:241 FALSE:255 FALSE:236 FALSE:245
## TRUE :59 TRUE :45 TRUE :64 TRUE :55
##
##
##
##
## adv_mom_rank adv_mom_dsi adv_cumulative
## Mode :logical Mode :logical Min. :0.000
## FALSE:227 FALSE:219 1st Qu.:1.000
## TRUE :73 TRUE :81 Median :1.000
## Mean :1.257
## 3rd Qu.:2.000
## Max. :5.000
##
- No
NA
values - Response (survival time):
age
- Categorical variables (3):
sex
: only contains Femalematgrp
: social group of mothersstatus
: 0, alive; 1, dead; 3, dropped out due to observer events
- Logical variables (6):
dead
,adv_density
,adv_rain
,adv_mom
,adv_sib
,adv_mom_rank
,adv_mom_dsi
- Date (2):
birth
,statdate
- Numerical variables (1):
adv_cumulative
- Other (1):
sname
sex
is the same for all samples so there is no need to include it. Here I would treat the status
= 3 as non-informative censoring so that dead
is included as censoring variable. The date does not seem useful at this stage since I am not going to consider temporal information. Also, sname
is not informative as raw string type.
Hence, the censoring variable is dead
and the predictors I use are
- six adversities,
adv_cumulative
- interactions:
adv_mom
:other adversity
,adv_rain
:other adversity
<- data %>% select(starts_with("adv_"))
data_sub
# interactions selected are between adv_mom:other, and adv_rain:other
<- "~ . + adv_mom:(.-adv_cumulative) + adv_rain:(.-adv_cumulative)"
formula <- model.matrix(as.formula(formula) , data = data.frame(data_sub))
penalized_covariates
# remove intercept ready for penalized covariates
<- penalized_covariates[, colnames(penalized_covariates) != "(Intercept)"] penalized_covariates
Lasso
set.seed(42)
<- profL1(
cv Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
fold = 5,
minlambda1 = 5,
maxlambda1 = 25,
trace = F
)
set.seed(42)
<- optL1(
opt Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
fold = 5,
trace = F
)
Stepwise Lasso Fit
<- penalized(
stepwise Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
steps = 20,
lambda1 = 1,
trace = F
)
plotpath(
stepwise,labelsize = 0.8,
standardize = T
)
abline(
v = opt$lambda,
col = "gray",
lwd = 2
)
Nonzero covariates
<- penalized(
final Surv(data$age, data$dead),
penalized = penalized_covariates,
standardize = T,
lambda1 = opt$lambda,
trace = F
)<- names(final@penalized)[final@penalized != 0]
nonzero_varnames nonzero_varnames
## [1] "adv_densityTRUE" "adv_momTRUE"
## [3] "adv_rainTRUE:adv_momTRUE" "adv_momTRUE:adv_sibTRUE"
## [5] "adv_momTRUE:adv_mom_dsiTRUE"
Variable selection by lasso picks out 5 covariates: adv_density
, adv_mom
, adv_rain:adv_mom
, adv_mom:adv_sib
, and adv_mom:adv_mom_ds
.
Cox Regression
= paste("Surv(data$age, data$dead) ~", paste0(nonzero_varnames, collapse = "+ "))
formula
<- coxph(
cox as.formula(formula),
data = data.frame(penalized_covariates)
)
summary(cox)
## Call:
## coxph(formula = as.formula(formula), data = data.frame(penalized_covariates))
##
## n= 300, number of events= 123
##
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.5734 0.5636 0.3748 -1.530 0.1260
## adv_momTRUE 0.6928 1.9993 0.2861 2.422 0.0154 *
## adv_momTRUE:adv_rainTRUE -0.7772 0.4597 0.5511 -1.410 0.1585
## adv_momTRUE:adv_sibTRUE 0.4074 1.5029 0.4233 0.963 0.3358
## adv_momTRUE:adv_mom_dsiTRUE 0.3029 1.3538 0.3849 0.787 0.4313
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## adv_densityTRUE 0.5636 1.7743 0.2703 1.175
## adv_momTRUE 1.9993 0.5002 1.1412 3.503
## adv_momTRUE:adv_rainTRUE 0.4597 2.1754 0.1561 1.354
## adv_momTRUE:adv_sibTRUE 1.5029 0.6654 0.6556 3.445
## adv_momTRUE:adv_mom_dsiTRUE 1.3538 0.7387 0.6366 2.879
##
## Concordance= 0.606 (se = 0.026 )
## Likelihood ratio test= 19.04 on 5 df, p=0.002
## Wald test = 21.55 on 5 df, p=6e-04
## Score (logrank) test = 23.38 on 5 df, p=3e-04
Rerun with composite adversity
"“LASSO pick up five terms (two main effects and three interactions (all with maternal loss), but none of the interaction term is significant.I was wondering if you can create a composite variable of whether there is one adversity besides mom death (sum of all other adversities>0), and rerun the analysis with the two main effects and the interaction between mom death and this composite adversity.”"
<- raw_data %>%
data_composite mutate(adv_composite = adv_rain+adv_sib+adv_mom_dsi) %>%
select(adv_density, adv_mom, adv_composite)
<- coxph(
cox_composite Surv(data$age, data$dead) ~ adv_density + adv_mom + adv_mom*adv_composite,
data = data_composite
)
summary(cox_composite)
## Call:
## coxph(formula = Surv(data$age, data$dead) ~ adv_density + adv_mom +
## adv_mom * adv_composite, data = data_composite)
##
## n= 300, number of events= 123
##
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.64269 0.52588 0.37330 -1.722 0.0851 .
## adv_momTRUE 0.66552 1.94550 0.30018 2.217 0.0266 *
## adv_composite -0.05055 0.95071 0.18456 -0.274 0.7842
## adv_momTRUE:adv_composite 0.08668 1.09055 0.34477 0.251 0.8015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## adv_densityTRUE 0.5259 1.902 0.2530 1.093
## adv_momTRUE 1.9455 0.514 1.0802 3.504
## adv_composite 0.9507 1.052 0.6621 1.365
## adv_momTRUE:adv_composite 1.0906 0.917 0.5549 2.143
##
## Concordance= 0.604 (se = 0.03 )
## Likelihood ratio test= 14.64 on 4 df, p=0.006
## Wald test = 15.36 on 4 df, p=0.004
## Score (logrank) test = 16.02 on 4 df, p=0.003