Lasso and Cox Regression

Library and Data

library(survival)
library(tidyverse)
library(penalized)
library(latex2exp)

library(readxl)
raw_data <- read_excel("../data/Aim3-early-adversity-subjects-F-survival-2020-10-09.xlsx")

EDA

# sum(is.na(raw_data))

data <- raw_data %>%
  mutate(sex = factor(sex),
         matgrp = factor(matgrp),
         status = factor(status))

summary(data)

##     sname               birth                     sex         matgrp  
##  Length:300         Min.   :1982-12-24 00:00:00   F:300   1.22   :46  
##  Class :character   1st Qu.:1997-01-22 12:00:00           1.1    :45  
##  Mode  :character   Median :2003-12-22 12:00:00           2.2    :41  
##                     Mean   :2002-05-11 19:50:24           2      :37  
##                     3rd Qu.:2010-05-10 00:00:00           2.1    :37  
##                     Max.   :2015-12-30 00:00:00           1      :29  
##                                                           (Other):65  
##     statdate                   status       age            dead        
##  Min.   :1987-12-21 00:00:00   0:122   Min.   : 4.003   Mode :logical  
##  1st Qu.:2009-08-29 18:00:00   1:123   1st Qu.: 7.192   FALSE:177      
##  Median :2018-03-10 12:00:00   3: 55   Median :10.198   TRUE :123      
##  Mean   :2014-04-09 02:52:48           Mean   :11.737                  
##  3rd Qu.:2020-06-27 00:00:00           3rd Qu.:16.005                  
##  Max.   :2020-06-29 00:00:00           Max.   :26.741                  
##                                                                        
##  adv_density      adv_rain        adv_mom         adv_sib       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:241       FALSE:255       FALSE:236       FALSE:245      
##  TRUE :59        TRUE :45        TRUE :64        TRUE :55       
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  adv_mom_rank    adv_mom_dsi     adv_cumulative 
##  Mode :logical   Mode :logical   Min.   :0.000  
##  FALSE:227       FALSE:219       1st Qu.:1.000  
##  TRUE :73        TRUE :81        Median :1.000  
##                                  Mean   :1.257  
##                                  3rd Qu.:2.000  
##                                  Max.   :5.000  
##

No NA values
Response (survival time): age
Categorical variables (3):
1. sex: only contains Female
2. matgrp: social group of mothers
3. status: 0, alive; 1, dead; 3, dropped out due to observer events
Logical variables (6): dead, adv_density, adv_rain, adv_mom, adv_sib, adv_mom_rank, adv_mom_dsi
Date (2): birth, statdate
Numerical variables (1): adv_cumulative
Other (1): sname

sex is the same for all samples so there is no need to include it. Here I would treat the status = 3 as non-informative censoring so that dead is included as censoring variable. The date does not seem useful at this stage since I am not going to consider temporal information. Also, sname is not informative as raw string type.

Hence, the censoring variable is dead and the predictors I use are

six adversities, adv_cumulative
interactions: adv_mom:other adversity, adv_rain:other adversity

data_sub <- data %>% select(starts_with("adv_"))

# interactions selected are between adv_mom:other, and adv_rain:other
formula <- "~ . + adv_mom:(.-adv_cumulative) + adv_rain:(.-adv_cumulative)"
penalized_covariates <- model.matrix(as.formula(formula) , data = data.frame(data_sub))

# remove intercept ready for penalized covariates
penalized_covariates <- penalized_covariates[, colnames(penalized_covariates) != "(Intercept)"]

Lasso

set.seed(42)

cv <- profL1(
  Surv(data$age, data$dead),
  penalized = penalized_covariates,
  standardize = T,
  fold = 5,
  minlambda1 = 5,
  maxlambda1 = 25,
  trace = F
)

set.seed(42)
opt <- optL1(
  Surv(data$age, data$dead),
  penalized = penalized_covariates,
  standardize = T,
  fold = 5,
  trace = F
)

Stepwise Lasso Fit

stepwise <- penalized(
  Surv(data$age, data$dead),
  penalized = penalized_covariates,
  standardize = T,
  steps = 20,
  lambda1 = 1,
  trace = F
)

plotpath(
  stepwise,
  labelsize = 0.8,
  standardize = T
)

abline(
  v = opt$lambda,
  col = "gray",
  lwd = 2
)

Nonzero covariates

final <- penalized(
  Surv(data$age, data$dead),
  penalized = penalized_covariates,
  standardize = T,
  lambda1 = opt$lambda,
  trace = F
)
nonzero_varnames <- names(final@penalized)[final@penalized != 0]
nonzero_varnames

## [1] "adv_densityTRUE"             "adv_momTRUE"                
## [3] "adv_rainTRUE:adv_momTRUE"    "adv_momTRUE:adv_sibTRUE"    
## [5] "adv_momTRUE:adv_mom_dsiTRUE"

Variable selection by lasso picks out 5 covariates: adv_density, adv_mom, adv_rain:adv_mom, adv_mom:adv_sib, and adv_mom:adv_mom_ds.

Cox Regression

formula = paste("Surv(data$age, data$dead) ~", paste0(nonzero_varnames, collapse = "+ "))

cox <- coxph(
  as.formula(formula),
  data = data.frame(penalized_covariates)
)

summary(cox)

## Call:
## coxph(formula = as.formula(formula), data = data.frame(penalized_covariates))
## 
##   n= 300, number of events= 123 
## 
##                                coef exp(coef) se(coef)      z Pr(>|z|)  
## adv_densityTRUE             -0.5734    0.5636   0.3748 -1.530   0.1260  
## adv_momTRUE                  0.6928    1.9993   0.2861  2.422   0.0154 *
## adv_momTRUE:adv_rainTRUE    -0.7772    0.4597   0.5511 -1.410   0.1585  
## adv_momTRUE:adv_sibTRUE      0.4074    1.5029   0.4233  0.963   0.3358  
## adv_momTRUE:adv_mom_dsiTRUE  0.3029    1.3538   0.3849  0.787   0.4313  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                             exp(coef) exp(-coef) lower .95 upper .95
## adv_densityTRUE                0.5636     1.7743    0.2703     1.175
## adv_momTRUE                    1.9993     0.5002    1.1412     3.503
## adv_momTRUE:adv_rainTRUE       0.4597     2.1754    0.1561     1.354
## adv_momTRUE:adv_sibTRUE        1.5029     0.6654    0.6556     3.445
## adv_momTRUE:adv_mom_dsiTRUE    1.3538     0.7387    0.6366     2.879
## 
## Concordance= 0.606  (se = 0.026 )
## Likelihood ratio test= 19.04  on 5 df,   p=0.002
## Wald test            = 21.55  on 5 df,   p=6e-04
## Score (logrank) test = 23.38  on 5 df,   p=3e-04

Rerun with composite adversity

"“LASSO pick up five terms (two main effects and three interactions (all with maternal loss), but none of the interaction term is significant.I was wondering if you can create a composite variable of whether there is one adversity besides mom death (sum of all other adversities>0), and rerun the analysis with the two main effects and the interaction between mom death and this composite adversity.”"

data_composite <- raw_data %>%
  mutate(adv_composite = adv_rain+adv_sib+adv_mom_dsi) %>%
  select(adv_density, adv_mom, adv_composite)

cox_composite <- coxph(
  Surv(data$age, data$dead) ~ adv_density + adv_mom + adv_mom*adv_composite,
  data = data_composite
)

summary(cox_composite)

## Call:
## coxph(formula = Surv(data$age, data$dead) ~ adv_density + adv_mom + 
##     adv_mom * adv_composite, data = data_composite)
## 
##   n= 300, number of events= 123 
## 
##                               coef exp(coef) se(coef)      z Pr(>|z|)  
## adv_densityTRUE           -0.64269   0.52588  0.37330 -1.722   0.0851 .
## adv_momTRUE                0.66552   1.94550  0.30018  2.217   0.0266 *
## adv_composite             -0.05055   0.95071  0.18456 -0.274   0.7842  
## adv_momTRUE:adv_composite  0.08668   1.09055  0.34477  0.251   0.8015  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                           exp(coef) exp(-coef) lower .95 upper .95
## adv_densityTRUE              0.5259      1.902    0.2530     1.093
## adv_momTRUE                  1.9455      0.514    1.0802     3.504
## adv_composite                0.9507      1.052    0.6621     1.365
## adv_momTRUE:adv_composite    1.0906      0.917    0.5549     2.143
## 
## Concordance= 0.604  (se = 0.03 )
## Likelihood ratio test= 14.64  on 4 df,   p=0.006
## Wald test            = 15.36  on 4 df,   p=0.004
## Score (logrank) test = 16.02  on 4 df,   p=0.003