Library and Data

library(survival)
library(tidyverse)
library(penalized)
library(kableExtra)
library(ggrepel)

source("../R/utils.R")
source("../R/full_helper.R")
source("../R/lasso_helper.R")
source("../R/stepBIC_helper.R")
source("../R/cumulative_helper.R")

raw_data <- read_csv("../data/Aim3-early-adversity-subjects-F-survival-2021-01-29.csv")
raw_data <- dplyr::select(raw_data, -c("adv_mom_dsi", "adv_cumulative"))

Raw data here refers to the larger dataset.

EDA

Below is the Kaplan-Meier estimation of the survival function on the raw data.

km <- survfit(Surv(raw_data$age, raw_data$dead) ~ 1, conf.type = "log-log")
ggplot() +
  geom_line(aes(x = km$time, y = km$surv)) +
  geom_vline(aes(xintercept = 1), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = 5), color = "red", linetype = "dashed") +
  labs(x = expression("Time (year)"), y = expression("Survival"),
       title = "Kaplan-Meier estimate") +
  scale_x_continuous(breaks = seq(0, 25, 1)) +
  theme_classic()

Analysis Process

We conduct four models on three datasets derived from the raw data.

Four models:

  • Cox model of survival data versus all the main effects (six adversities)
  • Cox model of survival data versus main effects and interaction terms selected by lasso
  • Cox model of survival data versus main effects and interaction terms selected by stepwise BIC
  • Cox model of survival data versus adv_mom, sum of other adversities adv_other_cumulative and their interaction term

Three datasets:

  • age_below_1: Include all the samples. For those whose age > 1, use age = 1, and dead = FALSE
  • age_above_1_below_5: Include samples of age > 1. For those whose age > 5, use age = 5, and dead = FALSE
  • age_above_5: Include samples of age > 5.

Note that for the age_below_1 dataset, we exclude adv_sib from all models.

Modification of the Covariates

First, we replace low maternal DSI with low maternal SCI.

And brief information of covariates we are using in this analysis is in the table below:

covariate_names <- c("adv_density", "adv_rain", "adv_mom",
                     "adv_sib", "adv_mom_rank", "adv_mom_sci",
                     "adv_cumulative_sci")
covariate_types <- c(rep("Logical", 6), "Numeric")
covariate_meanings <- c("Group size is in the highest quartile",
                       "Rainfall <= 200mm",
                       "Mom dies before age 4",
                       "A sibling is born",
                       "Mom's rank is in the lowest quartile",
                       "Social connection is in the lowest quartile",
                       "Sum of the above six adversities")

data.frame(
  list(types = covariate_types,
       meanings = covariate_meanings)
) %>%
  `row.names<-`(covariate_names) %>%
  kable(col.names = c("Type", "Meaning if TRUE")) %>%
  kable_styling(full_width = FALSE, position = "center", font_size = 18)
Type Meaning if TRUE
adv_density Logical Group size is in the highest quartile
adv_rain Logical Rainfall <= 200mm
adv_mom Logical Mom dies before age 4
adv_sib Logical A sibling is born
adv_mom_rank Logical Mom’s rank is in the lowest quartile
adv_mom_sci Logical Social connection is in the lowest quartile
adv_cumulative_sci Numeric Sum of the above six adversities
age_below_1 <- raw_data %>%
  select(starts_with("adv_"), age, dead, -adv_cumulative_sci) %>%
  mutate(dead = if_else(age > 1, FALSE, dead)) %>%
  mutate(age = if_else(age > 1, 1, age))

age_above_1_below_5 <- raw_data %>%
  filter(age >= 1) %>%
  select(starts_with("adv_"), age, dead, -adv_cumulative_sci) %>%
  mutate(dead = if_else(age > 5, FALSE, dead)) %>%
  mutate(age = if_else(age > 5, 5, age))

age_above_5 <- raw_data %>%
  filter(age >= 5) %>%
  select(starts_with("adv_"), age, dead, -adv_cumulative_sci)

The cumulative adversities category in three age groups is in the table below. Note that when counting the cumulative adversities in Age 1- group we exclude adv_sib.

cumulative_summary_1 <- age_below_1 %>%
  mutate(adv_other_cumulative = adv_rain+adv_mom_sci+adv_mom_rank+adv_density) %>%
  group_by(adv_other_cumulative) %>%
  summarise(count = n()) %>%
  select(count)

cumulative_summary_2 <- age_above_1_below_5 %>%
  mutate(adv_other_cumulative = adv_rain+adv_sib+adv_mom_sci+adv_mom_rank+adv_density) %>%
  group_by(adv_other_cumulative) %>%
  summarise(count = n()) %>%
  select(count)

cumulative_summary_3 <- age_above_5 %>%
  mutate(adv_other_cumulative = adv_rain+adv_sib+adv_mom_sci+adv_mom_rank+adv_density) %>%
  group_by(adv_other_cumulative) %>%
  summarise(count = n()) %>%
  select(count)

data.frame(cbind(cumulative_summary_1, cumulative_summary_2, cumulative_summary_3)) %>%
  t() %>%
  `rownames<-`(c("Age 1-", "Age 1-5", "Age 5+")) %>%
  kable(col.names = 0:4) %>%
  kable_styling(full_width = FALSE, position = "center", font_size = 18)
0 1 2 3 4
Age 1- 222 226 92 20 2
Age 1-5 128 188 97 24 3
Age 5+ 70 122 66 17 2


Age below 1

Main effects

full_age_below_1 <- get_full(age_below_1, below_one = TRUE)
round(summary(full_age_below_1)$coef, 3)
##                    coef exp(coef) se(coef)      z Pr(>|z|)
## adv_densityTRUE  -0.503     0.605    0.251 -2.002    0.045
## adv_rainTRUE     -0.254     0.775    0.318 -0.800    0.424
## adv_momTRUE       0.442     1.555    0.195  2.264    0.024
## adv_mom_rankTRUE  0.213     1.237    0.209  1.015    0.310
## adv_mom_sciTRUE  -0.637     0.529    0.242 -2.631    0.009
plot_terms(full_age_below_1)
mtext("Age < 1: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)

Lasso

penalized_below_1 <- get_penalized_covariates(age_below_1, below_one = TRUE)
opt_lambda_below_1 <- get_opt_lambda_of_lasso(
  data = age_below_1,
  penalized_covariates = penalized_below_1
)
nonzero_vars_below_1 <- get_nonzero_varnames_of_opt_lasso(
  data = age_below_1,
  penalized_covariates = penalized_below_1,
  opt_lambda = opt_lambda_below_1
)
vars_below_1 <- get_varnames_in_final_model(nonzero_vars_below_1)
lasso_age_below_1 <- run_cox(
  data = age_below_1,
  varnames = vars_below_1,
  penalized_covariates = penalized_below_1
)
round(summary(lasso_age_below_1)$coef, 3)
##                                 coef exp(coef) se(coef)      z Pr(>|z|)
## adv_densityTRUE               -0.772     0.462    0.339 -2.276    0.023
## adv_momTRUE                    0.410     1.507    0.232  1.770    0.077
## adv_mom_rankTRUE               0.265     1.303    0.216  1.226    0.220
## adv_mom_sciTRUE               -0.441     0.643    0.282 -1.562    0.118
## adv_rainTRUE                  -0.079     0.924    0.337 -0.234    0.815
## adv_densityTRUE:adv_momTRUE    0.825     2.281    0.510  1.618    0.106
## adv_mom_rankTRUE:adv_rainTRUE -1.254     0.285    1.078 -1.163    0.245
## adv_momTRUE:adv_mom_sciTRUE   -0.676     0.509    0.557 -1.214    0.225
plot_terms(lasso_age_below_1)
mtext("Age < 1: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)

Stepwise-BIC

stepBIC_age_below_1 <- get_stepBIC(age_below_1, below_one=TRUE)
round(summary(stepBIC_age_below_1)$coef, 3)
##                   coef exp(coef) se(coef)      z Pr(>|z|)
## adv_mom_sciTRUE -0.588     0.555     0.24 -2.447    0.014
plot_terms(stepBIC_age_below_1)
mtext("Age < 1: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)

Cumulative

cumulative_age_below_1 <- get_cumulative(age_below_1, below_one=TRUE)
round(summary(cumulative_age_below_1)$coef, 3)
##                                       coef exp(coef) se(coef)      z Pr(>|z|)
## adv_momTRUE                          0.497     1.644    0.290  1.717    0.086
## adv_other_cumulative1               -0.293     0.746    0.244 -1.203    0.229
## adv_other_cumulative2               -0.584     0.558    0.369 -1.583    0.113
## adv_other_cumulative2+              -0.511     0.600    0.725 -0.706    0.480
## adv_momTRUE:adv_other_cumulative1   -0.060     0.942    0.425 -0.141    0.888
## adv_momTRUE:adv_other_cumulative2    0.346     1.414    0.581  0.596    0.551
## adv_momTRUE:adv_other_cumulative2+ -16.132     0.000 2222.416 -0.007    0.994


Age above 1 and below 5

Main effects

full_age_above_1_below_5 <- get_full(age_above_1_below_5, below_one = FALSE)
round(summary(full_age_above_1_below_5)$coef, 3)
##                    coef exp(coef) se(coef)      z Pr(>|z|)
## adv_densityTRUE  -0.006     0.994    0.243 -0.024    0.981
## adv_rainTRUE     -0.330     0.719    0.339 -0.976    0.329
## adv_momTRUE       0.789     2.201    0.217  3.634    0.000
## adv_sibTRUE      -0.054     0.948    0.279 -0.192    0.847
## adv_mom_rankTRUE  0.197     1.217    0.245  0.802    0.423
## adv_mom_sciTRUE  -0.278     0.757    0.243 -1.143    0.253
plot_terms(full_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)

Lasso

penalized_above_1_below_5 <- get_penalized_covariates(age_above_1_below_5, below_one = FALSE)
opt_lambda_above_1_below_5 <- get_opt_lambda_of_lasso(
  data = age_above_1_below_5,
  penalized_covariates = penalized_above_1_below_5
)
nonzero_vars_above_1_below_5 <- get_nonzero_varnames_of_opt_lasso(
  data = age_above_1_below_5,
  penalized_covariates = penalized_above_1_below_5,
  opt_lambda = opt_lambda_above_1_below_5
)
vars_above_1_below_5 <- get_varnames_in_final_model(nonzero_vars_above_1_below_5)
lasso_age_above_1_below_5 <- run_cox(
  data = age_above_1_below_5,
  varnames = vars_above_1_below_5,
  penalized_covariates = penalized_above_1_below_5
)
round(summary(lasso_age_above_1_below_5)$coef, 3)
##                                coef exp(coef) se(coef)      z Pr(>|z|)
## adv_momTRUE                   0.551     1.734    0.260  2.118    0.034
## adv_densityTRUE              -0.228     0.796    0.270 -0.843    0.399
## adv_sibTRUE                  -0.249     0.779    0.330 -0.755    0.450
## adv_mom_rankTRUE             -0.135     0.873    0.335 -0.404    0.686
## adv_densityTRUE:adv_sibTRUE   1.259     3.520    0.606  2.076    0.038
## adv_momTRUE:adv_mom_rankTRUE  0.692     1.999    0.495  1.399    0.162
plot_terms(lasso_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)

Stepwise-BIC

stepBIC_age_above_1_below_5 <- get_stepBIC(age_above_1_below_5, below_one=FALSE)
round(summary(stepBIC_age_above_1_below_5)$coef, 3)
##              coef exp(coef) se(coef)     z Pr(>|z|)
## adv_momTRUE 0.765      2.15    0.216 3.543        0
plot_terms(stepBIC_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)

Cumulative

cumulative_age_above_1_below_5 <- get_cumulative(age_above_1_below_5)
round(summary(cumulative_age_above_1_below_5)$coef, 3)
##                                      coef exp(coef) se(coef)      z Pr(>|z|)
## adv_momTRUE                         0.336     1.399    0.434  0.773    0.440
## adv_other_cumulative1              -0.496     0.609    0.309 -1.606    0.108
## adv_other_cumulative2              -0.381     0.683    0.369 -1.031    0.302
## adv_other_cumulative2+              0.165     1.179    0.495  0.333    0.739
## adv_momTRUE:adv_other_cumulative1   0.891     2.439    0.545  1.635    0.102
## adv_momTRUE:adv_other_cumulative2   0.536     1.710    0.625  0.858    0.391
## adv_momTRUE:adv_other_cumulative2+ -1.088     0.337    1.178 -0.924    0.356


Age above 5

Main effects

full_age_above_5 <- get_full(age_above_5, below_one = FALSE)
round(summary(full_age_above_5)$coef, 3)
##                    coef exp(coef) se(coef)      z Pr(>|z|)
## adv_densityTRUE   0.145     1.156    0.268  0.542    0.588
## adv_rainTRUE     -0.214     0.807    0.307 -0.696    0.487
## adv_momTRUE       0.732     2.079    0.225  3.250    0.001
## adv_sibTRUE       0.302     1.352    0.245  1.232    0.218
## adv_mom_rankTRUE  0.251     1.285    0.244  1.028    0.304
## adv_mom_sciTRUE   0.027     1.028    0.210  0.131    0.896
plot_terms(full_age_above_5)
mtext("Age > 5: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)

Lasso

penalized_above_5 <- get_penalized_covariates(age_above_5, below_one = FALSE)
opt_lambda_above_5 <- get_opt_lambda_of_lasso(
  data = age_above_5,
  penalized_covariates = penalized_above_5
)
nonzero_vars_above_5 <- get_nonzero_varnames_of_opt_lasso(
  data = age_above_5,
  penalized_covariates = penalized_above_5,
  opt_lambda = opt_lambda_above_5
)
vars_above_5 <- get_varnames_in_final_model(nonzero_vars_above_5)
lasso_age_above_5 <- run_cox(
  data = age_above_5,
  varnames = vars_above_5,
  penalized_covariates = penalized_above_5
)
round(summary(lasso_age_above_5)$coef, 3)
##                          coef exp(coef) se(coef)     z Pr(>|z|)
## adv_momTRUE             0.561     1.752    0.254 2.208    0.027
## adv_sibTRUE             0.146     1.157    0.289 0.506    0.613
## adv_momTRUE:adv_sibTRUE 0.587     1.799    0.507 1.158    0.247
plot_terms(lasso_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)

Stepwise-BIC

stepBIC_age_above_5 <- get_stepBIC(age_above_5, below_one=FALSE)
round(summary(stepBIC_age_above_5)$coef, 3)
##                            coef exp(coef) se(coef)      z Pr(>|z|)
## adv_rainTRUE              0.320     1.377    0.329  0.973    0.331
## adv_momTRUE               0.975     2.651    0.234  4.162    0.000
## adv_rainTRUE:adv_momTRUE -1.911     0.148    0.814 -2.346    0.019
plot_terms(stepBIC_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)

Cumulative

cumulative_age_above_5 <- get_cumulative(age_above_5)
round(summary(cumulative_age_above_5)$coef, 3)
##                                      coef exp(coef) se(coef)      z Pr(>|z|)
## adv_momTRUE                         1.095     2.990    0.450  2.433    0.015
## adv_other_cumulative1               0.563     1.756    0.282  1.994    0.046
## adv_other_cumulative2               0.321     1.378    0.361  0.888    0.374
## adv_other_cumulative2+              0.753     2.123    0.558  1.349    0.177
## adv_momTRUE:adv_other_cumulative1  -0.679     0.507    0.546 -1.243    0.214
## adv_momTRUE:adv_other_cumulative2   0.124     1.132    0.647  0.192    0.848
## adv_momTRUE:adv_other_cumulative2+ -1.684     0.186    1.207 -1.395    0.163
plot_terms(cumulative_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (Cumulative)", line=0, side=3, outer=TRUE, cex=1.2)


Discussion

Where do we see interaction terms picked by models and their effects?

groups <- c("Age < 1(lasso)", "Age 1-5(lasso)", "Age > 5(lasso)", "Age > 5(stepBIC)")
interactions <- c("adv_mom_rank:adv_rain, adv_mom:adv_mom_sci",
                  "adv_density:adv_sib, adv_mom:adv_mom_rank",
                  "adv_mom:adv_sib", "adv_rain:adv_mom")
if_signifcant <- c("No", "adv_density:adv_sib Yes",
                   "No", "Yes")

data.frame(
  list(interactions = interactions,
       if_signifcant = if_signifcant)
) %>%
  `row.names<-`(groups) %>%
  kable(col.names = c("Interaction terms picked", "If significant")) %>%
  kable_styling(full_width = FALSE, position = "center", font_size = 18)
Interaction terms picked If significant
Age < 1(lasso) adv_mom_rank:adv_rain, adv_mom:adv_mom_sci No
Age 1-5(lasso) adv_density:adv_sib, adv_mom:adv_mom_rank adv_density:adv_sib Yes
Age > 5(lasso) adv_mom:adv_sib No
Age > 5(stepBIC) adv_rain:adv_mom Yes

Age 1-5 Lasso

test_df <- data.frame(
  case1 = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE),   # baseline
  case2 = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE),    # adv_density
  case3 = c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE),    # adv_sib
  case4 = c(FALSE, TRUE, TRUE, FALSE, FALSE, FALSE)        # adv_density and adv_sib
) %>%
  `rownames<-`(names(coef(lasso_age_above_1_below_5))) %>%
  t()

preds <- predict(lasso_age_above_1_below_5, newdata = as.data.frame(test_df))

plot_df <- data.frame(
  x = c(0, 1, 0, 1),
  y = preds,
  group = c("first", "first", "second", "second"),
  comment = c("baseline", "adv_densityTRUE", "adv_sibTRUE", "adv_densityTRUE+adv_sibTRUE")
)

ggplot(plot_df) +
  geom_point(aes(x = x, y = y)) +
  geom_line(aes(x = x, y = y, group = group)) +
  geom_text_repel(aes(label = comment, x = x, y = y)) +
  theme_bw() +
  xlim(-0.5, 1.5) +
  scale_x_discrete(breaks = c(0, 1)) +
  labs(x = "", y = "Log-hazards") +
  theme(axis.title = element_text(size = 15))

Age > 5 StepBIC

test_df <- data.frame(
  case1 = c(FALSE, FALSE, FALSE),   # baseline
  case2 = c(TRUE, FALSE, FALSE),    # mom_rain
  case3 = c(FALSE, TRUE, FALSE),    # mom
  case4 = c(TRUE, TRUE, TRUE)       # mom and mom_rain
) %>%
  `rownames<-`(c("adv_rain", "adv_mom", "adv_rain:adv_mom")) %>%
  t()

preds <- predict(stepBIC_age_above_5, newdata = as.data.frame(test_df))

plot_df <- data.frame(
  x = c(0, 0, 1, 1),
  y = preds,
  group = c("first", "second", "first", "second"),
  comment = c("baseline", "adv_rainTRUE", "adv_momTRUE", "adv_rainTRUE+adv_momTRUE")
)

ggplot(plot_df) +
  geom_point(aes(x = x, y = y)) +
  geom_line(aes(x = x, y = y, group = group)) +
  geom_text_repel(aes(label = comment, x = x, y = y)) +
  theme_bw() +
  xlim(-0.5, 1.5) +
  scale_x_discrete(breaks = c(0, 1)) +
  labs(x = "", y = "Log-hazards") +
  theme(axis.title = element_text(size = 15))