Three Age-Group Analysis - With SCI
Library and Data
library(survival)
library(tidyverse)
library(penalized)
library(kableExtra)
library(ggrepel)
source("../R/utils.R")
source("../R/full_helper.R")
source("../R/lasso_helper.R")
source("../R/stepBIC_helper.R")
source("../R/cumulative_helper.R")
raw_data <- read_csv("../data/Aim3-early-adversity-subjects-F-survival-2021-01-29.csv")
raw_data <- dplyr::select(raw_data, -c("adv_mom_dsi", "adv_cumulative"))Raw data here refers to the larger dataset.
EDA
Below is the Kaplan-Meier estimation of the survival function on the raw data.
km <- survfit(Surv(raw_data$age, raw_data$dead) ~ 1, conf.type = "log-log")
ggplot() +
geom_line(aes(x = km$time, y = km$surv)) +
geom_vline(aes(xintercept = 1), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = 5), color = "red", linetype = "dashed") +
labs(x = expression("Time (year)"), y = expression("Survival"),
title = "Kaplan-Meier estimate") +
scale_x_continuous(breaks = seq(0, 25, 1)) +
theme_classic()Analysis Process
We conduct four models on three datasets derived from the raw data.
Four models:
- Cox model of survival data versus all the main effects (six adversities)
- Cox model of survival data versus main effects and interaction terms selected by lasso
- Cox model of survival data versus main effects and interaction terms selected by stepwise BIC
- Cox model of survival data versus
adv_mom, sum of other adversitiesadv_other_cumulativeand their interaction term
Three datasets:
age_below_1: Include all the samples. For those whoseage> 1, useage= 1, anddead= FALSEage_above_1_below_5: Include samples ofage> 1. For those whoseage> 5, useage= 5, anddead= FALSEage_above_5: Include samples ofage> 5.
Note that for the age_below_1 dataset, we exclude adv_sib from all models.
Modification of the Covariates
First, we replace low maternal DSI with low maternal SCI.
And brief information of covariates we are using in this analysis is in the table below:
covariate_names <- c("adv_density", "adv_rain", "adv_mom",
"adv_sib", "adv_mom_rank", "adv_mom_sci",
"adv_cumulative_sci")
covariate_types <- c(rep("Logical", 6), "Numeric")
covariate_meanings <- c("Group size is in the highest quartile",
"Rainfall <= 200mm",
"Mom dies before age 4",
"A sibling is born",
"Mom's rank is in the lowest quartile",
"Social connection is in the lowest quartile",
"Sum of the above six adversities")
data.frame(
list(types = covariate_types,
meanings = covariate_meanings)
) %>%
`row.names<-`(covariate_names) %>%
kable(col.names = c("Type", "Meaning if TRUE")) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 18)| Type | Meaning if TRUE | |
|---|---|---|
| adv_density | Logical | Group size is in the highest quartile |
| adv_rain | Logical | Rainfall <= 200mm |
| adv_mom | Logical | Mom dies before age 4 |
| adv_sib | Logical | A sibling is born |
| adv_mom_rank | Logical | Mom’s rank is in the lowest quartile |
| adv_mom_sci | Logical | Social connection is in the lowest quartile |
| adv_cumulative_sci | Numeric | Sum of the above six adversities |
age_below_1 <- raw_data %>%
select(starts_with("adv_"), age, dead, -adv_cumulative_sci) %>%
mutate(dead = if_else(age > 1, FALSE, dead)) %>%
mutate(age = if_else(age > 1, 1, age))
age_above_1_below_5 <- raw_data %>%
filter(age >= 1) %>%
select(starts_with("adv_"), age, dead, -adv_cumulative_sci) %>%
mutate(dead = if_else(age > 5, FALSE, dead)) %>%
mutate(age = if_else(age > 5, 5, age))
age_above_5 <- raw_data %>%
filter(age >= 5) %>%
select(starts_with("adv_"), age, dead, -adv_cumulative_sci)The cumulative adversities category in three age groups is in the table below. Note that when counting the cumulative adversities in Age 1- group we exclude adv_sib.
cumulative_summary_1 <- age_below_1 %>%
mutate(adv_other_cumulative = adv_rain+adv_mom_sci+adv_mom_rank+adv_density) %>%
group_by(adv_other_cumulative) %>%
summarise(count = n()) %>%
select(count)
cumulative_summary_2 <- age_above_1_below_5 %>%
mutate(adv_other_cumulative = adv_rain+adv_sib+adv_mom_sci+adv_mom_rank+adv_density) %>%
group_by(adv_other_cumulative) %>%
summarise(count = n()) %>%
select(count)
cumulative_summary_3 <- age_above_5 %>%
mutate(adv_other_cumulative = adv_rain+adv_sib+adv_mom_sci+adv_mom_rank+adv_density) %>%
group_by(adv_other_cumulative) %>%
summarise(count = n()) %>%
select(count)
data.frame(cbind(cumulative_summary_1, cumulative_summary_2, cumulative_summary_3)) %>%
t() %>%
`rownames<-`(c("Age 1-", "Age 1-5", "Age 5+")) %>%
kable(col.names = 0:4) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 18)| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| Age 1- | 222 | 226 | 92 | 20 | 2 |
| Age 1-5 | 128 | 188 | 97 | 24 | 3 |
| Age 5+ | 70 | 122 | 66 | 17 | 2 |
Age below 1
Main effects
full_age_below_1 <- get_full(age_below_1, below_one = TRUE)
round(summary(full_age_below_1)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.503 0.605 0.251 -2.002 0.045
## adv_rainTRUE -0.254 0.775 0.318 -0.800 0.424
## adv_momTRUE 0.442 1.555 0.195 2.264 0.024
## adv_mom_rankTRUE 0.213 1.237 0.209 1.015 0.310
## adv_mom_sciTRUE -0.637 0.529 0.242 -2.631 0.009
plot_terms(full_age_below_1)
mtext("Age < 1: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)Lasso
penalized_below_1 <- get_penalized_covariates(age_below_1, below_one = TRUE)
opt_lambda_below_1 <- get_opt_lambda_of_lasso(
data = age_below_1,
penalized_covariates = penalized_below_1
)
nonzero_vars_below_1 <- get_nonzero_varnames_of_opt_lasso(
data = age_below_1,
penalized_covariates = penalized_below_1,
opt_lambda = opt_lambda_below_1
)
vars_below_1 <- get_varnames_in_final_model(nonzero_vars_below_1)
lasso_age_below_1 <- run_cox(
data = age_below_1,
varnames = vars_below_1,
penalized_covariates = penalized_below_1
)
round(summary(lasso_age_below_1)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.772 0.462 0.339 -2.276 0.023
## adv_momTRUE 0.410 1.507 0.232 1.770 0.077
## adv_mom_rankTRUE 0.265 1.303 0.216 1.226 0.220
## adv_mom_sciTRUE -0.441 0.643 0.282 -1.562 0.118
## adv_rainTRUE -0.079 0.924 0.337 -0.234 0.815
## adv_densityTRUE:adv_momTRUE 0.825 2.281 0.510 1.618 0.106
## adv_mom_rankTRUE:adv_rainTRUE -1.254 0.285 1.078 -1.163 0.245
## adv_momTRUE:adv_mom_sciTRUE -0.676 0.509 0.557 -1.214 0.225
plot_terms(lasso_age_below_1)
mtext("Age < 1: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)Stepwise-BIC
stepBIC_age_below_1 <- get_stepBIC(age_below_1, below_one=TRUE)
round(summary(stepBIC_age_below_1)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_mom_sciTRUE -0.588 0.555 0.24 -2.447 0.014
plot_terms(stepBIC_age_below_1)
mtext("Age < 1: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)Cumulative
cumulative_age_below_1 <- get_cumulative(age_below_1, below_one=TRUE)
round(summary(cumulative_age_below_1)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.497 1.644 0.290 1.717 0.086
## adv_other_cumulative1 -0.293 0.746 0.244 -1.203 0.229
## adv_other_cumulative2 -0.584 0.558 0.369 -1.583 0.113
## adv_other_cumulative2+ -0.511 0.600 0.725 -0.706 0.480
## adv_momTRUE:adv_other_cumulative1 -0.060 0.942 0.425 -0.141 0.888
## adv_momTRUE:adv_other_cumulative2 0.346 1.414 0.581 0.596 0.551
## adv_momTRUE:adv_other_cumulative2+ -16.132 0.000 2222.416 -0.007 0.994
Age above 1 and below 5
Main effects
full_age_above_1_below_5 <- get_full(age_above_1_below_5, below_one = FALSE)
round(summary(full_age_above_1_below_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.006 0.994 0.243 -0.024 0.981
## adv_rainTRUE -0.330 0.719 0.339 -0.976 0.329
## adv_momTRUE 0.789 2.201 0.217 3.634 0.000
## adv_sibTRUE -0.054 0.948 0.279 -0.192 0.847
## adv_mom_rankTRUE 0.197 1.217 0.245 0.802 0.423
## adv_mom_sciTRUE -0.278 0.757 0.243 -1.143 0.253
plot_terms(full_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)Lasso
penalized_above_1_below_5 <- get_penalized_covariates(age_above_1_below_5, below_one = FALSE)
opt_lambda_above_1_below_5 <- get_opt_lambda_of_lasso(
data = age_above_1_below_5,
penalized_covariates = penalized_above_1_below_5
)
nonzero_vars_above_1_below_5 <- get_nonzero_varnames_of_opt_lasso(
data = age_above_1_below_5,
penalized_covariates = penalized_above_1_below_5,
opt_lambda = opt_lambda_above_1_below_5
)
vars_above_1_below_5 <- get_varnames_in_final_model(nonzero_vars_above_1_below_5)
lasso_age_above_1_below_5 <- run_cox(
data = age_above_1_below_5,
varnames = vars_above_1_below_5,
penalized_covariates = penalized_above_1_below_5
)
round(summary(lasso_age_above_1_below_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.551 1.734 0.260 2.118 0.034
## adv_densityTRUE -0.228 0.796 0.270 -0.843 0.399
## adv_sibTRUE -0.249 0.779 0.330 -0.755 0.450
## adv_mom_rankTRUE -0.135 0.873 0.335 -0.404 0.686
## adv_densityTRUE:adv_sibTRUE 1.259 3.520 0.606 2.076 0.038
## adv_momTRUE:adv_mom_rankTRUE 0.692 1.999 0.495 1.399 0.162
plot_terms(lasso_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)Stepwise-BIC
stepBIC_age_above_1_below_5 <- get_stepBIC(age_above_1_below_5, below_one=FALSE)
round(summary(stepBIC_age_above_1_below_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.765 2.15 0.216 3.543 0
plot_terms(stepBIC_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)Cumulative
cumulative_age_above_1_below_5 <- get_cumulative(age_above_1_below_5)
round(summary(cumulative_age_above_1_below_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.336 1.399 0.434 0.773 0.440
## adv_other_cumulative1 -0.496 0.609 0.309 -1.606 0.108
## adv_other_cumulative2 -0.381 0.683 0.369 -1.031 0.302
## adv_other_cumulative2+ 0.165 1.179 0.495 0.333 0.739
## adv_momTRUE:adv_other_cumulative1 0.891 2.439 0.545 1.635 0.102
## adv_momTRUE:adv_other_cumulative2 0.536 1.710 0.625 0.858 0.391
## adv_momTRUE:adv_other_cumulative2+ -1.088 0.337 1.178 -0.924 0.356
Age above 5
Main effects
full_age_above_5 <- get_full(age_above_5, below_one = FALSE)
round(summary(full_age_above_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE 0.145 1.156 0.268 0.542 0.588
## adv_rainTRUE -0.214 0.807 0.307 -0.696 0.487
## adv_momTRUE 0.732 2.079 0.225 3.250 0.001
## adv_sibTRUE 0.302 1.352 0.245 1.232 0.218
## adv_mom_rankTRUE 0.251 1.285 0.244 1.028 0.304
## adv_mom_sciTRUE 0.027 1.028 0.210 0.131 0.896
plot_terms(full_age_above_5)
mtext("Age > 5: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)Lasso
penalized_above_5 <- get_penalized_covariates(age_above_5, below_one = FALSE)
opt_lambda_above_5 <- get_opt_lambda_of_lasso(
data = age_above_5,
penalized_covariates = penalized_above_5
)
nonzero_vars_above_5 <- get_nonzero_varnames_of_opt_lasso(
data = age_above_5,
penalized_covariates = penalized_above_5,
opt_lambda = opt_lambda_above_5
)
vars_above_5 <- get_varnames_in_final_model(nonzero_vars_above_5)
lasso_age_above_5 <- run_cox(
data = age_above_5,
varnames = vars_above_5,
penalized_covariates = penalized_above_5
)
round(summary(lasso_age_above_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.561 1.752 0.254 2.208 0.027
## adv_sibTRUE 0.146 1.157 0.289 0.506 0.613
## adv_momTRUE:adv_sibTRUE 0.587 1.799 0.507 1.158 0.247
plot_terms(lasso_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)Stepwise-BIC
stepBIC_age_above_5 <- get_stepBIC(age_above_5, below_one=FALSE)
round(summary(stepBIC_age_above_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_rainTRUE 0.320 1.377 0.329 0.973 0.331
## adv_momTRUE 0.975 2.651 0.234 4.162 0.000
## adv_rainTRUE:adv_momTRUE -1.911 0.148 0.814 -2.346 0.019
plot_terms(stepBIC_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)Cumulative
cumulative_age_above_5 <- get_cumulative(age_above_5)
round(summary(cumulative_age_above_5)$coef, 3)## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 1.095 2.990 0.450 2.433 0.015
## adv_other_cumulative1 0.563 1.756 0.282 1.994 0.046
## adv_other_cumulative2 0.321 1.378 0.361 0.888 0.374
## adv_other_cumulative2+ 0.753 2.123 0.558 1.349 0.177
## adv_momTRUE:adv_other_cumulative1 -0.679 0.507 0.546 -1.243 0.214
## adv_momTRUE:adv_other_cumulative2 0.124 1.132 0.647 0.192 0.848
## adv_momTRUE:adv_other_cumulative2+ -1.684 0.186 1.207 -1.395 0.163
plot_terms(cumulative_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (Cumulative)", line=0, side=3, outer=TRUE, cex=1.2)Discussion
Where do we see interaction terms picked by models and their effects?
groups <- c("Age < 1(lasso)", "Age 1-5(lasso)", "Age > 5(lasso)", "Age > 5(stepBIC)")
interactions <- c("adv_mom_rank:adv_rain, adv_mom:adv_mom_sci",
"adv_density:adv_sib, adv_mom:adv_mom_rank",
"adv_mom:adv_sib", "adv_rain:adv_mom")
if_signifcant <- c("No", "adv_density:adv_sib Yes",
"No", "Yes")
data.frame(
list(interactions = interactions,
if_signifcant = if_signifcant)
) %>%
`row.names<-`(groups) %>%
kable(col.names = c("Interaction terms picked", "If significant")) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 18)| Interaction terms picked | If significant | |
|---|---|---|
| Age < 1(lasso) | adv_mom_rank:adv_rain, adv_mom:adv_mom_sci | No |
| Age 1-5(lasso) | adv_density:adv_sib, adv_mom:adv_mom_rank | adv_density:adv_sib Yes |
| Age > 5(lasso) | adv_mom:adv_sib | No |
| Age > 5(stepBIC) | adv_rain:adv_mom | Yes |
Age 1-5 Lasso
test_df <- data.frame(
case1 = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), # baseline
case2 = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE), # adv_density
case3 = c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), # adv_sib
case4 = c(FALSE, TRUE, TRUE, FALSE, FALSE, FALSE) # adv_density and adv_sib
) %>%
`rownames<-`(names(coef(lasso_age_above_1_below_5))) %>%
t()
preds <- predict(lasso_age_above_1_below_5, newdata = as.data.frame(test_df))
plot_df <- data.frame(
x = c(0, 1, 0, 1),
y = preds,
group = c("first", "first", "second", "second"),
comment = c("baseline", "adv_densityTRUE", "adv_sibTRUE", "adv_densityTRUE+adv_sibTRUE")
)
ggplot(plot_df) +
geom_point(aes(x = x, y = y)) +
geom_line(aes(x = x, y = y, group = group)) +
geom_text_repel(aes(label = comment, x = x, y = y)) +
theme_bw() +
xlim(-0.5, 1.5) +
scale_x_discrete(breaks = c(0, 1)) +
labs(x = "", y = "Log-hazards") +
theme(axis.title = element_text(size = 15))Age > 5 StepBIC
test_df <- data.frame(
case1 = c(FALSE, FALSE, FALSE), # baseline
case2 = c(TRUE, FALSE, FALSE), # mom_rain
case3 = c(FALSE, TRUE, FALSE), # mom
case4 = c(TRUE, TRUE, TRUE) # mom and mom_rain
) %>%
`rownames<-`(c("adv_rain", "adv_mom", "adv_rain:adv_mom")) %>%
t()
preds <- predict(stepBIC_age_above_5, newdata = as.data.frame(test_df))
plot_df <- data.frame(
x = c(0, 0, 1, 1),
y = preds,
group = c("first", "second", "first", "second"),
comment = c("baseline", "adv_rainTRUE", "adv_momTRUE", "adv_rainTRUE+adv_momTRUE")
)
ggplot(plot_df) +
geom_point(aes(x = x, y = y)) +
geom_line(aes(x = x, y = y, group = group)) +
geom_text_repel(aes(label = comment, x = x, y = y)) +
theme_bw() +
xlim(-0.5, 1.5) +
scale_x_discrete(breaks = c(0, 1)) +
labs(x = "", y = "Log-hazards") +
theme(axis.title = element_text(size = 15))