Three Age-Group Analysis - With SCI
Library and Data
library(survival)
library(tidyverse)
library(penalized)
library(kableExtra)
library(ggrepel)
source("../R/utils.R")
source("../R/full_helper.R")
source("../R/lasso_helper.R")
source("../R/stepBIC_helper.R")
source("../R/cumulative_helper.R")
<- read_csv("../data/Aim3-early-adversity-subjects-F-survival-2021-01-29.csv")
raw_data <- dplyr::select(raw_data, -c("adv_mom_dsi", "adv_cumulative")) raw_data
Raw data here refers to the larger dataset.
EDA
Below is the Kaplan-Meier estimation of the survival function on the raw data.
<- survfit(Surv(raw_data$age, raw_data$dead) ~ 1, conf.type = "log-log")
km ggplot() +
geom_line(aes(x = km$time, y = km$surv)) +
geom_vline(aes(xintercept = 1), color = "red", linetype = "dashed") +
geom_vline(aes(xintercept = 5), color = "red", linetype = "dashed") +
labs(x = expression("Time (year)"), y = expression("Survival"),
title = "Kaplan-Meier estimate") +
scale_x_continuous(breaks = seq(0, 25, 1)) +
theme_classic()
Analysis Process
We conduct four models on three datasets derived from the raw data.
Four models:
- Cox model of survival data versus all the main effects (six adversities)
- Cox model of survival data versus main effects and interaction terms selected by lasso
- Cox model of survival data versus main effects and interaction terms selected by stepwise BIC
- Cox model of survival data versus
adv_mom
, sum of other adversitiesadv_other_cumulative
and their interaction term
Three datasets:
age_below_1
: Include all the samples. For those whoseage
> 1, useage
= 1, anddead
= FALSEage_above_1_below_5
: Include samples ofage
> 1. For those whoseage
> 5, useage
= 5, anddead
= FALSEage_above_5
: Include samples ofage
> 5.
Note that for the age_below_1
dataset, we exclude adv_sib
from all models.
Modification of the Covariates
First, we replace low maternal DSI with low maternal SCI.
And brief information of covariates we are using in this analysis is in the table below:
<- c("adv_density", "adv_rain", "adv_mom",
covariate_names "adv_sib", "adv_mom_rank", "adv_mom_sci",
"adv_cumulative_sci")
<- c(rep("Logical", 6), "Numeric")
covariate_types <- c("Group size is in the highest quartile",
covariate_meanings "Rainfall <= 200mm",
"Mom dies before age 4",
"A sibling is born",
"Mom's rank is in the lowest quartile",
"Social connection is in the lowest quartile",
"Sum of the above six adversities")
data.frame(
list(types = covariate_types,
meanings = covariate_meanings)
%>%
) `row.names<-`(covariate_names) %>%
kable(col.names = c("Type", "Meaning if TRUE")) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 18)
Type | Meaning if TRUE | |
---|---|---|
adv_density | Logical | Group size is in the highest quartile |
adv_rain | Logical | Rainfall <= 200mm |
adv_mom | Logical | Mom dies before age 4 |
adv_sib | Logical | A sibling is born |
adv_mom_rank | Logical | Mom’s rank is in the lowest quartile |
adv_mom_sci | Logical | Social connection is in the lowest quartile |
adv_cumulative_sci | Numeric | Sum of the above six adversities |
<- raw_data %>%
age_below_1 select(starts_with("adv_"), age, dead, -adv_cumulative_sci) %>%
mutate(dead = if_else(age > 1, FALSE, dead)) %>%
mutate(age = if_else(age > 1, 1, age))
<- raw_data %>%
age_above_1_below_5 filter(age >= 1) %>%
select(starts_with("adv_"), age, dead, -adv_cumulative_sci) %>%
mutate(dead = if_else(age > 5, FALSE, dead)) %>%
mutate(age = if_else(age > 5, 5, age))
<- raw_data %>%
age_above_5 filter(age >= 5) %>%
select(starts_with("adv_"), age, dead, -adv_cumulative_sci)
The cumulative adversities category in three age groups is in the table below. Note that when counting the cumulative adversities in Age 1-
group we exclude adv_sib
.
<- age_below_1 %>%
cumulative_summary_1 mutate(adv_other_cumulative = adv_rain+adv_mom_sci+adv_mom_rank+adv_density) %>%
group_by(adv_other_cumulative) %>%
summarise(count = n()) %>%
select(count)
<- age_above_1_below_5 %>%
cumulative_summary_2 mutate(adv_other_cumulative = adv_rain+adv_sib+adv_mom_sci+adv_mom_rank+adv_density) %>%
group_by(adv_other_cumulative) %>%
summarise(count = n()) %>%
select(count)
<- age_above_5 %>%
cumulative_summary_3 mutate(adv_other_cumulative = adv_rain+adv_sib+adv_mom_sci+adv_mom_rank+adv_density) %>%
group_by(adv_other_cumulative) %>%
summarise(count = n()) %>%
select(count)
data.frame(cbind(cumulative_summary_1, cumulative_summary_2, cumulative_summary_3)) %>%
t() %>%
`rownames<-`(c("Age 1-", "Age 1-5", "Age 5+")) %>%
kable(col.names = 0:4) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 18)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
Age 1- | 222 | 226 | 92 | 20 | 2 |
Age 1-5 | 128 | 188 | 97 | 24 | 3 |
Age 5+ | 70 | 122 | 66 | 17 | 2 |
Age below 1
Main effects
<- get_full(age_below_1, below_one = TRUE)
full_age_below_1 round(summary(full_age_below_1)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.503 0.605 0.251 -2.002 0.045
## adv_rainTRUE -0.254 0.775 0.318 -0.800 0.424
## adv_momTRUE 0.442 1.555 0.195 2.264 0.024
## adv_mom_rankTRUE 0.213 1.237 0.209 1.015 0.310
## adv_mom_sciTRUE -0.637 0.529 0.242 -2.631 0.009
plot_terms(full_age_below_1)
mtext("Age < 1: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)
Lasso
<- get_penalized_covariates(age_below_1, below_one = TRUE)
penalized_below_1 <- get_opt_lambda_of_lasso(
opt_lambda_below_1 data = age_below_1,
penalized_covariates = penalized_below_1
)<- get_nonzero_varnames_of_opt_lasso(
nonzero_vars_below_1 data = age_below_1,
penalized_covariates = penalized_below_1,
opt_lambda = opt_lambda_below_1
)<- get_varnames_in_final_model(nonzero_vars_below_1)
vars_below_1 <- run_cox(
lasso_age_below_1 data = age_below_1,
varnames = vars_below_1,
penalized_covariates = penalized_below_1
)round(summary(lasso_age_below_1)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.772 0.462 0.339 -2.276 0.023
## adv_momTRUE 0.410 1.507 0.232 1.770 0.077
## adv_mom_rankTRUE 0.265 1.303 0.216 1.226 0.220
## adv_mom_sciTRUE -0.441 0.643 0.282 -1.562 0.118
## adv_rainTRUE -0.079 0.924 0.337 -0.234 0.815
## adv_densityTRUE:adv_momTRUE 0.825 2.281 0.510 1.618 0.106
## adv_mom_rankTRUE:adv_rainTRUE -1.254 0.285 1.078 -1.163 0.245
## adv_momTRUE:adv_mom_sciTRUE -0.676 0.509 0.557 -1.214 0.225
plot_terms(lasso_age_below_1)
mtext("Age < 1: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)
Stepwise-BIC
<- get_stepBIC(age_below_1, below_one=TRUE)
stepBIC_age_below_1 round(summary(stepBIC_age_below_1)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_mom_sciTRUE -0.588 0.555 0.24 -2.447 0.014
plot_terms(stepBIC_age_below_1)
mtext("Age < 1: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)
Cumulative
<- get_cumulative(age_below_1, below_one=TRUE)
cumulative_age_below_1 round(summary(cumulative_age_below_1)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.497 1.644 0.290 1.717 0.086
## adv_other_cumulative1 -0.293 0.746 0.244 -1.203 0.229
## adv_other_cumulative2 -0.584 0.558 0.369 -1.583 0.113
## adv_other_cumulative2+ -0.511 0.600 0.725 -0.706 0.480
## adv_momTRUE:adv_other_cumulative1 -0.060 0.942 0.425 -0.141 0.888
## adv_momTRUE:adv_other_cumulative2 0.346 1.414 0.581 0.596 0.551
## adv_momTRUE:adv_other_cumulative2+ -16.132 0.000 2222.416 -0.007 0.994
Age above 1 and below 5
Main effects
<- get_full(age_above_1_below_5, below_one = FALSE)
full_age_above_1_below_5 round(summary(full_age_above_1_below_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE -0.006 0.994 0.243 -0.024 0.981
## adv_rainTRUE -0.330 0.719 0.339 -0.976 0.329
## adv_momTRUE 0.789 2.201 0.217 3.634 0.000
## adv_sibTRUE -0.054 0.948 0.279 -0.192 0.847
## adv_mom_rankTRUE 0.197 1.217 0.245 0.802 0.423
## adv_mom_sciTRUE -0.278 0.757 0.243 -1.143 0.253
plot_terms(full_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)
Lasso
<- get_penalized_covariates(age_above_1_below_5, below_one = FALSE)
penalized_above_1_below_5 <- get_opt_lambda_of_lasso(
opt_lambda_above_1_below_5 data = age_above_1_below_5,
penalized_covariates = penalized_above_1_below_5
)<- get_nonzero_varnames_of_opt_lasso(
nonzero_vars_above_1_below_5 data = age_above_1_below_5,
penalized_covariates = penalized_above_1_below_5,
opt_lambda = opt_lambda_above_1_below_5
)<- get_varnames_in_final_model(nonzero_vars_above_1_below_5)
vars_above_1_below_5 <- run_cox(
lasso_age_above_1_below_5 data = age_above_1_below_5,
varnames = vars_above_1_below_5,
penalized_covariates = penalized_above_1_below_5
)round(summary(lasso_age_above_1_below_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.551 1.734 0.260 2.118 0.034
## adv_densityTRUE -0.228 0.796 0.270 -0.843 0.399
## adv_sibTRUE -0.249 0.779 0.330 -0.755 0.450
## adv_mom_rankTRUE -0.135 0.873 0.335 -0.404 0.686
## adv_densityTRUE:adv_sibTRUE 1.259 3.520 0.606 2.076 0.038
## adv_momTRUE:adv_mom_rankTRUE 0.692 1.999 0.495 1.399 0.162
plot_terms(lasso_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)
Stepwise-BIC
<- get_stepBIC(age_above_1_below_5, below_one=FALSE)
stepBIC_age_above_1_below_5 round(summary(stepBIC_age_above_1_below_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.765 2.15 0.216 3.543 0
plot_terms(stepBIC_age_above_1_below_5)
mtext("Age 1 - 5: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)
Cumulative
<- get_cumulative(age_above_1_below_5)
cumulative_age_above_1_below_5 round(summary(cumulative_age_above_1_below_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.336 1.399 0.434 0.773 0.440
## adv_other_cumulative1 -0.496 0.609 0.309 -1.606 0.108
## adv_other_cumulative2 -0.381 0.683 0.369 -1.031 0.302
## adv_other_cumulative2+ 0.165 1.179 0.495 0.333 0.739
## adv_momTRUE:adv_other_cumulative1 0.891 2.439 0.545 1.635 0.102
## adv_momTRUE:adv_other_cumulative2 0.536 1.710 0.625 0.858 0.391
## adv_momTRUE:adv_other_cumulative2+ -1.088 0.337 1.178 -0.924 0.356
Age above 5
Main effects
<- get_full(age_above_5, below_one = FALSE)
full_age_above_5 round(summary(full_age_above_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_densityTRUE 0.145 1.156 0.268 0.542 0.588
## adv_rainTRUE -0.214 0.807 0.307 -0.696 0.487
## adv_momTRUE 0.732 2.079 0.225 3.250 0.001
## adv_sibTRUE 0.302 1.352 0.245 1.232 0.218
## adv_mom_rankTRUE 0.251 1.285 0.244 1.028 0.304
## adv_mom_sciTRUE 0.027 1.028 0.210 0.131 0.896
plot_terms(full_age_above_5)
mtext("Age > 5: Log-hazard versus important variables", line=0, side=3, outer=TRUE, cex=1.2)
Lasso
<- get_penalized_covariates(age_above_5, below_one = FALSE)
penalized_above_5 <- get_opt_lambda_of_lasso(
opt_lambda_above_5 data = age_above_5,
penalized_covariates = penalized_above_5
)<- get_nonzero_varnames_of_opt_lasso(
nonzero_vars_above_5 data = age_above_5,
penalized_covariates = penalized_above_5,
opt_lambda = opt_lambda_above_5
)<- get_varnames_in_final_model(nonzero_vars_above_5)
vars_above_5 <- run_cox(
lasso_age_above_5 data = age_above_5,
varnames = vars_above_5,
penalized_covariates = penalized_above_5
)round(summary(lasso_age_above_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 0.561 1.752 0.254 2.208 0.027
## adv_sibTRUE 0.146 1.157 0.289 0.506 0.613
## adv_momTRUE:adv_sibTRUE 0.587 1.799 0.507 1.158 0.247
plot_terms(lasso_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (Lasso)", line=0, side=3, outer=TRUE, cex=1.2)
Stepwise-BIC
<- get_stepBIC(age_above_5, below_one=FALSE)
stepBIC_age_above_5 round(summary(stepBIC_age_above_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_rainTRUE 0.320 1.377 0.329 0.973 0.331
## adv_momTRUE 0.975 2.651 0.234 4.162 0.000
## adv_rainTRUE:adv_momTRUE -1.911 0.148 0.814 -2.346 0.019
plot_terms(stepBIC_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (BIC)", line=0, side=3, outer=TRUE, cex=1.2)
Cumulative
<- get_cumulative(age_above_5)
cumulative_age_above_5 round(summary(cumulative_age_above_5)$coef, 3)
## coef exp(coef) se(coef) z Pr(>|z|)
## adv_momTRUE 1.095 2.990 0.450 2.433 0.015
## adv_other_cumulative1 0.563 1.756 0.282 1.994 0.046
## adv_other_cumulative2 0.321 1.378 0.361 0.888 0.374
## adv_other_cumulative2+ 0.753 2.123 0.558 1.349 0.177
## adv_momTRUE:adv_other_cumulative1 -0.679 0.507 0.546 -1.243 0.214
## adv_momTRUE:adv_other_cumulative2 0.124 1.132 0.647 0.192 0.848
## adv_momTRUE:adv_other_cumulative2+ -1.684 0.186 1.207 -1.395 0.163
plot_terms(cumulative_age_above_5)
mtext("Age > 5: Log-hazard versus important variables (Cumulative)", line=0, side=3, outer=TRUE, cex=1.2)
Discussion
Where do we see interaction terms picked by models and their effects?
<- c("Age < 1(lasso)", "Age 1-5(lasso)", "Age > 5(lasso)", "Age > 5(stepBIC)")
groups <- c("adv_mom_rank:adv_rain, adv_mom:adv_mom_sci",
interactions "adv_density:adv_sib, adv_mom:adv_mom_rank",
"adv_mom:adv_sib", "adv_rain:adv_mom")
<- c("No", "adv_density:adv_sib Yes",
if_signifcant "No", "Yes")
data.frame(
list(interactions = interactions,
if_signifcant = if_signifcant)
%>%
) `row.names<-`(groups) %>%
kable(col.names = c("Interaction terms picked", "If significant")) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 18)
Interaction terms picked | If significant | |
---|---|---|
Age < 1(lasso) | adv_mom_rank:adv_rain, adv_mom:adv_mom_sci | No |
Age 1-5(lasso) | adv_density:adv_sib, adv_mom:adv_mom_rank | adv_density:adv_sib Yes |
Age > 5(lasso) | adv_mom:adv_sib | No |
Age > 5(stepBIC) | adv_rain:adv_mom | Yes |
Age 1-5 Lasso
<- data.frame(
test_df case1 = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), # baseline
case2 = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE), # adv_density
case3 = c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), # adv_sib
case4 = c(FALSE, TRUE, TRUE, FALSE, FALSE, FALSE) # adv_density and adv_sib
%>%
) `rownames<-`(names(coef(lasso_age_above_1_below_5))) %>%
t()
<- predict(lasso_age_above_1_below_5, newdata = as.data.frame(test_df))
preds
<- data.frame(
plot_df x = c(0, 1, 0, 1),
y = preds,
group = c("first", "first", "second", "second"),
comment = c("baseline", "adv_densityTRUE", "adv_sibTRUE", "adv_densityTRUE+adv_sibTRUE")
)
ggplot(plot_df) +
geom_point(aes(x = x, y = y)) +
geom_line(aes(x = x, y = y, group = group)) +
geom_text_repel(aes(label = comment, x = x, y = y)) +
theme_bw() +
xlim(-0.5, 1.5) +
scale_x_discrete(breaks = c(0, 1)) +
labs(x = "", y = "Log-hazards") +
theme(axis.title = element_text(size = 15))
Age > 5 StepBIC
<- data.frame(
test_df case1 = c(FALSE, FALSE, FALSE), # baseline
case2 = c(TRUE, FALSE, FALSE), # mom_rain
case3 = c(FALSE, TRUE, FALSE), # mom
case4 = c(TRUE, TRUE, TRUE) # mom and mom_rain
%>%
) `rownames<-`(c("adv_rain", "adv_mom", "adv_rain:adv_mom")) %>%
t()
<- predict(stepBIC_age_above_5, newdata = as.data.frame(test_df))
preds
<- data.frame(
plot_df x = c(0, 0, 1, 1),
y = preds,
group = c("first", "second", "first", "second"),
comment = c("baseline", "adv_rainTRUE", "adv_momTRUE", "adv_rainTRUE+adv_momTRUE")
)
ggplot(plot_df) +
geom_point(aes(x = x, y = y)) +
geom_line(aes(x = x, y = y, group = group)) +
geom_text_repel(aes(label = comment, x = x, y = y)) +
theme_bw() +
xlim(-0.5, 1.5) +
scale_x_discrete(breaks = c(0, 1)) +
labs(x = "", y = "Log-hazards") +
theme(axis.title = element_text(size = 15))