# -------------------------------------------
# Survival Analysis: Module 4 Code
# -------------------------------------------
# Load necessary packages
library(tidyverse) # Load tidyverse packages
library(survival) # Load survival package
library(broom) # Load broom package
library(gtsummary) # Load gtsummary package
library(ggsurvfit) # Load ggsurvfit package
library(survminer) # Load survminer package
library(tidycmprsk) # Load tidycmprsk package
# ---------------------------
# 1. Load and Prepare Data
# ---------------------------
# Load GBC dataset
gbc <- read.table("data/gbc.txt", header = TRUE) # Load GBC dataset
# Reformat the data
df <- gbc |> # calculate time to first event (relapse or death)
group_by(id) |> # group by id
arrange(time) |> # sort rows by time
slice(1) |> # get the first row within each id
ungroup() |> # remove grouping
mutate(
age40 = ifelse(age >= 40, 1, 0), # create binary variable for age >= 40
grade = factor(grade), # convert grade to factor
prog = prog / 100, # rescale progesterone receptor
estrg = estrg / 100 # rescale estrogen receptor
)
# ---------------------------
# 2. Fit Cox PH Model
# ---------------------------
# Model fitting: survival::coxph()
cox_fit <- coxph(Surv(time, status) ~ hormone + meno + age40 + grade + size + prog + estrg,
data = df)
summary(cox_fit) # Print model summary
# ---------------------------
# 3. Tidy Output with broom
# ---------------------------
tidy_cox <- tidy(cox_fit) # Tidy the coxph output
tidy_cox # Display the tidy output
# ---------------------------
# 4. Tabulate Results with gtsummary
# ---------------------------
cox_tbl <- cox_fit |> tbl_regression( # Create a regression table
exponentiate = TRUE, # Exponentiate coefficients to get hazard ratios
label = list(hormone ~ "Hormone Therapy", # Custom labels
meno ~ "Menopausal",
age40 ~ "Older than 40",
grade ~ "Tumor Grade",
size ~ "Tumor Size (mm)",
prog ~ "Progesterone Receptor (100 fmol/ml)",
estrg ~ "Estrogen Receptor (100 fmol/ml)")
) |>
add_global_p() # Add global p-value for categorical variables
cox_tbl # Display the regression table
# ---------------------------
# 5. Customize the Regression Table
# ---------------------------
cox_tbl |> # Start with the regression table
modify_caption("Cox regression analysis of the German breast cancer study") |> # Add caption
bold_p() |> # Bold significant p-values
italicize_levels() # Italicize variable levels
# ---------------------------
# 6. Fit Accelerated Failure Time (AFT) Model
# ---------------------------
# Fit a Weibull AFT model
aft_fit <- survreg(Surv(time, status) ~ hormone + meno + age + grade + size + prog + estrg,
data = df, dist = "weibull") # specify the Weibull model
# ---------------------------
# 7. Forest Plot of Hazard Ratios
# ---------------------------
# Tidy with exponentiated coeffs (HR) and CI
tidy_cox <- tidy(cox_fit, exponentiate = TRUE, conf.int = TRUE)
tidy_cox$term <- recode(tidy_cox$term, # Relabel the variables
hormone = "Hormone Therapy",
meno = "Menopausal",
age40 = "Older than 40",
grade2 = "Tumor Grade II vs I",
grade3 = "Tumor Grade III vs I",
size = "Tumor Size (mm)",
prog = "Progesterone (100 fmol/ml)",
estrg = "Estrogen (100 fmol/ml)")
tidy_cox |> # plot of hazard ratios and 95% CIs
ggplot(aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) +
geom_pointrange(shape = 15) + # plots center point (x) in square and range (xmin, xmax)
geom_vline(xintercept=1, linetype = 2) + # vertical line at HR=1
scale_x_log10("Hazard ratio (95% CI)", # log scale for x-axis
breaks = c(0.5, 1, 2, 4)) + # log scale for x-axis
ggtitle("Cox Regression Results for GBC Data") + # Add title
theme_classic() + # classic theme for clean look
theme(
axis.line.y = element_blank(), # remove y-axis line
axis.ticks.y = element_blank(), # remove y-axis ticks
axis.text.y = element_text(size = 11), # set variable label size
axis.title.y = element_blank() # remove y-axis title
)
# ---------------------------
# 8. Prediction and Visualization
# ---------------------------
# Create new data for prediction
new_data <- data.frame(hormone = 2, meno = 2, age40 = 1, grade = factor(2),
size = 20, prog = 1, estrg = 1)
# Predict survival probabilities for `newdata`
pred_surv <- survfit(cox_fit, newdata = new_data[1, ])
tidy_pred_surv <- tidy(pred_surv) # Tidy the survival prediction output
head(tidy_pred_surv) # Display the first few rows of the tidy output
# Visualize predicted survival
pred_fig <- pred_surv |> # Pass the survfit object
ggsurvfit() + # Main function
add_confidence_interval() + # Add confidence interval
scale_x_continuous("Time (months)", breaks = seq(0, 84, by = 12)) + # x-axis format
scale_y_continuous("Relapse-free survival probability", limits = c(0, 1)) + # y-axis format
ggtitle("Predicted Relapse-Free Survival for a GBC Patient") + # Add title
theme_classic() # Classic theme for clean look
# Add horizontal grid lines
pred_fig + theme(panel.grid.major.y = element_line()) # Add horizontal grid lines
# ---------------------------
# 9. Cox Model Diagnostics
# ---------------------------
# PH assumption: Schoenfeld residuals
ph_test <- cox.zph(cox_fit) # Test proportional hazards assumption
ggcoxzph(ph_test) # Visualize Schoenfeld residuals
# Functional form: Martingale residuals vs linear predictor
ggcoxdiagnostics(cox_fit, type = "martingale", # martingale on y-axi
ox.scale = "linear.predictions") # linear predictor on x-axis
# Influential points: Deviance residuals
ggcoxdiagnostics(cox_fit, type = "deviance", # deviance on y-axis
ox.scale = "observation.id", # observation ID on x-axis
sline = FALSE) # no smoothed line
# ---------------------------
# 10. Fine-Gray Model (Competing Risks)
# ---------------------------
# Load trial dataset
data("trial", package = "tidycmprsk") # Load trial data from tidycmprsk package
head(trial) # Display the first few rows of the data
# Fit Fine-Gray model
fg_fit <- crr(Surv(ttdeath, death_cr) ~ trt + age + marker + stage, # fit FG model
failcode = "death from cancer", trial) # for death from cancer
fg_fit # print the Fine-Gray model fit summary
# Extract coefficients and variance
coef(fg_fit) # Extract coefficients
vcov(fg_fit) |> head() # Extract variance-covariance matrix
# Tidy FG model output
tidy_fg <- tidy(fg_fit, exponentiate = TRUE, conf.int = TRUE) # Tidy model output
tidy_fg # Display the tidy output
# Forest plot for sub-distribution hazard ratios
tidy_fg |> # plot of sub-distribution hazard ratios and 95% CIs
ggplot(aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) +
geom_pointrange() + # plots center point (x) and range (xmin, xmax)
geom_vline(xintercept=1, linetype = 2) + # vertical line at HR=1
scale_x_log10("Sub-distribution hazard ratio (95% CI)") + # log scale for x-axis
theme_classic() + # classic theme for clean look
theme(
axis.line.y = element_blank(), # remove y-axis line
axis.ticks.y = element_blank(), # remove y-axis ticks
axis.text.y = element_text(size = 11), # set variable label size
axis.title.y = element_blank() # remove y-axis title
)
# FG regression table
fg_tbl <- fg_fit |> tbl_regression(exponentiate = TRUE) |> # Create a regression table
add_global_p() # Add global p-value for categorical variables
fg_tbl # display the regression table
# Model-based prediction for CIF
fg_pred <- predict(fg_fit, newdata= trial[1:10, ], times = c(6, 12, 18)) # Predict CIF
fg_pred # Display the predicted CIF