Module 4. Semiparametric Regression Analysis

Slides

Section slides here. (To convert html to pdf, press E \(\to\) Print \(\to\) Destination: Save to pdf)

R code

Show the code
# -------------------------------------------
# Survival Analysis: Module 4 Code
# -------------------------------------------

# Load necessary packages
library(tidyverse) # Load tidyverse packages
library(survival) # Load survival package
library(broom) # Load broom package
library(gtsummary) # Load gtsummary package
library(ggsurvfit) # Load ggsurvfit package
library(survminer) # Load survminer package
library(tidycmprsk) # Load tidycmprsk package

# ---------------------------
# 1. Load and Prepare Data
# ---------------------------

# Load GBC dataset
gbc <- read.table("data/gbc.txt", header = TRUE) # Load GBC dataset

# Reformat the data
df <- gbc |>  # calculate time to first event (relapse or death)
  group_by(id) |> # group by id
  arrange(time) |> # sort rows by time
  slice(1) |>      # get the first row within each id
  ungroup() |>     # remove grouping
  mutate(
    age40 = ifelse(age >= 40, 1, 0), # create binary variable for age >= 40
    grade = factor(grade), # convert grade to factor
    prog = prog / 100, # rescale progesterone receptor
    estrg = estrg / 100 # rescale estrogen receptor
    ) 

# ---------------------------
# 2. Fit Cox PH Model
# ---------------------------

# Model fitting: survival::coxph()
cox_fit <- coxph(Surv(time, status) ~ hormone + meno + age40 + grade + size + prog + estrg, 
                 data = df)
summary(cox_fit) # Print model summary

# ---------------------------
# 3. Tidy Output with broom
# ---------------------------

tidy_cox <- tidy(cox_fit) # Tidy the coxph output
tidy_cox # Display the tidy output

# ---------------------------
# 4. Tabulate Results with gtsummary
# ---------------------------

cox_tbl <- cox_fit |> tbl_regression( # Create a regression table
                exponentiate = TRUE, # Exponentiate coefficients to get hazard ratios
                label = list(hormone ~ "Hormone Therapy", # Custom labels
                             meno ~ "Menopausal", 
                             age40 ~ "Older than 40", 
                             grade ~ "Tumor Grade", 
                             size ~ "Tumor Size (mm)", 
                             prog ~ "Progesterone Receptor (100 fmol/ml)", 
                             estrg ~ "Estrogen Receptor  (100 fmol/ml)")
               ) |> 
          add_global_p() # Add global p-value for categorical variables
cox_tbl # Display the regression table

# ---------------------------
# 5. Customize the Regression Table
# ---------------------------

cox_tbl |> # Start with the regression table
  modify_caption("Cox regression analysis of the German breast cancer study") |> # Add caption
  bold_p() |> # Bold significant p-values
  italicize_levels() # Italicize variable levels

# ---------------------------
# 6. Fit Accelerated Failure Time (AFT) Model
# ---------------------------

# Fit a Weibull AFT model
aft_fit <- survreg(Surv(time, status) ~ hormone + meno + age + grade + size + prog + estrg, 
                    data = df, dist = "weibull") # specify the Weibull model

# ---------------------------
# 7. Forest Plot of Hazard Ratios
# ---------------------------

# Tidy with exponentiated coeffs (HR) and CI
tidy_cox <- tidy(cox_fit, exponentiate = TRUE, conf.int = TRUE) 
tidy_cox$term <- recode(tidy_cox$term,           # Relabel the variables
                hormone = "Hormone Therapy", 
                meno = "Menopausal", 
                age40 = "Older than 40", 
                grade2 = "Tumor Grade II vs I",
                grade3 = "Tumor Grade III vs I",
                size = "Tumor Size (mm)", 
                prog = "Progesterone (100 fmol/ml)", 
                estrg = "Estrogen (100 fmol/ml)")

tidy_cox |>  # plot of hazard ratios and 95% CIs
  ggplot(aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) + 
  geom_pointrange(shape = 15) +  # plots center point (x) in square and range (xmin, xmax)
  geom_vline(xintercept=1, linetype = 2) + # vertical line at HR=1
  scale_x_log10("Hazard ratio (95% CI)",    # log scale for x-axis
                breaks = c(0.5, 1, 2, 4)) + # log scale for x-axis
  ggtitle("Cox Regression Results for GBC Data") + # Add title
  theme_classic() + # classic theme for clean look
  theme(
    axis.line.y = element_blank(),          # remove y-axis line
    axis.ticks.y = element_blank(),         # remove y-axis ticks
    axis.text.y = element_text(size = 11),  # set variable label size
    axis.title.y = element_blank()          # remove y-axis title
  )

# ---------------------------
# 8. Prediction and Visualization
# ---------------------------

# Create new data for prediction
new_data <- data.frame(hormone = 2, meno = 2, age40 = 1, grade = factor(2), 
                            size = 20, prog = 1, estrg = 1)

# Predict survival probabilities for `newdata`
pred_surv <- survfit(cox_fit, newdata = new_data[1, ])
tidy_pred_surv <- tidy(pred_surv) # Tidy the survival prediction output
head(tidy_pred_surv) # Display the first few rows of the tidy output

# Visualize predicted survival
pred_fig <- pred_surv |>  # Pass the survfit object
  ggsurvfit() + # Main function
  add_confidence_interval() + # Add confidence interval
  scale_x_continuous("Time (months)", breaks = seq(0, 84, by = 12)) + # x-axis format
  scale_y_continuous("Relapse-free survival probability", limits = c(0, 1)) + # y-axis format
  ggtitle("Predicted Relapse-Free Survival for a GBC Patient") + # Add title
  theme_classic() # Classic theme for clean look

# Add horizontal grid lines
pred_fig + theme(panel.grid.major.y = element_line()) # Add horizontal grid lines

# ---------------------------
# 9. Cox Model Diagnostics
# ---------------------------

# PH assumption: Schoenfeld residuals
ph_test <- cox.zph(cox_fit) # Test proportional hazards assumption
ggcoxzph(ph_test) # Visualize Schoenfeld residuals

# Functional form: Martingale residuals vs linear predictor
ggcoxdiagnostics(cox_fit, type = "martingale",    # martingale on y-axi
                 ox.scale = "linear.predictions") # linear predictor on x-axis

# Influential points: Deviance residuals
ggcoxdiagnostics(cox_fit, type = "deviance",  # deviance on y-axis
                 ox.scale = "observation.id", # observation ID on x-axis
                 sline = FALSE)               # no smoothed line

# ---------------------------
# 10. Fine-Gray Model (Competing Risks)
# ---------------------------

# Load trial dataset
data("trial", package = "tidycmprsk") # Load trial data from tidycmprsk package
head(trial) # Display the first few rows of the data

# Fit Fine-Gray model
fg_fit <- crr(Surv(ttdeath, death_cr) ~ trt + age + marker + stage, # fit FG model
              failcode = "death from cancer", trial) # for death from cancer
fg_fit # print the Fine-Gray model fit summary

# Extract coefficients and variance
coef(fg_fit) # Extract coefficients
vcov(fg_fit) |> head() # Extract variance-covariance matrix

# Tidy FG model output
tidy_fg <- tidy(fg_fit, exponentiate = TRUE, conf.int = TRUE) # Tidy model output
tidy_fg # Display the tidy output

# Forest plot for sub-distribution hazard ratios
tidy_fg |>  # plot of sub-distribution hazard ratios and 95% CIs
  ggplot(aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) + 
  geom_pointrange() +  # plots center point (x) and range (xmin, xmax)
  geom_vline(xintercept=1, linetype = 2) + # vertical line at HR=1
  scale_x_log10("Sub-distribution hazard ratio (95% CI)") + # log scale for x-axis
  theme_classic() + # classic theme for clean look
  theme(
    axis.line.y = element_blank(),          # remove y-axis line
    axis.ticks.y = element_blank(),         # remove y-axis ticks
    axis.text.y = element_text(size = 11),  # set variable label size
    axis.title.y = element_blank()          # remove y-axis title
  )

# FG regression table
fg_tbl <- fg_fit |> tbl_regression(exponentiate = TRUE) |>  # Create a regression table 
          add_global_p() # Add global p-value for categorical variables
fg_tbl # display the regression table

# Model-based prediction for CIF
fg_pred <- predict(fg_fit, newdata= trial[1:10, ], times = c(6, 12, 18)) # Predict CIF
fg_pred # Display the predicted CIF