Module 4. Semiparametric Regression Analysis
Department of Biostatistics & Medical Informatics
University of Wisconsin-Madison
Aug 3, 2025
df <- gbc |>  # calculate time to first event (relapse or death)
  group_by(id) |> # group by id
  arrange(time) |> # sort rows by time
  slice(1) |>      # get the first row within each id
  ungroup() |>     # remove grouping
  mutate(
    age40 = ifelse(age >= 40, 1, 0), # create binary variable for age >= 40
    grade = factor(grade), # convert grade to factor
    prog = prog / 100, # rescale progesterone receptor
    estrg = estrg / 100 # rescale estrogen receptor
    ) survival::coxph()library(survival) # Load survival package
cox_fit <- coxph(Surv(time, status) ~ hormone + meno + age40 + grade + size + prog + estrg, 
                 data = df)
summary(cox_fit) # Print model summaryCall:
coxph(formula = Surv(time, status) ~ hormone + meno + age40 + 
    grade + size + prog + estrg, data = df)
  n= 686, number of events= 299 
            coef exp(coef) se(coef)      z Pr(>|z|)    
hormone -0.37432   0.68776  0.12917 -2.898 0.003758 ** 
meno     0.28450   1.32909  0.13973  2.036 0.041748 *  
age40   -0.55127   0.57622  0.20243 -2.723 0.006463 ** 
grade2   0.71547   2.04514  0.24854  2.879 0.003993 ** 
grade3   0.77465   2.16982  0.26970  2.872 0.004075 ** 
size     0.01606   1.01619  0.00368  4.365 1.27e-05 ***
prog    -0.22400   0.79932  0.05776 -3.878 0.000105 ***
estrg    0.01204   1.01212  0.04680  0.257 0.796895    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
        exp(coef) exp(-coef) lower .95 upper .95
hormone    0.6878     1.4540    0.5339    0.8859
meno       1.3291     0.7524    1.0107    1.7478
age40      0.5762     1.7355    0.3875    0.8568
grade2     2.0451     0.4890    1.2565    3.3287
grade3     2.1698     0.4609    1.2790    3.6812
size       1.0162     0.9841    1.0089    1.0236
prog       0.7993     1.2511    0.7138    0.8951
estrg      1.0121     0.9880    0.9234    1.1093
Concordance= 0.661  (se = 0.016 )
Likelihood ratio test= 78.79  on 8 df,   p=9e-14
Wald test            = 68.82  on 8 df,   p=8e-12
Score (logrank) test = 69.51  on 8 df,   p=6e-12
coxph() Outputbroom package: broom::tidy()
library(broom) # Load broom package
tidy_cox <- tidy(cox_fit) # Tidy the coxph output
tidy_cox # Display the tidy output# A tibble: 8 × 5
  term    estimate std.error statistic   p.value
  <chr>      <dbl>     <dbl>     <dbl>     <dbl>
1 hormone  -0.374    0.129      -2.90  0.00376  
2 meno      0.284    0.140       2.04  0.0417   
3 age40    -0.551    0.202      -2.72  0.00646  
4 grade2    0.715    0.249       2.88  0.00399  
5 grade3    0.775    0.270       2.87  0.00408  
6 size      0.0161   0.00368     4.37  0.0000127
7 prog     -0.224    0.0578     -3.88  0.000105 
8 estrg     0.0120   0.0468      0.257 0.797    
gtsummary (I)gtsummary package: tbl_regression()
library(gtsummary) # Load gtsummary package
cox_tbl <- cox_fit |> tbl_regression( # Create a regression table
                exponentiate = TRUE, # Exponentiate coefficients to get hazard ratios
                label = list(hormone ~ "Hormone Therapy", # Custom labels
                             meno ~ "Menopausal", 
                             age40 ~ "Older than 40", 
                             grade ~ "Tumor Grade", 
                             size ~ "Tumor Size (mm)", 
                             prog ~ "Progesterone Receptor (100 fmol/ml)", 
                             estrg ~ "Estrogen Receptor  (100 fmol/ml)")
               ) |> 
          add_global_p() # Add global p-value for categorical variables
cox_tbl # Display the regression tablegtsummary (II)| Characteristic | HR1 | 95% CI1 | p-value | 
|---|---|---|---|
| Hormone Therapy | 0.69 | 0.53, 0.89 | 0.003 | 
| Menopausal | 1.33 | 1.01, 1.75 | 0.039 | 
| Older than 40 | 0.58 | 0.39, 0.86 | 0.009 | 
| Tumor Grade | 0.004 | ||
| 1 | — | — | |
| 2 | 2.05 | 1.26, 3.33 | |
| 3 | 2.17 | 1.28, 3.68 | |
| Tumor Size (mm) | 1.02 | 1.01, 1.02 | <0.001 | 
| Progesterone Receptor (100 fmol/ml) | 0.80 | 0.71, 0.90 | <0.001 | 
| Estrogen Receptor (100 fmol/ml) | 1.01 | 0.92, 1.11 | 0.8 | 
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
modify_header(): update column headersmodify_footnote_header(): update column header footnotemodify_footnote_body(): update table body footnotemodify_caption(): update table caption/titlebold_labels(): bold variable labelsbold_levels(): bold variable levelsitalicize_labels(): italicize variable labelsitalicize_levels(): italicize variable levelsbold_p(): bold significant p-valuestbl_regression()
survival::survreg()Exercise
survreg object aft_fit using broom::tidy()gtsummary::tbl_regression()# Tidy with exponentiated coeffs (HR) and CI
tidy_cox <- tidy(cox_fit, exponentiate = TRUE, conf.int = TRUE) 
tidy_cox$term <- recode(tidy_cox$term,           # Relabel the variables
                hormone = "Hormone Therapy", 
                meno = "Menopausal", 
                age40 = "Older than 40", 
                grade2 = "Tumor Grade II vs I",
                grade3 = "Tumor Grade III vs I",
                size = "Tumor Size (mm)", 
                prog = "Progesterone (100 fmol/ml)", 
                estrg = "Estrogen (100 fmol/ml)")
tidy_cox |>  # plot of hazard ratios and 95% CIs
  ggplot(aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) + 
  geom_pointrange() +  # plots center point (x) and range (xmin, xmax)
  geom_vline(xintercept=1, linetype = 2) + # vertical line at HR=1
  scale_x_log10("Hazard ratio (95% CI)") + # log scale for x-axis
  theme_classic() + # classic theme for clean look
  theme(
    axis.line.y = element_blank(),          # remove y-axis line
    axis.ticks.y = element_blank(),         # remove y-axis ticks
    axis.text.y = element_text(size = 11),  # set variable label size
    axis.title.y = element_blank()          # remove y-axis title
  )tidy_cox |> 
  ggplot(aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) + 
  geom_pointrange(shape = 15) +  # plots center point (x) in square and range (xmin, xmax)
  geom_vline(xintercept=1, linetype = 2) + # vertical line at HR=1
  scale_x_log10("Hazard ratio (95% CI)",    # log scale for x-axis
                breaks = c(0.5, 1, 2, 4)) + # log scale for x-axis
  ggtitle("Cox Regression Results for GBC Data") + # Add title
  theme_classic() + # classic theme for clean look
  theme(
    axis.line.y = element_blank(),          # remove y-axis line
    axis.ticks.y = element_blank(),         # remove y-axis ticks
    axis.text.y = element_text(size = 11),  # set variable label size
    axis.title.y = element_blank()          # remove y-axis title
  )Predicted survival function \[ \hat S(t \mid z) = \exp\left\{- \exp(\hat\beta^\mathrm{T} z) \hat\Lambda_0(t)\right\} \]
Prepare new data for prediction
Use survival::survfit() to predict survival probabilities
newdata: new data for predictiontimes: time points for predictionbroom::tidy() to tidy the output# Predict survival probabilities for `newdata`
pred_surv <- survfit(cox_fit, newdata = new_data[1, ])
tidy_pred_surv <- tidy(pred_surv) # Tidy the survival prediction output
head(tidy_pred_surv) # Display the first few rows of the tidy output# A tibble: 6 × 8
   time n.risk n.event n.censor estimate std.error conf.high conf.low
  <dbl>  <dbl>   <dbl>    <dbl>    <dbl>     <dbl>     <dbl>    <dbl>
1 0.262    686       0        1        1         0         1        1
2 0.492    685       0        1        1         0         1        1
3 0.525    684       0        1        1         0         1        1
4 0.557    683       0        2        1         0         1        1
5 0.590    681       0        1        1         0         1        1
6 0.951    680       0        1        1         0         1        1
ggsurvfit package: ggsurvfit()
survfit object to ggsurvfit()library(ggsurvfit) # Load ggsurvfit package
pred_fig <- pred_surv |>  # Pass the survfit object
  ggsurvfit() + # Main function
  add_confidence_interval() + # Add confidence interval
  scale_x_continuous("Time (months)", breaks = seq(0, 84, by = 12)) + # x-axis format
  scale_y_continuous("Relapse-free survival probability", limits = c(0, 1)) + # y-axis format
  ggtitle("Predicted Relapse-Free Survival for a GBC Patient") + # Add title
  theme_classic() # Classic theme for clean lookcox.zph() to test PH assumptionsurvminer::ggcoxzph() on cox.zph object to visualize Schoenfeld residualsresiduals(cox_fit, type = "martingale") to get martingale residualssurvminer::ggcoxdiagnostics()Check proportionality
ggcoxdiagnostics()
coxph objecttype: Residual type (“martingale”, “deviance”, “score”, “schoenfeld”, “dfbeta”, “dfbetas”, and “scaledsch”)`ox.scale: Scale for x-axis (“linear.predictions”, “observation.id”, “time”)point.col: Color of pointspoint.size: Size of pointssurvminer
library(tidycmprsk) # Load tidycmprsk package
data("trial", package = "tidycmprsk") # Load trial data from tidycmprsk package
head(trial) # Display the first few rows of the data# A tibble: 6 × 9
  trt      age marker stage grade response death death_cr           ttdeath
  <chr>  <dbl>  <dbl> <fct> <fct>    <int> <int> <fct>                <dbl>
1 Drug A    23  0.16  T1    II           0     0 censor                24  
2 Drug B     9  1.11  T2    I            1     0 censor                24  
3 Drug A    31  0.277 T1    II           0     0 censor                24  
4 Drug A    NA  2.07  T3    III          1     1 death other causes    17.6
5 Drug A    51  2.77  T4    III          1     1 death other causes    16.4
6 Drug B    39  0.613 T4    I            0     1 death from cancer     15.6
cmprsk::crr()
formula: Surv(time, status) ~ covariates
status: a factor with first level indicating censoring and subsequent levels the competing risksfailcode: event code for the cause of interestfg_fit <- crr(Surv(ttdeath, death_cr) ~ trt + age + marker + stage, # fit FG model
              failcode = "death from cancer", trial) # for death from cancer21 cases omitted due to missing values
Variable    Coef     SE      HR     95% CI       p-value    
trtDrug B   0.396    0.283   1.49   0.85, 2.59   0.16       
age         0.009    0.011   1.01   0.99, 1.03   0.42       
marker      -0.002   0.159   1.00   0.73, 1.36   0.99       
stageT2     0.140    0.475   1.15   0.45, 2.92   0.77       
stageT3     0.500    0.460   1.65   0.67, 4.06   0.28       
stageT4     0.959    0.418   2.61   1.15, 5.91   0.022      
   trtDrug B          age       marker      stageT2      stageT3      stageT4 
 0.396486121  0.008956935 -0.002006717  0.140251967  0.500394141  0.958640247 
              [,1]          [,2]          [,3]         [,4]          [,5]
[1,]  0.0800665101  0.0001535045 -0.0051801922  0.011605373  0.0094601803
[2,]  0.0001535045  0.0001239790 -0.0005094795  0.001111245 -0.0009368412
[3,] -0.0051801922 -0.0005094795  0.0251827414 -0.028697647 -0.0037297926
[4,]  0.0116053732  0.0011112447 -0.0286976466  0.225187822  0.1101888313
[5,]  0.0094601803 -0.0009368412 -0.0037297926  0.110188831  0.2111942725
[6,]  0.0219362509  0.0010459739 -0.0176113331  0.124589145  0.1064088264
             [,6]
[1,]  0.021936251
[2,]  0.001045974
[3,] -0.017611333
[4,]  0.124589145
[5,]  0.106408826
[6,]  0.174446817
broom package: broom::tidy()
tidy_fg <- tidy(fg_fit, exponentiate = TRUE, conf.int = TRUE) # Tidy model output
tidy_fg # Display the tidy output# A tibble: 6 × 7
  term      estimate std.error statistic conf.low conf.high p.value
  <chr>        <dbl>     <dbl>     <dbl>    <dbl>     <dbl>   <dbl>
1 trtDrug B    1.49     0.283     1.40      0.854      2.59   0.16 
2 age          1.01     0.0111    0.804     0.987      1.03   0.42 
3 marker       0.998    0.159    -0.0126    0.731      1.36   0.99 
4 stageT2      1.15     0.475     0.296     0.454      2.92   0.77 
5 stageT3      1.65     0.460     1.09      0.670      4.06   0.28 
6 stageT4      2.61     0.418     2.30      1.15       5.91   0.022
# tidy_fg$term <- recode(tidy_fg$term, ...) # Relabel the variables
tidy_fg |>  # plot of sub-distribution hazard ratios and 95% CIs
  ggplot(aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) + 
  geom_pointrange() +  # plots center point (x) and range (xmin, xmax)
  geom_vline(xintercept=1, linetype = 2) + # vertical line at HR=1
  scale_x_log10("Sub-distribution hazard ratio (95% CI)") + # log scale for x-axis
  theme_classic() + # classic theme for clean look
  theme(
    axis.line.y = element_blank(),          # remove y-axis line
    axis.ticks.y = element_blank(),         # remove y-axis ticks
    axis.text.y = element_text(size = 11),  # set variable label size
    axis.title.y = element_blank()          # remove y-axis title
  )gtsummary package: tbl_regression()
coxph object| Characteristic | HR1 | 95% CI1 | p-value | 
|---|---|---|---|
| Chemotherapy Treatment | 0.2 | ||
| Drug A | — | — | |
| Drug B | 1.49 | 0.85, 2.59 | |
| Age | 1.01 | 0.99, 1.03 | 0.4 | 
| Marker Level (ng/mL) | 1.00 | 0.73, 1.36 | >0.9 | 
| T Stage | 0.058 | ||
| T1 | — | — | |
| T2 | 1.15 | 0.45, 2.92 | |
| T3 | 1.65 | 0.67, 4.06 | |
| T4 | 2.61 | 1.15, 5.91 | |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
# Predict cumulative incidence function for first 10 patients
fg_pred <- predict(fg_fit, newdata= trial[1:10, ], times = c(6, 12, 18)) # Predict CIF
fg_pred # Display the predicted CIF$`time 6`
 [1] 0.002279375 0.003430702 0.002447917          NA 0.007579470 0.010150002
 [7] 0.002582498 0.002462688 0.002448570 0.006155050
$`time 12`
 [1] 0.02093164 0.03135502 0.02246374         NA 0.06809922 0.09023665
 [7] 0.02368558 0.02259790 0.02246967 0.05562630
$`time 18`
 [1] 0.07090010 0.10483563 0.07594466         NA 0.21744137 0.28018810
 [7] 0.07995368 0.07638548 0.07596414 0.18042196
broom and gtsummaryggplot2survival::survfit() and ggsurvfit()survminercmprsk::crr()broom and gtsummarytidymodels packages (censored)