Libraries

library(glmnet) # lasso
library(latex2exp) # math symbols in plots
library(mosaic) # bootstrapping and data manipulation

Data Import

url="https://raw.githubusercontent.com/jlacasa/stat705_fall2024/main/classes/data/prostate.csv"
dd_prostate=read.csv(url)

Data Split

# train and test split
train=filter(dd_prostate,train=="TRUE")
test=filter(dd_prostate,train=="FALSE")

Exploratory Analysis

Exploratory analyses are my preferred first step in any statistics/data analysis project. I’m starting by printing the head of my data to get an idea of variable names and data types, (i.e., whether the data is discrete or continuous).

# first 10 rows of each column
head(train)
##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
## 6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
##   train
## 1  TRUE
## 2  TRUE
## 3  TRUE
## 4  TRUE
## 5  TRUE
## 6  TRUE

Next I plotted a histogram of my intended response variable, lpsa, to get a rough read on how the data is distributed.

# checking distribution assumptions of response
hist(train$lpsa)

Then I’m just plotting my response against various possible predictor variables. pgg45 and svi were avoided because of high incidence of \(0\) values, and lbph was honestly just missed in the moment.

# plotting variables against response (graphical variable selection)
plot(train$lweight,train$lpsa,xlab="Weight",ylab="PSA")

plot(train$age,train$lpsa,xlab="Age",ylab="PSA")

plot(train$gleason,train$lpsa,xlab="Gleason",ylab="PSA")

plot(train$lcavol,train$lpsa,xlab="Cavol",ylab="PSA")

plot(train$lcp,train$lpsa,xlab="Capsular Penetration",ylab="PSA")

LASSO

I chose to pair my graphical selection with LASSO as a form of “fact check” for my gut feeling.

# its so much easier to do this in R its not even funny
y=train$lpsa 
X=model.matrix(lpsa~.,train)[,-1]
fit_lasso=glmnet(X,y,family=c("gaussian"),alpha=1,standardize=FALSE)
par(mfrow=c(1,2))
plot(fit_lasso)
plot(fit_lasso,xvar="lambda",label=TRUE)

coef(fit_lasso)[,100]
##  (Intercept)       lcavol      lweight          age         lbph          svi 
##  0.404484828  0.574913700  0.608939008 -0.018893525  0.144364248  0.720031225 
##          lcp      gleason        pgg45    trainTRUE 
## -0.200094947 -0.022334689  0.009284446  0.000000000

Overall I would only exclude lcp, everything else I would still consider viable, if for no other reason than to put the same burden of fact checking onto LASSO.

Model Proposals

I investigated two possible models, for simplicity sake I capped my models at \(3\) predictors, rather than considering it a minimum. Both models will include lweight and age, while the difference between each model will be the inclusion of gleason versus lcavol.

My goal was to make a simple comparison and get a high performing model with good calibration without muddying up the interpretation with too many variables.

Model 1

\[y_{ij}=\beta_0+\beta_1w+\beta_2a+\beta_3t_j+\epsilon_{ij}\]

\[ \text{Where} : t_j=\begin{cases} 0 & \text{Season}=6\\ 1 & \text{Season}=7\\ 2 & \text{Season}=8\\ 3 & \text{Season}=9\\ \end{cases} \]

\[\epsilon_{ij} \sim N(0,\sigma^2)\]

Model 2

\[y_{i}=\beta_0+\beta_1w+\beta_2a+\beta_3c+\epsilon_{i}\]

\[\epsilon_{i} \sim N(0,\sigma^2)\]

Model Fitting

m1=lm(lpsa~lweight+age+as.factor(gleason),data=train)
m2=lm(lpsa~lweight+age+lcavol,data=train)
summary(m1)
## 
## Call:
## lm(formula = lpsa ~ lweight + age + as.factor(gleason), data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81889 -0.55429 -0.03658  0.63347  2.40783 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.80149    1.16762  -1.543   0.1280    
## lweight              1.22428    0.25293   4.840 9.20e-06 ***
## age                 -0.01464    0.01739  -0.841   0.4034    
## as.factor(gleason)7  1.24613    0.25233   4.938 6.44e-06 ***
## as.factor(gleason)8  0.55016    0.96231   0.572   0.5696    
## as.factor(gleason)9  1.05302    0.58652   1.795   0.0775 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9232 on 61 degrees of freedom
## Multiple R-squared:   0.46,  Adjusted R-squared:  0.4157 
## F-statistic: 10.39 on 5 and 61 DF,  p-value: 3.061e-07
summary(m2)
## 
## Call:
## lm(formula = lpsa ~ lweight + age + lcavol, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59665 -0.43183  0.00286  0.49425  1.93751 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.598680   0.984032  -0.608 0.545113    
## lweight      0.775401   0.213917   3.625 0.000579 ***
## age         -0.009274   0.013529  -0.686 0.495543    
## lcavol       0.639376   0.081225   7.872 5.98e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7645 on 63 degrees of freedom
## Multiple R-squared:  0.6176, Adjusted R-squared:  0.5994 
## F-statistic: 33.92 on 3 and 63 DF,  p-value: 3.591e-13

Model Evaluation

Graphical Checks

These graphical evaluations are just to filter out anything egregious, nothing beyond that.

Model 1

# model check plotting
# it actually is this easy
plot(m1)

Model 2

plot(m2)

Predictive Checks

I checked Mean Squared Error, Root Mean Squared Error, and coverage using the testing data.

# predict from test set
E.y.hat=predict(m1,newdata=test)
y=test$lpsa

mean((y-E.y.hat)^2) # mean squared error
## [1] 0.9167181
mean((y-E.y.hat)^2)^0.5 # root mean squared error
## [1] 0.9574539
# prediction intervals
li=predict(m1,newdata=test,interval="prediction",level=0.95)[,2]
ui=predict(m1,newdata=test,interval="prediction",level=0.95)[,3]

# checking if our data is within our prediction intervals
cover=ifelse(y>li,1,0)*ifelse(y<ui,1,0)
mean(cover)
## [1] 0.9333333
E.y.hat=predict(m2,newdata=test)

mean((y-E.y.hat)^2) # mean squared error
## [1] 0.4800778
mean((y-E.y.hat)^2)^0.5 # root mean squared error
## [1] 0.6928765
# prediction intervals
li=predict(m2,newdata=test,interval="prediction",level=0.95)[,2]
ui=predict(m2,newdata=test,interval="prediction",level=0.95)[,3]

# checking if our data is within our prediction intervals
cover=ifelse(y>li,1,0)*ifelse(y<ui,1,0)
mean(cover)
## [1] 0.9666667

The difference between the two models is almost subjective, but Model 2 does have the best performance overall between all variable selection, within-sample checks, and out-of-sample checks.

Uncertainty Estimates

I concluded with bootstrapping uncertainty estimates for \(\beta_0\), \(\beta_1\), and \(\beta_2\) for each model. I chose to show just those three since they’re the shared variables between the two models, and included both models since the difference between them is not severe enough to consider one unusable.

set.seed(1)
# so much faster to bootstrap this way
m1_boot=do(1000)*coef(lm(lpsa~lweight+age+as.factor(gleason),data=resample(train)))
m2_boot=do(1000)*coef(lm(lpsa~lweight+age+lcavol,data=resample(train)))
hist(m1_boot$Intercept,xlab=TeX("$\\beta_0$"),main="Model 1 Bootstrap")
abline(v=mean(m1_boot$Intercept),lwd=4,lty=2,col="gold")

hist(m2_boot$Intercept,xlab=TeX("$\\beta_0$"),main="Model 2 Bootstrap")
abline(v=mean(m2_boot$Intercept),lwd=4,lty=2,col="gold")

hist(m1_boot$lweight,xlab=TeX("$\\beta_1$"),main="Model 1 Bootstrap")
abline(v=mean(m1_boot$lweight),lwd=4,lty=2,col="gold")

hist(m2_boot$lweight,xlab=TeX("$\\beta_1$"),main="Model 2 Bootstrap")
abline(v=mean(m2_boot$lweight),lwd=4,lty=2,col="gold")

hist(m1_boot$age,xlab=TeX("$\\beta_2$"),main="Model 1 Bootstrap")
abline(v=mean(m1_boot$age),lwd=4,lty=2,col="gold")

hist(m2_boot$age,xlab=TeX("$\\beta_2$"),main="Model 2 Bootstrap")
abline(v=mean(m1_boot$age),lwd=4,lty=2,col="gold")