library(glmnet) # lasso
library(latex2exp) # math symbols in plots
library(mosaic) # bootstrapping and data manipulationurl="https://raw.githubusercontent.com/jlacasa/stat705_fall2024/main/classes/data/prostate.csv"
dd_prostate=read.csv(url)# train and test split
train=filter(dd_prostate,train=="TRUE")
test=filter(dd_prostate,train=="FALSE")Exploratory analyses are my preferred first step in any statistics/data analysis project. I’m starting by printing the head of my data to get an idea of variable names and data types, (i.e., whether the data is discrete or continuous).
# first 10 rows of each column
head(train)##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
## 6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
##   train
## 1  TRUE
## 2  TRUE
## 3  TRUE
## 4  TRUE
## 5  TRUE
## 6  TRUENext I plotted a histogram of my intended response variable,
lpsa, to get a rough read on how the data is
distributed.
# checking distribution assumptions of response
hist(train$lpsa)Then I’m just plotting my response against various possible predictor
variables. pgg45 and svi were avoided because
of high incidence of \(0\) values, and
lbph was honestly just missed in the moment.
# plotting variables against response (graphical variable selection)
plot(train$lweight,train$lpsa,xlab="Weight",ylab="PSA")plot(train$age,train$lpsa,xlab="Age",ylab="PSA")plot(train$gleason,train$lpsa,xlab="Gleason",ylab="PSA")plot(train$lcavol,train$lpsa,xlab="Cavol",ylab="PSA")plot(train$lcp,train$lpsa,xlab="Capsular Penetration",ylab="PSA")I chose to pair my graphical selection with LASSO as a form of “fact check” for my gut feeling.
# its so much easier to do this in R its not even funny
y=train$lpsa 
X=model.matrix(lpsa~.,train)[,-1]
fit_lasso=glmnet(X,y,family=c("gaussian"),alpha=1,standardize=FALSE)
par(mfrow=c(1,2))
plot(fit_lasso)
plot(fit_lasso,xvar="lambda",label=TRUE)coef(fit_lasso)[,100]##  (Intercept)       lcavol      lweight          age         lbph          svi 
##  0.404484828  0.574913700  0.608939008 -0.018893525  0.144364248  0.720031225 
##          lcp      gleason        pgg45    trainTRUE 
## -0.200094947 -0.022334689  0.009284446  0.000000000Overall I would only exclude lcp, everything else I
would still consider viable, if for no other reason than to put the same
burden of fact checking onto LASSO.
I investigated two possible models, for simplicity sake I
capped my models at \(3\)
predictors, rather than considering it a minimum. Both models will
include lweight and age, while the difference
between each model will be the inclusion of gleason versus
lcavol.
My goal was to make a simple comparison and get a high performing model with good calibration without muddying up the interpretation with too many variables.
Model 1
\[y_{ij}=\beta_0+\beta_1w+\beta_2a+\beta_3t_j+\epsilon_{ij}\]
\[ \text{Where} : t_j=\begin{cases} 0 & \text{Season}=6\\ 1 & \text{Season}=7\\ 2 & \text{Season}=8\\ 3 & \text{Season}=9\\ \end{cases} \]
\[\epsilon_{ij} \sim N(0,\sigma^2)\]
Model 2
\[y_{i}=\beta_0+\beta_1w+\beta_2a+\beta_3c+\epsilon_{i}\]
\[\epsilon_{i} \sim N(0,\sigma^2)\]
m1=lm(lpsa~lweight+age+as.factor(gleason),data=train)
m2=lm(lpsa~lweight+age+lcavol,data=train)summary(m1)## 
## Call:
## lm(formula = lpsa ~ lweight + age + as.factor(gleason), data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81889 -0.55429 -0.03658  0.63347  2.40783 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.80149    1.16762  -1.543   0.1280    
## lweight              1.22428    0.25293   4.840 9.20e-06 ***
## age                 -0.01464    0.01739  -0.841   0.4034    
## as.factor(gleason)7  1.24613    0.25233   4.938 6.44e-06 ***
## as.factor(gleason)8  0.55016    0.96231   0.572   0.5696    
## as.factor(gleason)9  1.05302    0.58652   1.795   0.0775 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9232 on 61 degrees of freedom
## Multiple R-squared:   0.46,  Adjusted R-squared:  0.4157 
## F-statistic: 10.39 on 5 and 61 DF,  p-value: 3.061e-07summary(m2)## 
## Call:
## lm(formula = lpsa ~ lweight + age + lcavol, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59665 -0.43183  0.00286  0.49425  1.93751 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.598680   0.984032  -0.608 0.545113    
## lweight      0.775401   0.213917   3.625 0.000579 ***
## age         -0.009274   0.013529  -0.686 0.495543    
## lcavol       0.639376   0.081225   7.872 5.98e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7645 on 63 degrees of freedom
## Multiple R-squared:  0.6176, Adjusted R-squared:  0.5994 
## F-statistic: 33.92 on 3 and 63 DF,  p-value: 3.591e-13These graphical evaluations are just to filter out anything egregious, nothing beyond that.
# model check plotting
# it actually is this easy
plot(m1)plot(m2)I checked Mean Squared Error, Root Mean Squared Error, and coverage using the testing data.
# predict from test set
E.y.hat=predict(m1,newdata=test)
y=test$lpsa
mean((y-E.y.hat)^2) # mean squared error## [1] 0.9167181mean((y-E.y.hat)^2)^0.5 # root mean squared error## [1] 0.9574539# prediction intervals
li=predict(m1,newdata=test,interval="prediction",level=0.95)[,2]
ui=predict(m1,newdata=test,interval="prediction",level=0.95)[,3]
# checking if our data is within our prediction intervals
cover=ifelse(y>li,1,0)*ifelse(y<ui,1,0)
mean(cover)## [1] 0.9333333E.y.hat=predict(m2,newdata=test)
mean((y-E.y.hat)^2) # mean squared error## [1] 0.4800778mean((y-E.y.hat)^2)^0.5 # root mean squared error## [1] 0.6928765# prediction intervals
li=predict(m2,newdata=test,interval="prediction",level=0.95)[,2]
ui=predict(m2,newdata=test,interval="prediction",level=0.95)[,3]
# checking if our data is within our prediction intervals
cover=ifelse(y>li,1,0)*ifelse(y<ui,1,0)
mean(cover)## [1] 0.9666667The difference between the two models is almost subjective, but Model 2 does have the best performance overall between all variable selection, within-sample checks, and out-of-sample checks.
I concluded with bootstrapping uncertainty estimates for \(\beta_0\), \(\beta_1\), and \(\beta_2\) for each model. I chose to show just those three since they’re the shared variables between the two models, and included both models since the difference between them is not severe enough to consider one unusable.
set.seed(1)
# so much faster to bootstrap this way
m1_boot=do(1000)*coef(lm(lpsa~lweight+age+as.factor(gleason),data=resample(train)))
m2_boot=do(1000)*coef(lm(lpsa~lweight+age+lcavol,data=resample(train)))hist(m1_boot$Intercept,xlab=TeX("$\\beta_0$"),main="Model 1 Bootstrap")
abline(v=mean(m1_boot$Intercept),lwd=4,lty=2,col="gold")hist(m2_boot$Intercept,xlab=TeX("$\\beta_0$"),main="Model 2 Bootstrap")
abline(v=mean(m2_boot$Intercept),lwd=4,lty=2,col="gold")hist(m1_boot$lweight,xlab=TeX("$\\beta_1$"),main="Model 1 Bootstrap")
abline(v=mean(m1_boot$lweight),lwd=4,lty=2,col="gold")hist(m2_boot$lweight,xlab=TeX("$\\beta_1$"),main="Model 2 Bootstrap")
abline(v=mean(m2_boot$lweight),lwd=4,lty=2,col="gold")hist(m1_boot$age,xlab=TeX("$\\beta_2$"),main="Model 1 Bootstrap")
abline(v=mean(m1_boot$age),lwd=4,lty=2,col="gold")hist(m2_boot$age,xlab=TeX("$\\beta_2$"),main="Model 2 Bootstrap")
abline(v=mean(m1_boot$age),lwd=4,lty=2,col="gold")