import pandas as pa # data manipulation
import numpy as ny # math functions
import scipy.stats as scy # stats functions
import matplotlib.pyplot as pyp # plotting
from sklearn.linear_model import LinearRegression as LM # linear model
from sklearn.linear_model import Lasso # variable selection
from sklearn.preprocessing import StandardScaler # standardization
from sklearn.pipeline import make_pipeline # necessary lasso setup
from sklearn.metrics import mean_squared_error as mse # mean squared error
from sklearn.utils import resample # bootstrap functionality705assign4_guide_pyth
Libraries
Data Import
url="https://raw.githubusercontent.com/jlacasa/stat705_fall2024/main/classes/data/prostate.csv"
dd=pa.read_csv(url) Data Split
# training and testing split
train=dd[dd['train']==True]
test=dd[dd['train']==False]Exploratory Analysis
Exploratory analyses are my preferred first step in any statistics/data analysis project. I’m starting by printing the head of my data to get an idea of variable names and data types, (i.e., whether the data is discrete or continuous).
# first 10 rows of each column
print(dd.head(10)) lcavol lweight age lbph svi lcp gleason pgg45 lpsa \
0 -0.579818 2.769459 50 -1.386294 0 -1.386294 6 0 -0.430783
1 -0.994252 3.319626 58 -1.386294 0 -1.386294 6 0 -0.162519
2 -0.510826 2.691243 74 -1.386294 0 -1.386294 7 20 -0.162519
3 -1.203973 3.282789 58 -1.386294 0 -1.386294 6 0 -0.162519
4 0.751416 3.432373 62 -1.386294 0 -1.386294 6 0 0.371564
5 -1.049822 3.228826 50 -1.386294 0 -1.386294 6 0 0.765468
6 0.737164 3.473518 64 0.615186 0 -1.386294 6 0 0.765468
7 0.693147 3.539509 58 1.536867 0 -1.386294 6 0 0.854415
8 -0.776529 3.539509 47 -1.386294 0 -1.386294 6 0 1.047319
9 0.223144 3.244544 63 -1.386294 0 -1.386294 6 0 1.047319
train
0 True
1 True
2 True
3 True
4 True
5 True
6 False
7 True
8 False
9 False
Next I plotted a histogram of my intended response variable, lpsa, to get a rough read on how the data is distributed.
# observing the response distribution
pyp.hist(train['lpsa'],color='grey')
pyp.title("PSA Histogram")
pyp.show()
pyp.close('all') # i make use of close('all') frequently
# i know there's better functions, im just a boomerThen I’m just plotting my response against various possible predictor variables. pgg45 and svi were avoided because of high incidence of \(0\) values, and lbph was honestly just missed in the moment.
# weight by response
pyp.scatter(train['lweight'],train['lpsa'],color='black')
pyp.title("Weight / PSA")
pyp.show()
pyp.close('all')# age by response
pyp.scatter(train['age'],train['lpsa'],color='black')
pyp.title("Age / PSA")
pyp.show()
pyp.close('all')# gleason by response
pyp.scatter(train['gleason'],train['lpsa'],color='black')
pyp.title("Gleason / PSA")
pyp.show()
pyp.close('all')# cavol by response
pyp.scatter(train['lcavol'],train['lpsa'],color='black')
pyp.title("Cavol / PSA")
pyp.show()
pyp.close('all')# penetration by response
pyp.scatter(train['lcp'],train['lpsa'],color='black')
pyp.title("Penetration / PSA")
pyp.show()
pyp.close('all')LASSO
I chose to pair my graphical selection with LASSO as a form of “fact check” for my gut feeling.
# everything about this was terrible
# just learn R
LY=train['lpsa'] # separated Y
# (unnecessary but helped with troubleshooting)
LX=train.drop(columns=['lpsa']) # full X matrix
lasso=Lasso(alpha=1.0) # regularlization strength
m=make_pipeline(StandardScaler(),lasso) # pipeline with standardization
m.fit(LX,LY) # fit full model
alphas=ny.logspace(-6,1,100) # logscale vector
coefs=[] # empty coefficient list
l1_norms=[] # empty l1 norms list
scaler=StandardScaler() # standardization (removing mean)
LX_scaled=scaler.fit_transform(LX) # fit and transform to standardized vector
# lasso loop
for alpha in alphas:
lasso=Lasso(alpha=alpha,max_iter=10000)
lasso.fit(LX_scaled,LY)
coefs.append(lasso.coef_)
l1_norms.append(ny.sum(ny.abs(lasso.coef_)))
coefs=ny.array(coefs) # fill coefs list
# final figure plotting
pyp.figure(figsize=(8,6)) # this was also terrible, just learn R
pyp.subplot(1,2,1)
for i in range(coefs.shape[1]):
pyp.plot(l1_norms,coefs[:,i])
pyp.xlabel('L1 Norm')
pyp.ylabel('Coefficients')
pyp.axhline(0,color='black',linestyle='--',linewidth=0.5)
pyp.subplot(1,2,2)
for i in range(coefs.shape[1]):
pyp.plot(ny.log10(alphas),coefs[:,i])
pyp.xlabel('Log Lambda')
pyp.ylabel('Coefficients')
pyp.axhline(0,color='black',linestyle='--',linewidth=0.5)
pyp.tight_layout()
pyp.show()
pyp.close('all')Since I ended up fitting this essentially by hand (StackExchange did a ton of the heavy lifting), I avoided printing the same coefficient values as I did in R because it was cumbersome and far beyond my complete understanding of this process.
Model Proposals
I investigated two possible models, for simplicity sake I capped my models at \(3\) predictors, rather than considering it a minimum. Both models will include lweight and age, while the difference between each model will be the inclusion of gleason versus lcavol.
My goal was to make a simple comparison and get a high performing model with good calibration without muddying up the interpretation with too many variables.
Model 1
\[y_{ij}=\beta_0+\beta_1w+\beta_2a+\beta_3t_j+\epsilon_{ij}\]
\[ \text{Where} : t_j=\begin{cases} 0 & \text{Season}=6\\ 1 & \text{Season}=7\\ 2 & \text{Season}=8\\ 3 & \text{Season}=9\\ \end{cases} \]
\[\epsilon_{ij} \sim N(0,\sigma^2)\]
Model 2
\[y_{i}=\beta_0+\beta_1w+\beta_2a+\beta_3c+\epsilon_{i}\]
\[\epsilon_{i} \sim N(0,\sigma^2)\]
Model Fitting
X=train[['lweight','age']] # base X matrix
# dummies serve as a viable analog to factorizing a value
gl_dum=pa.get_dummies(train['gleason'],
prefix='gleason',drop_first=True)
# bind the dummies to base X matrix for model 1
m1_X=pa.concat([X,gl_dum],axis=1)
# model 2 X matrix
m2_X=train[['lweight','age','lcavol']]
Y=train[['lpsa']] # response matrix
# i don't do enough statistics in python to have
# found a preference on library for lm() functions
# i used this one because it's simple and the documentation was easy
m1=LM().fit(m1_X,Y)
m2=LM().fit(m2_X,Y)Model Evaluation
Outputs
# basic coefficient summaries
print(m1.intercept_,m1.coef_)
print(m2.intercept_,m2.coef_)[-1.80148661] [[ 1.22428095 -0.01463659 1.24612983 0.55016093 1.05301552]]
[-0.59868012] [[ 0.77540056 -0.00927429 0.63937616]]
Graphical Evaluation
These graphical evaluations are just to filter out anything egregious, nothing beyond that.
# predictions to get residuals
m1_pred=m1.predict(m1_X)
m2_pred=m2.predict(m2_X)
# actual minus predicted
m1_res=Y-m1_pred
m2_res=Y-m2_pred# residual vs fit m1
pyp.scatter(m1_pred,m1_res,color='black')
pyp.axhline(y=0,color='r',linestyle='--')
pyp.xlabel('Fit')
pyp.ylabel('Residuals')
pyp.title('Residuals vs Fitted Model 1')
pyp.show()
pyp.close('all')# residual vs fit m2
pyp.scatter(m2_pred,m2_res,color='black')
pyp.axhline(y=0,color='r',linestyle='--')
pyp.xlabel('Fit')
pyp.ylabel('Residuals')
pyp.title('Residuals vs Fit Model 2')
pyp.show()
pyp.close('all')# convert to numpy object
m1_res=m1_res.values
m1_res=m1_res.flatten() # flatten to reduce later errors
m2_res=m2_res.values
m2_res=m2_res.flatten()# q-q residuals, a lot of my python friends have struggled with this
# i did too, this is a pretty consistent function for it
scy.probplot(m1_res,dist="norm",plot=pyp)
pyp.title("Q-Q Residuals Model 1")
pyp.show()
pyp.close('all')scy.probplot(m2_res,dist="norm",plot=pyp)
pyp.title("Q-Q Residuals Model 2")
pyp.show()
pyp.close('all')Calibration
I checked Mean Squared Error, Root Mean Squared Error, and coverage using the testing data.
# rinse/repeat the X and Y matrix prep
# but with the test set
X=test[['lweight','age']]
gl_dum_train=pa.get_dummies(train['gleason'],
prefix='gleason',drop_first=True)
gl_dum_test=pa.get_dummies(test['gleason'],
prefix='gleason',drop_first=True)
gl_dum_test=gl_dum_test.reindex(columns=gl_dum_train.columns,
fill_value=0)
m1_X_test=pa.concat([X,gl_dum_test],axis=1)
m2_X_test=test[['lweight','age','lcavol']]
Y_test=test[['lpsa']]
# predictions from the test set
m1_pred_t=m1.predict(m1_X_test)
m2_pred_t=m2.predict(m2_X_test)
MSE_m1=mse(Y_test,m1_pred_t) # mean squared error
rMSE_m1=ny.sqrt(MSE_m1) # root mean squared error
MSE_m2=mse(Y_test,m2_pred_t)
rMSE_m2=ny.sqrt(MSE_m2)print(MSE_m1)0.9167180524122771
print(rMSE_m1)0.9574539427107066
print(MSE_m2)0.4800778387223387
print(rMSE_m2)0.6928764960094539
# dimensions of X matrix
n=m1_X.shape[0]
p=m1_X.shape[1]
# summary statistics for prediction intervals
rss=ny.sum(m1_res**2)
s2=rss/(n-p-1)
sigma=ny.sqrt(s2)
sigma=float(sigma) # this float haunted me
# if it isn't here everything breaks
# prediction from the test but flattened
m1_pred=m1.predict(m1_X_test).flatten()
# i just converted everything to a numpy object
# it took so long to make this work i lost my mind
m1_X_test_b0=ny.column_stack((ny.ones(m1_X_test.shape[0]),
m1_X_test.to_numpy().astype(float)))
m1_X_b0=ny.column_stack((ny.ones(m1_X.shape[0]),
m1_X.to_numpy().astype(float)))
# compute hat matrix
X_b0_inv=ny.linalg.inv(ny.dot(m1_X_b0.T,m1_X_b0))
H=ny.dot(m1_X_test_b0,X_b0_inv).dot(m1_X_test_b0.T)
m1_lev=ny.diag(H).flatten()
# t value for prediction interval
t_value=scy.t.ppf(0.975,df=n-p-1)
# upper and lower bounds
m1_lower=m1_pred-t_value*sigma*ny.sqrt(1+m1_lev)
m1_upper=m1_pred+t_value*sigma*ny.sqrt(1+m1_lev)
Y_test=Y_test.to_numpy().flatten()
# cover over prediction intervals
m1_cover=ny.mean((Y_test>m1_lower)&(Y_test<m1_upper))print(m1_cover)0.9333333333333333
n=m2_X.shape[0]
p=m2_X.shape[1]
rss=ny.sum(m2_res**2)
s2=rss/(n-p-1)
sigma=ny.sqrt(s2)
sigma=float(sigma)
m2_pred=m2.predict(m2_X_test).flatten()
m2_X_test_b0=ny.column_stack((ny.ones(m2_X_test.shape[0]),
m2_X_test.to_numpy().astype(float)))
m2_X_b0=ny.column_stack((ny.ones(m2_X.shape[0]),
m2_X.to_numpy().astype(float)))
X_b0_inv=ny.linalg.inv(ny.dot(m2_X_b0.T,m2_X_b0))
H=ny.dot(m2_X_test_b0,X_b0_inv).dot(m2_X_test_b0.T)
m2_lev=ny.diag(H).flatten()
t_value=scy.t.ppf(0.975,df=n-p-1)
m2_lower=m2_pred-t_value*sigma*ny.sqrt(1+m2_lev)
m2_upper=m2_pred+t_value*sigma*ny.sqrt(1+m2_lev)
m2_cover=ny.mean((Y_test>m2_lower)&(Y_test<m2_upper))print(m2_cover)0.9666666666666667
The difference between the two models is almost subjective, but Model 2 does have the best performance overall between all variable selection, within-sample checks, and out-of-sample checks.
Bootstrapping
I concluded with bootstrapping uncertainty estimates the parameter estimates. I chose to include both models since the difference between them is not severe enough to consider one unusable.
n=1000 # number of boot iterations
save_boot=[] # empty list for saving coefficients
for i in range(n):
# sample with replacement
X_boot,Y_boot=resample(m1_X,Y,replace=True)
# fit sampled data
m_boot=LM().fit(X_boot,Y_boot)
# pull coefficients
boot_out=ny.concatenate((m_boot.intercept_,m_boot.coef_.flatten()))
# save coefficients
save_boot.append(boot_out)
# organize and name coefficients
df_boot=pa.DataFrame(save_boot,columns=['b0','b1','b2','b3','b4','b5'])
# calculate summaries
boot_coefs=df_boot.describe(percentiles=[0.025, 0.975])
print(boot_coefs) b0 b1 b2 b3 b4 \
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean -1.845712 1.200705 -0.012513 1.232285 0.323455
std 1.135697 0.291694 0.018407 0.231758 0.323871
min -5.312997 0.404303 -0.066541 0.489315 -0.340773
2.5% -3.990084 0.666926 -0.046332 0.806086 0.000000
50% -1.856850 1.197388 -0.013980 1.233752 0.307868
97.5% 0.311719 1.819283 0.026523 1.726560 0.990818
max 2.082668 2.265220 0.058537 1.989906 1.331466
b5
count 1000.000000
mean 0.981349
std 0.397177
min 0.000000
2.5% 0.000000
50% 1.018819
97.5% 1.683957
max 2.079295
# bootstrap distributions of coefficients
pyp.hist(df_boot.b0,color='grey')
pyp.title('Model 1 Intercept')
mean=ny.mean(df_boot.b0)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b1,color='grey')
pyp.title('Model 1 b1')
mean=ny.mean(df_boot.b1)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b2,color='grey')
pyp.title('Model 1 b2')
mean=ny.mean(df_boot.b2)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b3,color='grey')
pyp.title('Model 1 b3')
mean=ny.mean(df_boot.b3)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')n=1000
save_boot=[]
for i in range(n):
X_boot,Y_boot=resample(m2_X,Y,replace=True)
m_boot=LM().fit(X_boot,Y_boot)
boot_out=ny.concatenate((m_boot.intercept_,m_boot.coef_.flatten()))
save_boot.append(boot_out)
df_boot=pa.DataFrame(save_boot,columns=['b0','b1','b2','b3'])
boot_coefs=df_boot.describe(percentiles=[0.025,0.975])
print(boot_coefs) b0 b1 b2 b3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.707601 0.784005 -0.008224 0.641244
std 0.926261 0.210198 0.012804 0.076553
min -3.970061 0.109116 -0.041309 0.350498
2.5% -2.541435 0.398566 -0.029900 0.496291
50% -0.652980 0.779620 -0.009063 0.640420
97.5% 1.011098 1.215844 0.019604 0.784977
max 2.189422 1.410757 0.057484 0.917710
pyp.hist(df_boot.b0,color='grey')
pyp.title('Model 2 Intercept')
mean=ny.mean(df_boot.b1)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b1,color='grey')
pyp.title('Model 2 b1')
mean=ny.mean(df_boot.b1)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b2,color='grey')
pyp.title('Model 2 b2')
mean=ny.mean(df_boot.b2)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b3,color='grey')
pyp.title('Model 2 b3')
mean=ny.mean(df_boot.b3)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')