import pandas as pa # data manipulation
import numpy as ny # math functions
import scipy.stats as scy # stats functions
import matplotlib.pyplot as pyp # plotting
from sklearn.linear_model import LinearRegression as LM # linear model
from sklearn.linear_model import Lasso # variable selection
from sklearn.preprocessing import StandardScaler # standardization
from sklearn.pipeline import make_pipeline # necessary lasso setup
from sklearn.metrics import mean_squared_error as mse # mean squared error
from sklearn.utils import resample # bootstrap functionality
705assign4_guide_pyth
Libraries
Data Import
="https://raw.githubusercontent.com/jlacasa/stat705_fall2024/main/classes/data/prostate.csv"
url=pa.read_csv(url) dd
Data Split
# training and testing split
=dd[dd['train']==True]
train=dd[dd['train']==False] test
Exploratory Analysis
Exploratory analyses are my preferred first step in any statistics/data analysis project. I’m starting by printing the head of my data to get an idea of variable names and data types, (i.e., whether the data is discrete or continuous).
# first 10 rows of each column
print(dd.head(10))
lcavol lweight age lbph svi lcp gleason pgg45 lpsa \
0 -0.579818 2.769459 50 -1.386294 0 -1.386294 6 0 -0.430783
1 -0.994252 3.319626 58 -1.386294 0 -1.386294 6 0 -0.162519
2 -0.510826 2.691243 74 -1.386294 0 -1.386294 7 20 -0.162519
3 -1.203973 3.282789 58 -1.386294 0 -1.386294 6 0 -0.162519
4 0.751416 3.432373 62 -1.386294 0 -1.386294 6 0 0.371564
5 -1.049822 3.228826 50 -1.386294 0 -1.386294 6 0 0.765468
6 0.737164 3.473518 64 0.615186 0 -1.386294 6 0 0.765468
7 0.693147 3.539509 58 1.536867 0 -1.386294 6 0 0.854415
8 -0.776529 3.539509 47 -1.386294 0 -1.386294 6 0 1.047319
9 0.223144 3.244544 63 -1.386294 0 -1.386294 6 0 1.047319
train
0 True
1 True
2 True
3 True
4 True
5 True
6 False
7 True
8 False
9 False
Next I plotted a histogram of my intended response variable, lpsa
, to get a rough read on how the data is distributed.
# observing the response distribution
'lpsa'],color='grey')
pyp.hist(train["PSA Histogram")
pyp.title(
pyp.show()'all') # i make use of close('all') frequently
pyp.close(# i know there's better functions, im just a boomer
Then I’m just plotting my response against various possible predictor variables. pgg45
and svi
were avoided because of high incidence of \(0\) values, and lbph
was honestly just missed in the moment.
# weight by response
'lweight'],train['lpsa'],color='black')
pyp.scatter(train["Weight / PSA")
pyp.title(
pyp.show()'all') pyp.close(
# age by response
'age'],train['lpsa'],color='black')
pyp.scatter(train["Age / PSA")
pyp.title(
pyp.show()'all') pyp.close(
# gleason by response
'gleason'],train['lpsa'],color='black')
pyp.scatter(train["Gleason / PSA")
pyp.title(
pyp.show()'all') pyp.close(
# cavol by response
'lcavol'],train['lpsa'],color='black')
pyp.scatter(train["Cavol / PSA")
pyp.title(
pyp.show()'all') pyp.close(
# penetration by response
'lcp'],train['lpsa'],color='black')
pyp.scatter(train["Penetration / PSA")
pyp.title(
pyp.show()'all') pyp.close(
LASSO
I chose to pair my graphical selection with LASSO as a form of “fact check” for my gut feeling.
# everything about this was terrible
# just learn R
=train['lpsa'] # separated Y
LY# (unnecessary but helped with troubleshooting)
=train.drop(columns=['lpsa']) # full X matrix
LX
=Lasso(alpha=1.0) # regularlization strength
lasso=make_pipeline(StandardScaler(),lasso) # pipeline with standardization
m# fit full model
m.fit(LX,LY)
=ny.logspace(-6,1,100) # logscale vector
alphas=[] # empty coefficient list
coefs=[] # empty l1 norms list
l1_norms
=StandardScaler() # standardization (removing mean)
scaler=scaler.fit_transform(LX) # fit and transform to standardized vector
LX_scaled
# lasso loop
for alpha in alphas:
=Lasso(alpha=alpha,max_iter=10000)
lasso
lasso.fit(LX_scaled,LY)
coefs.append(lasso.coef_)sum(ny.abs(lasso.coef_)))
l1_norms.append(ny.
=ny.array(coefs) # fill coefs list
coefs
# final figure plotting
=(8,6)) # this was also terrible, just learn R
pyp.figure(figsize1,2,1)
pyp.subplot(for i in range(coefs.shape[1]):
pyp.plot(l1_norms,coefs[:,i])'L1 Norm')
pyp.xlabel('Coefficients')
pyp.ylabel(0,color='black',linestyle='--',linewidth=0.5)
pyp.axhline(
1,2,2)
pyp.subplot(for i in range(coefs.shape[1]):
pyp.plot(ny.log10(alphas),coefs[:,i])'Log Lambda')
pyp.xlabel('Coefficients')
pyp.ylabel(0,color='black',linestyle='--',linewidth=0.5)
pyp.axhline(
pyp.tight_layout()
pyp.show()'all') pyp.close(
Since I ended up fitting this essentially by hand (StackExchange did a ton of the heavy lifting), I avoided printing the same coefficient values as I did in R because it was cumbersome and far beyond my complete understanding of this process.
Model Proposals
I investigated two possible models, for simplicity sake I capped my models at \(3\) predictors, rather than considering it a minimum. Both models will include lweight
and age
, while the difference between each model will be the inclusion of gleason
versus lcavol
.
My goal was to make a simple comparison and get a high performing model with good calibration without muddying up the interpretation with too many variables.
Model 1
\[y_{ij}=\beta_0+\beta_1w+\beta_2a+\beta_3t_j+\epsilon_{ij}\]
\[ \text{Where} : t_j=\begin{cases} 0 & \text{Season}=6\\ 1 & \text{Season}=7\\ 2 & \text{Season}=8\\ 3 & \text{Season}=9\\ \end{cases} \]
\[\epsilon_{ij} \sim N(0,\sigma^2)\]
Model 2
\[y_{i}=\beta_0+\beta_1w+\beta_2a+\beta_3c+\epsilon_{i}\]
\[\epsilon_{i} \sim N(0,\sigma^2)\]
Model Fitting
=train[['lweight','age']] # base X matrix
X# dummies serve as a viable analog to factorizing a value
=pa.get_dummies(train['gleason'],
gl_dum='gleason',drop_first=True)
prefix# bind the dummies to base X matrix for model 1
=pa.concat([X,gl_dum],axis=1)
m1_X# model 2 X matrix
=train[['lweight','age','lcavol']]
m2_X=train[['lpsa']] # response matrix
Y
# i don't do enough statistics in python to have
# found a preference on library for lm() functions
# i used this one because it's simple and the documentation was easy
=LM().fit(m1_X,Y)
m1=LM().fit(m2_X,Y) m2
Model Evaluation
Outputs
# basic coefficient summaries
print(m1.intercept_,m1.coef_)
print(m2.intercept_,m2.coef_)
[-1.80148661] [[ 1.22428095 -0.01463659 1.24612983 0.55016093 1.05301552]]
[-0.59868012] [[ 0.77540056 -0.00927429 0.63937616]]
Graphical Evaluation
These graphical evaluations are just to filter out anything egregious, nothing beyond that.
# predictions to get residuals
=m1.predict(m1_X)
m1_pred=m2.predict(m2_X)
m2_pred# actual minus predicted
=Y-m1_pred
m1_res=Y-m2_pred m2_res
# residual vs fit m1
='black')
pyp.scatter(m1_pred,m1_res,color=0,color='r',linestyle='--')
pyp.axhline(y'Fit')
pyp.xlabel('Residuals')
pyp.ylabel('Residuals vs Fitted Model 1')
pyp.title(
pyp.show()'all') pyp.close(
# residual vs fit m2
='black')
pyp.scatter(m2_pred,m2_res,color=0,color='r',linestyle='--')
pyp.axhline(y'Fit')
pyp.xlabel('Residuals')
pyp.ylabel('Residuals vs Fit Model 2')
pyp.title(
pyp.show()'all') pyp.close(
# convert to numpy object
=m1_res.values
m1_res=m1_res.flatten() # flatten to reduce later errors
m1_res=m2_res.values
m2_res=m2_res.flatten() m2_res
# q-q residuals, a lot of my python friends have struggled with this
# i did too, this is a pretty consistent function for it
="norm",plot=pyp)
scy.probplot(m1_res,dist"Q-Q Residuals Model 1")
pyp.title(
pyp.show()'all') pyp.close(
="norm",plot=pyp)
scy.probplot(m2_res,dist"Q-Q Residuals Model 2")
pyp.title(
pyp.show()'all') pyp.close(
Calibration
I checked Mean Squared Error, Root Mean Squared Error, and coverage using the testing data.
# rinse/repeat the X and Y matrix prep
# but with the test set
=test[['lweight','age']]
X=pa.get_dummies(train['gleason'],
gl_dum_train='gleason',drop_first=True)
prefix=pa.get_dummies(test['gleason'],
gl_dum_test='gleason',drop_first=True)
prefix=gl_dum_test.reindex(columns=gl_dum_train.columns,
gl_dum_test=0)
fill_value=pa.concat([X,gl_dum_test],axis=1)
m1_X_test=test[['lweight','age','lcavol']]
m2_X_test=test[['lpsa']]
Y_test
# predictions from the test set
=m1.predict(m1_X_test)
m1_pred_t=m2.predict(m2_X_test)
m2_pred_t
=mse(Y_test,m1_pred_t) # mean squared error
MSE_m1=ny.sqrt(MSE_m1) # root mean squared error
rMSE_m1=mse(Y_test,m2_pred_t)
MSE_m2=ny.sqrt(MSE_m2) rMSE_m2
print(MSE_m1)
0.9167180524122771
print(rMSE_m1)
0.9574539427107066
print(MSE_m2)
0.4800778387223387
print(rMSE_m2)
0.6928764960094539
# dimensions of X matrix
=m1_X.shape[0]
n=m1_X.shape[1]
p
# summary statistics for prediction intervals
=ny.sum(m1_res**2)
rss=rss/(n-p-1)
s2=ny.sqrt(s2)
sigma=float(sigma) # this float haunted me
sigma# if it isn't here everything breaks
# prediction from the test but flattened
=m1.predict(m1_X_test).flatten()
m1_pred
# i just converted everything to a numpy object
# it took so long to make this work i lost my mind
=ny.column_stack((ny.ones(m1_X_test.shape[0]),
m1_X_test_b0float)))
m1_X_test.to_numpy().astype(=ny.column_stack((ny.ones(m1_X.shape[0]),
m1_X_b0float)))
m1_X.to_numpy().astype(
# compute hat matrix
=ny.linalg.inv(ny.dot(m1_X_b0.T,m1_X_b0))
X_b0_inv=ny.dot(m1_X_test_b0,X_b0_inv).dot(m1_X_test_b0.T)
H=ny.diag(H).flatten()
m1_lev
# t value for prediction interval
=scy.t.ppf(0.975,df=n-p-1)
t_value
# upper and lower bounds
=m1_pred-t_value*sigma*ny.sqrt(1+m1_lev)
m1_lower=m1_pred+t_value*sigma*ny.sqrt(1+m1_lev)
m1_upper
=Y_test.to_numpy().flatten()
Y_test
# cover over prediction intervals
=ny.mean((Y_test>m1_lower)&(Y_test<m1_upper)) m1_cover
print(m1_cover)
0.9333333333333333
=m2_X.shape[0]
n=m2_X.shape[1]
p
=ny.sum(m2_res**2)
rss=rss/(n-p-1)
s2=ny.sqrt(s2)
sigma=float(sigma)
sigma
=m2.predict(m2_X_test).flatten()
m2_pred
=ny.column_stack((ny.ones(m2_X_test.shape[0]),
m2_X_test_b0float)))
m2_X_test.to_numpy().astype(=ny.column_stack((ny.ones(m2_X.shape[0]),
m2_X_b0float)))
m2_X.to_numpy().astype(
=ny.linalg.inv(ny.dot(m2_X_b0.T,m2_X_b0))
X_b0_inv=ny.dot(m2_X_test_b0,X_b0_inv).dot(m2_X_test_b0.T)
H=ny.diag(H).flatten()
m2_lev
=scy.t.ppf(0.975,df=n-p-1)
t_value
=m2_pred-t_value*sigma*ny.sqrt(1+m2_lev)
m2_lower=m2_pred+t_value*sigma*ny.sqrt(1+m2_lev)
m2_upper
=ny.mean((Y_test>m2_lower)&(Y_test<m2_upper)) m2_cover
print(m2_cover)
0.9666666666666667
The difference between the two models is almost subjective, but Model 2 does have the best performance overall between all variable selection, within-sample checks, and out-of-sample checks.
Bootstrapping
I concluded with bootstrapping uncertainty estimates the parameter estimates. I chose to include both models since the difference between them is not severe enough to consider one unusable.
=1000 # number of boot iterations
n=[] # empty list for saving coefficients
save_boot
for i in range(n):
# sample with replacement
=resample(m1_X,Y,replace=True)
X_boot,Y_boot# fit sampled data
=LM().fit(X_boot,Y_boot)
m_boot# pull coefficients
=ny.concatenate((m_boot.intercept_,m_boot.coef_.flatten()))
boot_out# save coefficients
save_boot.append(boot_out)# organize and name coefficients
=pa.DataFrame(save_boot,columns=['b0','b1','b2','b3','b4','b5'])
df_boot# calculate summaries
=df_boot.describe(percentiles=[0.025, 0.975])
boot_coefs
print(boot_coefs)
b0 b1 b2 b3 b4 \
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean -1.845712 1.200705 -0.012513 1.232285 0.323455
std 1.135697 0.291694 0.018407 0.231758 0.323871
min -5.312997 0.404303 -0.066541 0.489315 -0.340773
2.5% -3.990084 0.666926 -0.046332 0.806086 0.000000
50% -1.856850 1.197388 -0.013980 1.233752 0.307868
97.5% 0.311719 1.819283 0.026523 1.726560 0.990818
max 2.082668 2.265220 0.058537 1.989906 1.331466
b5
count 1000.000000
mean 0.981349
std 0.397177
min 0.000000
2.5% 0.000000
50% 1.018819
97.5% 1.683957
max 2.079295
# bootstrap distributions of coefficients
='grey')
pyp.hist(df_boot.b0,color'Model 1 Intercept')
pyp.title(=ny.mean(df_boot.b0)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(
='grey')
pyp.hist(df_boot.b1,color'Model 1 b1')
pyp.title(=ny.mean(df_boot.b1)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(
='grey')
pyp.hist(df_boot.b2,color'Model 1 b2')
pyp.title(=ny.mean(df_boot.b2)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(
='grey')
pyp.hist(df_boot.b3,color'Model 1 b3')
pyp.title(=ny.mean(df_boot.b3)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(
=1000
n=[]
save_boot
for i in range(n):
=resample(m2_X,Y,replace=True)
X_boot,Y_boot
=LM().fit(X_boot,Y_boot)
m_boot
=ny.concatenate((m_boot.intercept_,m_boot.coef_.flatten()))
boot_out
save_boot.append(boot_out)
=pa.DataFrame(save_boot,columns=['b0','b1','b2','b3'])
df_boot
=df_boot.describe(percentiles=[0.025,0.975])
boot_coefs
print(boot_coefs)
b0 b1 b2 b3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.707601 0.784005 -0.008224 0.641244
std 0.926261 0.210198 0.012804 0.076553
min -3.970061 0.109116 -0.041309 0.350498
2.5% -2.541435 0.398566 -0.029900 0.496291
50% -0.652980 0.779620 -0.009063 0.640420
97.5% 1.011098 1.215844 0.019604 0.784977
max 2.189422 1.410757 0.057484 0.917710
='grey')
pyp.hist(df_boot.b0,color'Model 2 Intercept')
pyp.title(=ny.mean(df_boot.b1)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(
='grey')
pyp.hist(df_boot.b1,color'Model 2 b1')
pyp.title(=ny.mean(df_boot.b1)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(
='grey')
pyp.hist(df_boot.b2,color'Model 2 b2')
pyp.title(=ny.mean(df_boot.b2)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(
='grey')
pyp.hist(df_boot.b3,color'Model 2 b3')
pyp.title(=ny.mean(df_boot.b3)
mean='gold',linestyle='dashed',linewidth=2)
pyp.axvline(mean,color
pyp.show()'all') pyp.close(