import pandas as pa # data manipulation
import numpy as ny # math functions
import scipy.stats as scy # stats functions
import matplotlib.pyplot as pyp # plotting
from sklearn.linear_model import LinearRegression as LM # linear model
from sklearn.linear_model import Lasso # variable selection
from sklearn.preprocessing import StandardScaler # standardization
from sklearn.pipeline import make_pipeline # necessary lasso setup
from sklearn.metrics import mean_squared_error as mse # mean squared error
from sklearn.utils import resample # bootstrap functionality705assign4_guide_pyth
Libraries
Data Import
url="https://raw.githubusercontent.com/jlacasa/stat705_fall2024/main/classes/data/prostate.csv"
dd=pa.read_csv(url) Data Split
# training and testing split
train=dd[dd['train']==True] 
test=dd[dd['train']==False]Exploratory Analysis
Exploratory analyses are my preferred first step in any statistics/data analysis project. I’m starting by printing the head of my data to get an idea of variable names and data types, (i.e., whether the data is discrete or continuous).
# first 10 rows of each column
print(dd.head(10))     lcavol   lweight  age      lbph  svi       lcp  gleason  pgg45      lpsa  \
0 -0.579818  2.769459   50 -1.386294    0 -1.386294        6      0 -0.430783   
1 -0.994252  3.319626   58 -1.386294    0 -1.386294        6      0 -0.162519   
2 -0.510826  2.691243   74 -1.386294    0 -1.386294        7     20 -0.162519   
3 -1.203973  3.282789   58 -1.386294    0 -1.386294        6      0 -0.162519   
4  0.751416  3.432373   62 -1.386294    0 -1.386294        6      0  0.371564   
5 -1.049822  3.228826   50 -1.386294    0 -1.386294        6      0  0.765468   
6  0.737164  3.473518   64  0.615186    0 -1.386294        6      0  0.765468   
7  0.693147  3.539509   58  1.536867    0 -1.386294        6      0  0.854415   
8 -0.776529  3.539509   47 -1.386294    0 -1.386294        6      0  1.047319   
9  0.223144  3.244544   63 -1.386294    0 -1.386294        6      0  1.047319   
   train  
0   True  
1   True  
2   True  
3   True  
4   True  
5   True  
6  False  
7   True  
8  False  
9  False  Next I plotted a histogram of my intended response variable, lpsa, to get a rough read on how the data is distributed.
# observing the response distribution
pyp.hist(train['lpsa'],color='grey')
pyp.title("PSA Histogram")
pyp.show()
pyp.close('all') # i make use of close('all') frequently
# i know there's better functions, im just a boomerThen I’m just plotting my response against various possible predictor variables. pgg45 and svi were avoided because of high incidence of \(0\) values, and lbph was honestly just missed in the moment.
# weight by response 
pyp.scatter(train['lweight'],train['lpsa'],color='black')
pyp.title("Weight / PSA")
pyp.show()
pyp.close('all')# age by response
pyp.scatter(train['age'],train['lpsa'],color='black')
pyp.title("Age / PSA")
pyp.show()
pyp.close('all')# gleason by response
pyp.scatter(train['gleason'],train['lpsa'],color='black')
pyp.title("Gleason / PSA")
pyp.show()
pyp.close('all')# cavol by response
pyp.scatter(train['lcavol'],train['lpsa'],color='black')
pyp.title("Cavol / PSA")
pyp.show()
pyp.close('all')# penetration by response
pyp.scatter(train['lcp'],train['lpsa'],color='black')
pyp.title("Penetration / PSA")
pyp.show()
pyp.close('all')LASSO
I chose to pair my graphical selection with LASSO as a form of “fact check” for my gut feeling.
# everything about this was terrible
# just learn R
LY=train['lpsa'] # separated Y 
                # (unnecessary but helped with troubleshooting)
LX=train.drop(columns=['lpsa']) # full X matrix
lasso=Lasso(alpha=1.0) # regularlization strength
m=make_pipeline(StandardScaler(),lasso) # pipeline with standardization 
m.fit(LX,LY) # fit full model
alphas=ny.logspace(-6,1,100) # logscale vector
coefs=[] # empty coefficient list
l1_norms=[] # empty l1 norms list
scaler=StandardScaler() # standardization (removing mean)
LX_scaled=scaler.fit_transform(LX) # fit and transform to standardized vector
# lasso loop
for alpha in alphas:
    lasso=Lasso(alpha=alpha,max_iter=10000)
    lasso.fit(LX_scaled,LY)
    coefs.append(lasso.coef_)
    l1_norms.append(ny.sum(ny.abs(lasso.coef_)))
coefs=ny.array(coefs) # fill coefs list
# final figure plotting
pyp.figure(figsize=(8,6)) # this was also terrible, just learn R
pyp.subplot(1,2,1)
for i in range(coefs.shape[1]):
    pyp.plot(l1_norms,coefs[:,i])
pyp.xlabel('L1 Norm')
pyp.ylabel('Coefficients')
pyp.axhline(0,color='black',linestyle='--',linewidth=0.5)
pyp.subplot(1,2,2)
for i in range(coefs.shape[1]):
    pyp.plot(ny.log10(alphas),coefs[:,i])
pyp.xlabel('Log Lambda')
pyp.ylabel('Coefficients')
pyp.axhline(0,color='black',linestyle='--',linewidth=0.5)
pyp.tight_layout()
pyp.show()
pyp.close('all')Since I ended up fitting this essentially by hand (StackExchange did a ton of the heavy lifting), I avoided printing the same coefficient values as I did in R because it was cumbersome and far beyond my complete understanding of this process.
Model Proposals
I investigated two possible models, for simplicity sake I capped my models at \(3\) predictors, rather than considering it a minimum. Both models will include lweight and age, while the difference between each model will be the inclusion of gleason versus lcavol.
My goal was to make a simple comparison and get a high performing model with good calibration without muddying up the interpretation with too many variables.
Model 1
\[y_{ij}=\beta_0+\beta_1w+\beta_2a+\beta_3t_j+\epsilon_{ij}\]
\[ \text{Where} : t_j=\begin{cases} 0 & \text{Season}=6\\ 1 & \text{Season}=7\\ 2 & \text{Season}=8\\ 3 & \text{Season}=9\\ \end{cases} \]
\[\epsilon_{ij} \sim N(0,\sigma^2)\]
Model 2
\[y_{i}=\beta_0+\beta_1w+\beta_2a+\beta_3c+\epsilon_{i}\]
\[\epsilon_{i} \sim N(0,\sigma^2)\]
Model Fitting
X=train[['lweight','age']] # base X matrix
# dummies serve as a viable analog to factorizing a value
gl_dum=pa.get_dummies(train['gleason'],
prefix='gleason',drop_first=True)
# bind the dummies to base X matrix for model 1
m1_X=pa.concat([X,gl_dum],axis=1)
# model 2 X matrix
m2_X=train[['lweight','age','lcavol']]
Y=train[['lpsa']] # response matrix
# i don't do enough statistics in python to have
# found a preference on library for lm() functions
# i used this one because it's simple and the documentation was easy
m1=LM().fit(m1_X,Y) 
m2=LM().fit(m2_X,Y)Model Evaluation
Outputs
# basic coefficient summaries
print(m1.intercept_,m1.coef_)
print(m2.intercept_,m2.coef_)[-1.80148661] [[ 1.22428095 -0.01463659  1.24612983  0.55016093  1.05301552]]
[-0.59868012] [[ 0.77540056 -0.00927429  0.63937616]]Graphical Evaluation
These graphical evaluations are just to filter out anything egregious, nothing beyond that.
# predictions to get residuals
m1_pred=m1.predict(m1_X)
m2_pred=m2.predict(m2_X)
# actual minus predicted
m1_res=Y-m1_pred
m2_res=Y-m2_pred# residual vs fit m1
pyp.scatter(m1_pred,m1_res,color='black')
pyp.axhline(y=0,color='r',linestyle='--')
pyp.xlabel('Fit')
pyp.ylabel('Residuals')
pyp.title('Residuals vs Fitted Model 1')
pyp.show()
pyp.close('all')# residual vs fit m2
pyp.scatter(m2_pred,m2_res,color='black')
pyp.axhline(y=0,color='r',linestyle='--')
pyp.xlabel('Fit')
pyp.ylabel('Residuals')
pyp.title('Residuals vs Fit Model 2')
pyp.show()
pyp.close('all')# convert to numpy object
m1_res=m1_res.values
m1_res=m1_res.flatten() # flatten to reduce later errors
m2_res=m2_res.values
m2_res=m2_res.flatten()# q-q residuals, a lot of my python friends have struggled with this
# i did too, this is a pretty consistent function for it
scy.probplot(m1_res,dist="norm",plot=pyp)
pyp.title("Q-Q Residuals Model 1")
pyp.show()
pyp.close('all')scy.probplot(m2_res,dist="norm",plot=pyp)
pyp.title("Q-Q Residuals Model 2")
pyp.show()
pyp.close('all')Calibration
I checked Mean Squared Error, Root Mean Squared Error, and coverage using the testing data.
# rinse/repeat the X and Y matrix prep
# but with the test set
X=test[['lweight','age']]
gl_dum_train=pa.get_dummies(train['gleason'],
prefix='gleason',drop_first=True)
gl_dum_test=pa.get_dummies(test['gleason'],
prefix='gleason',drop_first=True)
gl_dum_test=gl_dum_test.reindex(columns=gl_dum_train.columns,
fill_value=0) 
m1_X_test=pa.concat([X,gl_dum_test],axis=1)
m2_X_test=test[['lweight','age','lcavol']]
Y_test=test[['lpsa']]
# predictions from the test set
m1_pred_t=m1.predict(m1_X_test)
m2_pred_t=m2.predict(m2_X_test)
MSE_m1=mse(Y_test,m1_pred_t) # mean squared error
rMSE_m1=ny.sqrt(MSE_m1) # root mean squared error
MSE_m2=mse(Y_test,m2_pred_t)
rMSE_m2=ny.sqrt(MSE_m2)print(MSE_m1)0.9167180524122771print(rMSE_m1)0.9574539427107066print(MSE_m2)0.4800778387223387print(rMSE_m2)0.6928764960094539# dimensions of X matrix
n=m1_X.shape[0]
p=m1_X.shape[1]
# summary statistics for prediction intervals
rss=ny.sum(m1_res**2)
s2=rss/(n-p-1)
sigma=ny.sqrt(s2)
sigma=float(sigma) # this float haunted me
                    # if it isn't here everything breaks
# prediction from the test but flattened
m1_pred=m1.predict(m1_X_test).flatten()
# i just converted everything to a numpy object
# it took so long to make this work i lost my mind
m1_X_test_b0=ny.column_stack((ny.ones(m1_X_test.shape[0]),
m1_X_test.to_numpy().astype(float)))
m1_X_b0=ny.column_stack((ny.ones(m1_X.shape[0]),
m1_X.to_numpy().astype(float)))
# compute hat matrix
X_b0_inv=ny.linalg.inv(ny.dot(m1_X_b0.T,m1_X_b0))
H=ny.dot(m1_X_test_b0,X_b0_inv).dot(m1_X_test_b0.T)
m1_lev=ny.diag(H).flatten()
# t value for prediction interval
t_value=scy.t.ppf(0.975,df=n-p-1)
# upper and lower bounds
m1_lower=m1_pred-t_value*sigma*ny.sqrt(1+m1_lev)
m1_upper=m1_pred+t_value*sigma*ny.sqrt(1+m1_lev)
Y_test=Y_test.to_numpy().flatten()
# cover over prediction intervals
m1_cover=ny.mean((Y_test>m1_lower)&(Y_test<m1_upper))print(m1_cover)0.9333333333333333n=m2_X.shape[0]
p=m2_X.shape[1]
rss=ny.sum(m2_res**2)
s2=rss/(n-p-1)
sigma=ny.sqrt(s2)
sigma=float(sigma)
m2_pred=m2.predict(m2_X_test).flatten()
m2_X_test_b0=ny.column_stack((ny.ones(m2_X_test.shape[0]),
m2_X_test.to_numpy().astype(float)))
m2_X_b0=ny.column_stack((ny.ones(m2_X.shape[0]),
m2_X.to_numpy().astype(float)))
X_b0_inv=ny.linalg.inv(ny.dot(m2_X_b0.T,m2_X_b0))
H=ny.dot(m2_X_test_b0,X_b0_inv).dot(m2_X_test_b0.T)
m2_lev=ny.diag(H).flatten()
t_value=scy.t.ppf(0.975,df=n-p-1)
m2_lower=m2_pred-t_value*sigma*ny.sqrt(1+m2_lev)
m2_upper=m2_pred+t_value*sigma*ny.sqrt(1+m2_lev)
m2_cover=ny.mean((Y_test>m2_lower)&(Y_test<m2_upper))print(m2_cover)0.9666666666666667The difference between the two models is almost subjective, but Model 2 does have the best performance overall between all variable selection, within-sample checks, and out-of-sample checks.
Bootstrapping
I concluded with bootstrapping uncertainty estimates the parameter estimates. I chose to include both models since the difference between them is not severe enough to consider one unusable.
n=1000 # number of boot iterations
save_boot=[] # empty list for saving coefficients
for i in range(n):
    # sample with replacement
    X_boot,Y_boot=resample(m1_X,Y,replace=True)
    # fit sampled data
    m_boot=LM().fit(X_boot,Y_boot)
    # pull coefficients
    boot_out=ny.concatenate((m_boot.intercept_,m_boot.coef_.flatten()))
    # save coefficients
    save_boot.append(boot_out)
# organize and name coefficients
df_boot=pa.DataFrame(save_boot,columns=['b0','b1','b2','b3','b4','b5'])
# calculate summaries
boot_coefs=df_boot.describe(percentiles=[0.025, 0.975])
print(boot_coefs)                b0           b1           b2           b3           b4  \
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   
mean     -1.845712     1.200705    -0.012513     1.232285     0.323455   
std       1.135697     0.291694     0.018407     0.231758     0.323871   
min      -5.312997     0.404303    -0.066541     0.489315    -0.340773   
2.5%     -3.990084     0.666926    -0.046332     0.806086     0.000000   
50%      -1.856850     1.197388    -0.013980     1.233752     0.307868   
97.5%     0.311719     1.819283     0.026523     1.726560     0.990818   
max       2.082668     2.265220     0.058537     1.989906     1.331466   
                b5  
count  1000.000000  
mean      0.981349  
std       0.397177  
min       0.000000  
2.5%      0.000000  
50%       1.018819  
97.5%     1.683957  
max       2.079295  # bootstrap distributions of coefficients
pyp.hist(df_boot.b0,color='grey')
pyp.title('Model 1 Intercept')
mean=ny.mean(df_boot.b0)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b1,color='grey')
pyp.title('Model 1 b1')
mean=ny.mean(df_boot.b1)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b2,color='grey')
pyp.title('Model 1 b2')
mean=ny.mean(df_boot.b2)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b3,color='grey')
pyp.title('Model 1 b3')
mean=ny.mean(df_boot.b3)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')n=1000
save_boot=[]
for i in range(n):
    X_boot,Y_boot=resample(m2_X,Y,replace=True)
    
    m_boot=LM().fit(X_boot,Y_boot)
    
    boot_out=ny.concatenate((m_boot.intercept_,m_boot.coef_.flatten()))
    
    save_boot.append(boot_out)
df_boot=pa.DataFrame(save_boot,columns=['b0','b1','b2','b3'])
boot_coefs=df_boot.describe(percentiles=[0.025,0.975])
print(boot_coefs)                b0           b1           b2           b3
count  1000.000000  1000.000000  1000.000000  1000.000000
mean     -0.707601     0.784005    -0.008224     0.641244
std       0.926261     0.210198     0.012804     0.076553
min      -3.970061     0.109116    -0.041309     0.350498
2.5%     -2.541435     0.398566    -0.029900     0.496291
50%      -0.652980     0.779620    -0.009063     0.640420
97.5%     1.011098     1.215844     0.019604     0.784977
max       2.189422     1.410757     0.057484     0.917710pyp.hist(df_boot.b0,color='grey')
pyp.title('Model 2 Intercept')
mean=ny.mean(df_boot.b1)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b1,color='grey')
pyp.title('Model 2 b1')
mean=ny.mean(df_boot.b1)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b2,color='grey')
pyp.title('Model 2 b2')
mean=ny.mean(df_boot.b2)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')pyp.hist(df_boot.b3,color='grey')
pyp.title('Model 2 b3')
mean=ny.mean(df_boot.b3)
pyp.axvline(mean,color='gold',linestyle='dashed',linewidth=2)
pyp.show()
pyp.close('all')