# Regression diagnostics
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import OLSInfluence
from scipy.stats import t, shapiro
from statsmodels.stats.diagnostic import het_breuschpagan

# Load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data")  # Change the working directory
Auto = pd.read_csv("Auto.csv")  # Read the data file in the CSV format
Auto['horsepower'] = pd.to_numeric(Auto['horsepower'], errors='coerce')

# Fit the regression model
reg = smf.ols('mpg ~ year + acceleration + horsepower + weight', data=Auto).fit()

# Diagnostic plots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
fig = sm.graphics.plot_regress_exog(reg, 'year', fig=fig)
fig = sm.graphics.plot_regress_exog(reg, 'acceleration', fig=fig)
fig = sm.graphics.plot_regress_exog(reg, 'horsepower', fig=fig)
fig = sm.graphics.plot_regress_exog(reg, 'weight', fig=fig)
plt.show()

# Studentized residuals and outliers
influence = OLSInfluence(reg)
studentized_residuals = influence.resid_studentized_internal

plt.figure(figsize=(10, 6))
plt.plot(studentized_residuals, 'o')
plt.axhline(y=3, color='r', linestyle='--')
plt.axhline(y=-3, color='r', linestyle='--')
plt.show()

outliers = np.where(np.abs(studentized_residuals) > 3)
print(outliers)

(array([242, 320, 323, 324, 327], dtype=int64),)

# Which of these residuals can be considered as outliers?
# Compare with the Bonferroni-adjusted quantile from t-distribution.

# Bonferroni-adjusted quantile from t-distribution
bonferroni_threshold = t.ppf(1 - 0.05 / (2 * len(Auto)), Auto.shape[0] - len(reg.params))
print(bonferroni_threshold)

outliers_bonferroni = np.where(np.abs(studentized_residuals) > bonferroni_threshold)
print(outliers_bonferroni)

3.872996975131257
(array([320], dtype=int64),)

# Testing normality. 
# Also look at the Normal Q-Q plot above. Shapiro-Wilk statistic W measures how close the graph is to a straight line.

shapiro_test = shapiro(studentized_residuals)
print(f'Shapiro-Wilk Test: W={shapiro_test[0]}, p-value={shapiro_test[1]}')

Shapiro-Wilk Test: W=0.9726724233275309, p-value=9.959795380693004e-07

# Testing HOMOSCEDASTICITY (constant variance). This is the Breausch-Pagan test.

bp_test = het_breuschpagan(reg.resid, reg.model.exog)
print(f'Breusch-Pagan Test: Chi2={bp_test[2]}, p-value={bp_test[3]}')

Breusch-Pagan Test: Chi2=6.689789969666584, p-value=3.252369382529331e-05

# Influential data
leverage = influence.hat_matrix_diag

plt.figure(figsize=(10, 6))
plt.plot(leverage, 'o')
plt.axhline(y=5/len(Auto), color='r', linestyle='--')
plt.show()

print(leverage[leverage > 0.03])

[0.03624109 0.03226743 0.04258253 0.12054403 0.04797419 0.04360256
 0.04475796 0.06035105 0.03555978 0.03434446 0.05510052 0.0421212
 0.04379524]

print(influence.summary_frame())
# This calculates DFBETS, DFFITS, and other measures of influence

     dfb_Intercept  dfb_year  dfb_acceleration  dfb_horsepower  dfb_weight  \
0         0.079746 -0.065157         -0.059737       -0.048945    0.038775   
1         0.012116 -0.013711         -0.001112        0.012296   -0.010837   
2         0.048271 -0.041431         -0.029284        0.003736   -0.010585   
3         0.003894 -0.003989         -0.000903        0.001802   -0.002058   
4         0.042328 -0.031384         -0.037634       -0.019976    0.012137   
..             ...       ...               ...             ...         ...   
392       0.033510 -0.048618          0.020036        0.015665   -0.015842   
393      -0.375910  0.205380          0.547620        0.324480   -0.239950   
394       0.000476 -0.002874          0.004532        0.002105   -0.000804   
395       0.060850 -0.056323         -0.034956       -0.021791    0.012538   
396      -0.030484  0.025238          0.023087        0.014763   -0.008174   

      cooks_d  standard_resid  hat_diag  dffits_internal  student_resid  \
0    0.001971        0.808808  0.014840         0.099269       0.808445   
1    0.000223        0.280919  0.013941         0.033403       0.280584   
2    0.001295        0.684108  0.013647         0.080470       0.683637   
3    0.000011        0.067890  0.012043         0.007496       0.067803   
4    0.000662        0.436242  0.017095         0.057531       0.435786   
..        ...             ...       ...              ...            ...   
392  0.000865       -0.635307  0.010607        -0.065781      -0.634817   
393  0.078430        2.926085  0.043795         0.626217       2.955175   
394  0.000007       -0.037391  0.025702        -0.006073      -0.037343   
395  0.001259       -0.728580  0.011716        -0.079329      -0.728138   
396  0.000299        0.314862  0.014858         0.038668       0.314495   

       dffits  
0    0.099225  
1    0.033363  
2    0.080415  
3    0.007486  
4    0.057471  
..        ...  
392 -0.065730  
393  0.632443  
394 -0.006065  
395 -0.079281  
396  0.038623  

[392 rows x 11 columns]

# Cook's distance
# The Cook’s distance measures the effect of deleting the i­-th observation

cooks_d = influence.cooks_distance[0]
plt.figure(figsize=(10, 6))
plt.stem(np.arange(len(cooks_d)), cooks_d, markerfmt=",")
plt.show()

# Variance inflation factors (VIF)
# Here we measure the amount of multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_df = pd.DataFrame()
vif_df["VIF Factor"] = [variance_inflation_factor(reg.model.exog, i) for i in range(reg.model.exog.shape[1])]
vif_df["features"] = reg.model.exog_names
print(vif_df)

   VIF Factor      features
0  726.310218     Intercept
1    1.228910          year
2    2.519844  acceleration
3    8.813443    horsepower
4    5.303347        weight