In [2]:
# Regression diagnostics
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import OLSInfluence
from scipy.stats import t, shapiro
from statsmodels.stats.diagnostic import het_breuschpagan
# Load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data") # Change the working directory
Auto = pd.read_csv("Auto.csv") # Read the data file in the CSV format
Auto['horsepower'] = pd.to_numeric(Auto['horsepower'], errors='coerce')
In [3]:
# Fit the regression model
reg = smf.ols('mpg ~ year + acceleration + horsepower + weight', data=Auto).fit()
In [6]:
# Diagnostic plots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
fig = sm.graphics.plot_regress_exog(reg, 'year', fig=fig)
fig = sm.graphics.plot_regress_exog(reg, 'acceleration', fig=fig)
fig = sm.graphics.plot_regress_exog(reg, 'horsepower', fig=fig)
fig = sm.graphics.plot_regress_exog(reg, 'weight', fig=fig)
plt.show()
In [8]:
# Studentized residuals and outliers
influence = OLSInfluence(reg)
studentized_residuals = influence.resid_studentized_internal
plt.figure(figsize=(10, 6))
plt.plot(studentized_residuals, 'o')
plt.axhline(y=3, color='r', linestyle='--')
plt.axhline(y=-3, color='r', linestyle='--')
plt.show()
In [12]:
outliers = np.where(np.abs(studentized_residuals) > 3)
print(outliers)
Out[12]:
(array([242, 320, 323, 324, 327], dtype=int64),)
In [16]:
# Which of these residuals can be considered as outliers?
# Compare with the Bonferroni-adjusted quantile from t-distribution.
# Bonferroni-adjusted quantile from t-distribution
bonferroni_threshold = t.ppf(1 - 0.05 / (2 * len(Auto)), Auto.shape[0] - len(reg.params))
print(bonferroni_threshold)
outliers_bonferroni = np.where(np.abs(studentized_residuals) > bonferroni_threshold)
print(outliers_bonferroni)
3.872996975131257 (array([320], dtype=int64),)
In [22]:
# Testing normality.
# Also look at the Normal Q-Q plot above. Shapiro-Wilk statistic W measures how close the graph is to a straight line.
shapiro_test = shapiro(studentized_residuals)
print(f'Shapiro-Wilk Test: W={shapiro_test[0]}, p-value={shapiro_test[1]}')
Shapiro-Wilk Test: W=0.9726724233275309, p-value=9.959795380693004e-07
In [24]:
# Testing HOMOSCEDASTICITY (constant variance). This is the Breausch-Pagan test.
bp_test = het_breuschpagan(reg.resid, reg.model.exog)
print(f'Breusch-Pagan Test: Chi2={bp_test[2]}, p-value={bp_test[3]}')
Breusch-Pagan Test: Chi2=6.689789969666584, p-value=3.252369382529331e-05
In [28]:
# Influential data
leverage = influence.hat_matrix_diag
plt.figure(figsize=(10, 6))
plt.plot(leverage, 'o')
plt.axhline(y=5/len(Auto), color='r', linestyle='--')
plt.show()
In [32]:
print(leverage[leverage > 0.03])
[0.03624109 0.03226743 0.04258253 0.12054403 0.04797419 0.04360256 0.04475796 0.06035105 0.03555978 0.03434446 0.05510052 0.0421212 0.04379524]
In [34]:
print(influence.summary_frame())
# This calculates DFBETS, DFFITS, and other measures of influence
dfb_Intercept dfb_year dfb_acceleration dfb_horsepower dfb_weight \ 0 0.079746 -0.065157 -0.059737 -0.048945 0.038775 1 0.012116 -0.013711 -0.001112 0.012296 -0.010837 2 0.048271 -0.041431 -0.029284 0.003736 -0.010585 3 0.003894 -0.003989 -0.000903 0.001802 -0.002058 4 0.042328 -0.031384 -0.037634 -0.019976 0.012137 .. ... ... ... ... ... 392 0.033510 -0.048618 0.020036 0.015665 -0.015842 393 -0.375910 0.205380 0.547620 0.324480 -0.239950 394 0.000476 -0.002874 0.004532 0.002105 -0.000804 395 0.060850 -0.056323 -0.034956 -0.021791 0.012538 396 -0.030484 0.025238 0.023087 0.014763 -0.008174 cooks_d standard_resid hat_diag dffits_internal student_resid \ 0 0.001971 0.808808 0.014840 0.099269 0.808445 1 0.000223 0.280919 0.013941 0.033403 0.280584 2 0.001295 0.684108 0.013647 0.080470 0.683637 3 0.000011 0.067890 0.012043 0.007496 0.067803 4 0.000662 0.436242 0.017095 0.057531 0.435786 .. ... ... ... ... ... 392 0.000865 -0.635307 0.010607 -0.065781 -0.634817 393 0.078430 2.926085 0.043795 0.626217 2.955175 394 0.000007 -0.037391 0.025702 -0.006073 -0.037343 395 0.001259 -0.728580 0.011716 -0.079329 -0.728138 396 0.000299 0.314862 0.014858 0.038668 0.314495 dffits 0 0.099225 1 0.033363 2 0.080415 3 0.007486 4 0.057471 .. ... 392 -0.065730 393 0.632443 394 -0.006065 395 -0.079281 396 0.038623 [392 rows x 11 columns]
In [36]:
# Cook's distance
# The Cook’s distance measures the effect of deleting the i-th observation
cooks_d = influence.cooks_distance[0]
plt.figure(figsize=(10, 6))
plt.stem(np.arange(len(cooks_d)), cooks_d, markerfmt=",")
plt.show()
In [38]:
# Variance inflation factors (VIF)
# Here we measure the amount of multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()
vif_df["VIF Factor"] = [variance_inflation_factor(reg.model.exog, i) for i in range(reg.model.exog.shape[1])]
vif_df["features"] = reg.model.exog_names
print(vif_df)
VIF Factor features 0 726.310218 Intercept 1 1.228910 year 2 2.519844 acceleration 3 8.813443 horsepower 4 5.303347 weight