In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# JACKKNIFE AND BOOTSTRAP
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install statsmodels;
! pip install scikit-learn;
! pip install matplotlib;
! pip install ISLP;
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from ISLP import load_data
In [5]:
# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')
In [11]:
############### JACKKNIFE ########################
# 1. Biased sample variance function:
# Define the biased sample variance function
def variance_fn(X):
return np.mean((X - np.mean(X)) ** 2)
mpg = Auto['mpg']
# Calculate the biased variance
biased_variance = variance_fn(mpg)
print(biased_variance)
60.76273844231571
In [14]:
# 2. Jackknife function to reduce bias:
def jackknife(X, func):
n = len(X)
jack_values = np.zeros(n)
# Compute jackknife estimates (leave-one-out)
for i in range(n):
X_jack = np.delete(X, i)
jack_values[i] = func(X_jack)
# Calculate jackknife bias and standard error
jack_mean = np.mean(jack_values)
jack_bias = (n - 1) * (jack_mean - func(X))
jack_se = np.sqrt((n - 1) * np.mean((jack_values - jack_mean) ** 2))
return jack_bias, jack_se, jack_values
# Apply jackknife to reduce bias in the variance estimation
jack_bias, jack_se, jack_values = jackknife(mpg, variance_fn)
print("Jackknife bias:", jack_bias)
print("Jackknife standard error:", jack_se)
print("Jackknife values:", jack_values)
Jackknife bias: -0.15540342313448718 Jackknife standard error: 3.7419506898082293 Jackknife values: [60.84209614 60.73523656 60.84209614 60.77598459 60.81160445 60.73523656 60.68936035 60.68936035 60.68936035 60.73523656 60.73523656 60.68936035 60.73523656 60.68936035 60.91735467 60.91278118 60.84209614 60.90280218 60.88575363 60.90141548 60.91194916 60.91735467 60.91194916 60.90141548 60.90280218 60.45457382 60.45457382 60.52096271 60.38305676 60.88575363 60.8649636 60.91194916 60.86745966 60.77598459 60.81160445 60.86745966 60.84209614 60.68936035 60.68936035 60.68936035 60.68936035 60.58222343 60.63835598 60.63835598 60.84209614 60.91278118 60.86745966 60.84209614 60.91763201 60.8649636 60.80799903 60.80799903 60.77182449 60.57584461 60.88575363 60.90141548 60.91735467 60.91194916 60.91763201 60.887695 60.90280218 60.63835598 60.68936035 60.73523656 60.68936035 60.81160445 60.52096271 60.63835598 60.58222343 60.63835598 60.86745966 60.73523656 60.63835598 60.63835598 60.68936035 60.84209614 60.91278118 60.90280218 60.90141548 60.91278118 60.8649636 60.91763201 60.8649636 60.88575363 60.63835598 60.68936035 60.63835598 60.68936035 60.73523656 60.58222343 60.63835598 60.63835598 60.68936035 60.63835598 60.58222343 60.63835598 60.84209614 60.77598459 60.84209614 60.84209614 60.91763201 60.90141548 60.52096271 60.58222343 60.63835598 60.58222343 60.84209614 60.887695 60.90280218 60.91278118 60.84209614 60.86745966 60.90280218 60.90141548 60.73523656 60.77598459 60.8390454 60.91735467 60.887695 60.86745966 60.73523656 60.91735467 60.887695 60.52096271 60.887695 60.86745966 60.73523656 60.77182449 60.90141548 60.73052178 60.91194916 60.77598459 60.77598459 60.84209614 60.77598459 60.63835598 60.68936035 60.68936035 60.68936035 60.8390454 60.90141548 60.90141548 60.77182449 60.73052178 60.8649636 60.91735467 60.90141548 60.91735467 60.90141548 60.77182449 60.86745966 60.84209614 60.73523656 60.73523656 60.77598459 60.73523656 60.77598459 60.68936035 60.81160445 60.77598459 60.73523656 60.84209614 60.90280218 60.887695 60.63835598 60.8390454 60.91763201 60.887695 60.91763201 60.91735467 60.91194916 60.91735467 60.84209614 60.8390454 60.86745966 60.91763201 60.91763201 60.91278118 60.91194916 60.68409089 60.8649636 60.91194916 60.91194916 60.90141548 60.88575363 60.82749132 60.77598459 60.75625159 60.71293948 60.91278118 60.91278118 60.91735467 60.91584762 60.8390454 60.91529294 60.8390454 60.68409089 60.887695 60.84209614 60.85541892 60.82749132 60.82416324 60.73052178 60.8649636 60.89422557 60.887695 60.63835598 60.86745966 60.86745966 60.79443554 60.79443554 60.63835598 60.63835598 60.63835598 60.75181416 60.80799903 60.51402921 60.90732334 60.65895239 60.82749132 60.81160445 60.75625159 60.73523656 60.82749132 60.89588961 60.86745966 60.85541892 60.77598459 60.75625159 60.75625159 60.77598459 60.8390454 60.91529294 60.90141548 60.90732334 60.79055278 60.65895239 60.80799903 60.79055278 60.91278118 60.9084327 60.9084327 59.92767931 60.50756562 60.69378732 60.26549813 60.50756562 60.88590224 60.87616918 60.89112669 60.87191698 60.89588961 60.89112669 60.91112656 60.89588961 60.87616918 60.89737469 60.900191 60.85792963 60.84486326 60.87191698 60.83348709 60.84486326 60.82749132 60.80799903 60.87599963 60.88200587 60.77567271 60.90403085 60.9179868 60.9178204 60.91761318 60.89276562 60.81160445 60.90940496 60.78351882 60.75181416 60.82416324 60.9084327 60.88405819 60.91477489 60.89112669 60.89737469 60.81160445 60.83051484 60.79443554 60.8475791 60.80827323 60.75625159 60.87191698 60.85541892 60.73488282 60.62709388 60.53311229 60.878053 60.90835107 60.91763201 60.88200587 60.91761318 60.62160465 60.60482925 60.73919257 60.42600258 60.8552117 60.84463929 60.88929625 60.65895239 60.08237845 60.36752468 60.72610946 60.43308155 60.8649636 60.89576612 60.91627148 60.86971396 60.61606413 60.81461856 60.75997214 60.44708564 60.72164586 59.54350599 60.86727337 60.14593115 59.80303962 59.89721169 60.48786716 60.80799903 59.77072586 60.6432539 60.81461856 60.69855862 60.91797633 60.57584461 60.71256481 60.88200587 60.89263375 60.90393247 60.91813437 60.80799903 60.28981195 60.29781399 60.56989384 60.71713097 60.44708564 60.39717388 60.62709388 60.59338924 60.61047233 60.81133444 60.68409089 60.64853801 60.71256481 60.68896475 60.74765824 60.86260255 60.78321531 60.90835107 60.91668383 60.9153369 60.89263375 60.89112669 60.83051484 60.8649636 60.88575363 60.63253184 60.77182449 60.8390454 60.88575363 60.91735467 60.51402921 60.44708564 60.77182449 60.3750139 60.51402921 60.51402921 60.51402921 60.63253184 60.3750139 60.73052178 60.3750139 60.91194916 60.3750139 60.90141548 60.91278118 60.73052178 60.51402921 60.88575363 60.88575363 59.83489184 60.73052178 60.8649636 60.77182449]
In [15]:
# 3. Jackknife variance estimator:
n = len(mpg)
# Jackknife variance estimator by subtracting bias from the original estimator
corrected_variance = biased_variance - jack_bias
print("Corrected variance:", corrected_variance)
# Alternatively, use the jackknife formula directly
jackknife_var_estimator = n * variance_fn(mpg) - (n - 1) * np.mean(jack_values)
print("Jackknife variance estimator:", jackknife_var_estimator)
Corrected variance: 60.918141865450195 Jackknife variance estimator: 60.91814186545162
In [16]:
# 4. Verification with unbiased variance:
unbiased_variance = np.var(mpg, ddof=1)
print("Unbiased variance (numpy):", unbiased_variance)
Unbiased variance (numpy): 60.91814186544184
In [18]:
############### BOOTSTRAP #####################################
# 1. Bootstrap to Estimate Standard Deviation of a Sample Median:
B = 1000 # Number of bootstrap samples
n = len(mpg) # Sample size
median_bootstrap = np.zeros(B) # Initialize bootstrap medians array
# Generate bootstrap samples and compute medians
for k in range(B):
b = np.random.choice(mpg, size=n, replace=True)
median_bootstrap[k] = np.median(b)
# Plot histogram of bootstrap medians
plt.hist(median_bootstrap, bins=30, edgecolor='black')
plt.title('Histogram of Bootstrap Medians')
plt.show()
# Calculate the actual sample median and the bootstrap standard deviation
actual_median = np.median(mpg)
bootstrap_sd = np.std(median_bootstrap)
print(f"Sample median: {actual_median}")
print(f"Bootstrap estimator of SD(median): {bootstrap_sd}")
Sample median: 22.75 Bootstrap estimator of SD(median): 0.7906999920956115
In [19]:
# 2. Bootstrap confidence interval
alpha = 0.05
lower_quantile = np.percentile(median_bootstrap, 100 * alpha / 2)
upper_quantile = np.percentile(median_bootstrap, 100 * (1 - alpha / 2))
print(f"Bootstrap confidence interval for the population median: [{lower_quantile}, {upper_quantile}]")
Bootstrap confidence interval for the population median: [21.0, 24.0]
In [20]:
# 3. Using the scipy.stats.bootstrap function
from scipy.stats import bootstrap
# Function to compute the median for bootstrap samples
def median_fn(data):
return np.median(data)
# Perform bootstrap using scipy
bootstrap_result = bootstrap((mpg,), median_fn, n_resamples=10000, method='basic')
# Bootstrap results
print(f"Original sample median: {np.median(mpg)}")
print(f"Bootstrap bias: {np.mean(bootstrap_result.confidence_interval) - np.median(mpg)}")
print(f"Bootstrap standard error: {bootstrap_result.standard_error}")
Original sample median: 22.75 Bootstrap bias: 0.25 Bootstrap standard error: 0.772020431290282
In [23]:
# 4. Bootstrap Estimate of the Standard Deviation of Regression Coefficients:
import statsmodels.api as sm
# Define function to return regression intercept and slope
def slopes_fn(data, subsample):
subsample_data = data.iloc[subsample] # Use iloc to index rows
X = subsample_data['horsepower']
y = subsample_data['mpg']
X = sm.add_constant(X) # Add intercept
model = sm.OLS(y, X).fit()
return model.params # Return intercept and slope
# Perform bootstrap
n = Auto.shape[0]
bootstrap_slopes = np.zeros((10000, 2)) # Store intercept and slope
for k in range(10000):
subsample = np.random.choice(np.arange(n), size=n, replace=True)
bootstrap_slopes[k, :] = slopes_fn(Auto, subsample)
# Calculate standard deviation of intercept and slope
intercept_sd = np.std(bootstrap_slopes[:, 0])
slope_sd = np.std(bootstrap_slopes[:, 1])
print(f"Bootstrap std. error of intercept: {intercept_sd}")
print(f"Bootstrap std. error of slope: {slope_sd}")
# Compare with standard linear regression result
X = sm.add_constant(Auto['horsepower'])
model = sm.OLS(Auto['mpg'], X).fit()
print(model.summary())
Bootstrap std. error of intercept: 0.8553227828474659 Bootstrap std. error of slope: 0.007440832344784144 OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.606 Model: OLS Adj. R-squared: 0.605 Method: Least Squares F-statistic: 599.7 Date: Fri, 27 Sep 2024 Prob (F-statistic): 7.03e-81 Time: 10:21:36 Log-Likelihood: -1178.7 No. Observations: 392 AIC: 2361. Df Residuals: 390 BIC: 2369. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 39.9359 0.717 55.660 0.000 38.525 41.347 horsepower -0.1578 0.006 -24.489 0.000 -0.171 -0.145 ============================================================================== Omnibus: 16.432 Durbin-Watson: 0.920 Prob(Omnibus): 0.000 Jarque-Bera (JB): 17.305 Skew: 0.492 Prob(JB): 0.000175 Kurtosis: 3.299 Cond. No. 322. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# A comment about discrepancy with the standard estimates. Bootstrap estimates are pretty close to each other.
# They should be, because each of them is based on 10,000 bootstrap samples. However, the bootstrap standard errors
# of slopes are still a little higher than the estimates obtained by the standard regression analysis. The book
# explains this phenomenon by a number of assumptions that the standard regression requires. In particular,
# nonrandom predictors X and a constant variance of responses Var(Y) = σ2. Bootstrap is a nonparametric procedure,
# it is not based on any particular model. Thus, it can account for the variance of predictors. Here, “horsepower”
# is the predictor, and it is probably random.
#
# Since some of the standard regression assumptions are questionable here, the bootstrap method for estimating
# standard errors is more reliable.