# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# JACKKNIFE AND BOOTSTRAP

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install statsmodels;
! pip install scikit-learn;
! pip install matplotlib;
! pip install ISLP;

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from ISLP import load_data

# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')

###############  JACKKNIFE ########################


# 1. Biased sample variance function:

# Define the biased sample variance function
def variance_fn(X):
    return np.mean((X - np.mean(X)) ** 2)

mpg = Auto['mpg']

# Calculate the biased variance
biased_variance = variance_fn(mpg)
print(biased_variance)

60.76273844231571

# 2. Jackknife function to reduce bias:

def jackknife(X, func):
    n = len(X)
    jack_values = np.zeros(n)

    # Compute jackknife estimates (leave-one-out)
    for i in range(n):
        X_jack = np.delete(X, i)
        jack_values[i] = func(X_jack)

    # Calculate jackknife bias and standard error
    jack_mean = np.mean(jack_values)
    jack_bias = (n - 1) * (jack_mean - func(X))
    jack_se = np.sqrt((n - 1) * np.mean((jack_values - jack_mean) ** 2))

    return jack_bias, jack_se, jack_values

# Apply jackknife to reduce bias in the variance estimation
jack_bias, jack_se, jack_values = jackknife(mpg, variance_fn)

print("Jackknife bias:", jack_bias)
print("Jackknife standard error:", jack_se)
print("Jackknife values:", jack_values)

Jackknife bias: -0.15540342313448718
Jackknife standard error: 3.7419506898082293
Jackknife values: [60.84209614 60.73523656 60.84209614 60.77598459 60.81160445 60.73523656
 60.68936035 60.68936035 60.68936035 60.73523656 60.73523656 60.68936035
 60.73523656 60.68936035 60.91735467 60.91278118 60.84209614 60.90280218
 60.88575363 60.90141548 60.91194916 60.91735467 60.91194916 60.90141548
 60.90280218 60.45457382 60.45457382 60.52096271 60.38305676 60.88575363
 60.8649636  60.91194916 60.86745966 60.77598459 60.81160445 60.86745966
 60.84209614 60.68936035 60.68936035 60.68936035 60.68936035 60.58222343
 60.63835598 60.63835598 60.84209614 60.91278118 60.86745966 60.84209614
 60.91763201 60.8649636  60.80799903 60.80799903 60.77182449 60.57584461
 60.88575363 60.90141548 60.91735467 60.91194916 60.91763201 60.887695
 60.90280218 60.63835598 60.68936035 60.73523656 60.68936035 60.81160445
 60.52096271 60.63835598 60.58222343 60.63835598 60.86745966 60.73523656
 60.63835598 60.63835598 60.68936035 60.84209614 60.91278118 60.90280218
 60.90141548 60.91278118 60.8649636  60.91763201 60.8649636  60.88575363
 60.63835598 60.68936035 60.63835598 60.68936035 60.73523656 60.58222343
 60.63835598 60.63835598 60.68936035 60.63835598 60.58222343 60.63835598
 60.84209614 60.77598459 60.84209614 60.84209614 60.91763201 60.90141548
 60.52096271 60.58222343 60.63835598 60.58222343 60.84209614 60.887695
 60.90280218 60.91278118 60.84209614 60.86745966 60.90280218 60.90141548
 60.73523656 60.77598459 60.8390454  60.91735467 60.887695   60.86745966
 60.73523656 60.91735467 60.887695   60.52096271 60.887695   60.86745966
 60.73523656 60.77182449 60.90141548 60.73052178 60.91194916 60.77598459
 60.77598459 60.84209614 60.77598459 60.63835598 60.68936035 60.68936035
 60.68936035 60.8390454  60.90141548 60.90141548 60.77182449 60.73052178
 60.8649636  60.91735467 60.90141548 60.91735467 60.90141548 60.77182449
 60.86745966 60.84209614 60.73523656 60.73523656 60.77598459 60.73523656
 60.77598459 60.68936035 60.81160445 60.77598459 60.73523656 60.84209614
 60.90280218 60.887695   60.63835598 60.8390454  60.91763201 60.887695
 60.91763201 60.91735467 60.91194916 60.91735467 60.84209614 60.8390454
 60.86745966 60.91763201 60.91763201 60.91278118 60.91194916 60.68409089
 60.8649636  60.91194916 60.91194916 60.90141548 60.88575363 60.82749132
 60.77598459 60.75625159 60.71293948 60.91278118 60.91278118 60.91735467
 60.91584762 60.8390454  60.91529294 60.8390454  60.68409089 60.887695
 60.84209614 60.85541892 60.82749132 60.82416324 60.73052178 60.8649636
 60.89422557 60.887695   60.63835598 60.86745966 60.86745966 60.79443554
 60.79443554 60.63835598 60.63835598 60.63835598 60.75181416 60.80799903
 60.51402921 60.90732334 60.65895239 60.82749132 60.81160445 60.75625159
 60.73523656 60.82749132 60.89588961 60.86745966 60.85541892 60.77598459
 60.75625159 60.75625159 60.77598459 60.8390454  60.91529294 60.90141548
 60.90732334 60.79055278 60.65895239 60.80799903 60.79055278 60.91278118
 60.9084327  60.9084327  59.92767931 60.50756562 60.69378732 60.26549813
 60.50756562 60.88590224 60.87616918 60.89112669 60.87191698 60.89588961
 60.89112669 60.91112656 60.89588961 60.87616918 60.89737469 60.900191
 60.85792963 60.84486326 60.87191698 60.83348709 60.84486326 60.82749132
 60.80799903 60.87599963 60.88200587 60.77567271 60.90403085 60.9179868
 60.9178204  60.91761318 60.89276562 60.81160445 60.90940496 60.78351882
 60.75181416 60.82416324 60.9084327  60.88405819 60.91477489 60.89112669
 60.89737469 60.81160445 60.83051484 60.79443554 60.8475791  60.80827323
 60.75625159 60.87191698 60.85541892 60.73488282 60.62709388 60.53311229
 60.878053   60.90835107 60.91763201 60.88200587 60.91761318 60.62160465
 60.60482925 60.73919257 60.42600258 60.8552117  60.84463929 60.88929625
 60.65895239 60.08237845 60.36752468 60.72610946 60.43308155 60.8649636
 60.89576612 60.91627148 60.86971396 60.61606413 60.81461856 60.75997214
 60.44708564 60.72164586 59.54350599 60.86727337 60.14593115 59.80303962
 59.89721169 60.48786716 60.80799903 59.77072586 60.6432539  60.81461856
 60.69855862 60.91797633 60.57584461 60.71256481 60.88200587 60.89263375
 60.90393247 60.91813437 60.80799903 60.28981195 60.29781399 60.56989384
 60.71713097 60.44708564 60.39717388 60.62709388 60.59338924 60.61047233
 60.81133444 60.68409089 60.64853801 60.71256481 60.68896475 60.74765824
 60.86260255 60.78321531 60.90835107 60.91668383 60.9153369  60.89263375
 60.89112669 60.83051484 60.8649636  60.88575363 60.63253184 60.77182449
 60.8390454  60.88575363 60.91735467 60.51402921 60.44708564 60.77182449
 60.3750139  60.51402921 60.51402921 60.51402921 60.63253184 60.3750139
 60.73052178 60.3750139  60.91194916 60.3750139  60.90141548 60.91278118
 60.73052178 60.51402921 60.88575363 60.88575363 59.83489184 60.73052178
 60.8649636  60.77182449]

# 3. Jackknife variance estimator:

n = len(mpg)

# Jackknife variance estimator by subtracting bias from the original estimator
corrected_variance = biased_variance - jack_bias
print("Corrected variance:", corrected_variance)

# Alternatively, use the jackknife formula directly
jackknife_var_estimator = n * variance_fn(mpg) - (n - 1) * np.mean(jack_values)
print("Jackknife variance estimator:", jackknife_var_estimator)

Corrected variance: 60.918141865450195
Jackknife variance estimator: 60.91814186545162

# 4. Verification with unbiased variance:

unbiased_variance = np.var(mpg, ddof=1)
print("Unbiased variance (numpy):", unbiased_variance)

Unbiased variance (numpy): 60.91814186544184

###############  BOOTSTRAP   #####################################

# 1. Bootstrap to Estimate Standard Deviation of a Sample Median:

B = 1000  # Number of bootstrap samples
n = len(mpg)  # Sample size
median_bootstrap = np.zeros(B)  # Initialize bootstrap medians array

# Generate bootstrap samples and compute medians
for k in range(B):
    b = np.random.choice(mpg, size=n, replace=True)
    median_bootstrap[k] = np.median(b)

# Plot histogram of bootstrap medians
plt.hist(median_bootstrap, bins=30, edgecolor='black')
plt.title('Histogram of Bootstrap Medians')
plt.show()

# Calculate the actual sample median and the bootstrap standard deviation
actual_median = np.median(mpg)
bootstrap_sd = np.std(median_bootstrap)

print(f"Sample median: {actual_median}")
print(f"Bootstrap estimator of SD(median): {bootstrap_sd}")

Sample median: 22.75
Bootstrap estimator of SD(median): 0.7906999920956115

# 2. Bootstrap confidence interval

alpha = 0.05
lower_quantile = np.percentile(median_bootstrap, 100 * alpha / 2)
upper_quantile = np.percentile(median_bootstrap, 100 * (1 - alpha / 2))

print(f"Bootstrap confidence interval for the population median: [{lower_quantile}, {upper_quantile}]")

Bootstrap confidence interval for the population median: [21.0, 24.0]

# 3. Using the scipy.stats.bootstrap function

from scipy.stats import bootstrap

# Function to compute the median for bootstrap samples
def median_fn(data):
    return np.median(data)

# Perform bootstrap using scipy
bootstrap_result = bootstrap((mpg,), median_fn, n_resamples=10000, method='basic')

# Bootstrap results
print(f"Original sample median: {np.median(mpg)}")
print(f"Bootstrap bias: {np.mean(bootstrap_result.confidence_interval) - np.median(mpg)}")
print(f"Bootstrap standard error: {bootstrap_result.standard_error}")

Original sample median: 22.75
Bootstrap bias: 0.25
Bootstrap standard error: 0.772020431290282

# 4. Bootstrap Estimate of the Standard Deviation of Regression Coefficients:

import statsmodels.api as sm

# Define function to return regression intercept and slope
def slopes_fn(data, subsample):
    subsample_data = data.iloc[subsample]  # Use iloc to index rows
    X = subsample_data['horsepower']
    y = subsample_data['mpg']
    X = sm.add_constant(X)  # Add intercept
    model = sm.OLS(y, X).fit()
    return model.params  # Return intercept and slope

# Perform bootstrap
n = Auto.shape[0]
bootstrap_slopes = np.zeros((10000, 2))  # Store intercept and slope

for k in range(10000):
    subsample = np.random.choice(np.arange(n), size=n, replace=True)
    bootstrap_slopes[k, :] = slopes_fn(Auto, subsample)

# Calculate standard deviation of intercept and slope
intercept_sd = np.std(bootstrap_slopes[:, 0])
slope_sd = np.std(bootstrap_slopes[:, 1])

print(f"Bootstrap std. error of intercept: {intercept_sd}")
print(f"Bootstrap std. error of slope: {slope_sd}")

# Compare with standard linear regression result
X = sm.add_constant(Auto['horsepower'])
model = sm.OLS(Auto['mpg'], X).fit()
print(model.summary())

Bootstrap std. error of intercept: 0.8553227828474659
Bootstrap std. error of slope: 0.007440832344784144
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     599.7
Date:                Fri, 27 Sep 2024   Prob (F-statistic):           7.03e-81
Time:                        10:21:36   Log-Likelihood:                -1178.7
No. Observations:                 392   AIC:                             2361.
Df Residuals:                     390   BIC:                             2369.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         39.9359      0.717     55.660      0.000      38.525      41.347
horsepower    -0.1578      0.006    -24.489      0.000      -0.171      -0.145
==============================================================================
Omnibus:                       16.432   Durbin-Watson:                   0.920
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               17.305
Skew:                           0.492   Prob(JB):                     0.000175
Kurtosis:                       3.299   Cond. No.                         322.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# A comment about discrepancy with the standard estimates. Bootstrap estimates are pretty close to each other. 
# They should be, because each of them is based on 10,000 bootstrap samples. However, the bootstrap standard errors 
# of slopes are still a little higher than the estimates obtained by the standard regression analysis. The book 
# explains this phenomenon by a number of assumptions that the standard regression requires. In particular, 
# nonrandom predictors X and a constant variance of responses Var(Y) = σ2. Bootstrap is a nonparametric procedure, 
# it is not based on any particular model. Thus, it can account for the variance of predictors. Here, “horsepower” 
# is the predictor, and it is probably random. 
# 
# Since some of the standard regression assumptions are questionable here, the bootstrap method for estimating 
# standard errors is more reliable.