In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# Bootstrap analysis of efficacy of the Pfizer-BioNTech COVID-19 Vaccine
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install statsmodels;
! pip install scikit-learn;
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.utils import resample
In [3]:
# Data: 0 = placebo, 1 = treatment
V = np.concatenate([np.zeros(22000), np.ones(22000)]) # Treatment assignment: 0 for placebo, 1 for treatment
# Data: 1 = contracted COVID, 0 = did not contract
X = np.concatenate([np.zeros(21838), np.ones(162), np.zeros(21992), np.ones(8)]) # Infection data: 1 = infected
# Combine into a DataFrame
XV = pd.DataFrame({'X': X, 'V': V}) # 'X' is infection status, 'V' is treatment group
In [4]:
# Function to calculate the ratio of probabilities
def pratio(xv, Z):
subsample = xv.iloc[Z] # Subsample based on indices Z
placebo = subsample[subsample['V'] == 0] # Placebo group
vaccine = subsample[subsample['V'] == 1] # Vaccine group
Pplacebo = placebo['X'].mean() # Probability of infection in placebo group
Pvaccine = vaccine['X'].mean() # Probability of infection in vaccine group
return Pplacebo / Pvaccine
In [5]:
# Bootstrap function to apply the pratio function
def bootstrap_pratio(xv, R=10000):
n = len(xv)
ratios = np.zeros(R)
for i in range(R):
Z = np.random.choice(np.arange(n), size=n, replace=True) # Bootstrap sample
ratios[i] = pratio(xv, Z)
return ratios
# Run bootstrap for the ratio of probabilities
b_ratios = bootstrap_pratio(XV)
# 95% confidence interval for the ratio
ci_95 = np.percentile(b_ratios, [2.5, 97.5])
print(f"95% CI for p1/p2: {ci_95}")
# 5% one-sided confidence interval for the ratio
ci_5 = np.percentile(b_ratios, 5)
print(f"One-sided 95% CI lower bound for p1/p2: {ci_5}")
C:\Users\baron\AppData\Local\Temp\ipykernel_14388\3714909854.py:10: RuntimeWarning: divide by zero encountered in scalar divide return Pplacebo / Pvaccine
95% CI for p1/p2: [11.26246023 55.70211461] One-sided 95% CI lower bound for p1/p2: 12.272603600663965
In [6]:
# Function to calculate efficacy
def efficacy(xv, Z):
subsample = xv.iloc[Z] # Subsample based on indices Z
placebo = subsample[subsample['V'] == 0]
vaccine = subsample[subsample['V'] == 1]
Pplacebo = placebo['X'].mean() # Probability of infection in placebo group
Pvaccine = vaccine['X'].mean() # Probability of infection in vaccine group
return (Pplacebo - Pvaccine) / Pplacebo
# Bootstrap function to apply the efficacy function
def bootstrap_efficacy(xv, R=10000):
n = len(xv)
eff_values = np.zeros(R)
for i in range(R):
Z = np.random.choice(np.arange(n), size=n, replace=True)
eff_values[i] = efficacy(xv, Z)
return eff_values
# Run bootstrap for efficacy
b_eff = bootstrap_efficacy(XV)
# 95% confidence interval for efficacy
ci_efficacy_95 = np.percentile(b_eff, [2.5, 97.5])
print(f"95% CI for vaccine efficacy: {ci_efficacy_95}")
# 99% confidence interval for efficacy
ci_efficacy_99 = np.percentile(b_eff, [0.5, 99.5])
print(f"99% CI for vaccine efficacy: {ci_efficacy_99}")
95% CI for vaccine efficacy: [0.91166486 0.98199837] 99% CI for vaccine efficacy: [0.89683983 0.98865499]