# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# Bootstrap analysis of efficacy of the Pfizer-BioNTech COVID-19 Vaccine

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install statsmodels;
! pip install scikit-learn;

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.utils import resample

# Data: 0 = placebo, 1 = treatment
V = np.concatenate([np.zeros(22000), np.ones(22000)])  # Treatment assignment: 0 for placebo, 1 for treatment

# Data: 1 = contracted COVID, 0 = did not contract
X = np.concatenate([np.zeros(21838), np.ones(162), np.zeros(21992), np.ones(8)])  # Infection data: 1 = infected

# Combine into a DataFrame
XV = pd.DataFrame({'X': X, 'V': V})  # 'X' is infection status, 'V' is treatment group

# Function to calculate the ratio of probabilities
def pratio(xv, Z):
    subsample = xv.iloc[Z]  # Subsample based on indices Z
    placebo = subsample[subsample['V'] == 0]  # Placebo group
    vaccine = subsample[subsample['V'] == 1]  # Vaccine group
    
    Pplacebo = placebo['X'].mean()  # Probability of infection in placebo group
    Pvaccine = vaccine['X'].mean()  # Probability of infection in vaccine group
    
    return Pplacebo / Pvaccine

# Bootstrap function to apply the pratio function
def bootstrap_pratio(xv, R=10000):
    n = len(xv)
    ratios = np.zeros(R)
    for i in range(R):
        Z = np.random.choice(np.arange(n), size=n, replace=True)  # Bootstrap sample
        ratios[i] = pratio(xv, Z)
    return ratios

# Run bootstrap for the ratio of probabilities
b_ratios = bootstrap_pratio(XV)

# 95% confidence interval for the ratio
ci_95 = np.percentile(b_ratios, [2.5, 97.5])
print(f"95% CI for p1/p2: {ci_95}")

# 5% one-sided confidence interval for the ratio
ci_5 = np.percentile(b_ratios, 5)
print(f"One-sided 95% CI lower bound for p1/p2: {ci_5}")

C:\Users\baron\AppData\Local\Temp\ipykernel_14388\3714909854.py:10: RuntimeWarning: divide by zero encountered in scalar divide
  return Pplacebo / Pvaccine

95% CI for p1/p2: [11.26246023 55.70211461]
One-sided 95% CI lower bound for p1/p2: 12.272603600663965

# Function to calculate efficacy
def efficacy(xv, Z):
    subsample = xv.iloc[Z]  # Subsample based on indices Z
    placebo = subsample[subsample['V'] == 0]
    vaccine = subsample[subsample['V'] == 1]
    
    Pplacebo = placebo['X'].mean()  # Probability of infection in placebo group
    Pvaccine = vaccine['X'].mean()  # Probability of infection in vaccine group
    
    return (Pplacebo - Pvaccine) / Pplacebo

# Bootstrap function to apply the efficacy function
def bootstrap_efficacy(xv, R=10000):
    n = len(xv)
    eff_values = np.zeros(R)
    for i in range(R):
        Z = np.random.choice(np.arange(n), size=n, replace=True)
        eff_values[i] = efficacy(xv, Z)
    return eff_values

# Run bootstrap for efficacy
b_eff = bootstrap_efficacy(XV)

# 95% confidence interval for efficacy
ci_efficacy_95 = np.percentile(b_eff, [2.5, 97.5])
print(f"95% CI for vaccine efficacy: {ci_efficacy_95}")

# 99% confidence interval for efficacy
ci_efficacy_99 = np.percentile(b_eff, [0.5, 99.5])
print(f"99% CI for vaccine efficacy: {ci_efficacy_99}")

95% CI for vaccine efficacy: [0.91166486 0.98199837]
99% CI for vaccine efficacy: [0.89683983 0.98865499]