### Stat 427/627 (Baron). Lab 11a.
### Efficacy of the Pfizer-BioNTech COVID-19 Vaccine ###

# The Phase III clinical trial of Pfizer vaccine included n=44000 people, 
# 50% took trt, 50% placebo.

# Result: 8 vaccinated and 162 placebo participants contracted COVID-19.
# A conclusion was made that vaccination reduces the risk of COVID by 
# the factor of 20.

# Vaccine critics point to a small sample size of 170 for any meaningful
# conclusions. Here, we explore what confidence statements we can deduce
# from these data.

# Construct a Bootstrap CI for p1/p2 = the ratio of probabilities
# p1 = P( infection | placebo group ), p2 = P( infection | vaccine group )

V = c(rep(0,22000),rep(1,22000));        # 0 = placebo, 1 = treatment
X = c(rep(0,21838),rep(1,162),rep(0,21992),rep(1,8)); # 1 = contracted covid
XV = data.frame(X,V);

pratio = function(xv,Z){		 # function of data xv and a subsample Z
  x = xv[,1]; v = xv[,2];
  Pplacebo = sum(x[Z]==1 & v[Z]==0)/sum(v[Z]==0);
  Pvaccine = sum(x[Z]==1 & v[Z]==1)/sum(v[Z]==1);
  return( Pplacebo/Pvaccine )
}

pratio(XV,)

# The ratio is estimated as 20.25. We need to supply it with the margin of error 
# and a confidence level. That is, use Bootstrap to construct a confidence interval.

library(boot)                            
b = boot( XV, pratio, R=10000 ) 	 # Apply bootstrap
quantile(b$t,c(0.025,0.975))	         # This is a 95% confidence interval for p1/p2

#     2.5%    97.5%	# Thus, we can claim with 95% confidence that the ratio of 
# 11.34329 54.51769     # probabilities is between 11.3 and 54.5. But do we need an interval, 
                        # or should we just find a lower bound? 
quantile(b$t,0.05)      # Try a one-sided confidence interval.
#       5%              # We can now claim, also with 95% confidence, that vaccination
# 12.29509              # reduces the chance of COVID-19 infection by at least a factor of 12.

################################################################################
### Official efficacy of the vaccine ###########################################
### The efficacy of a vaccine is defined as the difference of risks between ####
### the vaccine group and the placebo group, relative to the placebo group. ####
################################################################################

### To study the efficacy of Pfizer, we modify our function:

efficacy = function(xv,Z){ # function of data xv and a subsample Z
   x = xv[,1]; v = xv[,2];
   Pplacebo = sum(x[Z]==1 & v[Z]==0)/sum(v[Z]==0);
   Pvaccine = sum(x[Z]==1 & v[Z]==1)/sum(v[Z]==1);
   return( (Pplacebo - Pvaccine)/Pplacebo ) }
 
pratio(XV,)   # This gives the estimated efficacy which you see in all official reports
# 0.9506173   # But we know that it is only based on the sample, so it has a margin of error.
              # So, we proceed by building a confidence interval

b = boot( XV, efficacy, 10000 )		# Bootstrap! 
hist(b$t)     # This is a histogram of bootstrap efficacy values, it's not very symmetric
              # I'm just checking normality, to see if we can settle with a PARAMETRIC
              # bootstrap confidence interval, which is the mean b$t, plus or minus 1.96 
              # standard deviations

shapiro.test(b$t)		# Shapiro-Wilk test is a rigorous test of normality
Error in shapiro.test(b$t) : sample size must be between 3 and 5000
                      # The way it's built in R does not handle samples > 5000                
shapiro.test(b$t[sample(10000,5000)])   # Ok, we select a random sample of n=5000

#        Shapiro-Wilk normality test
#
# data:  b$t[sample(10000, 5000)]
# W = 0.98926, p-value < 2.2e-16        # Shapiro-Wilk test rejects normal distribution.
                                        # Hence, we construct a NONPARAMETRIC
quantile( b$t, c(0.025,0.975) )         # bootstrap confidence interval 
#      2.5%     97.5% 
# 0.9105982 0.9816955                   # This is a 95% confidence interval for efficacy
quantile( b$t, c(0.005,0.995) )
     0.5%     99.5% 
0.8947269 0.9882617                     # This is a 99% confidence interval