683-final.Rmd

---
title: "683 Final project"
author: "Ariane Stark, Minsu Kim, Diezhang Wu, Alona Muzikansky"
date: "11/6/2021"
output:
  pdf_document: default
  html_document: default
---

```{r setup, include=FALSE}
library(tidyverse)
library(SuperLearner)
library(ltmle)
library(MASS)
library(dplyr)
library(kableExtra)
library(randomForest)
knitr::opts_chunk$set(echo = TRUE)

set.seed(123)
```


```{r include=FALSE}
data <-read.csv("fertility_sperm.csv")
colnames(data)

data <- data %>% 
  mutate(W11 = age,
         W12 = sitting_hour,
         W13 = trauma,
         W14 = case_when(alcohol<0.8 ~ '2',
                         alcohol==0.8 ~ '1',
                         alcohol==1 ~ '0'),
         W2 = case_when(season==-1 ~ 'Fall/Winter',
                        season==-0.33 ~'Spring/Summer',
                        season==0.33 ~ 'Spring/Summer',
                        season==1 ~ 'Fall/Winter'),
         A = case_when(smoking==-1 ~ 0,
                       smoking==0 ~ 1,
                       smoking==1 ~ 1),
         Y = ifelse(diagnosis=='N',0,1))

ObsData <- data %>% dplyr::select(W11, W12, W13, W14, W2, A, Y)

```


```{r warning=FALSE}
set.seed(123)
# simple substitution estimator (a.k.a. parameteric G-computation)
txt <- ObsData
control <- ObsData

txt$A <- 1
control$A <- 0

g.comp.reg <- glm(Y ~ W11 + W12 + W13 + W14 + W2 + A, family="binomial", data=ObsData)
pred.txt <- predict(g.comp.reg, newdata = txt, type = "response") 
pred.control <- predict(g.comp.reg, newdata = control, type = "response")
psi.hat <- mean(pred.txt - pred.control)
psi.hat

# IPTW estimator
prob.AW.reg <- glm(A ~ W11 + W12 + W13 + W14, family="binomial", data=ObsData)
prob.1W <- predict(prob.AW.reg, type= "response")
prob.0W <- 1 - prob.1W

hist(prob.1W)
summary(prob.1W)

hist(prob.0W)
summary(prob.0W)

wt1 <- as.numeric(ObsData$A==1)/prob.1W 
wt0 <- as.numeric(ObsData$A==0)/prob.0W
summary(wt1)
hist(wt1)
summary(wt0)
hist(wt0)

psi.iptw <- mean(wt1*ObsData$Y) - mean(wt0*ObsData$Y)
psi.iptw

# Modified HT
psi.ht <- mean(wt1*ObsData$Y)/mean(wt1) - mean(wt0*ObsData$Y)/mean(wt0)
psi.ht

# Unadjusted estimator
wt1.ua <- as.numeric(ObsData$A==1)/mean(ObsData$A == 1)
wt0.ua <- as.numeric(ObsData$A==0)/mean(ObsData$A == 0)
psi.unadj <- mean(wt1.ua*ObsData$Y) - mean(wt0.ua*ObsData$Y)
psi.unadj


# TMLE estimator

```


## SS, IPTW and TMLE estimator with super learner
```{r warning=FALSE}
library("SuperLearner")
SL.library<- c('SL.glm', 'SL.glm.interaction', "SL.step",
               "SL.randomForest","SL.step.forward","SL.stepAIC","SL.mean")

```

```{r}
run.tmle <- function(ObsData, SL.library){
  
  #------------------------------------------
  # Simple substitution estimator 
  #------------------------------------------

  # dataframe X with baseline covariates and exposure
  X <- subset(ObsData, select=c(A, W11, W12, W13, W14,W2))
  
  # set the exposure=1 in X1 and the exposure=0 in X0
  X1 <- X0 <- X
  X1$A <- 1 
  X0$A <- 0
  
  # Estimate E_0(Y|A,W) with Super Learner
  SL.outcome <- SuperLearner(Y=ObsData$Y, X=X, SL.library=SL.library,
              family="binomial", cvControl=list(V=10))
  
  # get the expected outcome, given the observed exposure and covariates
  expY.givenAW <- predict(SL.outcome, newdata=ObsData)$pred
  # expected outcome, given A=1 and covariates 
  expY.given1W <- predict(SL.outcome, newdata=X1)$pred
  # expected outcome, given A=0 and covariates
  expY.given0W <- predict(SL.outcome, newdata=X0)$pred
  
  # simple substitution estimator would be 
  PsiHat.SS <- mean(expY.given1W - expY.given0W)
  
  #------------------------------------------
  # Inverse probability of txt weighting
  #------------------------------------------
  
  #  Super Learner for the exposure mechanism  P_0(A=1|W)
  SL.exposure <- SuperLearner(Y=ObsData$A, 
                              X=subset(ObsData, select= -c(A,Y,W2)),
                              SL.library=SL.library, family="binomial",
                              cvControl=list(V=10, stratifyCV = TRUE))
  
  # generate the predicted prob of being exposed, given baseline cov
  probA1.givenW <- SL.exposure$SL.predict
  # generate the predicted prob of not being exposed, given baseline cov
  probA0.givenW <- 1- probA1.givenW
  
  # clever covariate
  H.AW <- as.numeric(ObsData$A==1)/probA1.givenW - as.numeric(ObsData$A==0)/probA0.givenW
  
  # also want to evaluate the clever covariate at A=1 and A=0 for all participants
  H.1W <- 1/probA1.givenW
  H.0W <- -1/probA0.givenW
  
  # IPTW estimate
  PsiHat.IPTW <- mean(H.AW*ObsData$Y, na.rm = TRUE)
  

  #------------------------------------------
  # Targeting & TMLE
  #------------------------------------------
  
  # Update the initial estimator of E_0(Y|A,W)
  # run logistic regression of Y on H.AW using the logit of the esimates as offset
  
  expY.givenAW <- expY.givenAW - 0.000001
  
  logitUpdate<- glm( ObsData$Y ~ -1 +offset(qlogis(expY.givenAW)) + 
                       H.AW, family='binomial')
  epsilon <- logitUpdate$coef
  
  # obtain the targeted estimates
  expY.givenAW.star<- plogis( qlogis(expY.givenAW)+ epsilon*H.AW )  
  expY.given1W.star<- plogis( qlogis(expY.given1W)+ epsilon*H.1W )	
  expY.given0W.star<- plogis( qlogis(expY.given0W)+ epsilon*H.0W )
  
  # TMLE point estimate
  PsiHat.TMLE<- mean(expY.given1W.star - expY.given0W.star)
  
  #------------------------------------------
  # Return a list withthe point estimates, targeted estimates of E_0(Y|A,W), 
  # and the vector of clever covariates
  #------------------------------------------
  
  estimates <- data.frame(cbind(PsiHat.SS=PsiHat.SS, PsiHat.IPTW, PsiHat.TMLE))
  predictions <- data.frame(cbind(expY.givenAW.star, expY.given1W.star, expY.given0W.star))
  colnames(predictions) <- c('givenAW', 'given1W', 'given0W')
  list(estimates=estimates, predictions=predictions, H.AW=H.AW, probA1.givenW=probA1.givenW, probA0.givenW=probA1.givenW)
}
```


```{r}
set.seed(123)
out <- run.tmle(ObsData = ObsData, SL.library = SL.library)
est <- out$estimates
est

```


# CV Superlearner
```{r message=FALSE, warning=FALSE}
X<- subset(ObsData, select= -Y )
CV.SL.out<- CV.SuperLearner(Y=ObsData$Y, X=X, 
                            SL.library=SL.library, family='binomial',
                            cvControl = list(V = 5), 
                            innerCvControl = list(list(V =20)))

summary(CV.SL.out)

```
# Influence Curve

```{r}
n <- nrow(ObsData)
# clever covariate
H.AW <- out$H.AW
# targeted predictions
expY.AW.star <- out$predictions[,'givenAW']
expY.1W.star <- out$predictions[,'given1W']
expY.0W.star <- out$predictions[,'given0W']
#  point estimate
PsiHat.TMLE <- est$PsiHat.TMLE

# plug-in
IC <- H.AW*(ObsData$Y - expY.AW.star) + expY.1W.star - expY.0W.star - PsiHat.TMLE
summary(IC)
hist(IC)

# estimate sigma^2 with the variance of the IC divided by n
varHat.IC <- var(IC)/n
varHat.IC
# standard error estimate
se <- sqrt(varHat.IC)
se


##### TMLE

# obtain 95% two-sided confidence intervals TMLE:
alpha <- 0.05
round(c(PsiHat.TMLE+qnorm(alpha/2, lower.tail=T)*se,
  PsiHat.TMLE+qnorm(alpha/2, lower.tail=F)*se),4)

# calculate the pvalue tmle
2* pnorm( abs(PsiHat.TMLE /se), lower.tail=F )


####### IPTW

PsiHat.IPTW <- est$PsiHat.IPTW

# obtain 95% two-sided confidence intervals:
round(c(PsiHat.IPTW+qnorm(alpha/2, lower.tail=T)*se,
  PsiHat.IPTW+qnorm(alpha/2, lower.tail=F)*se),4)

# calculate the pvalue 
2* pnorm( abs(PsiHat.IPTW /se), lower.tail=F )

####### SS

PsiHat.SS <- est$PsiHat.SS

# obtain 95% two-sided confidence intervals:
round(c(PsiHat.SS+qnorm(alpha/2, lower.tail=T)*se,
  PsiHat.SS+qnorm(alpha/2, lower.tail=F)*se),4)

# calculate the pvalue 
2* pnorm( abs(PsiHat.SS /se), lower.tail=F )

```


# Non-parametric bootstrap

```{r}
load('boot_par.Rdata')

colnames(estimates)<-c("SimpSubs", "IPTW", "TMLE")

summary(estimates)
```


```{r}
ggplot(mapping = aes(estimates[,1]))+
  geom_histogram(fill="dark green",bins = 25)+
  xlab("Point Estimates")+
  ylab("Freq")+
  labs(title="Simple Substitution Estimator", 
       subtitle = "500 Bootstrap Samples")+
  theme(plot.title = element_text(colour = "red"))

ggplot(mapping = aes(estimates[,2]))+
  geom_histogram(fill="dark green",bins = 25)+
  xlab("Point Estimates")+
  ylab("Freq")+
  labs(title="IPTW Estimator", 
       subtitle = "500 Bootstrap Samples")+
  theme(plot.title = element_text(colour = "red"))

ggplot(mapping = aes(estimates[,3]))+
  geom_histogram(fill="dark green",bins = 25)+
  xlab("Point Estimates")+
  ylab("Freq")+
  labs(title="TMLE Estimator", 
       subtitle = "500 Bootstrap Samples")+
  theme(plot.title = element_text(colour = "red"))
```


```{r}
ggplot(mapping = aes(out$probA1.givenW))+
  geom_histogram(fill="red3",bins = 25)+
  xlab("Probability")+
  ylab("Freq")+
  labs(title="Propensity Score A=1")+
  theme(plot.title = element_text(colour = "dark green"))+
  xlim(0,1)

weights1 <- as.numeric(ObsData$A==1)/out$probA1.givenW
summary(weights1)
```


```{r}
#---------------------------------
# 95% Confidence intervals assuming a normal dist & via quantiles
#---------------------------------
create.CI <- function(pt, boot, alpha=0.05){
  Zquant <- qnorm(alpha/2, lower.tail=F)
  CI.normal <- c(pt - Zquant*sd(boot,na.rm = TRUE), 
                 pt + Zquant*sd(boot,na.rm = TRUE) )
  CI.quant  <- quantile(boot, prob=c(0.025,0.975) ,na.rm=TRUE)
  out <- data.frame(rbind(CI.normal, CI.quant))
  colnames(out) <- c('CI.lo', 'CI.hi')
  out
}
```


```{r}
# IMPORTANT - POINT OF CONFUSION FOR PAST STUDENTS
# The point estimate 'pt' is from the original dataset

# Simple Subs - note the bias because of misspecified regression? Will it converge fast enough?
est$PsiHat.SS
create.CI(pt=est$PsiHat.SS, boot=estimates[,"SimpSubs"])

# IPTW
est$PsiHat.IPTW
create.CI(pt=est$PsiHat.IPTW, boot=estimates[,"IPTW"])

# TMLE
est$PsiHat.TMLE
create.CI(pt=est$PsiHat.TMLE, boot=estimates[,"TMLE"])
```


```{r}
# Compare to IC estimate
c(PsiHat.TMLE+qnorm(alpha/2, lower.tail=T)*se,
  PsiHat.TMLE+qnorm(alpha/2, lower.tail=F)*se)


```