SVA_Entire_Dataset.Rmd

---
title: "SVA on Entire Dataset"
output:
  html_document:
    df_print: paged
---

# Load Necessary Packages
```{r message=FALSE}
# BiocManager::install("sva") # install sva package
# BiocManager::install("limma") # install limma package
library(sva)
library(limma)
library(tidyverse)
library(parallel)
```

# Load Datasets
```{r}
# set working directory for easy access af data
setwd("~/OneDrive/MSU-GGS-CMSE/Year-One/Courses/CSS844_HRT841/HRT841_SVA/") 

# read in expression data
mdata <- read.csv("Orthogroup_RNAseq_8-23-21.csv", header=T) 
rownames(mdata) <- mdata[,1] ; mdata <- mdata[,-1] # reset rownames to Orthogroup columns

# remove accessions with no expression data
#colnames(mdata[,which(colSums(mdata)==0)])
to_remove <- c("SRR299086", "SRR299087", "SRR299088", "SRR299089", "SRR299090", "SRR299091", "SRR299092", 
"SRR299093", "SRR299094", "SRR299095", "SRR299096", "SRR299097")
mdata <- mdata[,-which(to_remove %in% colnames(mdata))]

# read in phenotype data
pheno <- read.csv("metadata_labels.csv", header=T) 
rownames(pheno) <- pheno[,1] # reset rownames to SRA accessions

# remove SRA accessions in pheno data that do not have expression data
pheno2 <- pheno[(rownames(pheno) %in% colnames(mdata)),] # SRAs in pheno that match mdata
pheno2$sample <- 1:nrow(pheno2) # create a sample column
pheno2 <- pheno2 %>% dplyr::rename(stress=NEW_stress, tissue=NEW_tissue, family=NEW_family) # rename cols

# look at the number of samples in each factor
table(pheno2$tissue) ; table(pheno2$stress) ; table(pheno2$family)

# read in Bioproject ID information that corresponds to pheno2
# I need to check if miles way gives the same bioprojects
# If so, it's better bc I had to manually clean the txt file after using edirect
bioproject <- read.delim("bioproject_ids_corrected.txt", header=F, sep="\t")
colnames(bioproject) <- c("sra", "bioproject") # rename columns

# combine bioproject and pheno2
pheno2 <- left_join(pheno2, bioproject, by="sra") # combine by matching sra columns
write.csv(pheno2, "pheno_data.csv")

# visualize expression and phenotype data
head(mdata[,1:5]) ; head(pheno2)
```

# Set Full and Null Models
```{r}
# full model matrix - family, stress, tissue should be sig
mod <- model.matrix(~stress+tissue+family, data=pheno2)
mod_no_fam <- model.matrix(~stress+tissue, data=pheno2) # exclude family
mod_no_tis <- model.matrix(~stress+family, data=pheno2) # exclude tissue
mod_no_fam_tis <- model.matrix(~stress, data=pheno2) # exclude both family and tissue

# null model matrix (no adjustment variables are included)
null_mod <- model.matrix(~1, data=pheno2)

# expression data must be a matrix
mdata <- as.matrix(mdata)
```

# Perform SVASeq on Entire Dataset
```{r}
# Estimate surrogate variables (SVs) using the two-step SVA method
# svaseq applies a log(data+1) transformation to the input data
myfunc <- function(args) {
	svaseq(mdata, args, null_mod, method = "two-step", n.sv=35)
}
clust <- makeCluster(detectCores(), type="FORK") # set up cluster for parallel processing
args <- list(mod, mod_no_fam, mod_no_tis, mod_no_fam_tis) # arguments for sva analysis
out <- parLapply(clust, args, myfunc) # execute sva in parallel
stopCluster(clust) # close the cluster

SVs <- out[[1]]$sv # 23 SVs
SVs_no_fam <- out[[2]]$sv # 30 SVs
SVs_no_tis <- out[[3]]$sv # 24 SVs
SVs_no_fam_tis <- out[[4]]$sv # 33 SVs

# plot of first 2 surrogate variables
plot(SVs, pch=20, col="blue")
plot(SVs_no_fam, pch=20, col="blue")
plot(SVs_no_tis, pch=20, col="blue")
plot(SVs_no_fam_tis, pch=20, col="blue")

# Save SVs to file:
rownames(SVs) <- colnames(mdata)
write.csv(SVs, "SVs_Orthogroup_RNAseq_8-23-21.csv")

rownames(SVs_no_fam) <- colnames(mdata)
write.csv(SVs_no_fam, "SVs_no_fam_Orthogroup_RNAseq_8-23-21.csv")

rownames(SVs_no_tis) <- colnames(mdata)
write.csv(SVs_no_tis, "SVs_no_tissue_Orthogroup_RNAseq_8-23-21.csv")

rownames(SVs_no_fam_tis) <- colnames(mdata)
write.csv(SVs_no_fam_tis, "SVs_no_family_tissue_Orthogroup_RNAseq_8-23-21.csv")
```

# Adjust Data for SVs with Limma Package
```{r}
## Remove effect of surrogate variables with limma package by fitting a linear model with surrogate variables included

# Full model with SVs
modSv <- cbind(mod, SVs)
modSv_no_fam <- cbind(mod_no_fam, SVs_no_fam)
modSv_no_tis <- cbind(mod_no_tis, SVs_no_tis)
modSv_no_fam_tis <- cbind(mod_no_fam_tis, SVs_no_fam_tis)

# Null model with SVs as adjustment variables
mod0Sv <- cbind(null_mod, SVs) 
mod0Sv_no_fam <- cbind(null_mod, SVs_no_fam)
mod0Sv_no_tis <- cbind(null_mod, SVs_no_tis)
mod0Sv_no_fam_tis <- cbind(null_mod, SVs_no_fam_tis)

# linear model
fit <- lmFit(mdata, modSv) ; summary(fit)
fit_no_fam <- lmFit(mdata, modSv_no_fam) ; summary(fit_no_fam)
fit_no_tis <- lmFit(mdata, modSv_no_tis) ; summary(fit_no_tis)
fit_no_fam_tis <- lmFit(mdata, modSv_no_fam_tis) ; summary(fit_no_fam_tis)
#fit_norm <- lmFit(mdata_norm, modSv_norm) ; summary(fit_norm)
```

# PCA
```{r}
## Generate a clean matrix using a function by Andrew Jaffe
# This function removes the effects of SVs from our expression data
#'y' as the gene expresion matrix
#'mod' as the model matrix you sent to sva (the full model)
#'svs' as svobj$sv where svobj is the output from the sva function
cleanY = function(y, mod, svs) {
    X = cbind(mod, svs) # same as modSv
    Hat = solve(t(X) %*% X) %*% t(X)
    beta = (Hat %*% t(y))
    rm(Hat)
    gc()
    P = ncol(mod)
    return(y - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}

clean_data <- cleanY(mdata, mod, SVs)
clean_no_fam <- cleanY(mdata, mod_no_fam, SVs_no_fam)
clean_no_tis <- cleanY(mdata, mod_no_tis, SVs_no_tis)
clean_no_fam_tis <- cleanY(mdata, mod_no_fam_tis, SVs_no_fam_tis)

# Function for plotting pca
plot_pca <- function(Legend, title){
  x %>% as.data.frame %>%
  ggplot(aes(x=PC1,y=PC2, col=Legend)) + geom_point() + 
  ggtitle(title) +
  labs(x=paste("PC1: ",round(var_explained[1]*100,2),"%"),
       y=paste("PC2: ",round(var_explained[2]*100,2),"%"))
}
# Plot PCA of cleaned matrix
pca.res <- prcomp(t(clean_data), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_family.png")

# PCA on model excluding family
pca.res <- prcomp(t(clean_no_fam), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_no_fam_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_no_fam_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_no_fam_family.png")

# PCA on model exluding tissue
pca.res <- prcomp(t(clean_no_tis), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_no_tis_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_no_tis_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_no_tis_family.png")

# PCA on model exluding family and tissue
pca.res <- prcomp(t(clean_no_fam_tis), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_no_fam_tis_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_no_fam_tis_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_no_fam_tis_family.png")

# write clean data to file
write.csv(clean_data, "clean_RNAseq_metadata_11-30-20.csv")
write.csv(clean_no_fam, "clean_no_family_RNAseq_metadata_11-30-20.csv")
write.csv(clean_no_tis, "clean_no_tissue_RNAseq_metadata_11-30-20.csv")
write.csv(clean_no_fam_tis, "clean_no_family_tissue_RNAseq_metadata_11-30-20.csv")
```

# Adjusting for SVs using `f.pvalue` Method from SVA Package
```{r}
# The f.pvalue function can be used to calculate parametric F-test p-values for each row of a data matrix
# The F-test compares the models mod and null_mod. They must be nested models, so all of the variables in null_mod must appear in mod.

# Calculate the F-test p-values for differential expression without adjusting for surrogate variables
pValues = f.pvalue(mdata,mod,null_mod)
qValues = p.adjust(pValues,method="BH")
head(pValues) ; head(qValues)

# Include the surrogate variables in both the null and full models to adjust for the surrogate variables by treating them as adjustment variables that must be included in both models. 
pValuesSv = f.pvalue(mdata,modSv,mod0Sv)
qValuesSv = p.adjust(pValuesSv,method="BH")
head(pValuesSv) ; head(qValuesSv)

# Identify orthogroups (OGs) that are NOT diferentially expressed with respect to stress + tissue + family
length(pValuesSv[pValuesSv>0.05]) # 38 w/BH corrected p-value > 0.05
pValuesSv[pValuesSv>0.05] # corresponding OGs
```

# Correlation between SVs, Sample, & BioProject
```{r}
adjRsq <- function(lm_mod){ # from Miles' code mdr_try_sva_v16.Rmd
  rVar = sapply(as.data.frame(lm_mod$residuals), var) # residual variance
  tVar = sapply(as.data.frame(lm_mod$residuals + lm_mod$fitted.values), var) # total variance
  r2 = 1-(rVar/tVar) # r squared
  n = nrow(lm_mod$fitted.values) # sample size
  p = nrow(lm_mod$coefficients) - 1 # number of predictors
  ar2 = 1-((1-r2)*(n-1))/(n-p-1) # McNemar's formula, adjusted r squared
  return(ar2)
}

data_list <- list(SVs, SVs_no_fam, SVs_no_tis, SVs_no_fam_tis)
save_name <- list("SV_corrected", "SV_&_fam_corrected", "SV_&_tis_corrected", "SV_&_fam_tis_corrected")
for (i in 1:length(data_list)){
	# Simple Linear Regression to test the association between bioproject, sample, tissue, family, stress and SVs. lm() uses McNEmar's formula to compute adjusted r squared values for each SV
	d <- data.frame(data_list[i])
	bio <- lm(as.matrix(d)~bioproject, data = pheno2)
	sam <- lm(as.matrix(d)~sample, data = pheno2)
	tis <- lm(as.matrix(d)~tissue, data = pheno2)
	fam <- lm(as.matrix(d)~family, data = pheno2) 
	str <- lm(as.matrix(d)~stress, data = pheno2)

	# Adjusted R squared values for each SV
	ar2.df <- as.data.frame(rbind(bioproject=adjRsq(bio), sample=adjRsq(sam), tissue=adjRsq(tis), family=adjRsq(fam), stress=adjRsq(str)))
	ar2.df$Predictors <- rownames(ar2.df) 
	ar2.df <- ar2.df %>% pivot_longer(1:ncol(d), names_to="SVs", values_to="Adj.R.Squared")

	# Heatmap of adjusted R squared values
	ggplot(ar2.df, aes(SVs, Predictors, fill=Adj.R.Squared)) + geom_tile() + scale_fill_gradient(low="white", high="blue", name="Adjusted R-squared") + theme_minimal() + ylab("Predictor Variables") + xlab("Surrogate Variables")

	ggsave(paste("SV_cor_heatmap", save_name[i], ".png", sep=""), width=10, height=7)
	write.csv(ar2.df, paste("SV_cor_heatmap_data_", save_name[i], ".csv", sep=""))
}
```