-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSVA_Entire_Dataset.Rmd
266 lines (226 loc) · 11.5 KB
/
SVA_Entire_Dataset.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
---
title: "SVA on Entire Dataset"
output:
html_document:
df_print: paged
---
# Load Necessary Packages
```{r message=FALSE}
# BiocManager::install("sva") # install sva package
# BiocManager::install("limma") # install limma package
library(sva)
library(limma)
library(tidyverse)
library(parallel)
```
# Load Datasets
```{r}
# set working directory for easy access af data
setwd("~/OneDrive/MSU-GGS-CMSE/Year-One/Courses/CSS844_HRT841/HRT841_SVA/")
# read in expression data
mdata <- read.csv("Orthogroup_RNAseq_8-23-21.csv", header=T)
rownames(mdata) <- mdata[,1] ; mdata <- mdata[,-1] # reset rownames to Orthogroup columns
# remove accessions with no expression data
#colnames(mdata[,which(colSums(mdata)==0)])
to_remove <- c("SRR299086", "SRR299087", "SRR299088", "SRR299089", "SRR299090", "SRR299091", "SRR299092",
"SRR299093", "SRR299094", "SRR299095", "SRR299096", "SRR299097")
mdata <- mdata[,-which(to_remove %in% colnames(mdata))]
# read in phenotype data
pheno <- read.csv("metadata_labels.csv", header=T)
rownames(pheno) <- pheno[,1] # reset rownames to SRA accessions
# remove SRA accessions in pheno data that do not have expression data
pheno2 <- pheno[(rownames(pheno) %in% colnames(mdata)),] # SRAs in pheno that match mdata
pheno2$sample <- 1:nrow(pheno2) # create a sample column
pheno2 <- pheno2 %>% dplyr::rename(stress=NEW_stress, tissue=NEW_tissue, family=NEW_family) # rename cols
# look at the number of samples in each factor
table(pheno2$tissue) ; table(pheno2$stress) ; table(pheno2$family)
# read in Bioproject ID information that corresponds to pheno2
# I need to check if miles way gives the same bioprojects
# If so, it's better bc I had to manually clean the txt file after using edirect
bioproject <- read.delim("bioproject_ids_corrected.txt", header=F, sep="\t")
colnames(bioproject) <- c("sra", "bioproject") # rename columns
# combine bioproject and pheno2
pheno2 <- left_join(pheno2, bioproject, by="sra") # combine by matching sra columns
write.csv(pheno2, "pheno_data.csv")
# visualize expression and phenotype data
head(mdata[,1:5]) ; head(pheno2)
```
# Set Full and Null Models
```{r}
# full model matrix - family, stress, tissue should be sig
mod <- model.matrix(~stress+tissue+family, data=pheno2)
mod_no_fam <- model.matrix(~stress+tissue, data=pheno2) # exclude family
mod_no_tis <- model.matrix(~stress+family, data=pheno2) # exclude tissue
mod_no_fam_tis <- model.matrix(~stress, data=pheno2) # exclude both family and tissue
# null model matrix (no adjustment variables are included)
null_mod <- model.matrix(~1, data=pheno2)
# expression data must be a matrix
mdata <- as.matrix(mdata)
```
# Perform SVASeq on Entire Dataset
```{r}
# Estimate surrogate variables (SVs) using the two-step SVA method
# svaseq applies a log(data+1) transformation to the input data
myfunc <- function(args) {
svaseq(mdata, args, null_mod, method = "two-step", n.sv=35)
}
clust <- makeCluster(detectCores(), type="FORK") # set up cluster for parallel processing
args <- list(mod, mod_no_fam, mod_no_tis, mod_no_fam_tis) # arguments for sva analysis
out <- parLapply(clust, args, myfunc) # execute sva in parallel
stopCluster(clust) # close the cluster
SVs <- out[[1]]$sv # 23 SVs
SVs_no_fam <- out[[2]]$sv # 30 SVs
SVs_no_tis <- out[[3]]$sv # 24 SVs
SVs_no_fam_tis <- out[[4]]$sv # 33 SVs
# plot of first 2 surrogate variables
plot(SVs, pch=20, col="blue")
plot(SVs_no_fam, pch=20, col="blue")
plot(SVs_no_tis, pch=20, col="blue")
plot(SVs_no_fam_tis, pch=20, col="blue")
# Save SVs to file:
rownames(SVs) <- colnames(mdata)
write.csv(SVs, "SVs_Orthogroup_RNAseq_8-23-21.csv")
rownames(SVs_no_fam) <- colnames(mdata)
write.csv(SVs_no_fam, "SVs_no_fam_Orthogroup_RNAseq_8-23-21.csv")
rownames(SVs_no_tis) <- colnames(mdata)
write.csv(SVs_no_tis, "SVs_no_tissue_Orthogroup_RNAseq_8-23-21.csv")
rownames(SVs_no_fam_tis) <- colnames(mdata)
write.csv(SVs_no_fam_tis, "SVs_no_family_tissue_Orthogroup_RNAseq_8-23-21.csv")
```
# Adjust Data for SVs with Limma Package
```{r}
## Remove effect of surrogate variables with limma package by fitting a linear model with surrogate variables included
# Full model with SVs
modSv <- cbind(mod, SVs)
modSv_no_fam <- cbind(mod_no_fam, SVs_no_fam)
modSv_no_tis <- cbind(mod_no_tis, SVs_no_tis)
modSv_no_fam_tis <- cbind(mod_no_fam_tis, SVs_no_fam_tis)
# Null model with SVs as adjustment variables
mod0Sv <- cbind(null_mod, SVs)
mod0Sv_no_fam <- cbind(null_mod, SVs_no_fam)
mod0Sv_no_tis <- cbind(null_mod, SVs_no_tis)
mod0Sv_no_fam_tis <- cbind(null_mod, SVs_no_fam_tis)
# linear model
fit <- lmFit(mdata, modSv) ; summary(fit)
fit_no_fam <- lmFit(mdata, modSv_no_fam) ; summary(fit_no_fam)
fit_no_tis <- lmFit(mdata, modSv_no_tis) ; summary(fit_no_tis)
fit_no_fam_tis <- lmFit(mdata, modSv_no_fam_tis) ; summary(fit_no_fam_tis)
#fit_norm <- lmFit(mdata_norm, modSv_norm) ; summary(fit_norm)
```
# PCA
```{r}
## Generate a clean matrix using a function by Andrew Jaffe
# This function removes the effects of SVs from our expression data
#'y' as the gene expresion matrix
#'mod' as the model matrix you sent to sva (the full model)
#'svs' as svobj$sv where svobj is the output from the sva function
cleanY = function(y, mod, svs) {
X = cbind(mod, svs) # same as modSv
Hat = solve(t(X) %*% X) %*% t(X)
beta = (Hat %*% t(y))
rm(Hat)
gc()
P = ncol(mod)
return(y - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}
clean_data <- cleanY(mdata, mod, SVs)
clean_no_fam <- cleanY(mdata, mod_no_fam, SVs_no_fam)
clean_no_tis <- cleanY(mdata, mod_no_tis, SVs_no_tis)
clean_no_fam_tis <- cleanY(mdata, mod_no_fam_tis, SVs_no_fam_tis)
# Function for plotting pca
plot_pca <- function(Legend, title){
x %>% as.data.frame %>%
ggplot(aes(x=PC1,y=PC2, col=Legend)) + geom_point() +
ggtitle(title) +
labs(x=paste("PC1: ",round(var_explained[1]*100,2),"%"),
y=paste("PC2: ",round(var_explained[2]*100,2),"%"))
}
# Plot PCA of cleaned matrix
pca.res <- prcomp(t(clean_data), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_family.png")
# PCA on model excluding family
pca.res <- prcomp(t(clean_no_fam), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_no_fam_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_no_fam_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_no_fam_family.png")
# PCA on model exluding tissue
pca.res <- prcomp(t(clean_no_tis), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_no_tis_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_no_tis_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_no_tis_family.png")
# PCA on model exluding family and tissue
pca.res <- prcomp(t(clean_no_fam_tis), center = T, scale = T) # run PCA
pca.res$x[1:5,1:5] # visualize matrix of principle components (PCs)
var_explained <- pca.res$sdev^2/sum(pca.res$sdev^2) # explained variance for each PC
var_explained[1:5] # visualize matrix of explained variance per PC
x <- data.frame(pca.res$x) # create a dataframe of PCs
plot_pca(pheno2$stress, "PCA on Clean Data (Stress)") ; ggsave("PCA_no_fam_tis_stress.png")
plot_pca(pheno2$tissue, "PCA on Clean Data (Tissue)") ; ggsave("PCA_no_fam_tis_tissue.png")
plot_pca(pheno2$family, "PCA on Clean Data (Family)") ; ggsave("PCA_no_fam_tis_family.png")
# write clean data to file
write.csv(clean_data, "clean_RNAseq_metadata_11-30-20.csv")
write.csv(clean_no_fam, "clean_no_family_RNAseq_metadata_11-30-20.csv")
write.csv(clean_no_tis, "clean_no_tissue_RNAseq_metadata_11-30-20.csv")
write.csv(clean_no_fam_tis, "clean_no_family_tissue_RNAseq_metadata_11-30-20.csv")
```
# Adjusting for SVs using `f.pvalue` Method from SVA Package
```{r}
# The f.pvalue function can be used to calculate parametric F-test p-values for each row of a data matrix
# The F-test compares the models mod and null_mod. They must be nested models, so all of the variables in null_mod must appear in mod.
# Calculate the F-test p-values for differential expression without adjusting for surrogate variables
pValues = f.pvalue(mdata,mod,null_mod)
qValues = p.adjust(pValues,method="BH")
head(pValues) ; head(qValues)
# Include the surrogate variables in both the null and full models to adjust for the surrogate variables by treating them as adjustment variables that must be included in both models.
pValuesSv = f.pvalue(mdata,modSv,mod0Sv)
qValuesSv = p.adjust(pValuesSv,method="BH")
head(pValuesSv) ; head(qValuesSv)
# Identify orthogroups (OGs) that are NOT diferentially expressed with respect to stress + tissue + family
length(pValuesSv[pValuesSv>0.05]) # 38 w/BH corrected p-value > 0.05
pValuesSv[pValuesSv>0.05] # corresponding OGs
```
# Correlation between SVs, Sample, & BioProject
```{r}
adjRsq <- function(lm_mod){ # from Miles' code mdr_try_sva_v16.Rmd
rVar = sapply(as.data.frame(lm_mod$residuals), var) # residual variance
tVar = sapply(as.data.frame(lm_mod$residuals + lm_mod$fitted.values), var) # total variance
r2 = 1-(rVar/tVar) # r squared
n = nrow(lm_mod$fitted.values) # sample size
p = nrow(lm_mod$coefficients) - 1 # number of predictors
ar2 = 1-((1-r2)*(n-1))/(n-p-1) # McNemar's formula, adjusted r squared
return(ar2)
}
data_list <- list(SVs, SVs_no_fam, SVs_no_tis, SVs_no_fam_tis)
save_name <- list("SV_corrected", "SV_&_fam_corrected", "SV_&_tis_corrected", "SV_&_fam_tis_corrected")
for (i in 1:length(data_list)){
# Simple Linear Regression to test the association between bioproject, sample, tissue, family, stress and SVs. lm() uses McNEmar's formula to compute adjusted r squared values for each SV
d <- data.frame(data_list[i])
bio <- lm(as.matrix(d)~bioproject, data = pheno2)
sam <- lm(as.matrix(d)~sample, data = pheno2)
tis <- lm(as.matrix(d)~tissue, data = pheno2)
fam <- lm(as.matrix(d)~family, data = pheno2)
str <- lm(as.matrix(d)~stress, data = pheno2)
# Adjusted R squared values for each SV
ar2.df <- as.data.frame(rbind(bioproject=adjRsq(bio), sample=adjRsq(sam), tissue=adjRsq(tis), family=adjRsq(fam), stress=adjRsq(str)))
ar2.df$Predictors <- rownames(ar2.df)
ar2.df <- ar2.df %>% pivot_longer(1:ncol(d), names_to="SVs", values_to="Adj.R.Squared")
# Heatmap of adjusted R squared values
ggplot(ar2.df, aes(SVs, Predictors, fill=Adj.R.Squared)) + geom_tile() + scale_fill_gradient(low="white", high="blue", name="Adjusted R-squared") + theme_minimal() + ylab("Predictor Variables") + xlab("Surrogate Variables")
ggsave(paste("SV_cor_heatmap", save_name[i], ".png", sep=""), width=10, height=7)
write.csv(ar2.df, paste("SV_cor_heatmap_data_", save_name[i], ".csv", sep=""))
}
```