-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrPCA.Rmd
152 lines (116 loc) · 4.56 KB
/
rPCA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
---
title: "Outlier Removal with rPCA"
author: "Andriana Manousidaki and Kenia Segura Abá"
date: "2/16/2021"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Load Packages & Upload Data
```{r message=FALSE}
#rPCA for outlier removal prior to SVA
library('rrcov')
library("DESeq2")
library('ggplot2')
setwd("~/Desktop/MSU GGS:CMSE/Year One/Courses/CSS844_HRT841/SVA Analysis/")
mdata <- read.csv("RNAseq_metadata_11-30-20.csv", header=T) # read in expression data
rownames(mdata) <- mdata[,1] ; mdata[,1] <- NULL # reset rownames to Orthogroup columns
pheno <- read.csv("factors_v1.csv", header=T) # read in phenotype data
rownames(pheno) <- pheno[,2]
head(mdata[,1:5]) ; head(pheno)
pheno2<-pheno[colnames(mdata),]
```
# Normalization of Data
```{r}
#NORMALIZATION
vstdata<-varianceStabilizingTransformation(round(as.matrix(mdata+1)), blind = TRUE, fitType = "local")
saveRDS(vstdata,'vstdata.rds')
```
# First method: PcaGrid
```{r}
rob_pca<-PcaGrid(t(vstdata),k=5)
saveRDS(rob_pca,'rob_pca.rds')
#Indicator of outliers: FALSE for outlier, TRUE for regular sample
outliers<-rob_pca@flag
outlier_sras <- outliers[which(outliers==FALSE)] #extract outliers from rob_pca
outlier_sras <- data.frame(outlier_sras) #data frame of outlier SRA accessions
head(outlier_sras)
#how many outliers?
length(which(outliers=='FALSE'))
#rPCA plots
plot(rob_pca$scores[,1], rob_pca$scores[,2], col = as.factor(pheno2$tissue), main = "Tissue")
plot(rob_pca$scores[,1], rob_pca$scores[,2], col = as.factor(pheno2$stress), main = "Stress")
plot(rob_pca$scores[,1], rob_pca$scores[,2], col = as.factor(pheno2$family), main = "Family")
##Visualization
pheno3<-data.frame(pheno2,robust_score_distance=rob_pca@sd,orthogonal_distance=rob_pca@od)
#Outlier plot with cut.off values
p <- ggplot(data=pheno3, aes(x=robust_score_distance, y=orthogonal_distance)) + geom_point()
p<-p+xlim(0, 5)+ylim(0,400)
p<-p + geom_hline([email protected], linetype="dashed", color = "red")+geom_vline(xintercept [email protected],color = "blue")
pdf("outlier_plot_pcaGRID_K=5.pdf")
p
dev.off()
#Outlier plot colored by family
p1<-ggplot(data=pheno3, aes(x=robust_score_distance, y=orthogonal_distance,color=family)) +
geom_point()
p1<-p1+xlim(0, 5)+ylim(0,400)
p1<-p1 + geom_hline([email protected])+geom_vline(xintercept [email protected])
pdf("outlier_plot_pcaGRID_K=5_family.pdf")
p1
dev.off()
#Outlier plot colored by stress
p1<-ggplot(data=pheno3, aes(x=robust_score_distance, y=orthogonal_distance,color=stress)) +
geom_point()
p1<-p1+xlim(0, 5)+ylim(0,400)
p1<-p1 + geom_hline([email protected])+geom_vline(xintercept [email protected])
pdf("outlier_plot_pcaGRID_K=5_stress.pdf")
p1
dev.off()
#Outlier plot colored by tissue
p1<-ggplot(data=pheno3, aes(x=robust_score_distance, y=orthogonal_distance,color=tissue)) +
geom_point()
p1<-p1+xlim(0, 5)+ylim(0,400)
p1<-p1 + geom_hline([email protected])+geom_vline(xintercept [email protected])
pdf("outlier_plot_pcaGRID_K=5_tissue.pdf")
p1
dev.off()
#Outlier plot colored by species
p1<-ggplot(data=pheno3, aes(x=robust_score_distance, y=orthogonal_distance,color=species)) +
geom_point()
p1<-p1+xlim(0, 5)+ylim(0,400)
p1<-p1 + geom_hline([email protected])+geom_vline(xintercept [email protected])
p1<-p1+theme_bw()+geom_text(aes(label=ifelse(orthogonal_distance>230,as.character(species),'')),hjust=0, vjust=0)+theme(legend.position = "none")
pdf("outlier_plot_pcaGRID_K=5_species2.pdf")
p1
dev.off()
```
# PcaGrid Method: No normalization
```{r}
saveRDS(t(mdata),'tmdata.rds')
model <- lm(t(mdata) ~ tissue + stress + family, data = pheno2)
#Linearity of data
plot(model$fitted.values, model$residuals, main = "Residuals vs Fitted")
rob_pca<-PcaGrid(t(mdata),k=5)
saveRDS(rob_pca,'rob_pca.rds')
#Indicator of outliers: FALSE for outlier, TRUE for regular sample
outliers<-rob_pca@flag
outlier_sras <- outliers[which(outliers==FALSE)]
outlier_sras <- data.frame(outlier_sras)
head(outlier_sras)
#how many outliers?
length(which(outliers=='FALSE'))
#rPCA plots
plot(rob_pca$scores[,1], rob_pca$scores[,2], col = as.factor(pheno2$tissue), main = "Tissue")
plot(rob_pca$scores[,1], rob_pca$scores[,2], col = as.factor(pheno2$stress), main = "Stress")
plot(rob_pca$scores[,1], rob_pca$scores[,2], col = as.factor(pheno2$family), main = "Family")
```
# Second method: PcaHubert
```{r}
#hu_pca<-PcaHubert(t(vstdata),k=5,kmax=5)
#saveRDS(hu_pca,'hu_pca.rds')
#Indicator of outliers: FALSE for outlier, TRUE for regular sample
#outliers<-hu_pca@flag
#how many outliers?
#length(which(outliers=='FALSE'))
```