-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_preprocessing.Rmd
164 lines (143 loc) · 6.37 KB
/
0_preprocessing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
---
title: "RNA-seq Mouse Placenta"
author: "Ha T. H. Vu"
output: html_document
---
```{r setup, include=FALSE}
options(max.print = "75")
knitr::opts_chunk$set(
echo = TRUE,
collapse = TRUE,
comment = "#>",
fig.path = "Files/",
fig.width = 8,
prompt = FALSE,
tidy = FALSE,
message = FALSE,
warning = TRUE
)
knitr::opts_knit$set(width = 75)
```
Documentation for the analysis of mouse placental RNA-seq data at e7.5, e8.5 and e9.5.
## 1. Step 1: FastQC - Quality control of reads:
Version: 0.11.7
```{bash, eval = FALSE}
cd /work/LAS/geetu-lab/hhvu/project1_2/rna-seq/0_rawData/fastqFiles
module load fastqc
fastqc *.fastq
```
## 2. Step 2: Trimmomatic - Trimming out adapter contents, overrepresented sequences and low quality base pairs: <br>
Version: 0.39
```{bash, eval = FALSE}
module load trimmomatic
cd /work/LAS/geetu-lab/hhvu/project1_2/rna-seq/0_rawData/fastq.gz
#example
java -jar /home/hhvu/Trimmomatic-0.39/trimmomatic-0.39.jar SE -phred33 inFiles ./outFiles.trimmed.fastq.gz ILLUMINACLIP:/home/hhvu/Trimmomatic-0.39/adapters/TruSeq3-SE.fa:2:30:10 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:15 MINLEN:36
```
## 3. Step 3: Kallisto - Pseudoalignment and quantification: <br>
Merge files from different technical runs:
```{bash, eval = FALSE}
cat file1.fatstq.gz file2.fastq.gz > merge.fastq.gz
```
Kallisto <br>
Version: 0.43.1 <br>
Index files were build with cDNA on autosomal and sex chromosomes of the mouse genome (GRCm38.p6) from Ensembl release 98. <br>
```{bash, eval = FALSE}
kallisto quant -i /work/LAS/geetu-lab/hhvu/index_files/mm10/kallisto-ensembl/mm10.idx -o ourdir/S49 --single -l 200 -s 30 -b 100 S49.fastq.gz
```
## 4. Step 4: Quality control
```{r}
#load necessary libraries
library("dplyr", suppressMessages())
library("tidyverse", suppressMessages())
library("biomaRt", suppressMessages())
library("ggplot2", suppressMessages())
library("dendextend", suppressMessages())
library("matrixStats", suppressMessages())
library("stats", suppressMessages())
library("pca3d", suppressMessages())
library("scatterplot3d", suppressMessages())
library("pheatmap", suppressMessages())
```
First, we filter the transcripts based on raw counts.
```{r}
#load necessary files
coding <- read.table("Files/Mus_musculus_grcm38_coding_transcripts.txt", header = F)
est_counts <- read.csv("Files/estCounts_allSamples.tsv", header = TRUE, sep = "\t", stringsAsFactors=FALSE)
#reformat count file
est_counts <- est_counts[order(est_counts$target_id),]
rownames(est_counts) <- est_counts$target_id #make transcript names to be row names
est_counts <- est_counts[, -c(1, 20, 21)] #exclude non-count columns
#make mean count table
mean_cts <- data.frame(trans=rownames(est_counts),
E7.5_mean=NA,
E8.5_mean=NA,
E9.5_mean=NA)
mean_cts$E7.5_mean <- rowMeans(est_counts[, c("E7.5_1", "E7.5_2", "E7.5_3", "E7.5_4", "E7.5_5", "E7.5_6")])
mean_cts$E8.5_mean <- rowMeans(est_counts[, c("E8.5_1", "E8.5_2", "E8.5_3", "E8.5_4", "E8.5_5", "E8.5_6")])
mean_cts$E9.5_mean <- rowMeans(est_counts[, c("E9.5_1", "E9.5_2", "E9.5_3", "E9.5_4", "E9.5_5", "E9.5_6")])
row.names(mean_cts) <- mean_cts$trans
mean_cts <- mean_cts[,2:ncol(mean_cts)]
#filter low count transcripts
basic_filter <- function (row, min_reads = 20, min_prop = 1/3) {
mean(row >= min_reads) >= min_prop
}
keep <- apply(mean_cts, 1, basic_filter) #1 means apply the function to each row
```
The `keep` object contains which transcripts to be kept. The retained transcripts are ones with raw counts $>=$ 20 in $>=$ 6 samples. <br>
Next, we carry out PCA and hierarchical clustering analysis. We will use scaled TPM values for this step. From the retained transcripts, we focused on the top 50\% most variable transcripts.
```{r}
tpm <- read.csv("Files/TPM_allSamples.tsv", header = TRUE, sep = "\t", stringsAsFactors=FALSE)
#reformat count file
tpm <- tpm[order(tpm$target_id),]
row.names(tpm) <- tpm$target_id
tpm <- tpm[,-c(1, 20, 21)]
tpm[] <- lapply(tpm, function(x) as.numeric(as.character(x)))
#filter out the low count transcripts according to the est_counts filtering above
tpm <- tpm[keep, ]
#calculate the variance between 3 time points
#here I take the average of each time point to make sure the variance is only calculated based on time points
#if not doing this, the variance may include biological variances between replicates also
mean_tpm <- data.frame(trans=rownames(tpm),
E7.5_mean=NA,
E8.5_mean=NA,
E9.5_mean=NA)
mean_tpm$E7.5_mean <- rowMeans(tpm[, c("E7.5_1", "E7.5_2", "E7.5_3", "E7.5_4", "E7.5_5", "E7.5_6")])
mean_tpm$E8.5_mean <- rowMeans(tpm[, c("E8.5_1", "E8.5_2", "E8.5_3", "E8.5_4", "E8.5_5", "E8.5_6")])
mean_tpm$E9.5_mean <- rowMeans(tpm[, c("E9.5_1", "E9.5_2", "E9.5_3", "E9.5_4", "E9.5_5", "E9.5_6")])
row.names(mean_tpm) <- mean_tpm$trans
mean_tpm <- mean_tpm[,2:ncol(mean_tpm)]
mean_tpm$var <- matrixStats::rowVars(as.matrix(mean_tpm)) #calculate the variance
mean_tpm_keep <- subset(mean_tpm, mean_tpm$var > quantile(mean_tpm$var, 0.50)) #keep the top 50% most variable transcripts
tpm1 <- subset(tpm, rownames(tpm) %in% rownames(mean_tpm_keep))
tpm1[] <- lapply(tpm1, function(x) as.numeric(as.character(x)))
#center and scale the data, for further analysis
tpm2 <- apply(tpm1, 1, function(x) (x-mean(x))/sd(x))
tpm2 <- t(tpm2)
tpm2 <- as.data.frame(tpm2)
```
PCA analysis:
```{r}
pca <- prcomp(t(tpm2))
#summary(pca) #run if we want to see the summary
#2D PCA
df_out <- as.data.frame(pca$x)
df_out$Group <- as.factor(c(rep("E7.5", 6),
rep("E8.5", 6),
rep("E9.5", 6)))
#the following plot with PC2 and PC3 showed clearly E9.5_6 is an outlier
ggplot(df_out, aes(x=PC2, y=PC3, color=Group, label=ifelse(colnames(tpm1) == c("E7.5_1", "E9.5_6"), as.character(colnames(tpm1)), ''))) +
geom_text(hjust=0.3,size=5, show.legend = F) + geom_point(size=3) +
labs(title="Principle component analysis", x="Principle component 2", y="Principle component 3") +
theme(text = element_text(size=20), axis.text.x = element_text(angle=0, hjust=1))
```
Hierarchical clustering analysis:
```{r}
hc <- hclust(dist(t(tpm2)), "ave")
dend <- as.dendrogram(hc)
dend %>% hang.dendrogram(hang = -1) %>% plot()
```
Based on the PCA and hierarchical clustering analysis on samples, the sample 1 of E7.5 and 6 of E9.5 did not cluster with their time points, hence identified as outliers. <br>
```{r}
sessionInfo()
```