-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRNA_sequencing.R
184 lines (118 loc) · 7.86 KB
/
RNA_sequencing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
## ----message=FALSE, warning=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------
library(tidyverse) # for data manipulation
library(DESeq2) # for Differential gene expression analysis
library(ggplot2) # for plotting
library(plotly) # for interactive plots
library(GenomicRanges) # for creating and manipulating genomic ranges
library(mixOmics) # for PCA
## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
assay<- read.csv("assay.csv", row.names = 1) # Importing gene expression assay data
colData <- read.csv("colData.csv") # Importing clinical information
temp<- read.csv("rawRange.csv") # Importing rawRanges as dataframe
rowRanges <- GRanges( # creating rawRanges from dataframge
seqnames = Rle(temp$chr), # Chromosome names
ranges = IRanges(start = temp$start, end =temp$end), # Start and End positions
strand = Rle(temp$strand), # Strand information (if available)
data.frame(Gene = temp$Gene, Length = temp$length)) # Additional columns
## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
print("The count data as Assay: ")
head(assay)
print("The clinical data data as colData: ")
head(colData)
print("The gene coordinates as rawRanges ")
head(rowRanges)
## ----message=FALSE, warning=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------
nreads <- data.frame(reads = colSums(assay)) %>% # Calculating library size by summing read counts for each sample
rownames_to_column("sample") %>%
mutate(Sample_Group = case_when(grepl("HK", sample) ~ "HK",
grepl("WT", sample) ~ "WT"))
lsize <- ggplot(nreads, aes(x=Sample_Group, y=reads, fill=Sample_Group,label=sample)) +
geom_boxplot() +
geom_jitter(width = 0.25) +
scale_fill_manual(values = c("forest green", "steel blue")) +
theme_minimal() +
ggtitle("Library size")
ggplotly(lsize)
## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
assay.filtered <- assay[rowSums(assay > 0) >= 2 & rowSums(assay) >= 20,]
paste0("Number of genes before filtering: ", nrow(assay))
paste0("Number of genes after filtering: ", nrow(assay.filtered))
## ----message=FALSE , warning=FALSE-------------------------------------------------------------------------------------------------------------------------------------------------------------
# Add pseudocount and calculate log2-transformed counts
log_counts <- log2(assay.filtered + 1)
# Calculate relative log expression
medians <- apply(log_counts, 1, median) # Median for each gene across samples
rle <- sweep(log_counts, 1, medians, FUN = "-") # Subtract medians (log scale)
# Reshape data for ggplot
rle_long <- reshape2::melt(rle)
colnames(rle_long) <- c("Sample", "Deviation")
rle_long$Sample_group <- rep(c("HK", "WT"), c(1005,1005))
# Plot RLE as a boxplot colored by Sample_Group
ggplot(rle_long, aes(x = Sample, y = Deviation, fill = Sample_group)) +
geom_boxplot(outlier.size = 0.5, outlier.colour = "red") +
scale_fill_manual(values = c("forest green", "steel blue")) +
theme_minimal() +
labs(
title = "RLE Plot before Normalization",
x = "Sample",
y = "Relative Log Expression (Deviation)",
fill = "Sample_group"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for clarity
## ----message=FALSE , warning=FALSE-------------------------------------------------------------------------------------------------------------------------------------------------------------
DE.obj <- DESeqDataSetFromMatrix(countData = assay.filtered,
colData = colData,
design = ~ Sample_Group)
## ----message=FALSE , warning=FALSE-------------------------------------------------------------------------------------------------------------------------------------------------------------
DE.obj <- estimateSizeFactors(DE.obj) # Estimating size factor
DE.normalized <- counts(DE.obj, normalized = T) # Normalizing counts
# Add pseudocount and calculate log2-transformed counts
log_counts <- log2(DE.normalized + 1) %>% as.data.frame()
# Calculate relative log expression
medians <- apply(log_counts, 1, median) # Median for each gene across samples
rle <- sweep(log_counts, 1, medians, FUN = "-") # Subtract medians (log scale)
# Reshape data for ggplot
rle_long <- reshape2::melt(rle)
colnames(rle_long) <- c("Sample", "Deviation")
rle_long$Sample_group <- rep(c("HK", "WT"), c(1005,1005))
# Plot RLE as a boxplot colored by Sample_Group
ggplot(rle_long, aes(x = Sample, y = Deviation, fill = Sample_group)) +
geom_boxplot(outlier.size = 0.5, outlier.colour = "red") +
scale_fill_manual(values = c("forest green", "steel blue")) +
theme_minimal() +
labs(
title = "RLE Plot after Normalization",
x = "Sample",
y = "Relative Log Expression (Deviation)",
fill = "Sample_group"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for clarity
## ----message=FALSE, warning=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------
pca1<- pca(t(DE.normalized), center = T, scale = T) # Performing PCA
plotIndiv(pca1, group = colData$Sample_Group, ind.names = F,
title = "Total expression profile",
legend = T, legend.position = "bottom", col.per.group = c("forest green", "steel blue"))
## ----message=FALSE, warning=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------
DE.obj$Sample_Group <- relevel(DE.obj$Sample_Group, ref = "HK") # Setting reference group
DE.obj <- DESeq(DE.obj) #Fitting the model
res.con <- results(DE.obj, # Extracting result table
contrast=c("Sample_Group", "WT", "HK"), # Setting Contrast
pAdjustMethod = "BH", # Using 'BH' for p-value adjustment
alpha = 0.1) # Adjusted p-value threshold
res.con <- res.con[!is.na(res.con$padj),] # Removing genes with NA adj p-values
## ----message=FALSE, warning=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------
volcano <- as.data.frame(res.con) %>%
rownames_to_column("gene") %>%
mutate(Logpadj=-log10(padj)) %>%
mutate(sig=case_when(log2FoldChange >= 1 & padj < 0.05 ~ "Up",
log2FoldChange <= -1 & padj < 0.05 ~ "Down")) %>%
ggplot(aes(x=log2FoldChange, y=Logpadj, color=sig, label=gene)) +
geom_point(size=1) +
theme_bw() +
ylab("-Log10(FDR)") +
theme(legend.position = "none") +
scale_color_manual(values = c("firebrick", "steelblue")) +
geom_vline(xintercept = c(-1, 1), col = "gray", linetype = 'dashed') +
geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
ggtitle("Volcano DEGs")
ggplotly(volcano)