Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft omics 02 #10

Merged
merged 13 commits into from
Oct 13, 2023
Merged
257 changes: 255 additions & 2 deletions omics/crib/cont-fgf-s30.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ library(tidyverse)
# Import ------------------------------------------------------------------

# 🐸 import the s30 data
s30 <- read_csv("data-raw/xlaevis_counts_S30.csv")
s30 <- read_csv("data-raw/xlaevis_s30_filtered.csv")


# Explore -----------------------------------------------------------------
Expand Down Expand Up @@ -192,10 +192,263 @@ write_csv(s30_filtered,


# OMICS 2 STATISTICAL ANALYSIS --------------------------------------------
library(tidyverse)
library(conflicted)
library(DESeq2)

# import the filtered s30
s30_filtered <- read_csv("data-processed/s30_filtered.csv")

# number of genes and samples. remember we filtered out any genes
# with 4, 5 or 6 zeros and those where the total count was less than 20


# first find genes that are expressed only in one group. that is those
# only zeros in one group but values in all of the others.

# Find the genes that are expressed only in the FGF-treated group:

s30_fgf_only <- s30_filtered |>
filter(S30_C_5 == 0,
S30_C_6 == 0,
S30_C_A == 0,
S30_F_5 > 0,
S30_F_6 > 0,
S30_F_A > 0)
# there are 26 genes expressed in the FGF-treated group but not in the control group

# genes that are expressed only in the control group.

s30_con_only <- s30_filtered |>
filter(S30_C_5 > 0,
S30_C_6 > 0,
S30_C_A > 0,
S30_F_5 == 0,
S30_F_6 == 0,
S30_F_A == 0)
# there are no genes expressed in the control group but not in the FGF-treated group

# Write to file
write_csv(s30_fgf_only, "results/s30_fgf_only.csv")


## Import metadata that maps the sample names to treatments
meta <- read_table("meta/frog_meta_data.txt")
row.names(meta) <- meta$sample_id

# We only need the s30
meta_S30 <- meta |>
dplyr::filter(stage == "stage_30")



# 1. check the names match: rownames of meta and column names of counts
names(s30_filtered)
row.names(meta_S30)

# verfiy with equality
names(s30_filtered[2:7]) == row.names(meta_S30)



# 2. Create a DESeq2 object
# bioconductor has custom data types
# to make a DESeqDataSet object we need
# a) count matrix,
# b) metadata and
# c) a design matrix

# The count matrix must contain only the counts
# the genes are not inside the matrix, but are the row names
s30_count_mat <- s30_filtered |>
select(-xenbase_gene_id) |>
as.matrix()
# add the genes as row names
row.names(s30_count_mat) <- s30_filtered$xenbase_gene_id

View(s30_count_mat)
knitr::kable(head(s30_count_mat))


# creates the DESeqDataSet (dds) object
dds <- DESeqDataSetFromMatrix(s30_count_mat,
colData = meta_S30,
design = ~ treatment + sibling_rep)
# design matrix is a formula that describes the experimental design using the
# column names of the metadata dataframe


# This is fine:
# converting counts to integer mode
# Warning message:

# some variables in design formula are characters, converting to factors



# To see the counts in the DESeqDataSet object, we can use the counts() function
counts(dds) |> View()
# This should be what is in s30_count_mat

# The normalisation is done automatically with the DE
# However, we need the normalised counts for data viz.


# The normalised counts can be generated using:
# estimateSizeFactors() for estimating the factors for normalisation.
dds <- estimateSizeFactors(dds)
# By assigning the results back to the dds object, we are filling in the
# slots of the DESeqDataSet object with the appropriate information.

# We can take a look at the normalization factors of each sample using:
sizeFactors(dds)
# S30_C_5 S30_C_6 S30_C_A S30_F_5 S30_F_6 S30_F_A
# 0.8812200 0.9454600 1.2989886 1.0881870 1.0518961 0.8322894

# Notice there is a relationship between the total number of
# counts and the size factors which we can see with a quick scatter plot
plot(colSums(s30_count_mat), sizeFactors(dds))

# Then to get the normalized counts as a matrix, we can use:
normalised_counts <- counts(dds, normalized = TRUE)


# Save this normalized data matrix to file for later use:
data.frame(normalised_counts,
xenbase_gene_id = row.names(normalised_counts)) |>
write_csv(file = "results/S30_normalised_counts.csv")

# These normalized counts will be useful for downstream visualization
# of results, but cannot be used as input to DESeq2 or any other tools that
# perform differential expression analysis that use the negative binomial model.


# DE WITH DESeq2 ----------------------------------------------------------

# run the actual differential expression analysis,
# we use a single call to the function DESeq().
# Run analysis
dds <- DESeq(dds)

# check dispersion estimates, checks the assumption of the DESeq2 model
plotDispEsts(dds)
# look fine


## Define contrasts for FGF overexpression
contrast_fgf <- c("treatment", "FGF", "control")
results_fgf <- results(dds,
contrast = contrast_fgf,
alpha = 0.01)
results_fgf |>
data.frame() |> head()


# OMICS 3 VISUALISE AND INTERPRET -----------------------------------------
data.frame(results_fgf,
xenbase_gene_id = row.names(results_fgf)) |>
write_csv(file = "results/S30_results.csv")












# OMICS 3 VISUALISE AND INTERPRET and chek the assumptions?? -----------------------------------------



# PCA AND CLUSTERING -----------------------

# we do this on the log2 transformed normalised counts or the regularized the
# log transformed counts
# rlog is a method to bias from low count genes.
# https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/03_DGE_QC_analysis.html gives a good explanation of regularized the log transform (rlog)


# The rlog transformation of the normalized counts is only necessary
# for these visualization methods during this quality assessment.
# They are not used for DE because DESeq2 takes care of that

### Transform counts for data visualization
rld <- rlog(dds, blind = TRUE)
# transformed counts are accessed with
# assay(rld)
rlogged <- assay(rld) |> t()
# These techniques want cells (samples in rows) and genes in columns.


# perform PCA using standard functions
pca <- rlogged |>
prcomp(scale. = TRUE)

summary(pca)
# PC1 proportion 0.3857, PC2 proportion 0.2112

pca_labelled <- data.frame(pca$x,
sample_id = row.names(rlogged))


# merge with metadata
pca_labelled <- pca_labelled |>
left_join(meta_S30,
by = "sample_id")

pca_labelled |> ggplot(aes(x = PC1, y = PC2,
colour = treatment,
shape = sibling_rep)) +
geom_point(size = 3) +
scale_colour_viridis_d(end = 0.95) +
theme_classic()

# they don't cluster that well, by treatment or sib group


# ########## TO COPY TO OTHER COMPARISONS FROM HERE
# # Hierarchical Clustering
# #
# # pheatmap need a matrix or dataframe so use SummarizedExperiment::assay()
# # which is loaded with DESeq2
# rld_mat <- assay(rld)
#
# # Compute pairwise correlation values
# rld_cor <- cor(rld_mat) ## cor() is a base R function
#
# meta_S30 <- as.data.frame(meta_S30)
# row.names(meta_S30) <- meta_S30$sample_id
# # Plot heatmap using the correlation matrix and the metadata object
# pheatmap(rld_cor,
# annotation = meta_S30[3:4])



## Principle Components Analysis (PCA)

# explore data further - at the sample and gene level check reps cluster
# together do on the normalised counts


# venn diagram for frogs, use original dataset

# look up information about the genes

# volcano plots

# MAYBE WEEK 3
# plotMA(results_fgf)
#
# resultsNames(dds)
# # "Intercept", "treatment_FGF_vs_control" "sibling_rep_five_vs_A" "sibling_rep_six_vs_A"
#
#
#
# # results_fgf_shrunken <- lfcShrink(dds,
# # coef = "treatment_FGF_vs_control",
# # type = "apeglm")
# #
# # plotMA(results_fgf_shrunken)
Loading
Loading