Skip to content

Commit

Permalink
Merge pull request #10 from 3mmaRand/draft-omics-02
Browse files Browse the repository at this point in the history
Draft omics 02
  • Loading branch information
3mmaRand authored Oct 13, 2023
2 parents de2becf + bb64ffd commit 59eabb4
Show file tree
Hide file tree
Showing 16 changed files with 21,819 additions and 829 deletions.
257 changes: 255 additions & 2 deletions omics/crib/cont-fgf-s30.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ library(tidyverse)
# Import ------------------------------------------------------------------

# 🐸 import the s30 data
s30 <- read_csv("data-raw/xlaevis_counts_S30.csv")
s30 <- read_csv("data-raw/xlaevis_s30_filtered.csv")


# Explore -----------------------------------------------------------------
Expand Down Expand Up @@ -192,10 +192,263 @@ write_csv(s30_filtered,


# OMICS 2 STATISTICAL ANALYSIS --------------------------------------------
library(tidyverse)
library(conflicted)
library(DESeq2)

# import the filtered s30
s30_filtered <- read_csv("data-processed/s30_filtered.csv")

# number of genes and samples. remember we filtered out any genes
# with 4, 5 or 6 zeros and those where the total count was less than 20


# first find genes that are expressed only in one group. that is those
# only zeros in one group but values in all of the others.

# Find the genes that are expressed only in the FGF-treated group:

s30_fgf_only <- s30_filtered |>
filter(S30_C_5 == 0,
S30_C_6 == 0,
S30_C_A == 0,
S30_F_5 > 0,
S30_F_6 > 0,
S30_F_A > 0)
# there are 26 genes expressed in the FGF-treated group but not in the control group

# genes that are expressed only in the control group.

s30_con_only <- s30_filtered |>
filter(S30_C_5 > 0,
S30_C_6 > 0,
S30_C_A > 0,
S30_F_5 == 0,
S30_F_6 == 0,
S30_F_A == 0)
# there are no genes expressed in the control group but not in the FGF-treated group

# Write to file
write_csv(s30_fgf_only, "results/s30_fgf_only.csv")


## Import metadata that maps the sample names to treatments
meta <- read_table("meta/frog_meta_data.txt")
row.names(meta) <- meta$sample_id

# We only need the s30
meta_S30 <- meta |>
dplyr::filter(stage == "stage_30")



# 1. check the names match: rownames of meta and column names of counts
names(s30_filtered)
row.names(meta_S30)

# verfiy with equality
names(s30_filtered[2:7]) == row.names(meta_S30)



# 2. Create a DESeq2 object
# bioconductor has custom data types
# to make a DESeqDataSet object we need
# a) count matrix,
# b) metadata and
# c) a design matrix

# The count matrix must contain only the counts
# the genes are not inside the matrix, but are the row names
s30_count_mat <- s30_filtered |>
select(-xenbase_gene_id) |>
as.matrix()
# add the genes as row names
row.names(s30_count_mat) <- s30_filtered$xenbase_gene_id

View(s30_count_mat)
knitr::kable(head(s30_count_mat))


# creates the DESeqDataSet (dds) object
dds <- DESeqDataSetFromMatrix(s30_count_mat,
colData = meta_S30,
design = ~ treatment + sibling_rep)
# design matrix is a formula that describes the experimental design using the
# column names of the metadata dataframe


# This is fine:
# converting counts to integer mode
# Warning message:

# some variables in design formula are characters, converting to factors



# To see the counts in the DESeqDataSet object, we can use the counts() function
counts(dds) |> View()
# This should be what is in s30_count_mat

# The normalisation is done automatically with the DE
# However, we need the normalised counts for data viz.


# The normalised counts can be generated using:
# estimateSizeFactors() for estimating the factors for normalisation.
dds <- estimateSizeFactors(dds)
# By assigning the results back to the dds object, we are filling in the
# slots of the DESeqDataSet object with the appropriate information.

# We can take a look at the normalization factors of each sample using:
sizeFactors(dds)
# S30_C_5 S30_C_6 S30_C_A S30_F_5 S30_F_6 S30_F_A
# 0.8812200 0.9454600 1.2989886 1.0881870 1.0518961 0.8322894

# Notice there is a relationship between the total number of
# counts and the size factors which we can see with a quick scatter plot
plot(colSums(s30_count_mat), sizeFactors(dds))

# Then to get the normalized counts as a matrix, we can use:
normalised_counts <- counts(dds, normalized = TRUE)


# Save this normalized data matrix to file for later use:
data.frame(normalised_counts,
xenbase_gene_id = row.names(normalised_counts)) |>
write_csv(file = "results/S30_normalised_counts.csv")

# These normalized counts will be useful for downstream visualization
# of results, but cannot be used as input to DESeq2 or any other tools that
# perform differential expression analysis that use the negative binomial model.


# DE WITH DESeq2 ----------------------------------------------------------

# run the actual differential expression analysis,
# we use a single call to the function DESeq().
# Run analysis
dds <- DESeq(dds)

# check dispersion estimates, checks the assumption of the DESeq2 model
plotDispEsts(dds)
# look fine


## Define contrasts for FGF overexpression
contrast_fgf <- c("treatment", "FGF", "control")
results_fgf <- results(dds,
contrast = contrast_fgf,
alpha = 0.01)
results_fgf |>
data.frame() |> head()


# OMICS 3 VISUALISE AND INTERPRET -----------------------------------------
data.frame(results_fgf,
xenbase_gene_id = row.names(results_fgf)) |>
write_csv(file = "results/S30_results.csv")












# OMICS 3 VISUALISE AND INTERPRET and chek the assumptions?? -----------------------------------------



# PCA AND CLUSTERING -----------------------

# we do this on the log2 transformed normalised counts or the regularized the
# log transformed counts
# rlog is a method to bias from low count genes.
# https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/03_DGE_QC_analysis.html gives a good explanation of regularized the log transform (rlog)


# The rlog transformation of the normalized counts is only necessary
# for these visualization methods during this quality assessment.
# They are not used for DE because DESeq2 takes care of that

### Transform counts for data visualization
rld <- rlog(dds, blind = TRUE)
# transformed counts are accessed with
# assay(rld)
rlogged <- assay(rld) |> t()
# These techniques want cells (samples in rows) and genes in columns.


# perform PCA using standard functions
pca <- rlogged |>
prcomp(scale. = TRUE)

summary(pca)
# PC1 proportion 0.3857, PC2 proportion 0.2112

pca_labelled <- data.frame(pca$x,
sample_id = row.names(rlogged))


# merge with metadata
pca_labelled <- pca_labelled |>
left_join(meta_S30,
by = "sample_id")

pca_labelled |> ggplot(aes(x = PC1, y = PC2,
colour = treatment,
shape = sibling_rep)) +
geom_point(size = 3) +
scale_colour_viridis_d(end = 0.95) +
theme_classic()

# they don't cluster that well, by treatment or sib group


# ########## TO COPY TO OTHER COMPARISONS FROM HERE
# # Hierarchical Clustering
# #
# # pheatmap need a matrix or dataframe so use SummarizedExperiment::assay()
# # which is loaded with DESeq2
# rld_mat <- assay(rld)
#
# # Compute pairwise correlation values
# rld_cor <- cor(rld_mat) ## cor() is a base R function
#
# meta_S30 <- as.data.frame(meta_S30)
# row.names(meta_S30) <- meta_S30$sample_id
# # Plot heatmap using the correlation matrix and the metadata object
# pheatmap(rld_cor,
# annotation = meta_S30[3:4])



## Principle Components Analysis (PCA)

# explore data further - at the sample and gene level check reps cluster
# together do on the normalised counts


# venn diagram for frogs, use original dataset

# look up information about the genes

# volcano plots

# MAYBE WEEK 3
# plotMA(results_fgf)
#
# resultsNames(dds)
# # "Intercept", "treatment_FGF_vs_control" "sibling_rep_five_vs_A" "sibling_rep_six_vs_A"
#
#
#
# # results_fgf_shrunken <- lfcShrink(dds,
# # coef = "treatment_FGF_vs_control",
# # type = "apeglm")
# #
# # plotMA(results_fgf_shrunken)
Loading

0 comments on commit 59eabb4

Please sign in to comment.