diff --git a/Rplot001.jpg b/Rplot001.jpg
new file mode 100644
index 0000000..e69de29
diff --git a/_site/omics/week-5/figures/frog-s30-pca.png b/_site/omics/week-5/figures/frog-s30-pca.png
new file mode 100644
index 0000000..f4f1799
Binary files /dev/null and b/_site/omics/week-5/figures/frog-s30-pca.png differ
diff --git a/_site/omics/week-5/figures/prog-hspc-volcano.png b/_site/omics/week-5/figures/prog-hspc-volcano.png
new file mode 100644
index 0000000..4122bc5
Binary files /dev/null and b/_site/omics/week-5/figures/prog-hspc-volcano.png differ
diff --git a/_site/omics/week-5/figures/prog_hspc-pca.png b/_site/omics/week-5/figures/prog_hspc-pca.png
new file mode 100644
index 0000000..315fc8d
Binary files /dev/null and b/_site/omics/week-5/figures/prog_hspc-pca.png differ
diff --git a/_site/omics/week-5/images/frog-heat.png b/_site/omics/week-5/images/frog-heat.png
new file mode 100644
index 0000000..8cf1b5f
Binary files /dev/null and b/_site/omics/week-5/images/frog-heat.png differ
diff --git a/_site/omics/week-5/images/volcano-why.png b/_site/omics/week-5/images/volcano-why.png
new file mode 100644
index 0000000..26b786b
Binary files /dev/null and b/_site/omics/week-5/images/volcano-why.png differ
diff --git a/_site/omics/week-5/images/why_pca_frog.png b/_site/omics/week-5/images/why_pca_frog.png
new file mode 100644
index 0000000..14bb9f6
Binary files /dev/null and b/_site/omics/week-5/images/why_pca_frog.png differ
diff --git a/_site/omics/week-5/images/why_pca_mouse.png b/_site/omics/week-5/images/why_pca_mouse.png
new file mode 100644
index 0000000..04ef843
Binary files /dev/null and b/_site/omics/week-5/images/why_pca_mouse.png differ
diff --git a/_site/omics/week-5/study_before_workshop.html b/_site/omics/week-5/study_before_workshop.html
index 1c44600..ed3f7a3 100644
--- a/_site/omics/week-5/study_before_workshop.html
+++ b/_site/omics/week-5/study_before_workshop.html
@@ -92,6 +92,25 @@
     code span.va { color: #111111; } /* Variable */
     code span.vs { color: #20794d; } /* VerbatimString */
     code span.wa { color: #5e5e5e; font-style: italic; } /* Warning */
+    div.csl-bib-body { }
+    div.csl-entry {
+      clear: both;
+    }
+    .hanging div.csl-entry {
+      margin-left:2em;
+      text-indent:-2em;
+    }
+    div.csl-left-margin {
+      min-width:2em;
+      float:left;
+    }
+    div.csl-right-inline {
+      margin-left:2em;
+      padding-left:1em;
+    }
+    div.csl-indent {
+      margin-left: 2em;
+    }
   </style>
 <link rel="stylesheet" href="../../site_libs/revealjs/dist/theme/quarto.css" id="theme">
 <link href="../../site_libs/revealjs/plugin/quarto-line-highlight/line-highlight.css" rel="stylesheet">
@@ -476,9 +495,9 @@
 </section><section id="examine-the-results-files-1" class="slide level2"><h2>Examine the results files</h2>
 <p>Remind yourself of the key columns you have in the results files:</p>
 <ul>
-<li>a fold change, logged to base 2</li>
-<li>an unadjusted p-value</li>
-<li>a p value adjusted for multiple testing (<code>FDR</code> or <code>padj</code>)</li>
+<li>a log<sub>2</sub> fold change</li>
+<li>an unadjusted <em>p</em>-value</li>
+<li>a <em>p</em> value adjusted for multiple testing (<code>FDR</code> or <code>padj</code>)</li>
 <li>a gene id</li>
 </ul></section><section id="frogs-1" class="slide level2"><h2>🐸 Frogs</h2>
 <div class="cell">
@@ -494,6 +513,7 @@
 $ xenbase_gene_id &lt;chr&gt; "XB-GENE-1000007", "XB-GENE-1000023", "XB-GENE-1000062…</code></pre>
 </div>
 </div>
+<div class="fragment">
 <ul>
 <li>
 <code>baseMean</code> is the mean of the normalised counts for the gene across all samples</li>
@@ -501,7 +521,11 @@
 <code>lfcSE</code> standard error of the fold change</li>
 <li>
 <code>stat</code> is the test statistic (the Wald statistic)</li>
-</ul></section><section id="mice-1" class="slide level2"><h2>🐭 Mice</h2>
+<li>Generated by <strong><code>DESeq2</code></strong> <span class="citation" data-cites="DESeq2">(<a href="#/references" role="doc-biblioref" onclick="">Love, Huber, and Anders 2014</a>)</span>
+</li>
+</ul>
+</div>
+</section><section id="mice-1" class="slide level2"><h2>🐭 Mice</h2>
 <div class="cell">
 <div class="cell-output cell-output-stdout">
 <pre><code>Rows: 280
@@ -514,24 +538,32 @@
 $ ensembl_gene_id &lt;chr&gt; "ENSMUSG00000028639", "ENSMUSG00000024053", "ENSMUSG00…</code></pre>
 </div>
 </div>
+<div class="fragment">
 <ul>
-<li>Top is the rank of the gene ordered by the p value (smallest first)</li>
-</ul></section></section><section><section id="adding-gene-information" class="title-slide slide level1 center"><h1>Adding gene information</h1>
+<li>Top is the rank of the gene ordered by the <em>p</em>-value (smallest first)</li>
+<li>
+<code>summary.logFC</code> and <code>logFC.hspc</code> give the same value (in this case since comparing two cell types)</li>
+<li>generated by <strong><code>scran</code></strong> <span class="citation" data-cites="scran">(<a href="#/references" role="doc-biblioref" onclick="">Lun, McCarthy, and Marioni 2016</a>)</span>
+</li>
+</ul>
+</div>
+</section></section><section><section id="adding-gene-information" class="title-slide slide level1 center"><h1>Adding gene information</h1>
 
 </section><section id="adding-gene-information-1" class="slide level2"><h2>Adding gene information</h2>
 <div>
 <ul>
 <li class="fragment"><p>The gene id is difficult to interpret in plots/tables</p></li>
 <li class="fragment"><p>Therefore we need to add information such as the gene name and a description to the results</p></li>
-<li class="fragment"><p>For the 🐸 Frog data information comes from xenbase</p></li>
-<li class="fragment"><p>For the 🐭 Mice data information comes from Ensembl</p></li>
+<li class="fragment"><p>For the 🐸 Frog data information comes from Xenbase <span class="citation" data-cites="fisher2023">(<a href="#/references" role="doc-biblioref" onclick="">Fisher et al. 2023</a>)</span></p></li>
+<li class="fragment"><p>For the 🐭 Mice data information comes from Ensembl <span class="citation" data-cites="birney2004">(<a href="#/references" role="doc-biblioref" onclick="">Birney et al. 2004</a>)</span></p></li>
 </ul>
 </div>
 </section><section id="xenbase" class="slide level2"><h2>🐸 Xenbase</h2>
 
-<img data-src="images/Xenbase-Logo-Medium.png" width="800" class="r-stretch quarto-figure-center"><p class="caption">xenbase logo</p><p><a href="http://www.xenbase.org/">Xenbase</a></p>
-<p>Xenbase is a model organism database that provides genomic, molecular, and developmental biology information about <em>Xenopus laevis</em> and <em>Xenopus tropicalis</em>.</p>
+<img data-src="images/Xenbase-Logo-Medium.png" width="800" class="r-stretch quarto-figure-center"><p class="caption">xenbase logo</p><p><a href="http://www.xenbase.org/">Xenbase</a> is a model organism database that provides genomic, molecular, and developmental biology information about <em>Xenopus laevis</em> and <em>Xenopus tropicalis</em>.</p>
+<div class="fragment">
 <p>It took me some time to find the information you need.</p>
+</div>
 </section><section id="xenbase-1" class="slide level2"><h2>🐸 Xenbase</h2>
 <div>
 <ul>
@@ -539,35 +571,148 @@
 <li class="fragment"><p>This is listed: Xenbase Gene Product Information [readme] <a href="https://download.xenbase.org/xenbase/GenePageReports/xenbase.gpi.gz">gzipped gpi (tab separated)</a></p></li>
 <li class="fragment"><p>Click on the readme link to see the file format and columns</p></li>
 <li class="fragment"><p>I downloaded <a href="https://download.xenbase.org/xenbase/GenePageReports/xenbase.gpi.gz">xenbase.gpi.gz</a>, unzipped it, removed header lines and the <em>Xenopus tropicalis</em> (taxon:8364) entries and saved it as <a href="meta/xenbase_info.xlsx">xenbase_info.xlsx</a></p></li>
-<li class="fragment"><p>In the workshop you will merge this information with the results file</p></li>
+<li class="fragment"><p>In the workshop you will import this file and merge the information with the results file</p></li>
 </ul>
 </div>
 </section><section id="ensembl" class="slide level2"><h2>🐭 Ensembl</h2>
-<p>from the ncbi</p>
-<p>biomart is a package that allows you to get information from the ncbi database such as gene names and descriptions</p>
+<div>
+<ul>
+<li class="fragment"><p><a href="https://www.ensembl.org/index.html">Ensembl</a> creates, integrates and distributes reference datasets and analysis tools that enable genomics</p></li>
+<li class="fragment"><p><a href="https://grch37.ensembl.org/info/data/biomart/index.html">BioMart</a> provides a access to these large datasets</p></li>
+<li class="fragment"><p><strong><code>biomaRt</code></strong> <span class="citation" data-cites="biomaRt">(<a href="#/references" role="doc-biblioref" onclick="">Durinck et al. 2009</a>)</span> is a Bioconductor package gives you programmatic access to BioMart.</p></li>
+<li class="fragment"><p>In the workshop you use this package to get information you can merge with the results file</p></li>
+</ul>
+</div>
 </section></section><section><section id="plots" class="title-slide slide level1 center"><h1>Plots</h1>
 
-</section><section id="plots-purpose" class="slide level2"><h2>plots purpose</h2>
-<p>dimsenion reduction</p>
-</section><section id="pca" class="slide level2"><h2>pca</h2>
-<p>lots of variables</p>
-</section><section id="tsne" class="slide level2"><h2>tsne</h2>
-<p>lots of variables and lots of observations</p>
-</section></section><section><section id="normalsing-before-plotting" class="title-slide slide level1 center"><h1>normalsing before plotting</h1>
-
-</section><section id="normalising-before-plotting" class="slide level2"><h2>normalising before plotting</h2>
-<p>log</p>
-<p>normalisation regularised log is a method to bias from low count genes. https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/03_DGE_QC_analysis.html</p>
+</section><section id="what-is-the-purpose-of-an-omics-plot" class="slide level2"><h2>What is the purpose of an Omics plot?</h2>
+<div>
+<ul>
+<li class="fragment"><p>In general, we plot data to help us summarise and understand it</p></li>
+<li class="fragment"><p>This is especially import for omics data where we have a very large number of variables and often a large number of observations</p></li>
+<li class="fragment"><p>We will look at three plots very commonly used in omics analysis: Principal Component Analysis (PCA) plot, Heatmaps and Volcano Plots</p></li>
+</ul>
+</div>
+</section></section><section><section id="principal-component-analysis-pca" class="title-slide slide level1 center"><h1>Principal Component Analysis (PCA)</h1>
+
+</section><section id="pca" class="slide level2"><h2>PCA</h2>
+<div>
+<ul>
+<li class="fragment"><p>Principal Component Analysis is an unsupervised machine learning technique</p></li>
+<li class="fragment"><p>Unsupervised methods<sup>1</sup> are unsupervised in that they do not use/optimise to a particular output. The goal is to uncover structure. They do not test hypotheses</p></li>
+<li class="fragment"><p>It is often used to visualise high dimensional data because it is a dimension reduction technique</p></li>
+</ul>
+</div>
+<aside><ol class="aside-footnotes"><li id="fn1"><p>You may wish to read a previous introduction to unsupervised methods I have written <a href="https://3mmarand.github.io/BIO00058M-Data-science-2020/slides/05_intro_to_ML_unsupervised.html#1">An introduction to Machine Learning: Unsupervised methods</a> <span class="citation" data-cites="rand2021">(<a href="#/references" role="doc-biblioref" onclick="">Rand 2021</a>)</span></p></li></ol></aside></section><section id="pca-1" class="slide level2"><h2>PCA</h2>
+<div>
+<ul>
+<li class="fragment"><p>It takes a large number of continuous variables (like gene expression) and reduces them to a smaller number of variables (called principal components) that explain most of the variation in the data</p></li>
+<li class="fragment"><p>The principal components can be plotted to see how samples cluster together</p></li>
+</ul>
+</div>
+</section><section id="pca-2" class="slide level2"><h2>PCA</h2>
+<ul>
+<li>To see if samples cluster as we would expect, we might plot the expression of one gene against another</li>
+</ul>
+<div class="quarto-layout-panel">
+<div class="quarto-layout-row quarto-layout-valign-top">
+<div class="quarto-layout-cell" style="flex-basis: 50.0%;justify-content: center;">
+<div class="quarto-figure quarto-figure-center">
+<figure><p><img data-src="images/why_pca_frog.png" width="300"></p>
+<p></p><figcaption>Samples</figcaption><p></p>
+</figure>
+</div>
+</div>
+<div class="quarto-layout-cell" style="flex-basis: 50.0%;justify-content: center;">
+<div class="quarto-figure quarto-figure-center">
+<figure><p><img data-src="images/why_pca_mouse.png" width="300"></p>
+<p></p><figcaption>Cells</figcaption><p></p>
+</figure>
+</div>
+</div>
+</div>
+</div>
+<p>This gives some insight but we have 280 (mice) or 10,000+(frogs) genes to consider. How do we know if the pair we use is typical? How can we consider al the genes at once?</p>
+</section><section id="pca-3" class="slide level2"><h2>PCA</h2>
+<ul>
+<li>PCA is a solution for this - It takes a large number of continuous variables (like gene expression) and reduces them to a smaller number of “principal components” that explain most of the variation in the data.</li>
+</ul>
+<div class="quarto-layout-panel">
+<div class="quarto-layout-row quarto-layout-valign-top">
+<div class="quarto-layout-cell" style="flex-basis: 50.0%;justify-content: center;">
+<div class="quarto-figure quarto-figure-center">
+<figure><p><img data-src="figures/frog-s30-pca.png" width="300"></p>
+<p></p><figcaption>Samples</figcaption><p></p>
+</figure>
+</div>
+</div>
+<div class="quarto-layout-cell" style="flex-basis: 50.0%;justify-content: center;">
+<div class="quarto-figure quarto-figure-center">
+<figure><p><img data-src="figures/prog_hspc-pca.png" width="300"></p>
+<p></p><figcaption>Cells</figcaption><p></p>
+</figure>
+</div>
+</div>
+</div>
+</div>
+<p>We have done PCA in Omics 3, but often PCA might be one of the first exploratory steps because it gives you an idea whether you expect general patterns in gene expression that distinguish groups.</p>
+</section></section><section><section id="heatmaps" class="title-slide slide level1 center"><h1>Heatmaps</h1>
+
+</section><section id="heatmaps-1" class="slide level2"><h2>Heatmaps</h2>
 <div>
 <ul>
-<li class="fragment">T</li>
+<li class="fragment"><p>are a grid of genes on one axis and samples on the other with each grid cell coloured by another variable</p></li>
+<li class="fragment"><p>in this case the other variable is gene expression</p></li>
+<li class="fragment"><p>they allow you to quickly get an overview of the expression patterns across genes and samples</p></li>
+<li class="fragment"><p>we often couple them with clustering to group genes and samples with similar expression patterns together which helps us see which genes are responsible for distinguishing groups</p></li>
 </ul>
 </div>
-<p>rlog is a method to bias from low count genes. https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/03_DGE_QC_analysis.html gives a good explanation of regularized the log transform (rlog)</p>
-<p>The rlog transformation of the normalized counts is only necessary for these visualization methods during this quality assessment. They are not used for DE because DESeq2 takes care of that</p>
-<p>in the workshop we just to log transformed</p>
+</section><section id="section-1" class="slide level2"><h2></h2>
+<div class="quarto-figure quarto-figure-center">
+<figure><p><img data-src="images/frog-heat.png" height="800"></p>
+<p></p><figcaption>Heat map for the frog data</figcaption><p></p>
+</figure>
+</div>
+<p>See next slide for information</p>
+</section><section id="heatmaps-2" class="slide level2"><h2>Heatmaps</h2>
+<div>
 <ul>
-<li>The 🐭 mouse data have been normalised to simplify the analysis for you; the 🐸 frog data have not but the DE method will do this for you.</li>
+<li class="fragment"><p>On the vertical axis are genes which are differentially expressed at the 0.01 level</p></li>
+<li class="fragment"><p>On the horizontal axis are samples</p></li>
+<li class="fragment"><p>We can see that the FGF-treated samples cluster together and the control samples cluster together</p></li>
+<li class="fragment"><p>We can also see two clusters of genes; one of these shows genes upregulated (more yellow) in the FGF-treated samples and the other shows genes downregulated (more blue) in the FGF-treated samples</p></li>
+</ul>
+</div>
+</section></section><section><section id="volcano-plots" class="title-slide slide level1 center"><h1>Volcano plots</h1>
+
+</section><section id="volcano-plots-1" class="slide level2"><h2>Volcano plots</h2>
+<div>
+<ul>
+<li class="fragment"><p>Volcano plots often used to visualise the results of differential expression analysis</p></li>
+<li class="fragment"><p>They are just a scatter of the corrected p value against the fold change….</p></li>
+<li class="fragment"><p>almost - the we actually plot the negative log of the corrected p value against the fold change</p></li>
+</ul>
+</div>
+</section><section id="volcano-plots-2" class="slide level2"><h2>Volcano plots</h2>
+<div style="font-size: 70%;">
+<ul>
+<li><p>This is because just plotting the <em>p</em>-value means the axis is counter intuitive. Small <em>p</em>-values (i.e., significant values) are at the bottom of the axis)</p></li>
+<li><p>And since <em>p</em>-values range from 1 to very tiny the points are all squashed at the bottom of the axis</p></li>
+</ul>
+</div>
+
+<img data-src="images/volcano-why.png" class="r-stretch quarto-figure-center"><p class="caption">Volcano plot FDR against fold change</p></section><section id="volcano-plots-3" class="slide level2"><h2>Volcano plots</h2>
+<div style="font-size: 70%;">
+<ul>
+<li>Plotting the negative log of the corrected <em>p</em>-value means that the values are spread out and the significant values are at the top of the axis</li>
+</ul>
+</div>
+
+<img data-src="figures/prog-hspc-volcano.png" class="r-stretch quarto-figure-center"><p class="caption">Volcano plot -log(FDR) against fold change</p></section><section id="visualisations" class="slide level2"><h2>Visualisations</h2>
+<ul>
+<li><p>Should be done on normalised data so meaningful comparisons can be made</p></li>
+<li><p>The 🐭 mouse data were already log<sub>2</sub>normalised</p></li>
+<li><p>The 🐸 frog data were normalised by the DE method and saved to file. We will log<sub>2</sub> transform before doing visualisations</p></li>
 </ul></section><section id="packages-to-install-before-the-workshop" class="slide level2"><h2>Packages to install before the workshop</h2>
 <p><strong><code>heatmaply</code></strong> <strong><code>ggrepel</code></strong> from CRAN in the the normal way:</p>
 <div class="cell">
@@ -585,11 +730,31 @@
 <li><p>Omics 1: Hello data Getting to know the data. Checking the distributions of values</p></li>
 <li><p>Omics 2: Statistical Analysis Identifying which genes are differentially expressed between treatments.</p></li>
 <li><p>Omics 3: Visualising and Interpreting. PCA, Volcano plots and heatmaps to visualise results. Interpreting the results and finding out more about genes of interest.</p></li>
-</ul></section><section id="references" class="slide level2"><h2>References</h2>
+</ul></section><section id="references" class="slide level2 smaller scrollable"><h2>References</h2>
 
 <div class="footer footer-default">
 <p>🔗 <a href="https://3mmarand.github.io/BIO00088H-data/omics/week-5/overview.html">About Omics 3: Visualising and Interpreting</a></p>
 </div>
+<div id="refs" class="references csl-bib-body hanging-indent" role="doc-bibliography">
+<div id="ref-birney2004" class="csl-entry" role="doc-biblioentry">
+Birney, Ewan, T. Daniel Andrews, Paul Bevan, Mario Caccamo, Yuan Chen, Laura Clarke, Guy Coates, et al. 2004. <span>“An Overview of Ensembl.”</span> <em>Genome Research</em> 14 (5): 925–28. <a href="https://doi.org/10.1101/gr.1860604">https://doi.org/10.1101/gr.1860604</a>.
+</div>
+<div id="ref-biomaRt" class="csl-entry" role="doc-biblioentry">
+Durinck, Steffen, Paul T. Spellman, Ewan Birney, and Wolfgang Huber. 2009. <span>“Mapping Identifiers for the Integration of Genomic Datasets with the r/Bioconductor Package biomaRt”</span> 4.
+</div>
+<div id="ref-fisher2023" class="csl-entry" role="doc-biblioentry">
+Fisher, Malcolm, Christina James-Zorn, Virgilio Ponferrada, Andrew J Bell, Nivitha Sundararaj, Erik Segerdell, Praneet Chaturvedi, et al. 2023. <span>“Xenbase: Key Features and Resources of the Xenopus Model Organism Knowledgebase.”</span> <em>Genetics</em> 224 (1): iyad018. <a href="https://doi.org/10.1093/genetics/iyad018">https://doi.org/10.1093/genetics/iyad018</a>.
+</div>
+<div id="ref-DESeq2" class="csl-entry" role="doc-biblioentry">
+Love, Michael I., Wolfgang Huber, and Simon Anders. 2014. <span>“Moderated Estimation of Fold Change and Dispersion for RNA-Seq Data with DESeq2”</span> 15: 550. <a href="https://doi.org/10.1186/s13059-014-0550-8">https://doi.org/10.1186/s13059-014-0550-8</a>.
+</div>
+<div id="ref-scran" class="csl-entry" role="doc-biblioentry">
+Lun, Aaron T. L., Davis J. McCarthy, and John C. Marioni. 2016. <span>“A Step-by-Step Workflow for Low-Level Analysis of Single-Cell RNA-Seq Data with Bioconductor”</span> 5: 2122. <a href="https://doi.org/10.12688/f1000research.9501.2">https://doi.org/10.12688/f1000research.9501.2</a>.
+</div>
+<div id="ref-rand2021" class="csl-entry" role="doc-biblioentry">
+Rand, Emma. 2021. <em>Data Science Strand of BIO00058M</em>. <a href="https://doi.org/10.5281/zenodo.5527705">https://doi.org/10.5281/zenodo.5527705</a>.
+</div>
+</div>
 </section></section>
 </div>
   </div>
diff --git a/_site/search.json b/_site/search.json
index 1e7152a..41513ad 100644
--- a/_site/search.json
+++ b/_site/search.json
@@ -1472,21 +1472,21 @@
     "href": "omics/week-5/study_before_workshop.html#examine-the-results-files-1",
     "title": "Independent Study to prepare for workshop",
     "section": "Examine the results files",
-    "text": "Examine the results files\nRemind yourself of the key columns you have in the results files:\n\na fold change, logged to base 2\nan unadjusted p-value\na p value adjusted for multiple testing (FDR or padj)\na gene id"
+    "text": "Examine the results files\nRemind yourself of the key columns you have in the results files:\n\na log2 fold change\nan unadjusted p-value\na p value adjusted for multiple testing (FDR or padj)\na gene id"
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#frogs-1",
     "href": "omics/week-5/study_before_workshop.html#frogs-1",
     "title": "Independent Study to prepare for workshop",
     "section": "🐸 Frogs",
-    "text": "🐸 Frogs\n\n\nRows: 10,136\nColumns: 7\n$ baseMean        <dbl> 237.553928, 531.565700, 86.392830, 49.813502, 419.9983…\n$ log2FoldChange  <dbl> 0.096601855, -0.089588528, -0.192811203, -0.008858703,…\n$ lfcSE           <dbl> 0.2079396, 0.1557384, 0.3253216, 0.4342614, 0.1685420,…\n$ stat            <dbl> 0.46456683, -0.57525007, -0.59267874, -0.02039947, -0.…\n$ pvalue          <dbl> 0.64224169, 0.56512218, 0.55339617, 0.98372471, 0.8699…\n$ padj            <dbl> 0.9998970, 0.9998970, 0.9998970, 0.9998970, 0.9998970,…\n$ xenbase_gene_id <chr> \"XB-GENE-1000007\", \"XB-GENE-1000023\", \"XB-GENE-1000062…\n\n\n\n\nbaseMean is the mean of the normalised counts for the gene across all samples\n\nlfcSE standard error of the fold change\n\nstat is the test statistic (the Wald statistic)"
+    "text": "🐸 Frogs\n\n\nRows: 10,136\nColumns: 7\n$ baseMean        <dbl> 237.553928, 531.565700, 86.392830, 49.813502, 419.9983…\n$ log2FoldChange  <dbl> 0.096601855, -0.089588528, -0.192811203, -0.008858703,…\n$ lfcSE           <dbl> 0.2079396, 0.1557384, 0.3253216, 0.4342614, 0.1685420,…\n$ stat            <dbl> 0.46456683, -0.57525007, -0.59267874, -0.02039947, -0.…\n$ pvalue          <dbl> 0.64224169, 0.56512218, 0.55339617, 0.98372471, 0.8699…\n$ padj            <dbl> 0.9998970, 0.9998970, 0.9998970, 0.9998970, 0.9998970,…\n$ xenbase_gene_id <chr> \"XB-GENE-1000007\", \"XB-GENE-1000023\", \"XB-GENE-1000062…\n\n\n\n\n\nbaseMean is the mean of the normalised counts for the gene across all samples\n\nlfcSE standard error of the fold change\n\nstat is the test statistic (the Wald statistic)\nGenerated by DESeq2 (Love, Huber, and Anders 2014)"
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#mice-1",
     "href": "omics/week-5/study_before_workshop.html#mice-1",
     "title": "Independent Study to prepare for workshop",
     "section": "🐭 Mice",
-    "text": "🐭 Mice\n\n\nRows: 280\nColumns: 6\n$ Top             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…\n$ p.value         <dbl> 7.038138e-117, 4.736622e-90, 1.832630e-88, 4.211954e-7…\n$ FDR             <dbl> 1.970679e-114, 6.631271e-88, 1.710455e-86, 2.948368e-7…\n$ summary.logFC   <dbl> 1.596910, 3.035165, 3.261056, -2.146491, -3.056730, 3.…\n$ logFC.hspc      <dbl> 1.596910, 3.035165, 3.261056, -2.146491, -3.056730, 3.…\n$ ensembl_gene_id <chr> \"ENSMUSG00000028639\", \"ENSMUSG00000024053\", \"ENSMUSG00…\n\n\n\nTop is the rank of the gene ordered by the p value (smallest first)"
+    "text": "🐭 Mice\n\n\nRows: 280\nColumns: 6\n$ Top             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…\n$ p.value         <dbl> 7.038138e-117, 4.736622e-90, 1.832630e-88, 4.211954e-7…\n$ FDR             <dbl> 1.970679e-114, 6.631271e-88, 1.710455e-86, 2.948368e-7…\n$ summary.logFC   <dbl> 1.596910, 3.035165, 3.261056, -2.146491, -3.056730, 3.…\n$ logFC.hspc      <dbl> 1.596910, 3.035165, 3.261056, -2.146491, -3.056730, 3.…\n$ ensembl_gene_id <chr> \"ENSMUSG00000028639\", \"ENSMUSG00000024053\", \"ENSMUSG00…\n\n\n\n\nTop is the rank of the gene ordered by the p-value (smallest first)\n\nsummary.logFC and logFC.hspc give the same value (in this case since comparing two cell types)\ngenerated by scran (Lun, McCarthy, and Marioni 2016)"
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#from-xenbase",
@@ -1513,8 +1513,8 @@
     "objectID": "omics/week-5/study_before_workshop.html#pca",
     "href": "omics/week-5/study_before_workshop.html#pca",
     "title": "Independent Study to prepare for workshop",
-    "section": "pca",
-    "text": "pca\nlots of variables"
+    "section": "PCA",
+    "text": "PCA\n\n\nPrincipal Component Analysis is an unsupervised machine learning technique\nUnsupervised methods1 are unsupervised in that they do not use/optimise to a particular output. The goal is to uncover structure. They do not test hypotheses\nIt is often used to visualise high dimensional data because it is a dimension reduction technique\n\n\nYou may wish to read a previous introduction to unsupervised methods I have written An introduction to Machine Learning: Unsupervised methods (Rand 2021)"
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#tsne",
@@ -1549,34 +1549,118 @@
     "href": "omics/week-5/study_before_workshop.html#references",
     "title": "Independent Study to prepare for workshop",
     "section": "References",
-    "text": "References\n\n\n🔗 About Omics 3: Visualising and Interpreting"
+    "text": "References\n\n\n🔗 About Omics 3: Visualising and Interpreting\n\n\n\nBirney, Ewan, T. Daniel Andrews, Paul Bevan, Mario Caccamo, Yuan Chen, Laura Clarke, Guy Coates, et al. 2004. “An Overview of Ensembl.” Genome Research 14 (5): 925–28. https://doi.org/10.1101/gr.1860604.\n\n\nDurinck, Steffen, Paul T. Spellman, Ewan Birney, and Wolfgang Huber. 2009. “Mapping Identifiers for the Integration of Genomic Datasets with the r/Bioconductor Package biomaRt” 4.\n\n\nFisher, Malcolm, Christina James-Zorn, Virgilio Ponferrada, Andrew J Bell, Nivitha Sundararaj, Erik Segerdell, Praneet Chaturvedi, et al. 2023. “Xenbase: Key Features and Resources of the Xenopus Model Organism Knowledgebase.” Genetics 224 (1): iyad018. https://doi.org/10.1093/genetics/iyad018.\n\n\nLove, Michael I., Wolfgang Huber, and Simon Anders. 2014. “Moderated Estimation of Fold Change and Dispersion for RNA-Seq Data with DESeq2” 15: 550. https://doi.org/10.1186/s13059-014-0550-8.\n\n\nLun, Aaron T. L., Davis J. McCarthy, and John C. Marioni. 2016. “A Step-by-Step Workflow for Low-Level Analysis of Single-Cell RNA-Seq Data with Bioconductor” 5: 2122. https://doi.org/10.12688/f1000research.9501.2.\n\n\nRand, Emma. 2021. Data Science Strand of BIO00058M. https://doi.org/10.5281/zenodo.5527705."
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#adding-gene-information-1",
     "href": "omics/week-5/study_before_workshop.html#adding-gene-information-1",
     "title": "Independent Study to prepare for workshop",
     "section": "Adding gene information",
-    "text": "Adding gene information\n\n\nThe gene id is difficult to interpret in plots/tables\nTherefore we need to add information such as the gene name and a description to the results\nFor the 🐸 Frog data information comes from xenbase\nFor the 🐭 Mice data information comes from Ensembl"
+    "text": "Adding gene information\n\n\nThe gene id is difficult to interpret in plots/tables\nTherefore we need to add information such as the gene name and a description to the results\nFor the 🐸 Frog data information comes from Xenbase (Fisher et al. 2023)\nFor the 🐭 Mice data information comes from Ensembl (Birney et al. 2004)"
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#xenbase",
     "href": "omics/week-5/study_before_workshop.html#xenbase",
     "title": "Independent Study to prepare for workshop",
     "section": "🐸 Xenbase",
-    "text": "🐸 Xenbase\n\nxenbase logoXenbase\nXenbase is a model organism database that provides genomic, molecular, and developmental biology information about Xenopus laevis and Xenopus tropicalis.\nIt took me some time to find the information you need."
+    "text": "🐸 Xenbase\n\nxenbase logoXenbase is a model organism database that provides genomic, molecular, and developmental biology information about Xenopus laevis and Xenopus tropicalis.\n\nIt took me some time to find the information you need."
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#xenbase-1",
     "href": "omics/week-5/study_before_workshop.html#xenbase-1",
     "title": "Independent Study to prepare for workshop",
     "section": "🐸 Xenbase",
-    "text": "🐸 Xenbase\n\n\nI got the information from the Xenbase information pages under Data Reports | Gene Information\nThis is listed: Xenbase Gene Product Information [readme] gzipped gpi (tab separated)\nClick on the readme link to see the file format and columns\nI downloaded xenbase.gpi.gz, unzipped it, removed header lines and the Xenopus tropicalis (taxon:8364) entries and saved it as xenbase_info.xlsx\nIn the workshop you will merge this information with the results file"
+    "text": "🐸 Xenbase\n\n\nI got the information from the Xenbase information pages under Data Reports | Gene Information\nThis is listed: Xenbase Gene Product Information [readme] gzipped gpi (tab separated)\nClick on the readme link to see the file format and columns\nI downloaded xenbase.gpi.gz, unzipped it, removed header lines and the Xenopus tropicalis (taxon:8364) entries and saved it as xenbase_info.xlsx\nIn the workshop you will import this file and merge the information with the results file"
   },
   {
     "objectID": "omics/week-5/study_before_workshop.html#ensembl",
     "href": "omics/week-5/study_before_workshop.html#ensembl",
     "title": "Independent Study to prepare for workshop",
     "section": "🐭 Ensembl",
-    "text": "🐭 Ensembl\nfrom the ncbi\nbiomart is a package that allows you to get information from the ncbi database such as gene names and descriptions"
+    "text": "🐭 Ensembl\n\n\nEnsembl creates, integrates and distributes reference datasets and analysis tools that enable genomics\nBioMart provides a access to these large datasets\nbiomaRt (Durinck et al. 2009) is a Bioconductor package gives you programmatic access to BioMart.\nIn the workshop you use this package to get information you can merge with the results file"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#what-is-the-purpose-of-an-omics-plot",
+    "href": "omics/week-5/study_before_workshop.html#what-is-the-purpose-of-an-omics-plot",
+    "title": "Independent Study to prepare for workshop",
+    "section": "What is the purpose of an Omics plot?",
+    "text": "What is the purpose of an Omics plot?\n\n\nIn general, we plot data to help us summarise and understand it\nThis is especially import for omics data where we have a very large number of variables and often a large number of observations\nWe will look at three plots very commonly used in omics analysis: Principal Component Analysis (PCA) plot, Heatmaps and Volcano Plots"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#pca-1",
+    "href": "omics/week-5/study_before_workshop.html#pca-1",
+    "title": "Independent Study to prepare for workshop",
+    "section": "PCA",
+    "text": "PCA\n\n\nIt takes a large number of continuous variables (like gene expression) and reduces them to a smaller number of variables (called principal components) that explain most of the variation in the data\nThe principal components can be plotted to see how samples cluster together"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#pca-2",
+    "href": "omics/week-5/study_before_workshop.html#pca-2",
+    "title": "Independent Study to prepare for workshop",
+    "section": "PCA",
+    "text": "PCA\n\nTo see if samples cluster as we would expect, we might plot the expression of one gene against another\n\n\n\n\n\n\nSamples\n\n\n\n\n\n\nCells\n\n\n\n\n\nThis gives some insight but we have 280 (mice) or 10,000+(frogs) genes to consider. How do we know if the pair we use is typical? How can we consider al the genes at once?"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#heatmaps-1",
+    "href": "omics/week-5/study_before_workshop.html#heatmaps-1",
+    "title": "Independent Study to prepare for workshop",
+    "section": "Heatmaps",
+    "text": "Heatmaps\n\n\nare a grid of genes on one axis and samples on the other with each grid cell coloured by another variable\nin this case the other variable is gene expression\nthey allow you to quickly get an overview of the expression patterns across genes and samples\nwe often couple them with clustering to group genes and samples with similar expression patterns together which helps us see which genes are responsible for distinguishing groups"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#volcano-plots-1",
+    "href": "omics/week-5/study_before_workshop.html#volcano-plots-1",
+    "title": "Independent Study to prepare for workshop",
+    "section": "Volcano plots",
+    "text": "Volcano plots\n\n\nVolcano plots often used to visualise the results of differential expression analysis\nThey are just a scatter of the corrected p value against the fold change….\nalmost - the we actually plot the negative log of the corrected p value against the fold change"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#volcano-plots-2",
+    "href": "omics/week-5/study_before_workshop.html#volcano-plots-2",
+    "title": "Independent Study to prepare for workshop",
+    "section": "Volcano plots",
+    "text": "Volcano plots\n\n\nThis is because just plotting the p-value means the axis is counter intuitive. Small p-values (i.e., significant values) are at the bottom of the axis)\nAnd since p-values range from 1 to very tiny the points are all squashed at the bottom of the axis\n\n\n\nVolcano plot FDR against fold change"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#volcano-plots-3",
+    "href": "omics/week-5/study_before_workshop.html#volcano-plots-3",
+    "title": "Independent Study to prepare for workshop",
+    "section": "Volcano plots",
+    "text": "Volcano plots\n\n\nPlotting the negative log of the corrected p-value means that the values are spread out and the significant values are at the top of the axis\n\n\n\nVolcano plot -log(FDR) against fold change"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#visualisations",
+    "href": "omics/week-5/study_before_workshop.html#visualisations",
+    "title": "Independent Study to prepare for workshop",
+    "section": "Visualisations",
+    "text": "Visualisations\n\nShould be done on normalised data so meaningful comparisons can be made\nThe 🐭 mouse data were already log2normalised\nThe 🐸 frog data were normalised by the DE method and saved to file. We will log2 transform before doing visualisations"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#heatmaps-2",
+    "href": "omics/week-5/study_before_workshop.html#heatmaps-2",
+    "title": "Independent Study to prepare for workshop",
+    "section": "Heatmaps",
+    "text": "Heatmaps\n\n\nOn the vertical axis are genes which are differentially expressed at the 0.01 level\nOn the horizontal axis are samples\nWe can see that the FGF-treated samples cluster together and the control samples cluster together\nWe can also see two clusters of genes; one of these shows genes upregulated (more yellow) in the FGF-treated samples and the other shows genes downregulated (more blue) in the FGF-treated samples"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#heatmaps-3",
+    "href": "omics/week-5/study_before_workshop.html#heatmaps-3",
+    "title": "Independent Study to prepare for workshop",
+    "section": "Heatmaps",
+    "text": "Heatmaps\n\n\nOn the vertical axis are genes which are differentially expressed at the 0.01 level\nOn the horizontal axis are samples\nWe can see that the FGF-treated samples cluster together and the control samples cluster together\nWe can also see two clusters of genes; one of these shows genes upregulated (more yellow) in the FGF-treated samples and the other shows genes downregulated (more blue) in the FGF-treated samples"
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#pca-3",
+    "href": "omics/week-5/study_before_workshop.html#pca-3",
+    "title": "Independent Study to prepare for workshop",
+    "section": "PCA",
+    "text": "PCA\n\nPCA is a solution for this - It takes a large number of continuous variables (like gene expression) and reduces them to a smaller number of “principal components” that explain most of the variation in the data.\n\n\n\n\n\n\nSamples\n\n\n\n\n\n\nCells\n\n\n\n\n\nWe have done PCA in Omics 3, but often PCA might be one of the first exploratory steps because it gives you an idea whether you expect general patterns in gene expression that distinguish groups."
+  },
+  {
+    "objectID": "omics/week-5/study_before_workshop.html#section-1",
+    "href": "omics/week-5/study_before_workshop.html#section-1",
+    "title": "Independent Study to prepare for workshop",
+    "section": "",
+    "text": "Heat map for the frog data\n\n\nSee next slide for information"
   }
 ]
\ No newline at end of file
diff --git a/omics/crib/cont-fgf-s30.R b/omics/crib/cont-fgf-s30.R
index c648943..18e4bb4 100644
--- a/omics/crib/cont-fgf-s30.R
+++ b/omics/crib/cont-fgf-s30.R
@@ -458,50 +458,20 @@ s30_log2_trans <- s30_results |>
 colnames(s30_log2_trans) <- s30_results$xenbase_gene_id
 
 # just for indep study before
-# s30_log2_trans$sample <- row.names(s30_log2_trans)
-# a <- s30_log2_trans |> ggplot(aes(x = `XB-GENE-1000007`, 
+# sample_id <- row.names(s30_log2_trans) |> str_remove("log2_")
+# fig <- s30_log2_trans |> ggplot(aes(x = `XB-GENE-1000007`,
 #                              y = `XB-GENE-1000023`)) +
 #   geom_point() +
-#   geom_text(aes(label = sample), 
+#   geom_text(aes(label = sample_id),
 #             vjust = -1, size = 3) +
 #   scale_x_continuous(expand = c(0.05,0.05)) +
 #   scale_y_continuous(expand = c(0.05,0.05)) +
 #   theme_classic()
 # 
 # 
-# b <- s30_log2_trans |> ggplot(aes(x = `XB-GENE-1000062`, 
-#                                   y = `XB-GENE-1000072`)) +
-#   geom_point() +
-#   geom_text(aes(label = sample), 
-#             vjust = -1, size = 3) +
-#   scale_x_continuous(expand = c(0.05,0.05)) +
-#   scale_y_continuous(expand = c(0.05,0.05)) +
-#   theme_classic()
-# 
-# c <- s30_log2_trans |> ggplot(aes(x = `XB-GENE-1000113`, 
-#                                   y = `XB-GENE-1000132`)) +
-#   geom_point() +
-#   geom_text(aes(label = sample), 
-#             vjust = -1, size = 3) +
-#   scale_x_continuous(expand = c(0.05,0.05)) +
-#   scale_y_continuous(expand = c(0.05,0.05)) +
-#   theme_classic()
-# 
-# d <- s30_log2_trans |> ggplot(aes(x = `XB-GENE-1000149`, 
-#                                   y = `XB-GENE-1000251`)) +
-#   geom_point() +
-#   geom_text(aes(label = sample), 
-#             vjust = -1, size = 3) +
-#   scale_x_continuous(expand = c(0.05,0.05)) +
-#   scale_y_continuous(expand = c(0.05,0.05)) +
-#   theme_classic()
-# 
-# library(patchwork)
-# fig <- (a + b) / (c + d)
-# 
-# ggsave("omics/week-5/images/why_pca.png", 
+# ggsave("omics/week-5/images/why_pca_frog.png",
 #        plot = fig,
-#        width = 6, height = 6)
+#        width = 4, height = 4)
 
 # perform PCA using standard functions
 pca <- s30_log2_trans |>
diff --git a/omics/crib/hspc-prog.R b/omics/crib/hspc-prog.R
index 637a7c4..bd2cf4c 100644
--- a/omics/crib/hspc-prog.R
+++ b/omics/crib/hspc-prog.R
@@ -509,6 +509,30 @@ prog_hspc_trans <- prog_hspc_results |>
 
 colnames(prog_hspc_trans) <- prog_hspc_results$ensembl_gene_id
 
+# just for indep study before
+# prog_hspc_trans$cell_id <- row.names(prog_hspc_trans)
+# prog_hspc_trans <- prog_hspc_trans |> 
+#   extract(cell_id, 
+#           remove = FALSE,
+#           c("cell_type", "cell_number"),
+#           "([a-zA-Z]{4})_([0-9]{3})")
+# 
+# fig <- prog_hspc_trans |> ggplot(aes(x = ENSMUSG00000028639,
+#                              y = ENSMUSG00000024053, colour = cell_type)) +
+#   geom_point() +
+#   # geom_text(aes(label = cell_id),
+#   #           vjust = -1, size = 3) +
+#   scale_x_continuous(expand = c(0.05,0.05)) +
+#   scale_y_continuous(expand = c(0.05,0.05)) +
+#   theme_classic() +
+# theme(legend.position = "none")
+# 
+# 
+# ggsave("omics/week-5/images/why_pca_mouse.png",
+#        plot = fig,
+#        width = 4, height = 4)
+
+
 # perform PCA using standard functions
 pca <- prog_hspc_trans |>
   prcomp(scale. = TRUE) 
@@ -625,3 +649,16 @@ ggsave("omics/week-5/figures/prog-hspc-volcano.png",
        units = "in",
        device = "png")
 
+
+# # just for the independent study slides
+# vol <- prog_hspc_results |> 
+#   ggplot(aes(x = summary.logFC, 
+#              y = FDR)) +
+#   geom_point() +
+#   theme_classic() 
+# ggsave("omics/week-5/images/volcano-why.png",
+#        plot = vol,
+#        height = 4.5, 
+#        width = 4.5,
+#        units = "in",
+#        device = "png")
\ No newline at end of file
diff --git a/omics/week-5/images/frog-heat.png b/omics/week-5/images/frog-heat.png
new file mode 100644
index 0000000..8cf1b5f
Binary files /dev/null and b/omics/week-5/images/frog-heat.png differ
diff --git a/omics/week-5/images/volcano-why.png b/omics/week-5/images/volcano-why.png
new file mode 100644
index 0000000..26b786b
Binary files /dev/null and b/omics/week-5/images/volcano-why.png differ
diff --git a/omics/week-5/images/why_pca.png b/omics/week-5/images/why_pca.png
deleted file mode 100644
index dd45e6c..0000000
Binary files a/omics/week-5/images/why_pca.png and /dev/null differ
diff --git a/omics/week-5/images/why_pca_frog.png b/omics/week-5/images/why_pca_frog.png
new file mode 100644
index 0000000..14bb9f6
Binary files /dev/null and b/omics/week-5/images/why_pca_frog.png differ
diff --git a/omics/week-5/images/why_pca_mouse.png b/omics/week-5/images/why_pca_mouse.png
new file mode 100644
index 0000000..04ef843
Binary files /dev/null and b/omics/week-5/images/why_pca_mouse.png differ
diff --git a/omics/week-5/overview.qmd b/omics/week-5/overview.qmd
index 872786e..80982a4 100644
--- a/omics/week-5/overview.qmd
+++ b/omics/week-5/overview.qmd
@@ -14,12 +14,11 @@ We suggest you sit together with your group in the workshop.
 The successful student will be able to:
 
 -   verify they have the required RStudio Project set up and the data and code files from the previous Workshop and Consolidation study
--   explain 
--   
--   
--   
--   
--   ,
+-   explain where gene information came from and add it to their results
+-   perform a PCA and understand how to interpret them
+-   create a heatmap and understand how to interpret them
+-   create a volcano plot and understand how to interpret them
+  
 
 ### Instructions
 
@@ -29,11 +28,13 @@ The successful student will be able to:
 
 2.  [Workshop](workshop.qmd)
 
-    i.  💻 ....
+    i.  💻 Add gene information to the results of DE
 
-    ii. 💻 ...
+    ii. 💻 Perform and plot a PCA
 
-    iii. 💻 ....
+    iii. 💻 Visualise results with a heatmap
+    
+    iv. 💻 Visualise all the results with a volcano plot
 
     iv. Look after future you!
 
diff --git a/omics/week-5/study_before_workshop.qmd b/omics/week-5/study_before_workshop.qmd
index 3795d28..68198fd 100644
--- a/omics/week-5/study_before_workshop.qmd
+++ b/omics/week-5/study_before_workshop.qmd
@@ -21,7 +21,6 @@ library(tidyverse)
 
 ```
 
-
 ## Overview
 
 In these slides we will:
@@ -43,14 +42,11 @@ In these slides we will:
 ## What we did in Omics 2: Statistical Analysis
 
 ::: incremental
-
-
 -   carried out differential expression analysis
 
 -   found genes not expressed at all, or expressed in one group only
 
 -   Saved results files
-
 :::
 
 ## Where should you be?
@@ -67,16 +63,15 @@ Workshop](../week-4/workshop.html) including:
 ## 🐸 Frogs
 
 ::: {style="font-size: 70%;"}
-
 -   An RStudio Project called `frogs-88H` which contains:
     -   Raw data (S14, S20 and S30)
     -   Processed data (`s30_filtered.csv`, `s30_summary_gene.csv`,
         `s30_summary_gene_filtered.csv`, `s30_summary_samp.csv` and
         equivalents for S14 *OR* S20)
-    -   Results files (`s30_fgf_only.csv`, `S30_normalised_counts.csv`, `S30_results.csv` and
-        equivalents for S14 *OR* S20)    
-    -   Two scripts called `cont-fgf-s30.R` and either `cont-fgf-s20.R` *OR*
-        `cont-fgf-s14.R`
+    -   Results files (`s30_fgf_only.csv`, `S30_normalised_counts.csv`,
+        `S30_results.csv` and equivalents for S14 *OR* S20)\
+    -   Two scripts called `cont-fgf-s30.R` and either `cont-fgf-s20.R`
+        *OR* `cont-fgf-s14.R`
 :::
 
 Files should be organised into folders. Code should well commented and
@@ -85,14 +80,14 @@ easy to read.
 ## 🐭 Mice
 
 ::: {style="font-size: 70%;"}
-
 -   An RStudio Project called `mice-88H` which contains
     -   Raw data (hspc, prog, lthsc)
     -   Processed data (`hspc_summary_gene.csv`,
         `hspc_summary_samp.csv`, `prog_summary_gene.csv`,
         `prog_summary_samp.csv`, `lthsc_summary_gene.csv`,
         `lthsc_summary_samp.csv`)
--   Results files (`prog_hspc_results.csv` and an equivalent for lthsc vs prog or hspc vs lthsc)
+-   Results files (`prog_hspc_results.csv` and an equivalent for lthsc
+    vs prog or hspc vs lthsc)
 -   Two scripts called `hspc-prog.R` and either `hspc-lthsc.R` *OR*
     `prog-lthsc.R`
 :::
@@ -122,12 +117,11 @@ Go through:
 
 Remind yourself of the key columns you have in the results files:
 
--   a fold change, logged to base 2
--   an unadjusted p-value
--   a p value adjusted for multiple testing (`FDR` or `padj`)
+-   a log~2~ fold change
+-   an unadjusted *p*-value
+-   a *p* value adjusted for multiple testing (`FDR` or `padj`)
 -   a gene id
 
-
 ## 🐸 Frogs
 
 ```{r}
@@ -135,15 +129,17 @@ Remind yourself of the key columns you have in the results files:
 read_csv("results/s30_results.csv") |> glimpse()
 
 ```
+
+. . .
+
 -   `baseMean` is the mean of the normalised counts for the gene across
     all samples
 -   `lfcSE` standard error of the fold change
 -   `stat` is the test statistic (the Wald statistic)
-
-
-
+-   Generated by **`DESeq2`** [@DESeq2]
 
 ## 🐭 Mice
+
 ```{r}
 #| echo: false
 read_csv("results/prog_hspc_results.csv") |> glimpse()
@@ -151,123 +147,262 @@ read_csv("results/prog_hspc_results.csv") |> glimpse()
 
 ```
 
--   Top is the rank of the gene ordered by the p value (smallest first)
-
-
+. . .
 
+-   Top is the rank of the gene ordered by the *p*-value (smallest
+    first)
+-   `summary.logFC` and `logFC.hspc` give the same value (in this case
+    since comparing two cell types)
+-   generated by **`scran`** [@scran]
 
 # Adding gene information
 
 ## Adding gene information
 
 ::: incremental
-
 -   The gene id is difficult to interpret in plots/tables
 
--   Therefore we need to add information such as the gene name and a description to the results
-
--   For the 🐸 Frog data information comes from xenbase
+-   Therefore we need to add information such as the gene name and a
+    description to the results
 
--   For the 🐭 Mice data information comes from Ensembl 
+-   For the 🐸 Frog data information comes from Xenbase [@fisher2023]
 
+-   For the 🐭 Mice data information comes from Ensembl [@birney2004]
 :::
 
 ## 🐸 Xenbase
 
-
 ![xenbase logo](images/Xenbase-Logo-Medium.png){width="800"}
 
+[Xenbase](http://www.xenbase.org/) is a model organism database that
+provides genomic, molecular, and developmental biology information about
+*Xenopus laevis* and *Xenopus tropicalis*.
 
-[Xenbase](http://www.xenbase.org/) is a model organism database that provides genomic, molecular, and developmental biology information about *Xenopus laevis* and *Xenopus tropicalis*. 
-
-. . . 
+. . .
 
 It took me some time to find the information you need.
 
-
 ## 🐸 Xenbase
 
 ::: incremental
+-   I got the information from the [Xenbase information
+    pages](https://www.xenbase.org/xenbase/static-xenbase/ftpDatafiles.jsp)
+    under Data Reports \| Gene Information
 
--   I got the information from the [Xenbase information pages](https://www.xenbase.org/xenbase/static-xenbase/ftpDatafiles.jsp) under Data Reports | Gene Information
-
--   This is listed:  Xenbase Gene Product Information [readme] [gzipped gpi (tab separated)](https://download.xenbase.org/xenbase/GenePageReports/xenbase.gpi.gz) 
+-   This is listed: Xenbase Gene Product Information \[readme\] [gzipped
+    gpi (tab
+    separated)](https://download.xenbase.org/xenbase/GenePageReports/xenbase.gpi.gz)
 
--   Click on the readme link to see the file format and columns  
+-   Click on the readme link to see the file format and columns
 
--   I downloaded [xenbase.gpi.gz](https://download.xenbase.org/xenbase/GenePageReports/xenbase.gpi.gz), unzipped it, removed header lines and the *Xenopus tropicalis* (taxon:8364) entries and saved it as [xenbase_info.xlsx](meta/xenbase_info.xlsx)
+-   I downloaded
+    [xenbase.gpi.gz](https://download.xenbase.org/xenbase/GenePageReports/xenbase.gpi.gz),
+    unzipped it, removed header lines and the *Xenopus tropicalis*
+    (taxon:8364) entries and saved it as
+    [xenbase_info.xlsx](meta/xenbase_info.xlsx)
 
--   In the workshop you will merge this information with the results file
+-   In the workshop you will import this file and merge the information
+    with the results file
 :::
 
 ## 🐭 Ensembl
 
 ::: incremental
+-   [Ensembl](https://www.ensembl.org/index.html) creates, integrates
+    and distributes reference datasets and analysis tools that enable
+    genomics
+
+-   [BioMart](https://grch37.ensembl.org/info/data/biomart/index.html)
+    provides a access to these large datasets
+
+-   **`biomaRt`** [@biomaRt] is a Bioconductor package gives you
+    programmatic access to BioMart.
+
+-   In the workshop you use this package to get information you can
+    merge with the results file
+:::
+
+# Plots
+
+## What is the purpose of an Omics plot?
+
+::: incremental
+-   In general, we plot data to help us summarise and understand it
+
+-   This is especially import for omics data where we have a very large
+    number of variables and often a large number of observations
+
+-   We will look at three plots very commonly used in omics analysis:
+    Principal Component Analysis (PCA) plot, Heatmaps and Volcano Plots
+:::
+
+# Principal Component Analysis (PCA)
+
+## PCA
+
+::: incremental
+-   Principal Component Analysis is an unsupervised machine learning
+    technique
+
+-   Unsupervised methods[^1] are unsupervised in that they do not
+    use/optimise to a particular output. The goal is to uncover
+    structure. They do not test hypotheses
+
+-   It is often used to visualise high dimensional data because it is a
+    dimension reduction technique
+:::
+
+[^1]: You may wish to read a previous introduction to unsupervised
+    methods I have written [An introduction to Machine Learning:
+    Unsupervised
+    methods](https://3mmarand.github.io/BIO00058M-Data-science-2020/slides/05_intro_to_ML_unsupervised.html#1)
+    [@rand2021]
+
+## PCA
+
+::: incremental
+-   It takes a large number of continuous variables (like gene
+    expression) and reduces them to a smaller number of variables
+    (called principal components) that explain most of the variation in
+    the data
 
--   [Ensembl](https://www.ensembl.org/index.html) creates, integrates and distributes reference datasets and analysis tools that enable genomics 
+-   The principal components can be plotted to see how samples cluster
+    together
+:::
 
--   [BioMart](https://grch37.ensembl.org/info/data/biomart/index.html)  provides a access to these large datasets
+## PCA
+::: {style="font-size: 70%;"}
+-   To see if samples cluster as we would expect, we might plot the
+    expression of one gene against another
 
--   **`biomaRt`** is a Bioconductor package gives you programmatic access to BioMart.
+::: {layout-ncol="2"}
+![Samples](images/why_pca_frog.png){width="300"}
 
--   In the workshop you use this package to get information you can merge with the results file
+![Cells](images/why_pca_mouse.png){width="300"}
 :::
 
+This gives some insight but we have 280 (mice) or 10,000+(frogs) genes
+to consider. How do we know if the pair we use is typical? How can we
+consider al the genes at once?
 
+:::
 
-# Plots 
+## PCA
 
-## plots purpose
+::: {style="font-size: 70%;"}
+-   PCA is a solution for this - It takes a large number of continuous
+    variables (like gene expression) and reduces them to a smaller
+    number of "principal components" that explain most of the variation
+    in the data.
 
-dimsenion reduction
+::: {layout-ncol="2"}
 
-## pca
+![Samples](figures/frog-s30-pca.png){width="300"}
 
-lots of variables 
+![Cells](figures/prog_hspc-pca.png){width="300"}
+:::
 
-## tsne
 
-lots of variables and lots of observations
+:::
+## PCA
 
+We have done PCA in Omics 3, but often PCA might be one of the first
+exploratory steps because it gives you an idea whether you expect
+general patterns in gene expression that distinguish groups.
 
+# Heatmaps
 
-# normalsing before plotting
+## Heatmaps
 
-## normalising before plotting
+::: incremental
+-   are a grid of genes on one axis and samples on the other with each
+    grid cell coloured by another variable
 
-log 
+-   in this case the other variable is gene expression
 
-normalisation
-regularised log is a method to bias from low count genes.
-https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/03_DGE_QC_analysis.html
+-   they allow you to quickly get an overview of the expression patterns
+    across genes and samples
+
+-   we often couple them with clustering to group genes and samples with
+    similar expression patterns together which helps us see which genes
+    are responsible for distinguishing groups
+:::
 
+## 
 
+![Heat map for the frog data](images/frog-heat.png){height="800"}
+
+See next slide for information
+
+## Heatmaps
 
 ::: incremental
--   T
+-   On the vertical axis are genes which are differentially expressed at
+    the 0.01 level
+
+-   On the horizontal axis are samples
+
+-   We can see that the FGF-treated samples cluster together and the
+    control samples cluster together
+
+-   We can also see two clusters of genes; one of these shows genes
+    upregulated (more yellow) in the FGF-treated samples and the other
+    shows genes downregulated (more blue) in the FGF-treated samples
+:::
+
+# Volcano plots
+
+## Volcano plots
+
+::: incremental
+-   Volcano plots often used to visualise the results of differential
+    expression analysis
+
+-   They are just a scatter of the corrected p value against the fold
+    change....
+
+-   almost - the we actually plot the negative log of the corrected p
+    value against the fold change
+:::
+
+## Volcano plots
+
+::: {style="font-size: 70%;"}
+-   This is because just plotting the *p*-value means the axis is
+    counter intuitive. Small *p*-values (i.e., significant values) are
+    at the bottom of the axis)
+
+-   And since *p*-values range from 1 to very tiny the points are all
+    squashed at the bottom of the axis
 :::
 
-rlog is a method to bias from low count genes.
-https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/03_DGE_QC_analysis.html
-gives a good explanation of regularized the log transform (rlog)
+![Volcano plot FDR against fold change](images/volcano-why.png)
 
-The rlog transformation of the normalized counts is only necessary for
-these visualization methods during this quality assessment. They are not
-used for DE because DESeq2 takes care of that
+## Volcano plots
 
-in the workshop we just to log transformed
+::: {style="font-size: 70%;"}
+-   Plotting the negative log of the corrected *p*-value means that the
+    values are spread out and the significant values are at the top of
+    the axis
+:::
+
+![Volcano plot -log(FDR) against fold
+change](figures/prog-hspc-volcano.png)
 
--   The 🐭 mouse data have been normalised to simplify the analysis for
-    you; the 🐸 frog data have not but the DE method will do this for
-    you.
-   
+## Visualisations
 
+-   Should be done on normalised data so meaningful comparisons can be
+    made
 
+-   The 🐭 mouse data were already log~2~normalised
 
+-   The 🐸 frog data were normalised by the DE method and saved to file.
+    We will log~2~ transform before doing visualisations
 
 ## Packages to install before the workshop
 
-**`heatmaply`** **`ggrepel`** from CRAN in the the normal way:
+**`heatmaply`** [@heatmapply] and **`ggrepel`** [@ggrepel] from CRAN in
+the the normal way:
 
 ```{r}
 #| eval: false
@@ -277,15 +412,14 @@ install.packages("ggrepel")
 
 ```
 
+**`biomaRt`** [@biomaRt] from Bioconductor using **`BiocManager`**
+[@BiocManager]
 
-**`biomaRt`** from Bioconductor using **`BiocManager`**:
 ```{r}
 #| eval: false
 BiocManager::install("biomaRt")
 ```
 
-
-
 # Workshops
 
 ## Workshops
diff --git a/omics/week-5/workshop.qmd b/omics/week-5/workshop.qmd
index f1bc095..69ee793 100644
--- a/omics/week-5/workshop.qmd
+++ b/omics/week-5/workshop.qmd
@@ -136,7 +136,7 @@ The [xenbase.gpi.gz](https://download.xenbase.org/xenbase/GenePageReports/xenbas
 If you click on the readme link you can see information telling you that the file is in the Gene Product information 2.1 format and is provided with gzip compression. gene product information for both 
 *Xenopus tropicalis* (taxon:8364) and *Xenopus laevis* (taxon:8355)
 
-🎬 ......
+🎬 If you want to 
 
 ```bash
 gunzip xenbase.gpi.gz
@@ -346,6 +346,7 @@ n_gene_clusters <- 2
 ```
 
 ```{r}
+#| fig-height: 8
 heatmaply(mat, 
           scale = "row",
           hide_colorbar = TRUE,
@@ -361,7 +362,7 @@ heatmaply(mat,
 
 
 
-## Visual all the results with a volcano plot
+## Visualise all the results with a volcano plot
 
 
 colour the points if padj < 0.05
@@ -706,7 +707,7 @@ separation is not as strong as for the frog data
 run a few times to see different subset
 
 
-## Visual all the results with a volcano plot
+## Visualise all the results with a volcano plot
 
 colour the points if FDR < 0.05
 and prog_hspc_results > 1
diff --git a/references.bib b/references.bib
index 52f2ca1..fa3ab82 100644
--- a/references.bib
+++ b/references.bib
@@ -476,3 +476,67 @@ @article{benjamini1995
 	url = {http://www.jstor.org/stable/2346101},
 	note = {Publisher: [Royal Statistical Society, Wiley]}
 }
+
+@article{fisher2023,
+	title = {Xenbase: key features and resources of the Xenopus model organism knowledgebase},
+	author = {Fisher, Malcolm and James-Zorn, Christina and Ponferrada, Virgilio and Bell, Andrew J and Sundararaj, Nivitha and Segerdell, Erik and Chaturvedi, Praneet and Bayyari, Nadia and Chu, Stanley and Pells, Troy and Lotay, Vaneet and Agalakov, Sergei and Wang, Dong Zhuo and Arshinoff, Bradley I and Foley, Saoirse and Karimi, Kamran and Vize, Peter D and Zorn, Aaron M},
+	year = {2023},
+	month = {05},
+	date = {2023-05-02},
+	journal = {Genetics},
+	pages = {iyad018},
+	volume = {224},
+	number = {1},
+	doi = {10.1093/genetics/iyad018},
+	url = {https://doi.org/10.1093/genetics/iyad018}
+}
+
+@article{birney2004,
+	title = {An Overview of Ensembl},
+	author = {Birney, Ewan and Andrews, T. Daniel and Bevan, Paul and Caccamo, Mario and Chen, Yuan and Clarke, Laura and Coates, Guy and Cuff, James and Curwen, Val and Cutts, Tim and Down, Thomas and Eyras, Eduardo and Fernandez-Suarez, Xose M. and Gane, Paul and Gibbins, Brian and Gilbert, James and Hammond, Martin and Hotz, Hans-Rudolf and Iyer, Vivek and Jekosch, Kerstin and Kahari, Andreas and Kasprzyk, Arek and Keefe, Damian and Keenan, Stephen and Lehvaslaiho, Heikki and McVicker, Graham and Melsopp, Craig and Meidl, Patrick and Mongin, Emmanuel and Pettett, Roger and Potter, Simon and Proctor, Glenn and Rae, Mark and Searle, Steve and Slater, Guy and Smedley, Damian and Smith, James and Spooner, Will and Stabenau, Arne and Stalker, James and Storey, Roy and Ureta-Vidal, Abel and Woodwark, K. Cara and Cameron, Graham and Durbin, Richard and Cox, Anthony and Hubbard, Tim and Clamp, Michele},
+	year = {2004},
+	month = {05},
+	date = {2004-05},
+	journal = {Genome Research},
+	pages = {925--928},
+	volume = {14},
+	number = {5},
+	doi = {10.1101/gr.1860604},
+	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC479121/},
+	note = {PMID: 15078858
+PMCID: PMC479121}
+}
+
+@article{biomaRt,
+	title = {Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt},
+	author = {Durinck, Steffen and Spellman, Paul T. and Birney, Ewan and Huber, Wolfgang},
+	year = {2009},
+	date = {2009},
+	volume = {4}
+}
+@book{rand2021,
+	title = {Data Science strand of BIO00058M},
+	author = {Rand, Emma},
+	year = {2021},
+	month = {09},
+	date = {2021-09},
+	doi = {10.5281/zenodo.5527705},
+	url = {https://github.com/3mmaRand/BIO00058M-Data-science-2020}
+}
+
+  @Manual{BiocManager,
+    title = {BiocManager: Access the Bioconductor Project Package Repository},
+    author = {Martin Morgan and Marcel Ramos},
+    year = {2023},
+    note = {R package version 1.30.22},
+    url = {https://bioconductor.github.io/BiocManager/},
+  }
+  
+  @Manual{ggrepel,
+    title = {ggrepel: Automatically Position Non-Overlapping Text Labels with
+'ggplot2'},
+    author = {Kamil Slowikowski},
+    year = {2023},
+    note = {R package version 0.9.4},
+    url = {https://github.com/slowkow/ggrepel},
+  }
\ No newline at end of file