From 77fedca96fcfad82d7434e120c1b0613dc8366e3 Mon Sep 17 00:00:00 2001
From: daikitag <48062118+daikitag@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:26:17 +0000
Subject: [PATCH] INFRA: Verification

Statistical tests against external simulators
---
 data/simulate_AlphaSimR.R        |  75 ++++++++++
 data/simulate_simplePHENOTYPES.R |  69 ++++++++++
 data/simulation_output.md        |  15 ++
 verification.py                  | 230 +++++++++++++++++++++++++++++++
 4 files changed, 389 insertions(+)
 create mode 100644 data/simulate_AlphaSimR.R
 create mode 100644 data/simulate_simplePHENOTYPES.R
 create mode 100644 data/simulation_output.md

diff --git a/data/simulate_AlphaSimR.R b/data/simulate_AlphaSimR.R
new file mode 100644
index 0000000..351f6bb
--- /dev/null
+++ b/data/simulate_AlphaSimR.R
@@ -0,0 +1,75 @@
+if (!require("AlphaSimR")) install.packages("AlphaSimR", repos='http://cran.us.r-project.org')
+library(AlphaSimR)
+if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org')
+library(reticulate)
+if (!require("tidyr")) install.packages("tidyr", repos='http://cran.us.r-project.org')
+library(tidyr)
+
+# This code is used from verification.py to simulate quantitative traits
+# by using AlphaSimR.
+#
+# The basic simulation step is the following:
+# 1. Simulate tree sequence by using msprime
+# 2. Use the tskit Python package through the R package tskit and load it
+#   as a founder population in AlphaSimR
+#   The codes of this step are largely adapted from
+#   https://github.com/gaynorr/AlphaSimR_Examples/blob/master/misc/msprime.R
+# 3. Simulate quantitative traits of the founder population in AlphaSimR
+#  This can be achieved by setting additive trait through addTraitA
+#  and extract its genetic value information through gv and phenotype
+#  information through pheno
+# Note: Please refer to the AlphaSimR documentation for the details
+# of the simulation code written for AlphaSimR
+
+# The commandline input has 7 elements
+# [mean, var, num_rep, num_causal, random_seed, tree_filename, output_filename]
+myArgs <- commandArgs(trailingOnly = TRUE)
+# Convert to numerics
+mean <- as.numeric(myArgs[1])
+var <- as.numeric(myArgs[2])
+num_rep <- as.numeric(myArgs[3])
+num_causal <- as.numeric(myArgs[4])
+random_seed <- as.numeric(myArgs[5])
+tree_filename <- myArgs[6]
+output_filename <- myArgs[7]
+
+tskit = import("tskit")
+filename = paste0("data/treeseq_simulation/",
+                  tree_filename,".tree")
+
+ts = tskit$load(filename)
+sites = ts$tables$sites$asdict()
+pos = sites$position * 1e-8 # Convert to Morgans
+pos = pos - pos[1] # Set first position to zero
+
+# Extract haplotypes
+haplo = t(ts$genotype_matrix())
+
+# Create an AlphaSimR founder population
+founderPop = newMapPop(genMap=list(pos), haplotypes=list(haplo))
+
+num_ind <- nrow(haplo) / 2
+phenotype_matrix <- matrix(0, nrow=num_rep, ncol=num_ind)
+
+for (i in 1:num_rep) {
+  # Setting the parameters of the simulation
+  # Quantitative trait simulation is performed here when we set the
+  # parameters
+  SP = SimParam$
+    new(founderPop)$
+    addTraitA(
+      nQtlPerChr=num_causal,
+      mean=mean,
+      var=var
+    )$
+    setVarE(H2=1)
+  
+  individuals = newPop(founderPop)
+  phenotype_matrix[i,] = individuals@pheno[, 1]
+}
+
+df <- as.data.frame(phenotype_matrix)
+names(df) <- 0:(num_ind-1)
+df <- gather(df, key="individual_id", value="phenotype")
+filename = paste0("data/AlphaSimR/",output_filename,".csv")
+write.csv(df, filename, row.names=FALSE)
diff --git a/data/simulate_simplePHENOTYPES.R b/data/simulate_simplePHENOTYPES.R
new file mode 100644
index 0000000..81489ee
--- /dev/null
+++ b/data/simulate_simplePHENOTYPES.R
@@ -0,0 +1,69 @@
+if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org')
+library(simplePHENOTYPES)
+if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org')
+library(reticulate)
+
+# This code is used from verification.py to simulate quantitative traits
+# by using simplePHENOTYPES.
+
+# This code loads the vcf file that is generated in `verification.py`
+# and uses the effect size from a normal distribution to simulate
+# additive traits.
+
+# The commandline input has 7 elements
+# [mean, var, num_rep, num_causal, random_seed, vcf_filename, output_filename]
+myArgs <- commandArgs(trailingOnly = TRUE)
+mean <- as.numeric(myArgs[1])
+var <- as.numeric(myArgs[2])
+num_rep <- as.numeric(myArgs[3])
+num_causal <- as.numeric(myArgs[4])
+random_seed <- as.numeric(myArgs[5])
+vcf_filename <- myArgs[6]
+output_filename <- myArgs[7]
+
+set.seed(random_seed)
+
+# Function to simulate phenotypes from simplePHENOTYPES
+# The effect sizes are simulated from a normal distribution,
+# as the geometric series is the only effect size distribution
+# supported in simplePHENOTYPES.
+simulate_simplePHENOTYPE <- function(
+    mean, var, num_causal, vcf_filename, random_seed
+    ) {
+  effect_size <- list(rnorm(n=num_causal, mean=mean, sd=sqrt(var)))
+  phenotypes <- suppressMessages(create_phenotypes(
+    geno_file = paste0("data/treeseq_simulation/", vcf_filename, ".vcf"),
+    add_QTN_num = num_causal,
+    add_effect = effect_size,
+    rep = 1,
+    h2 = 1,
+    model = "A",
+    seed = random_seed,
+    vary_QTN = FALSE,
+    to_r = TRUE,
+    sim_method = "custom",
+    quiet = TRUE,
+    home_dir = getwd(),
+    output_dir="data/simplePHENOTYPES/simplePhenotypes_output",
+    verbose = FALSE,
+    mean = 0
+  ))
+  # The mean is centered at 0 from simplePHENOTYPES simulation
+  # so we will divide it by the standard deviation to normalise
+  # the data
+  phenotypes$Heritability_1_Rep_1 <- phenotypes$Heritability_1_Rep_1 / sd(phenotypes$Heritability_1_Rep_1)
+  return(phenotypes)
+}
+
+phenotype_result <- c()
+
+for (i in 1:num_rep) {
+  simulated_result <- simulate_simplePHENOTYPE(
+    mean=mean, var=var, num_causal=num_causal,
+    vcf_filename=vcf_filename, random_seed=random_seed+i
+  )
+  phenotype_result <- rbind(phenotype_result, simulated_result)
+}
+
+filename = paste0("data/simplePHENOTYPES/",output_filename,".csv")
+write.csv(phenotype_result, filename, row.names=FALSE)
diff --git a/data/simulation_output.md b/data/simulation_output.md
new file mode 100644
index 0000000..967aa0e
--- /dev/null
+++ b/data/simulation_output.md
@@ -0,0 +1,15 @@
+# Simulation Output
+
+This 
+This directory is used to store the R simulation codes and save the simulation output.
+
+R simulation codes:
+
+- simulate_AlphaSimR.R: This R code uses AlphaSimR to simulate quantitative traits based on a simulated tree sequence data.
+- simulate_simplePHENOTYPES.R: This R code uses simplePHENOTYPES to simulate quantitative traits based on a VCF file.
+
+Directories:
+
+- AlphaSimR: This directory is used to save the simulated output of AlphaSimR.
+- simplePHENOTYPES: This directory is used to save the simulated output of simplePHENOYTPES.
+- treeseq_simulation: This directory is used to save the simulated output of msprime.
\ No newline at end of file
diff --git a/verification.py b/verification.py
index 79a2a6e..289975f 100644
--- a/verification.py
+++ b/verification.py
@@ -10,6 +10,7 @@
 import inspect
 import logging
 import pathlib
+import subprocess
 import sys
 from collections import namedtuple
 
@@ -18,6 +19,7 @@
 import matplotlib
 import msprime
 import numpy as np
+import pandas as pd
 import scipy
 import seaborn as sns
 import tqdm
@@ -71,6 +73,234 @@ def _plot_qq_compare(self, data1, data1_name, data2, data2_name):
         pyplot.savefig(f, dpi=72)
         pyplot.close("all")
 
+        sns.kdeplot(data1, color="b", fill=True, legend=False)
+        pyplot.xlabel(data1_name)
+        f = self._build_filename(data1_name, "histogram")
+        pyplot.savefig(f, dpi=72)
+        pyplot.close("all")
+
+        sns.kdeplot(data2, color="b", fill=True, legend=False)
+        pyplot.xlabel(data2_name)
+        f = self._build_filename(data2_name, "histogram")
+        pyplot.savefig(f, dpi=72)
+        pyplot.close("all")
+
+class ComparisonTest(Test):
+    """
+    Compare tstrait against simplePHENOTYPES, AlphaSimR and the simulation framework
+    in ARG-Needle paper. We assume an infinite sites model and we also assume that
+    all sites are causal.
+    """
+
+    def _simulate_tstrait(self, ts, num_rep, alpha=0, random_seed=None):
+        """
+        Simulates trait and phenotype information based on a simulated tree
+        sequence and return the simulated phenotypes that are normalised.
+        """
+
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        phenotype_df_list = []
+        for i in range(num_rep):
+            sim_result = tstrait.sim_phenotype(
+                ts=ts,
+                num_causal=ts.num_sites,
+                model=model,
+                h2=1,
+                alpha=alpha,
+                random_seed=random_seed + i,
+            )
+            phenotype_df = sim_result.phenotype
+            phenotype_df = tstrait.normalise_phenotypes(phenotype_df)
+            phenotype_df_list.append(phenotype_df)
+
+        phenotype_result = pd.concat(phenotype_df_list).reset_index(drop=True)
+
+        return phenotype_result
+
+    def _simulate_simplePHENOTYPE(self, ts, num_rep, random_seed):
+        """
+        Saves a tree sequence file as VCF and simulates quantitative traits.
+        The output is a phenotype dataframe with 2 columns, individual ID and
+        phenotype.
+        The values in the individual ID column is the same as the simulated
+        output of tstrait.
+        """
+
+        vcf_filename = "vcf_comparison_simplePHENOTYPES"
+        output_filename = "phenotype_comparison_simplePHENOTYPES"
+        with open(f"data/treeseq_simulation/{vcf_filename}.vcf", "w") as vcf_file:
+            ts.write_vcf(vcf_file)
+        num_causal = ts.num_sites
+        cmd = ["Rscript", "data/simulate_simplePHENOTYPES.R"]
+        mean = 0
+        var = 1
+        args = [
+            str(mean),
+            str(var),
+            str(num_rep),
+            str(num_causal),
+            str(random_seed),
+            vcf_filename,
+            output_filename,
+        ]
+        input_cmd = cmd + args
+        subprocess.check_output(input_cmd)
+
+        phenotype_df = pd.read_csv(f"data/simplePHENOTYPES/{output_filename}.csv")
+        # Change the column names to be consistent with the tstrait output
+        phenotype_df = phenotype_df.rename(
+            columns={"<Trait>": "individual_id", "Heritability_1_Rep_1": "phenotype"}
+        )
+
+        num_ind = ts.num_individuals
+        # Change the individual ID in simplePHENOTYPES output to be consistent with the
+        # tstrait output
+        for i in range(num_ind):
+            phenotype_df = phenotype_df.replace(f"tsk_{i}", i)
+
+        return phenotype_df
+
+    def _simulate_arg_needle(self, ts, num_rep, alpha, random_seed):
+        """
+        Simulate phenotypes by using the quantitative trait simulation framework that is
+        adapted from the ARG-Needle paper.
+        https://zenodo.org/records/7745746
+        """
+
+        rng = np.random.default_rng(random_seed)
+        num_ind = ts.num_individuals
+
+        phenotype_df = pd.DataFrame({"individual_id": [], "phenotype": []})
+
+        for _ in range(num_rep):
+            phenotypes = np.zeros(num_ind)
+
+            for variant in ts.variants():
+                row = variant.genotypes.astype("float64")
+                row = row.reshape((num_ind, 2)).sum(axis=-1)
+                std = np.std(row, ddof=1)
+                beta = rng.normal()
+                phenotypes += row * (beta * std**alpha)
+
+            # normalize pheno to have mean 0 and var = h2
+            phenotypes -= np.mean(phenotypes)
+            phenotypes /= np.std(phenotypes, ddof=1)
+
+            simulation_result = pd.DataFrame(
+                {"individual_id": np.arange(num_ind), "phenotype": phenotypes}
+            )
+            phenotype_df = pd.concat(
+                [phenotype_df, simulation_result], ignore_index=True
+            )
+
+        return phenotype_df
+
+    def _simulate_AlphaSimR(self, ts, num_rep, random_seed):
+        """
+        Simulates quantitative traits by using AlphaSimR.
+        The output is a phenotype dataframe with 2 columns, individual ID and
+        phenotype.
+        The values in the individual ID column is the same as the simulated
+        output of tstrait.
+        """
+        output_filename = "phenotype_comparison_AlphaSimR"
+        tree_filename = "tree_comparison_AlphaSimR"
+        ts.dump(f"data/treeseq_simulation/{tree_filename}.tree")
+        num_causal = ts.num_sites
+        cmd = ["Rscript", "data/simulate_AlphaSimR.R"]
+        mean = 0
+        var = 1
+        args = [
+            str(mean),
+            str(var),
+            str(num_rep),
+            str(num_causal),
+            str(random_seed),
+            tree_filename,
+            output_filename,
+        ]
+        input_cmd = cmd + args
+        subprocess.check_output(input_cmd)
+
+        phenotype_df = pd.read_csv(f"data/AlphaSimR/{output_filename}.csv")
+
+        return phenotype_df
+
+    def _compare_phenotype(self, data1, data1_name, data2, data2_name):
+        """
+        The input of this function should be a pandas dataframe with individual_id and
+        phenotype columns. The values inside the individual_id column must be a number
+        and match with the 2 dataframes that are provided inside this function.
+        """
+
+        num_ind = data1["individual_id"].nunique()
+
+        for i in range(num_ind):
+            data1_phenotype = data1.loc[data1["individual_id"] == i]["phenotype"].values
+            data2_phenotype = data2.loc[data2["individual_id"] == i]["phenotype"].values
+            self._plot_qq_compare(
+                data1=data1_phenotype,
+                data1_name=f"{data1_name}_ind_{i}",
+                data2=data2_phenotype,
+                data2_name=f"{data2_name}_ind_{i}",
+            )
+
+    def _run(self, model, num_rep=200, random_seed=123, **kwargs):
+        ts = msprime.sim_ancestry(
+            samples=10,
+            recombination_rate=1e-8,
+            sequence_length=100_000,
+            population_size=10_000,
+            random_seed=random_seed,
+        )
+        ts = msprime.sim_mutations(
+            ts, rate=1e-7, random_seed=random_seed, discrete_genome=False
+        )
+        tstrait_phenotype_result = self._simulate_tstrait(
+            ts=ts, num_rep=num_rep, random_seed=random_seed
+        )
+        if model == "simplePHENOTYPES":
+            simplePHENOTYPE_output = self._simulate_simplePHENOTYPE(
+                ts=ts, num_rep=num_rep, random_seed=random_seed
+            )
+            self._compare_phenotype(
+                data1=tstrait_phenotype_result,
+                data1_name="tstrait_phenotype",
+                data2=simplePHENOTYPE_output,
+                data2_name="simplePHENOTYPE_phenotype",
+            )
+
+        elif model == "ARG-Needle":
+            argneedle_output = self._simulate_arg_needle(
+                ts=ts, num_rep=num_rep, alpha=kwargs["alpha"], random_seed=random_seed
+            )
+            self._compare_phenotype(
+                data1=tstrait_phenotype_result,
+                data1_name="tstrait_phenotype",
+                data2=argneedle_output,
+                data2_name="ARG_Needle_phenotype",
+            )
+
+        elif model == "AlphaSimR":
+            alphasimr_output = self._simulate_AlphaSimR(
+                ts=ts, num_rep=num_rep, random_seed=random_seed
+            )
+            self._compare_phenotype(
+                data1=tstrait_phenotype_result,
+                data1_name="tstrait_phenotype",
+                data2=alphasimr_output,
+                data2_name="AlphaSimR_phenotype",
+            )
+
+    def test_compare_simplePHENOTYPES(self):
+        self._run(model="simplePHENOTYPES", num_rep=500, random_seed=123456)
+
+    def test_compare_arg_needle(self):
+        self._run(model="ARG-Needle", num_rep=500, random_seed=12345, alpha=0)
+
+    def test_compare_AlphaSimR(self):
+        self._run(model="AlphaSimR", num_rep=500, random_seed=222)
+
 
 def model_list(loc, scale):
     df = 10