From 77fedca96fcfad82d7434e120c1b0613dc8366e3 Mon Sep 17 00:00:00 2001 From: daikitag <48062118+daikitag@users.noreply.github.com> Date: Thu, 8 Feb 2024 12:26:17 +0000 Subject: [PATCH] INFRA: Verification Statistical tests against external simulators --- data/simulate_AlphaSimR.R | 75 ++++++++++ data/simulate_simplePHENOTYPES.R | 69 ++++++++++ data/simulation_output.md | 15 ++ verification.py | 230 +++++++++++++++++++++++++++++++ 4 files changed, 389 insertions(+) create mode 100644 data/simulate_AlphaSimR.R create mode 100644 data/simulate_simplePHENOTYPES.R create mode 100644 data/simulation_output.md diff --git a/data/simulate_AlphaSimR.R b/data/simulate_AlphaSimR.R new file mode 100644 index 0000000..351f6bb --- /dev/null +++ b/data/simulate_AlphaSimR.R @@ -0,0 +1,75 @@ +if (!require("AlphaSimR")) install.packages("AlphaSimR", repos='http://cran.us.r-project.org') +library(AlphaSimR) +if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') +library(reticulate) +if (!require("tidyr")) install.packages("tidyr", repos='http://cran.us.r-project.org') +library(tidyr) + +# This code is used from verification.py to simulate quantitative traits +# by using AlphaSimR. +# +# The basic simulation step is the following: +# 1. Simulate tree sequence by using msprime +# 2. Use the tskit Python package through the R package tskit and load it +# as a founder population in AlphaSimR +# The codes of this step are largely adapted from +# https://github.com/gaynorr/AlphaSimR_Examples/blob/master/misc/msprime.R +# 3. Simulate quantitative traits of the founder population in AlphaSimR +# This can be achieved by setting additive trait through addTraitA +# and extract its genetic value information through gv and phenotype +# information through pheno +# Note: Please refer to the AlphaSimR documentation for the details +# of the simulation code written for AlphaSimR + +# The commandline input has 7 elements +# [mean, var, num_rep, num_causal, random_seed, tree_filename, output_filename] +myArgs <- commandArgs(trailingOnly = TRUE) +# Convert to numerics +mean <- as.numeric(myArgs[1]) +var <- as.numeric(myArgs[2]) +num_rep <- as.numeric(myArgs[3]) +num_causal <- as.numeric(myArgs[4]) +random_seed <- as.numeric(myArgs[5]) +tree_filename <- myArgs[6] +output_filename <- myArgs[7] + +tskit = import("tskit") +filename = paste0("data/treeseq_simulation/", + tree_filename,".tree") + +ts = tskit$load(filename) +sites = ts$tables$sites$asdict() +pos = sites$position * 1e-8 # Convert to Morgans +pos = pos - pos[1] # Set first position to zero + +# Extract haplotypes +haplo = t(ts$genotype_matrix()) + +# Create an AlphaSimR founder population +founderPop = newMapPop(genMap=list(pos), haplotypes=list(haplo)) + +num_ind <- nrow(haplo) / 2 +phenotype_matrix <- matrix(0, nrow=num_rep, ncol=num_ind) + +for (i in 1:num_rep) { + # Setting the parameters of the simulation + # Quantitative trait simulation is performed here when we set the + # parameters + SP = SimParam$ + new(founderPop)$ + addTraitA( + nQtlPerChr=num_causal, + mean=mean, + var=var + )$ + setVarE(H2=1) + + individuals = newPop(founderPop) + phenotype_matrix[i,] = individuals@pheno[, 1] +} + +df <- as.data.frame(phenotype_matrix) +names(df) <- 0:(num_ind-1) +df <- gather(df, key="individual_id", value="phenotype") +filename = paste0("data/AlphaSimR/",output_filename,".csv") +write.csv(df, filename, row.names=FALSE) diff --git a/data/simulate_simplePHENOTYPES.R b/data/simulate_simplePHENOTYPES.R new file mode 100644 index 0000000..81489ee --- /dev/null +++ b/data/simulate_simplePHENOTYPES.R @@ -0,0 +1,69 @@ +if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org') +library(simplePHENOTYPES) +if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') +library(reticulate) + +# This code is used from verification.py to simulate quantitative traits +# by using simplePHENOTYPES. + +# This code loads the vcf file that is generated in `verification.py` +# and uses the effect size from a normal distribution to simulate +# additive traits. + +# The commandline input has 7 elements +# [mean, var, num_rep, num_causal, random_seed, vcf_filename, output_filename] +myArgs <- commandArgs(trailingOnly = TRUE) +mean <- as.numeric(myArgs[1]) +var <- as.numeric(myArgs[2]) +num_rep <- as.numeric(myArgs[3]) +num_causal <- as.numeric(myArgs[4]) +random_seed <- as.numeric(myArgs[5]) +vcf_filename <- myArgs[6] +output_filename <- myArgs[7] + +set.seed(random_seed) + +# Function to simulate phenotypes from simplePHENOTYPES +# The effect sizes are simulated from a normal distribution, +# as the geometric series is the only effect size distribution +# supported in simplePHENOTYPES. +simulate_simplePHENOTYPE <- function( + mean, var, num_causal, vcf_filename, random_seed + ) { + effect_size <- list(rnorm(n=num_causal, mean=mean, sd=sqrt(var))) + phenotypes <- suppressMessages(create_phenotypes( + geno_file = paste0("data/treeseq_simulation/", vcf_filename, ".vcf"), + add_QTN_num = num_causal, + add_effect = effect_size, + rep = 1, + h2 = 1, + model = "A", + seed = random_seed, + vary_QTN = FALSE, + to_r = TRUE, + sim_method = "custom", + quiet = TRUE, + home_dir = getwd(), + output_dir="data/simplePHENOTYPES/simplePhenotypes_output", + verbose = FALSE, + mean = 0 + )) + # The mean is centered at 0 from simplePHENOTYPES simulation + # so we will divide it by the standard deviation to normalise + # the data + phenotypes$Heritability_1_Rep_1 <- phenotypes$Heritability_1_Rep_1 / sd(phenotypes$Heritability_1_Rep_1) + return(phenotypes) +} + +phenotype_result <- c() + +for (i in 1:num_rep) { + simulated_result <- simulate_simplePHENOTYPE( + mean=mean, var=var, num_causal=num_causal, + vcf_filename=vcf_filename, random_seed=random_seed+i + ) + phenotype_result <- rbind(phenotype_result, simulated_result) +} + +filename = paste0("data/simplePHENOTYPES/",output_filename,".csv") +write.csv(phenotype_result, filename, row.names=FALSE) diff --git a/data/simulation_output.md b/data/simulation_output.md new file mode 100644 index 0000000..967aa0e --- /dev/null +++ b/data/simulation_output.md @@ -0,0 +1,15 @@ +# Simulation Output + +This +This directory is used to store the R simulation codes and save the simulation output. + +R simulation codes: + +- simulate_AlphaSimR.R: This R code uses AlphaSimR to simulate quantitative traits based on a simulated tree sequence data. +- simulate_simplePHENOTYPES.R: This R code uses simplePHENOTYPES to simulate quantitative traits based on a VCF file. + +Directories: + +- AlphaSimR: This directory is used to save the simulated output of AlphaSimR. +- simplePHENOTYPES: This directory is used to save the simulated output of simplePHENOYTPES. +- treeseq_simulation: This directory is used to save the simulated output of msprime. \ No newline at end of file diff --git a/verification.py b/verification.py index 79a2a6e..289975f 100644 --- a/verification.py +++ b/verification.py @@ -10,6 +10,7 @@ import inspect import logging import pathlib +import subprocess import sys from collections import namedtuple @@ -18,6 +19,7 @@ import matplotlib import msprime import numpy as np +import pandas as pd import scipy import seaborn as sns import tqdm @@ -71,6 +73,234 @@ def _plot_qq_compare(self, data1, data1_name, data2, data2_name): pyplot.savefig(f, dpi=72) pyplot.close("all") + sns.kdeplot(data1, color="b", fill=True, legend=False) + pyplot.xlabel(data1_name) + f = self._build_filename(data1_name, "histogram") + pyplot.savefig(f, dpi=72) + pyplot.close("all") + + sns.kdeplot(data2, color="b", fill=True, legend=False) + pyplot.xlabel(data2_name) + f = self._build_filename(data2_name, "histogram") + pyplot.savefig(f, dpi=72) + pyplot.close("all") + +class ComparisonTest(Test): + """ + Compare tstrait against simplePHENOTYPES, AlphaSimR and the simulation framework + in ARG-Needle paper. We assume an infinite sites model and we also assume that + all sites are causal. + """ + + def _simulate_tstrait(self, ts, num_rep, alpha=0, random_seed=None): + """ + Simulates trait and phenotype information based on a simulated tree + sequence and return the simulated phenotypes that are normalised. + """ + + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + phenotype_df_list = [] + for i in range(num_rep): + sim_result = tstrait.sim_phenotype( + ts=ts, + num_causal=ts.num_sites, + model=model, + h2=1, + alpha=alpha, + random_seed=random_seed + i, + ) + phenotype_df = sim_result.phenotype + phenotype_df = tstrait.normalise_phenotypes(phenotype_df) + phenotype_df_list.append(phenotype_df) + + phenotype_result = pd.concat(phenotype_df_list).reset_index(drop=True) + + return phenotype_result + + def _simulate_simplePHENOTYPE(self, ts, num_rep, random_seed): + """ + Saves a tree sequence file as VCF and simulates quantitative traits. + The output is a phenotype dataframe with 2 columns, individual ID and + phenotype. + The values in the individual ID column is the same as the simulated + output of tstrait. + """ + + vcf_filename = "vcf_comparison_simplePHENOTYPES" + output_filename = "phenotype_comparison_simplePHENOTYPES" + with open(f"data/treeseq_simulation/{vcf_filename}.vcf", "w") as vcf_file: + ts.write_vcf(vcf_file) + num_causal = ts.num_sites + cmd = ["Rscript", "data/simulate_simplePHENOTYPES.R"] + mean = 0 + var = 1 + args = [ + str(mean), + str(var), + str(num_rep), + str(num_causal), + str(random_seed), + vcf_filename, + output_filename, + ] + input_cmd = cmd + args + subprocess.check_output(input_cmd) + + phenotype_df = pd.read_csv(f"data/simplePHENOTYPES/{output_filename}.csv") + # Change the column names to be consistent with the tstrait output + phenotype_df = phenotype_df.rename( + columns={"": "individual_id", "Heritability_1_Rep_1": "phenotype"} + ) + + num_ind = ts.num_individuals + # Change the individual ID in simplePHENOTYPES output to be consistent with the + # tstrait output + for i in range(num_ind): + phenotype_df = phenotype_df.replace(f"tsk_{i}", i) + + return phenotype_df + + def _simulate_arg_needle(self, ts, num_rep, alpha, random_seed): + """ + Simulate phenotypes by using the quantitative trait simulation framework that is + adapted from the ARG-Needle paper. + https://zenodo.org/records/7745746 + """ + + rng = np.random.default_rng(random_seed) + num_ind = ts.num_individuals + + phenotype_df = pd.DataFrame({"individual_id": [], "phenotype": []}) + + for _ in range(num_rep): + phenotypes = np.zeros(num_ind) + + for variant in ts.variants(): + row = variant.genotypes.astype("float64") + row = row.reshape((num_ind, 2)).sum(axis=-1) + std = np.std(row, ddof=1) + beta = rng.normal() + phenotypes += row * (beta * std**alpha) + + # normalize pheno to have mean 0 and var = h2 + phenotypes -= np.mean(phenotypes) + phenotypes /= np.std(phenotypes, ddof=1) + + simulation_result = pd.DataFrame( + {"individual_id": np.arange(num_ind), "phenotype": phenotypes} + ) + phenotype_df = pd.concat( + [phenotype_df, simulation_result], ignore_index=True + ) + + return phenotype_df + + def _simulate_AlphaSimR(self, ts, num_rep, random_seed): + """ + Simulates quantitative traits by using AlphaSimR. + The output is a phenotype dataframe with 2 columns, individual ID and + phenotype. + The values in the individual ID column is the same as the simulated + output of tstrait. + """ + output_filename = "phenotype_comparison_AlphaSimR" + tree_filename = "tree_comparison_AlphaSimR" + ts.dump(f"data/treeseq_simulation/{tree_filename}.tree") + num_causal = ts.num_sites + cmd = ["Rscript", "data/simulate_AlphaSimR.R"] + mean = 0 + var = 1 + args = [ + str(mean), + str(var), + str(num_rep), + str(num_causal), + str(random_seed), + tree_filename, + output_filename, + ] + input_cmd = cmd + args + subprocess.check_output(input_cmd) + + phenotype_df = pd.read_csv(f"data/AlphaSimR/{output_filename}.csv") + + return phenotype_df + + def _compare_phenotype(self, data1, data1_name, data2, data2_name): + """ + The input of this function should be a pandas dataframe with individual_id and + phenotype columns. The values inside the individual_id column must be a number + and match with the 2 dataframes that are provided inside this function. + """ + + num_ind = data1["individual_id"].nunique() + + for i in range(num_ind): + data1_phenotype = data1.loc[data1["individual_id"] == i]["phenotype"].values + data2_phenotype = data2.loc[data2["individual_id"] == i]["phenotype"].values + self._plot_qq_compare( + data1=data1_phenotype, + data1_name=f"{data1_name}_ind_{i}", + data2=data2_phenotype, + data2_name=f"{data2_name}_ind_{i}", + ) + + def _run(self, model, num_rep=200, random_seed=123, **kwargs): + ts = msprime.sim_ancestry( + samples=10, + recombination_rate=1e-8, + sequence_length=100_000, + population_size=10_000, + random_seed=random_seed, + ) + ts = msprime.sim_mutations( + ts, rate=1e-7, random_seed=random_seed, discrete_genome=False + ) + tstrait_phenotype_result = self._simulate_tstrait( + ts=ts, num_rep=num_rep, random_seed=random_seed + ) + if model == "simplePHENOTYPES": + simplePHENOTYPE_output = self._simulate_simplePHENOTYPE( + ts=ts, num_rep=num_rep, random_seed=random_seed + ) + self._compare_phenotype( + data1=tstrait_phenotype_result, + data1_name="tstrait_phenotype", + data2=simplePHENOTYPE_output, + data2_name="simplePHENOTYPE_phenotype", + ) + + elif model == "ARG-Needle": + argneedle_output = self._simulate_arg_needle( + ts=ts, num_rep=num_rep, alpha=kwargs["alpha"], random_seed=random_seed + ) + self._compare_phenotype( + data1=tstrait_phenotype_result, + data1_name="tstrait_phenotype", + data2=argneedle_output, + data2_name="ARG_Needle_phenotype", + ) + + elif model == "AlphaSimR": + alphasimr_output = self._simulate_AlphaSimR( + ts=ts, num_rep=num_rep, random_seed=random_seed + ) + self._compare_phenotype( + data1=tstrait_phenotype_result, + data1_name="tstrait_phenotype", + data2=alphasimr_output, + data2_name="AlphaSimR_phenotype", + ) + + def test_compare_simplePHENOTYPES(self): + self._run(model="simplePHENOTYPES", num_rep=500, random_seed=123456) + + def test_compare_arg_needle(self): + self._run(model="ARG-Needle", num_rep=500, random_seed=12345, alpha=0) + + def test_compare_AlphaSimR(self): + self._run(model="AlphaSimR", num_rep=500, random_seed=222) + def model_list(loc, scale): df = 10