diff --git a/data/simulate_AlphaSimR.R b/data/simulate_AlphaSimR.R deleted file mode 100644 index 74f3a21..0000000 --- a/data/simulate_AlphaSimR.R +++ /dev/null @@ -1,109 +0,0 @@ -if (!require("AlphaSimR")) install.packages("AlphaSimR", repos='http://cran.us.r-project.org') -library(AlphaSimR) -if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') -library(reticulate) - -# This code is used from verification.py to simulate quantitative traits -# by using AlphaSimR. -# -# The basic simulation step is the following: -# 1. Use the tskit Python package through the R package tskit and load the tree -# sequence data as a founder population in AlphaSimR. The codes of this step are -# largely adapted from -# https://github.com/ ynorr/AlphaSimR_Examples/blob/master/misc/msprime.R -# 2. Simulate quantitative traits of the founder population in AlphaSimR - -# The commandline input has 11 elements -# [num_causal, temporary_directory_name, tree_filename, phenotype_filename, -# trait_filename, corA, num_trait, h2, h2_2, num_rep, random_seed] - -myArgs <- commandArgs(trailingOnly = TRUE) -# Convert to numerics -num_causal <- as.numeric(myArgs[1]) -directory_name <- myArgs[2] -tree_filename <- myArgs[3] -phenotype_filename <- myArgs[4] -trait_filename <- myArgs[5] -corA <- as.numeric(myArgs[6]) -num_trait <- as.numeric(myArgs[7]) -h2 <- as.numeric(myArgs[8]) -h2_2 <- as.numeric(myArgs[9]) -num_rep <- as.numeric(myArgs[10]) -random_seed <- as.numeric(myArgs[11]) - -set.seed(random_seed) - -tskit <- import("tskit") - -tree_filename <- paste0(directory_name,"/", tree_filename,".tree") -ts <- tskit$load(tree_filename) - -sites <- ts$tables$sites$asdict() -pos <- sites$position * 1e-8 # Convert to Morgans -pos <- pos - pos[1] # Set first position to zero - -# Extract haplotypes -haplo <- t(ts$genotype_matrix()) - -# Create an AlphaSimR founder population -founderPop <- newMapPop(genMap=list(pos), haplotypes=list(haplo)) - -num_ind <- nrow(haplo) / 2 - -if (num_trait == 1){ - mean <- 0 - var <- 1 - corA <- NULL - H2 <- h2 -} else if (num_trait == 2){ - mean <- c(0,0) - var <- c(1,1) - corA <- matrix(c(1,corA,corA,1),nrow=2,ncol=2) - H2 <- c(h2,h2_2) -} - -phenotype_result <- c() -trait_result <- c() - -for (i in 1:num_rep) { - SP <- SimParam$ - new(founderPop)$ - addTraitA( - nQtlPerChr=num_causal, - mean=mean, - var=var, - corA=corA - )$ - setVarE(H2=H2) - - individuals <- newPop(founderPop) - - trait_df <- c() - phenotype_df <- c() - - for (trait_id in 1:num_trait){ - qtl_site <- SP$traits[[trait_id]]@lociLoc - 1 - effect_size <- SP$traits[[trait_id]]@addEff - trait_id_df <- data.frame( - effect_size = effect_size, - site_id = qtl_site, - trait_id = rep(trait_id-1, length(effect_size)) - ) - trait_df <- rbind(trait_df, trait_id_df) - phenotype <- individuals@pheno[,trait_id] - phenotype_id_df <- data.frame( - phenotype=phenotype, - individual_id = 0:(num_ind-1), - trait_id = rep(trait_id-1, num_ind) - ) - phenotype_df <- rbind(phenotype_df, phenotype_id_df) - } - phenotype_result <- rbind(phenotype_result, phenotype_df) - trait_result <- rbind(trait_result, trait_df) -} - -phenotype_filename <- paste0(directory_name,"/",phenotype_filename,".csv") -write.csv(phenotype_result, phenotype_filename, row.names=FALSE) - -trait_filename <- paste0(directory_name,"/",trait_filename,".csv") -write.csv(trait_result, trait_filename, row.names=FALSE) diff --git a/data/simulate_simplePHENOTYPES.R b/data/simulate_simplePHENOTYPES.R deleted file mode 100644 index fd94c78..0000000 --- a/data/simulate_simplePHENOTYPES.R +++ /dev/null @@ -1,50 +0,0 @@ -if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org') -library(simplePHENOTYPES) -if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') -library(reticulate) - -# This code is used from verification.py to simulate quantitative traits -# by using simplePHENOTYPES. - -# This code loads the vcf file and simulates quantitative traits - -# The commandline input has 7 elements -# [num_causal, num_trait, add_effect, add_effect_2, directory_name, -# vcf_filename, random_seed] -myArgs <- commandArgs(trailingOnly = TRUE) - -num_causal <- as.numeric(myArgs[1]) -num_trait <- as.numeric(myArgs[2]) -add_effect <- as.numeric(myArgs[3]) -add_effect_2 <- as.numeric(myArgs[4]) -directory_name <- myArgs[5] -vcf_filename <- myArgs[6] -random_seed <- as.numeric(myArgs[7]) - -if (num_trait == 1){ - effect <- add_effect - mean <- 0 - h2 <- 1 -} else if (num_trait == 2){ - effect <- c(add_effect, add_effect_2) - mean <- c(0,0) - h2 <- c(1,1) -} - -suppressMessages(create_phenotypes( - geno_file = paste0(directory_name, "/", vcf_filename, ".vcf"), - add_QTN_num = num_causal, - add_effect = effect, - rep = 1, - h2 = h2, - model = "A", - seed = random_seed, - vary_QTN = FALSE, - to_r = FALSE, - sim_method = "geometric", - quiet = TRUE, - home_dir = directory_name, - verbose = FALSE, - mean = mean, - ntraits = num_trait -)) \ No newline at end of file diff --git a/data/simulate_simplePHENOTYPES_multiple.R b/data/simulate_simplePHENOTYPES_multiple.R deleted file mode 100644 index 287628f..0000000 --- a/data/simulate_simplePHENOTYPES_multiple.R +++ /dev/null @@ -1,75 +0,0 @@ -if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org') -library(simplePHENOTYPES) -if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') -library(reticulate) - -# This code is used from verification.py to simulate quantitative traits -# by using simplePHENOTYPES. - -# This code loads the vcf file that is generated in `verification.py` -# and uses the effect size from a normal distribution to simulate -# additive traits. - -# The commandline input has 9 elements -# [num_causal, h2, directory_name, -# vcf_filename, num_rep, output_filename, -# mean, var, random_seed] -myArgs <- commandArgs(trailingOnly = TRUE) - -num_causal <- as.numeric(myArgs[1]) -h2 <- as.numeric(myArgs[2]) -directory_name <- myArgs[3] -vcf_filename <- myArgs[4] -num_rep <- myArgs[5] -output_filename <- myArgs[6] -mean <- as.numeric(myArgs[7]) -var <- as.numeric(myArgs[8]) -random_seed <- as.numeric(myArgs[9]) - -set.seed(random_seed) - -sd <- sqrt(var) - -# Function to simulate phenotypes from simplePHENOTYPES -# The effect sizes are simulated from a normal distribution, -# as the geometric series is the only effect size distribution -# supported in simplePHENOTYPES. -simulate_simplePHENOTYPE <- function( - num_causal, vcf_filename, random_seed - ) { - effect_size <- list(rnorm(n=num_causal, mean=mean, sd=sd)) - phenotypes <- suppressMessages(create_phenotypes( - geno_file = paste0(directory_name, "/", vcf_filename, ".vcf"), - add_QTN_num = num_causal, - add_effect = effect_size, - rep = 1, - h2 = h2, - model = "A", - seed = random_seed, - vary_QTN = FALSE, - to_r = TRUE, - sim_method = "custom", - quiet = TRUE, - home_dir = directory_name, - verbose = FALSE, - mean = 0 - )) - # The mean is centered at 0 from simplePHENOTYPES simulation - # so we will divide it by the standard deviation to normalise - # the data - phenotypes[,2] <- phenotypes[,2] / sd(phenotypes[,2]) - names(phenotypes)[1:2] <- c("individual_id", "phenotype") - return(phenotypes) -} - -phenotype_result <- c() - -for (i in 1:num_rep) { - simulated_result <- simulate_simplePHENOTYPE( - num_causal=num_causal, vcf_filename=vcf_filename, random_seed=random_seed+i - ) - phenotype_result <- rbind(phenotype_result, simulated_result) -} - -filename = paste0(directory_name, "/", output_filename,".csv") -write.csv(phenotype_result, filename, row.names=FALSE) diff --git a/data/simulation_codes.md b/data/simulation_codes.md deleted file mode 100644 index 51f5bc0..0000000 --- a/data/simulation_codes.md +++ /dev/null @@ -1,9 +0,0 @@ -# Simulation Codes - -This -This directory is used to store the R simulation codes. - -R simulation codes: - -- simulate_AlphaSimR.R: This R code uses AlphaSimR to simulate quantitative traits based on a simulated tree sequence data. -- simulate_simplePHENOTYPES.R: This R code uses simplePHENOTYPES to simulate quantitative traits based on a VCF file. \ No newline at end of file diff --git a/verification.py b/verification.py index 8756e34..d191970 100644 --- a/verification.py +++ b/verification.py @@ -140,45 +140,72 @@ class SimulationResult: trait: pd.DataFrame -class PhenotypeSimulator: +def _simulate_simplePHENOTYPES_qqplot( + ts, num_causal, h2, random_seed, num_rep, mean=0, var=1 +): """ - Class to simulate traits based on 3 models, ALphaSimR, simplePHENOTYPES, - and the simulation framework described in the ARG-Needle paper. + The input of this function is a tree sequence and the output is a phenotype + dataframe. This function is used for comparing the simulation output in + a QQ-plot. + The output of this function will be a phenotype dataframe. """ + with tempfile.TemporaryDirectory() as tmpdirname: + with open(f"{tmpdirname}/tree_seq.vcf", "w") as vcf_file: + ts.write_vcf(vcf_file) + cmd = ["Rscript", "scripts/simulate_simplePHENOTYPES_qqplot.R"] + args = [ + str(num_causal), + str(h2), + tmpdirname, + str(num_rep), + str(mean), + str(var), + str(random_seed), + ] + input_cmd = cmd + args + subprocess.check_output(input_cmd) - def __init__(self, model): - self.model = model + phenotype_df = pd.read_csv(f"{tmpdirname}/simplePHENOTYPES.csv") - def _simulate_simplePHENOTYPES_single( - self, ts, num_causal, random_seed, add_effect=1, num_trait=1, add_effect_2=1 - ): - """ - The function to simulate quantitative traits by using simplePHENOTYPES. - We will specify the number of causal sites and the parameter for the - geometric series where the effect sizes are determined. - """ + num_ind = ts.num_individuals + # Change the individual ID in simplePHENOTYPES output to be consistent with the + # tstrait output + for i in range(num_ind): + phenotype_df = phenotype_df.replace(f"tsk_{i}", i) + + return phenotype_df - directory = tempfile.TemporaryDirectory() - vcf_filename = "vcf_comparison_simplePHENOTYPES" - with open(f"{directory.name}/{vcf_filename}.vcf", "w") as vcf_file: +def _simulate_simplePHENOTYPES_exact( + ts, num_causal, random_seed, add_effect=1, num_trait=1, add_effect_2=1 +): + """ + The function to simulate quantitative traits by using simplePHENOTYPES. + We will specify the number of causal sites and the parameter for the + geometric series where the effect sizes are determined. + The output of this function will be a SimulationResult object. + """ + + with tempfile.TemporaryDirectory() as tmpdirname: + with open(f"{tmpdirname}/tree_seq.vcf", "w") as vcf_file: ts.write_vcf(vcf_file) - cmd = ["Rscript", "data/simulate_simplePHENOTYPES.R"] + cmd = ["Rscript", "data/simulate_simplePHENOTYPES_exact.R"] args = [ str(num_causal), str(num_trait), str(add_effect), str(add_effect_2), - directory.name, - vcf_filename, + tmpdirname, str(random_seed), ] input_cmd = cmd + args subprocess.check_output(input_cmd) + qtn_df = pd.read_csv(f"{tmpdirname}/Additive_Selected_QTNs.txt", sep="\t") + if num_trait == 1: phenotype_df = pd.read_csv( - f"{directory.name}/Simulated_Data_1_Reps_Herit_1.txt", sep="\t" + f"{tmpdirname}/Simulated_Data_1_Reps_Herit_1.txt", sep="\t" ) del phenotype_df["reps"] phenotype_df = phenotype_df.rename( @@ -186,7 +213,7 @@ def _simulate_simplePHENOTYPES_single( ) else: phenotype_df = pd.read_csv( - f"{directory.name}/Simulated_Data_1_Reps_Herit_1_1.txt", sep="\t" + f"{tmpdirname}/Simulated_Data_1_Reps_Herit_1_1.txt", sep="\t" ) del phenotype_df["Rep"] phenotype_df = pd.melt( @@ -203,154 +230,56 @@ def _simulate_simplePHENOTYPES_single( ) phenotype_df = phenotype_df.replace({"Trait_1_H2_1": 0, "Trait_2_H2_1": 1}) - num_ind = ts.num_individuals - # Change the individual ID in simplePHENOTYPES output to be consistent with the - # tstrait output - for i in range(num_ind): - phenotype_df = phenotype_df.replace(f"tsk_{i}", i) - - qtn_df = pd.read_csv(f"{directory.name}/Additive_Selected_QTNs.txt", sep="\t") - - # Obtain the list of causal allele - causal_allele = [] - effect_size = [] - effect_size_2 = [] - for i, site_id in enumerate(qtn_df["snp"].values, start=1): - # simplePHENOTYPES uses ancestral state as a causal allele - allele = ts.site(site_id).ancestral_state - causal_allele.append(allele) - effect_size.append(add_effect**i) - effect_size_2.append(add_effect_2**i) - - if num_trait == 2: - effect_size = np.append(effect_size, effect_size_2) - - trait_df = pd.DataFrame( - { - "site_id": np.tile(qtn_df["snp"].values, num_trait), - "causal_allele": np.tile(causal_allele, num_trait), - "effect_size": effect_size, - "trait_id": np.repeat(np.arange(num_trait), len(causal_allele)), - } - ) - - simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) - - directory.cleanup() - - return simulation_result - - def _simulate_simplePHENOTYPES_multiple( - self, ts, num_causal, h2, random_seed, num_rep, mean=0, var=1 - ): - directory = tempfile.TemporaryDirectory() - - output_filename = "simplePHENOTYPES_multiple" - - vcf_filename = "vcf_comparison_simplePHENOTYPES" - with open(f"{directory.name}/{vcf_filename}.vcf", "w") as vcf_file: - ts.write_vcf(vcf_file) - cmd = ["Rscript", "data/simulate_simplePHENOTYPES_multiple.R"] - args = [ - str(num_causal), - str(h2), - directory.name, - vcf_filename, - str(num_rep), - output_filename, - str(mean), - str(var), - str(random_seed), - ] - input_cmd = cmd + args - subprocess.check_output(input_cmd) - - phenotype_df = pd.read_csv(f"{directory.name}/{output_filename}.csv") - - num_ind = ts.num_individuals - # Change the individual ID in simplePHENOTYPES output to be consistent with the - # tstrait output - for i in range(num_ind): - phenotype_df = phenotype_df.replace(f"tsk_{i}", i) - - directory.cleanup() - - return phenotype_df - - def _simulate_simplePHENOTYPES( - self, ts, num_causal, random_seed, h2=1, num_rep=1, normalize=False, **kwargs - ): - """ - For single simulation (num_rep=1), we will be simulating effect sizes from - a geometric series, which is the only distribution supported in simplePHENOTYPES. - For multiple simulations (num_rep>=1), we will be simulating effect sizes from - a standard normal distribution. - """ - - phenotype_result = pd.DataFrame( - {"individual_id": [], "trait_id": [], "phenotype": []} - ) - trait_result = pd.DataFrame( - {"site_id": [], "causal_allele": [], "effect_size": [], "trait_id": []} - ) - - if num_rep == 1: - simulation_output = self._simulate_simplePHENOTYPES_single( - ts=ts, num_causal=num_causal, random_seed=random_seed, **kwargs - ) - if normalize: - simulation_output.phenotype = tstrait.normalise_phenotypes( - simulation_output.phenotype - ) - phenotype_result = pd.concat( - [phenotype_result, simulation_output.phenotype] - ) - trait_result = pd.concat([trait_result, simulation_output.trait]) - trait_result = trait_result.astype({"site_id": int}) - - else: - # We are not examining the trait information in multiple simulation, and - # accurately extracting trait information in a few lines of Python codes - # is challenging, so I wrote a new R code to do the simulation multiple - # times. - # Phenotypes are normalized in multiple simulation - phenotype_result = self._simulate_simplePHENOTYPES_multiple( - ts=ts, - num_causal=num_causal, - random_seed=random_seed, - num_rep=num_rep, - h2=h2, - mean=kwargs["mean"], - var=kwargs["var"], - ) - simulation_result = SimulationResult( - phenotype=phenotype_result, trait=trait_result - ) + num_ind = ts.num_individuals + # Change the individual ID in simplePHENOTYPES output to be consistent with the + # tstrait output + for i in range(num_ind): + phenotype_df = phenotype_df.replace(f"tsk_{i}", i) + + # Obtain the list of causal allele + causal_allele = [] + effect_size = [] + effect_size_2 = [] + for i, site_id in enumerate(qtn_df["snp"].values, start=1): + # simplePHENOTYPES uses ancestral state as a causal allele + allele = ts.site(site_id).ancestral_state + causal_allele.append(allele) + effect_size.append(add_effect**i) + effect_size_2.append(add_effect_2**i) + + if num_trait == 2: + effect_size = np.append(effect_size, effect_size_2) + + trait_df = pd.DataFrame( + { + "site_id": np.tile(qtn_df["snp"].values, num_trait), + "causal_allele": np.tile(causal_allele, num_trait), + "effect_size": effect_size, + "trait_id": np.repeat(np.arange(num_trait), len(causal_allele)), + } + ) - return simulation_result + simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) - def _simulate_AlphaSimR( - self, ts, num_causal, h2, random_seed, num_rep=1, h2_2=1, corA=1, num_trait=1 - ): - """ - The function to simulate quantitative traits by using AlphaSimR. We will - specify the number of causal sites, such that the AlphaSimR simulation - will be conducted randomly. corA is used to specify the correlation - coefficient of pleiotropic traits. - """ + return simulation_result - directory = tempfile.TemporaryDirectory() - tree_filename = "tree_comparison_AlphaSimR" - ts.dump(f"{directory.name}/{tree_filename}.tree") - phenotype_filename = "phenotype_comparison_AlphaSimR" - trait_filename = "trait_comparison_AlphaSimR" - cmd = ["Rscript", "data/simulate_AlphaSimR.R"] +def _simulate_AlphaSimR( + ts, num_causal, h2, random_seed, num_rep=1, h2_2=1, corA=1, num_trait=1 +): + """ + The function to simulate quantitative traits by using AlphaSimR. We will + specify the number of causal sites, such that the AlphaSimR simulation + will be conducted randomly. corA is used to specify the correlation + coefficient of pleiotropic traits. The output of this function is a + PhenotypeResult object. + """ + + with tempfile.TemporaryDirectory() as tmpdirname: + ts.dump(f"{tmpdirname}/tree_seq.tree") + cmd = ["Rscript", "scripts/simulate_AlphaSimR.R"] args = [ str(num_causal), - directory.name, - tree_filename, - phenotype_filename, - trait_filename, + tmpdirname, str(corA), str(num_trait), str(h2), @@ -361,121 +290,103 @@ def _simulate_AlphaSimR( input_cmd = cmd + args subprocess.check_output(input_cmd) - phenotype_df = pd.read_csv(f"{directory.name}/{phenotype_filename}.csv") - trait_df = pd.read_csv(f"{directory.name}/{trait_filename}.csv") + phenotype_df = pd.read_csv(f"{tmpdirname}/phenotype_alphasimr.csv") + trait_df = pd.read_csv(f"{tmpdirname}/trait_alphasimr.csv") - # Obtain the list of causal allele - causal_allele = [] - for site_id in trait_df["site_id"]: - allele = ts.mutation(site_id).derived_state - causal_allele.append(allele) - - trait_df["causal_allele"] = causal_allele + # Obtain the list of causal allele + causal_allele = [] + for site_id in trait_df["site_id"]: + allele = ts.mutation(site_id).derived_state + causal_allele.append(allele) - simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) + trait_df["causal_allele"] = causal_allele - directory.cleanup() + simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) - return simulation_result + return simulation_result - def _simulate_arg_needle(self, ts, alpha, h2, num_rep, random_seed): - """ - Simulate phenotypes by using the quantitative trait simulation framework that is - adapted from the ARG-Needle paper. - https://zenodo.org/records/7745746 - """ - rng = np.random.default_rng(random_seed) - num_ind = ts.num_individuals +def _simulate_arg_needle(ts, alpha, h2, random_seed, num_rep=1): + """ + Simulate phenotypes by using the quantitative trait simulation framework that is + adapted from the ARG-Needle paper. + https://zenodo.org/records/7745746 + """ - phenotype_result = pd.DataFrame({"individual_id": [], "phenotype": []}) - trait_result = pd.DataFrame( - {"site_id": [], "causal_allele": [], "effect_size": [], "trait_id": []} - ) + rng = np.random.default_rng(random_seed) + num_ind = ts.num_individuals - for _ in range(num_rep): - phenotypes = np.zeros(num_ind) - beta_list = [] - causal_allele = [] - - for variant in ts.variants(): - row = variant.genotypes.astype("float64") - row = row.reshape((num_ind, 2)).sum(axis=-1) - std = np.std(row, ddof=1) - beta = rng.normal() - beta_list.append(beta) - causal_allele.append(variant.alleles[1]) - phenotypes += row * (beta * std**alpha) - - # normalize pheno to have mean 0 and var = h2 - phenotypes -= np.mean(phenotypes) - phenotypes /= np.std(phenotypes, ddof=1) - phenotypes *= np.sqrt(h2) - - # sample environmental component with variance 1-h2 and add it to phenotype - phenotypes += rng.normal(size=num_ind) * np.sqrt(1 - h2) - - # normalize it all - phenotypes -= np.mean(phenotypes) - phenotypes /= np.std(phenotypes, ddof=1) - - phenotype_df = pd.DataFrame( - {"individual_id": np.arange(len(phenotypes)), "phenotype": phenotypes} - ) - trait_df = pd.DataFrame( - { - "site_id": np.arange(len(causal_allele)), - "causal_allele": causal_allele, - "effect_size": beta_list, - "trait_id": np.zeros(len(causal_allele)), - } - ) + phenotype_result = pd.DataFrame({"individual_id": [], "phenotype": []}) + trait_result = pd.DataFrame( + {"site_id": [], "causal_allele": [], "effect_size": [], "trait_id": []} + ) - phenotype_result = pd.concat( - [phenotype_result, phenotype_df], ignore_index=True - ) - trait_result = pd.concat([trait_result, trait_df], ignore_index=True) + for _ in range(num_rep): + phenotypes = np.zeros(num_ind) + beta_list = [] + causal_allele = [] - trait_result = trait_result.astype({"site_id": int}) - simulation_result = SimulationResult( - phenotype=phenotype_result, trait=trait_result + for variant in ts.variants(): + row = variant.genotypes.astype("float64") + row = row.reshape((num_ind, 2)).sum(axis=-1) + std = np.std(row, ddof=1) + beta = rng.normal() + beta_list.append(beta) + causal_allele.append(variant.alleles[1]) + phenotypes += row * (beta * std**alpha) + + # normalize pheno to have mean 0 and var = h2 + phenotypes -= np.mean(phenotypes) + phenotypes /= np.std(phenotypes, ddof=1) + phenotypes *= np.sqrt(h2) + + # sample environmental component with variance 1-h2 and add it to phenotype + phenotypes += rng.normal(size=num_ind) * np.sqrt(1 - h2) + + # normalize it all + phenotypes -= np.mean(phenotypes) + phenotypes /= np.std(phenotypes, ddof=1) + + phenotype_df = pd.DataFrame( + {"individual_id": np.arange(len(phenotypes)), "phenotype": phenotypes} + ) + trait_df = pd.DataFrame( + { + "site_id": np.arange(len(causal_allele)), + "causal_allele": causal_allele, + "effect_size": beta_list, + "trait_id": np.zeros(len(causal_allele)), + } ) - return simulation_result - - def _run(self, ts, h2, random_seed, num_rep=1, **kwargs): - if self.model == "simplePHENOTYPES": - simulation_output = self._simulate_simplePHENOTYPES( - ts=ts, h2=h2, random_seed=random_seed, num_rep=num_rep, **kwargs - ) - - elif self.model == "AlphaSimR": - simulation_output = self._simulate_AlphaSimR( - ts=ts, h2=h2, random_seed=random_seed, num_rep=num_rep, **kwargs - ) - - elif self.model == "ARG-Needle": - simulation_output = self._simulate_arg_needle( - ts=ts, h2=h2, random_seed=random_seed, num_rep=num_rep, **kwargs - ) + phenotype_result = pd.concat( + [phenotype_result, phenotype_df], ignore_index=True + ) + trait_result = pd.concat([trait_result, trait_df], ignore_index=True) - return simulation_output + trait_result = trait_result.astype({"site_id": int}) + simulation_result = SimulationResult( + phenotype=phenotype_result, trait=trait_result + ) + return simulation_result class ExactTest(Test): """ - Compare tstrait against simplePHENOTYPES, AlphaSimR and the simulation framework - in ARG-Needle paper. For all benchmarking, we simulate a genetic data of 100 - individuals with 100 kb sequence length and mutations are simulated from an - infinite-sites model. - - We assume that the following in all quantitative trait simulation: - - Narrow-sense heritability is 1 to compare the simulated genetic values. - - Genetic effect sizes are simulated from an external simulator and they are used to - simulate quantitative traits in tstrait. + Test class that conducts exact test. Phenotype and trait information are + simulated by using an external simulator. Afterwards, the trait information is used + to compute the genetic values by using tstrait, and their values are compared + to examine if the tstrait package can compute the accurate genetic values. + + The numericalization input is used to modify the numericalization of genotypes. + The standard numericalization of tstrait is (AA=2, Aa=1, aa=0), where A is the + causal allele, but when numericalization is True, it will be (AA=1, Aa=0, aa=-1). """ - - def _run(self, model, random_seed, **kwargs): + + def _simulate_tree_seq(self, random_seed): + """ + Simulates a tree sequence from an infinite-sites model. + """ ts = msprime.sim_ancestry( samples=100, recombination_rate=1e-8, @@ -486,90 +397,299 @@ def _run(self, model, random_seed, **kwargs): ts = msprime.sim_mutations( ts, rate=1e-8, random_seed=random_seed, discrete_genome=False ) - - simulator = PhenotypeSimulator(model=model) - simulation_output = simulator._run(ts, h2=1, random_seed=random_seed, **kwargs) - - trait_df = simulation_output.trait.sort_values(by=["site_id"]) + + return ts + + def _compute_tstrait(self, ts, trait_df, numericalization=False): + """ + This method computes genetic values from the trait dataframe. + There is an option to change the numericalization of genetic values. + No other post-processing is conducted to the computed genetic values. + """ + trait_df=trait_df.sort_values(by=["site_id"]) genetic_df = tstrait.genetic_value(ts, trait_df) + if numericalization: + trait_id_array = trait_df["trait_id"].unique() + for trait_id in trait_id_array: + effect_size_sum = trait_df[trait_df["trait_id"]==trait_id]["effect_size"].sum() + genetic_df.loc[genetic_df["trait_id"]==trait_id, ["genetic_value"]] += effect_size_sum phenotype_df = tstrait.sim_env(genetic_df, h2=1, random_seed=1) - - if model == "simplePHENOTYPES": - """ - simplePHENOTYPES just makes the mean to be 0 and there is no scaling - to the simulation output based on its variance - """ - grouped = phenotype_df.groupby("trait_id")[["phenotype"]] - phenotype_df = grouped.transform(lambda x: (x - x.mean())) - elif model == "AlphaSimR": - """ - AlphaSimR normalizes the simulation output with ddof=0 - """ - phenotype_df = tstrait.normalise_phenotypes( - phenotype_df, mean=0, var=1, ddof=0 - ) - elif model == "ARG-Needle": - """ - Simulation framework in the ARG-Needle paper normalizes the simulation - output with ddof=0 - """ - phenotype_df = tstrait.normalise_phenotypes(phenotype_df, mean=0, var=1) - - tstrait_phenotype = phenotype_df["phenotype"] - simulated_phenotype = simulation_output.phenotype["phenotype"].values - - np.testing.assert_array_almost_equal(tstrait_phenotype, simulated_phenotype) - - def test_exact_simplePHENOTYPES_single(self): - self._run( - model="simplePHENOTYPES", + + return phenotype_df + +class ExactTestSimplePHENOTYPES(ExactTest): + """ + Exact test between simplePHENOTYPES and tstrait. + """ + + def _compare_simplePHENOTYPES(self, num_causal, random_seed, add_effect=1, num_trait=1, add_effect_2=1, numericalization=False): + ts = self._simulate_tree_seq(random_seed) + simulation_output=_simulate_simplePHENOTYPES_exact( + ts=ts, num_causal=num_causal, random_seed=random_seed, add_effect=add_effect, + add_effect_2=add_effect_2, num_trait=num_trait + ) + phenotype_df = self._compute_tstrait(ts=ts, trait_df=simulation_output.trait, numericalization=numericalization) + grouped = phenotype_df.groupby("trait_id")[["phenotype"]] + phenotype_df = grouped.transform(lambda x: (x - x.mean())) + + np.testing.assert_array_almost_equal(phenotype_df["phenotype"].values, + simulation_output.phenotype["phenotype"].values) + + def test_exact_simplePHENOTYPES_single_causal_30_add_09(self): + self._compare_simplePHENOTYPES( num_causal=30, add_effect=0.9, random_seed=100, num_trait=1, - normalize=False, + numericalization=False, ) - - def test_exact_simplePHENOTYPES_pleiotropic(self): - self._run( - model="simplePHENOTYPES", - num_causal=30, + + def test_exact_simplePHENOTYPES_single_causal_50_add_09(self): + self._compare_simplePHENOTYPES( + num_causal=50, add_effect=0.9, random_seed=101, + num_trait=1, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_single_causal_30_add_11(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=1.1, + random_seed=102, + num_trait=1, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_single_causal_50_add_11(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=1.1, + random_seed=103, + num_trait=1, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_single_causal_30_add_09_numer(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.9, + random_seed=104, + num_trait=1, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_single_causal_30_add_11_numer(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=1.1, + random_seed=106, + num_trait=1, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_single_causal_50_add_11_numer(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=1.1, + random_seed=107, + num_trait=1, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_pleio_causal_30_add_09_08(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.9, + random_seed=108, + num_trait=2, + add_effect_2=0.8, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_pleio_causal_50_add_11_08(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=1.1, + random_seed=110, + num_trait=2, + add_effect_2=0.8, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_pleio_causal_30_add_08_08(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.8, + random_seed=111, + num_trait=2, + add_effect_2=0.8, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_pleio_causal_50_add_09_08_numer(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=0.9, + random_seed=112, + num_trait=2, + add_effect_2=0.8, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_pleio_causal_30_add_08_08_numer(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.8, + random_seed=113, num_trait=2, add_effect_2=0.8, - normalize=False, + numericalization=True, ) - def test_exact_AlphaSimR_single(self): - self._run(model="AlphaSimR", num_causal=100, random_seed=200) - def test_exact_AlphaSimR_pleiotropic(self): - self._run( - model="AlphaSimR", num_causal=100, random_seed=201, corA=0.8, num_trait=2 +class ExactTestAlphaSimR(ExactTest): + """ + Exact test between AlphaSimR and tstrait. + """ + + def _compare_AlphaSimR(self, num_causal, random_seed, corA=1, numericalization=False): + ts = self._simulate_tree_seq(random_seed) + simulation_output=_simulate_AlphaSimR( + ts=ts, num_causal=num_causal, h2=1, random_seed=random_seed, corA=corA + ) + phenotype_df = self._compute_tstrait(ts=ts, trait_df=simulation_output.trait, numericalization=numericalization) + phenotype_df = tstrait.normalise_phenotypes( + phenotype_df, mean=0, var=1, ddof=0 + ) + + np.testing.assert_array_almost_equal(phenotype_df["phenotype"].values, + simulation_output.phenotype["phenotype"].values) + + def test_exact_AlphaSimR_single_causal_30(self): + self._compare_AlphaSimR(num_causal=30, random_seed=200, numericalization=False) + + def test_exact_AlphaSimR_single_causal_100(self): + self._compare_AlphaSimR(num_causal=100, random_seed=201, numericalization=False) + + def test_exact_AlphaSimR_single_causal_30_numer(self): + self._compare_AlphaSimR(num_causal=30, random_seed=202, numericalization=True) + + def test_exact_AlphaSimR_single_causal_100_numer(self): + self._compare_AlphaSimR(num_causal=100, random_seed=203, numericalization=True) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor08(self): + self._compare_AlphaSimR( + model="AlphaSimR", num_causal=30, random_seed=204, corA=0.8, num_trait=2, + numericalization=False ) - def test_exact_ARG_Needle(self): - self._run(model="ARG-Needle", alpha=0, random_seed=300) + def test_exact_AlphaSimR_pleiotropic_causal_30_cor01(self): + self._compare_AlphaSimR( + model="AlphaSimR", num_causal=30, random_seed=205, corA=0.1, num_trait=2, + numericalization=False + ) + + def test_exact_AlphaSimR_pleiotropic_causal_100_cor01(self): + self._compare_AlphaSimR( + model="AlphaSimR", num_causal=100, random_seed=206, corA=0.1, num_trait=2, + numericalization=False + ) + + def test_exact_AlphaSimR_pleiotropic_causal_100_cor08(self): + self._compare_AlphaSimR( + model="AlphaSimR", num_causal=100, random_seed=207, corA=0.8, num_trait=2, + numericalization=False + ) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor08_numer(self): + self._compare_AlphaSimR( + model="AlphaSimR", num_causal=30, random_seed=208, corA=0.8, num_trait=2, + numericalization=True + ) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor08_numer(self): + self._compare_AlphaSimR( + model="AlphaSimR", num_causal=30, random_seed=209, corA=0.8, num_trait=2, + numericalization=True + ) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor01_numer(self): + self._compare_AlphaSimR( + model="AlphaSimR", num_causal=30, random_seed=205, corA=0.1, num_trait=2, + numericalization=True + ) + + +class ExactTestARGNeedle(ExactTest): + """ + Exact test between the simulation framework described in the + ARG-Needle paper and tstrait. + """ + + def _compare_arg_needle(self, alpha, random_seed, numericalization=False): + ts = self._simulate_tree_seq(random_seed) + simulation_output=_simulate_arg_needle( + ts=ts, alpha=alpha, h2=1, random_seed=random_seed + ) + phenotype_df = self._compute_tstrait(ts=ts, trait_df=simulation_output.trait, numericalization=numericalization) + phenotype_df = tstrait.normalise_phenotypes(phenotype_df, mean=0, var=1) + + np.testing.assert_array_almost_equal(phenotype_df["phenotype"].values, + simulation_output.phenotype["phenotype"].values) + + def test_exact_ARG_Needle_alpha_0(self): + self._compare_arg_needle(alpha=0, random_seed=300, numericalization=False) + + def test_exact_ARG_Needle_alpha_negative_1(self): + self._compare_arg_needle(alpha=-1, random_seed=301, numericalization=False) + + def test_exact_ARG_Needle_alpha_1(self): + self._compare_arg_needle(alpha=1, random_seed=302, numericalization=False) + + def test_exact_ARG_Needle_alpha_0_numer(self): + self._compare_arg_needle(alpha=0, random_seed=303, numericalization=True) + + def test_exact_ARG_Needle_alpha_negative_1_numer(self): + self._compare_arg_needle(alpha=-1, random_seed=304, numericalization=True) + + def test_exact_ARG_Needle_alpha_1_numer(self): + self._compare_arg_needle(alpha=1, random_seed=305, numericalization=True) class ComparisonTest(Test): """ - Compare tstrait against simplePHENOTYPES, AlphaSimR and the simulation framework - in ARG-Needle paper. We assume an infinite sites model and we also assume that - all sites are causal. + Compare tstrait against currently available simulators and create a QQ-plot. We + assume an infinite sites model and we also assume that all sites are causal. """ - def _simulate_tstrait(self, ts, num_rep, h2, random_seed=None, **kwargs): + def _compare_phenotype(self, data1, data1_name, data2, data2_name, ind_id): + """ + The input of this function should be a pandas dataframe with individual_id and + phenotype columns. The values inside the individual_id column must be a number + and match with the 2 dataframes that are provided inside this function. + `ind_id` is a list of individual IDs that are of interest to compare. + """ + + for i in ind_id: + data1_phenotype = data1.loc[data1["individual_id"] == i]["phenotype"].values + data2_phenotype = data2.loc[data2["individual_id"] == i]["phenotype"].values + self._plot_qq_compare( + data1=data1_phenotype, + data1_name=f"{data1_name}_ind_{i}", + data2=data2_phenotype, + data2_name=f"{data2_name}_ind_{i}", + ) + + + def _simulate_tstrait(self, ts, num_rep, h2, random_seed, mean=0, var=1, alpha=0): """ Simulates trait and phenotype information based on a simulated tree sequence and return the simulated phenotypes that are normalised. We assume that all sites are causal. """ - mean = kwargs.get("mean", 0) - var = kwargs.get("var", 1) - alpha = kwargs.get("alpha", 0) model = tstrait.trait_model(distribution="normal", mean=mean, var=var) phenotype_df_list = [] for i in range(num_rep): @@ -589,6 +709,11 @@ def _simulate_tstrait(self, ts, num_rep, h2, random_seed=None, **kwargs): return phenotype_result def _simulate_out_of_africa(self, random_seed): + """ + Simulate a tree sequence data from the Out of Africa model and select individuals + from different populations. + """ + rng = np.random.default_rng(random_seed) species = stdpopsim.get_species("HomSap") model = species.get_demographic_model("OutOfAfrica_3G09") contig = species.get_contig("chr22", mutation_rate=0, length_multiplier=1e-3) @@ -598,90 +723,141 @@ def _simulate_out_of_africa(self, random_seed): ts = msprime.sim_mutations( ts, rate=model.mutation_rate, random_seed=random_seed, discrete_genome=False ) - return ts - - def _compare_phenotype(self, data1, data1_name, data2, data2_name, random_seed): - """ - The input of this function should be a pandas dataframe with individual_id and - phenotype columns. The values inside the individual_id column must be a number - and match with the 2 dataframes that are provided inside this function. - """ - - rng = np.random.default_rng(random_seed) - - yri = rng.integers(low=0, high=300) - chb = rng.integers(low=301, high=600) - ceu = rng.integers(low=601, high=900) - - for i in [yri, chb, ceu]: - data1_phenotype = data1.loc[data1["individual_id"] == i]["phenotype"].values - data2_phenotype = data2.loc[data2["individual_id"] == i]["phenotype"].values - self._plot_qq_compare( - data1=data1_phenotype, - data1_name=f"{data1_name}_ind_{i}", - data2=data2_phenotype, - data2_name=f"{data2_name}_ind_{i}", + node0 = rng.choice(ts.samples(population=0)) + node1 = rng.choice(ts.samples(population=1)) + node2 = rng.choice(ts.samples(population=2)) + ind_id = np.array([ + ts.node(node0).individual, + ts.node(node1).individual, + ts.node(node2).individual, + ]) + return ts, ind_id + +class ComparisonTestSimplePHENOTYPES(ComparisonTest): + + def _qqplot_simplePHENOTYPES(self, num_rep, h2, random_seed, mean=0, var=1, alpha=0): + ts, ind_id = self._simulate_out_of_africa(random_seed=random_seed) + tstrait_phenotype = self._simulate_tstrait( + ts=ts, num_rep=num_rep, h2=h2, random_seed=random_seed, + mean=mean, var=var, alpha=alpha ) - - def _run(self, model, h2, num_rep=200, random_seed=123, **kwargs): - ts = self._simulate_out_of_africa(random_seed) - tstrait_phenotype_result = self._simulate_tstrait( - ts=ts, num_rep=num_rep, h2=h2, random_seed=random_seed, **kwargs - ) - - if model in ["simplePHENOTYPES", "AlphaSimR"]: - kwargs["num_causal"] = ts.num_sites - - # simplePHENOTYPES uses ancestral state as a causal allele - if model == "simplePHENOTYPES": - kwargs["mean"] *= -1 - - simulator = PhenotypeSimulator(model=model) - simulation_output = simulator._run( - ts=ts, h2=h2, random_seed=random_seed, num_rep=num_rep, **kwargs + # - mean because simplePHENOTYPES uses ancestral state as a causal allele + simplePHENOTYPES_phenotype = _simulate_simplePHENOTYPES_qqplot( + ts=ts, num_causal=ts.num_sites, h2=h2, random_seed=random_seed, + num_rep=num_rep, mean=-mean, var=var ) - self._compare_phenotype( - data1=tstrait_phenotype_result, + data1=tstrait_phenotype, data1_name="tstrait", - data2=simulation_output.phenotype, - data2_name=model, - random_seed=random_seed, + data2=simplePHENOTYPES_phenotype, + data2_name="simplePHENOTYPES", + ind_id=ind_id, ) def test_compare_simplePHENOTYPES_h2_08_mean_0_var_1(self): - self._run( - model="simplePHENOTYPES", + self._qqplot_simplePHENOTYPES( num_rep=500, - distribution="normal", - random_seed=123456, h2=0.8, + random_seed=1000, mean=0, var=1, ) def test_compare_simplePHENOTYPES_h2_09_mean_1_var_4(self): - self._run( - model="simplePHENOTYPES", + self._qqplot_simplePHENOTYPES( num_rep=500, - distribution="normal", - random_seed=123, h2=0.9, + random_seed=1001, mean=1, var=4, ) - + + def test_compare_simplePHENOTYPES_h2_02_mean_negative_1_var_4(self): + self._qqplot_simplePHENOTYPES( + num_rep=500, + h2=0.2, + random_seed=1002, + mean=-1, + var=4, + ) + + def test_compare_simplePHENOTYPES_h2_01_mean_0_var_4(self): + self._qqplot_simplePHENOTYPES( + num_rep=500, + h2=0.1, + random_seed=1001, + mean=0, + var=4, + ) + + +class ComparisonTestAlphaSimR(ComparisonTest): + + def _qqplot_AlphaSimR(self, num_rep, h2, random_seed): + ts, ind_id = self._simulate_out_of_africa(random_seed=random_seed) + tstrait_phenotype = self._simulate_tstrait( + ts=ts, num_rep=num_rep, h2=h2, random_seed=random_seed + ) + alphasimr_phenotype = _simulate_AlphaSimR( + ts=ts, num_causal=ts.num_sites, h2=h2, random_seed=random_seed, + num_rep=num_rep + ) + self._compare_phenotype( + data1=tstrait_phenotype, + data1_name="tstrait", + data2=alphasimr_phenotype.phenotype, + data2_name="AlphaSimR", + ind_id=ind_id, + ) + + def test_compare_AlphaSimR_h2_08(self): + self._qqplot_AlphaSimR(num_rep=500, random_seed=2000, h2=0.8) + + def test_compare_AlphaSimR_h2_02(self): + self._qqplot_AlphaSimR(num_rep=500, random_seed=2001, h2=0.2) + + def test_compare_AlphaSimR_h2_05(self): + self._qqplot_AlphaSimR(num_rep=500, random_seed=2000, h2=0.5) + + +class ComparisonTestArgNeedle(ComparisonTest): + + def _qqplot_arg_needle(self, num_rep, h2, alpha, random_seed): + ts, ind_id = self._simulate_out_of_africa(random_seed=random_seed) + tstrait_phenotype = self._simulate_tstrait( + ts=ts, num_rep=num_rep, h2=h2, random_seed=random_seed + ) + argneedle_phenotype = _simulate_arg_needle( + ts=ts, alpha=alpha, h2=h2, random_seed=random_seed, num_rep=num_rep + ) + self._compare_phenotype( + data1=tstrait_phenotype, + data1_name="tstrait", + data2=argneedle_phenotype.phenotype, + data2_name="AlphaSimR", + ind_id=ind_id, + ) + def test_compare_arg_needle_alpha_0_h2_08(self): - self._run(model="ARG-Needle", num_rep=500, h2=0.8, random_seed=12345, alpha=0) + self._qqplot_arg_needle(num_rep=500, h2=0.8, random_seed=3000, alpha=0) def test_compare_arg_needle_alpha_negative_1_h2_08(self): - self._run(model="ARG-Needle", num_rep=500, h2=0.8, random_seed=3333, alpha=-1) + self._qqplot_arg_needle(num_rep=500, h2=0.8, random_seed=3001, alpha=-1) def test_compare_arg_needle_alpha_positive_1_h2_1(self): - self._run(model="ARG-Needle", num_rep=500, h2=1, random_seed=3333, alpha=1) - - def test_compare_AlphaSimR_h2_08(self): - self._run(model="AlphaSimR", num_rep=500, random_seed=222, h2=0.8) + self._qqplot_arg_needle(num_rep=500, h2=1, random_seed=3002, alpha=1) + + def test_compare_arg_needle_alpha_positive_2_h2_01(self): + self._qqplot_arg_needle(num_rep=500, h2=0.1, random_seed=3003, alpha=2) + + def test_compare_arg_needle_alpha_negative_2_h2_01(self): + self._qqplot_arg_needle(num_rep=500, h2=0.1, random_seed=3004, alpha=-2) + + def test_compare_arg_needle_alpha_0_h2_02(self): + self._qqplot_arg_needle(num_rep=500, h2=0.2, random_seed=3005, alpha=0) + + def test_compare_arg_needle_alpha_negative_1_h2_05(self): + self._qqplot_arg_needle(num_rep=500, h2=0.5, random_seed=3006, alpha=-1) def model_list(loc, scale):