From 440a5ef00cfa6d907d6a795c344101ab4e226600 Mon Sep 17 00:00:00 2001
From: _ <4256466+leehart@users.noreply.github.com>
Date: Thu, 3 Aug 2023 20:31:37 +0100
Subject: [PATCH] Simulate CNV variants (start, end pos data) more
 realistically, independent of SNP data.

---
 tests/anoph/conftest.py | 211 +++++++++++++++++++++++++---------------
 1 file changed, 130 insertions(+), 81 deletions(-)

diff --git a/tests/anoph/conftest.py b/tests/anoph/conftest.py
index 0fb8da214..cb99a6dc0 100644
--- a/tests/anoph/conftest.py
+++ b/tests/anoph/conftest.py
@@ -584,23 +584,22 @@ def simulate_aim_variants(path, contigs, snp_sites, n_sites_low, n_sites_high):
     return ds
 
 
-def simulate_cnv_hmm(zarr_path, metadata_path, contigs, snp_sites):
+def simulate_cnv_hmm(zarr_path, metadata_path, contigs, contig_sizes):
     # zarr_path is the output path to the zarr store
     # metadata_path is the input path for the sample metadata
     # contigs is the list of contigs, e.g. Ag has ('2L', '2R', '3R', '3L', 'X')
-    # snp_sites are the data from snp_genotypes/all/sites, inc. {contig}/variants/POS
-    # n_sites will be derived from snp_sites {contig}/variants/POS
+    # contig_sizes is a dictionary of the sizes of the contigs in base pairs
 
     # {release}/cnv/{sample_set}/hmm/zarr
     # - {contig}
     #   - calldata
-    #     - CN [2D array] [int] [-1 to 12 for n_sites for n_samples]
-    #     - NormCov [2D array] [float] [0 to 356+ for n_sites for for n_samples]
-    #     - RawCov [2D array] [int] [-1 to 18465+ for n_sites for for n_samples]
+    #     - CN [2D array] [int] [-1 to 12 for n_variants for n_samples]
+    #     - NormCov [2D array] [float] [0 to 356+ for n_variants for for n_samples]
+    #     - RawCov [2D array] [int] [-1 to 18465+ for n_variants for for n_samples]
     #   - samples [1D array] [str for n_samples]
     #   - variants
-    #      - END [1D array] [int for n_snp_sites]
-    #      - POS [1D array] [int for n_snp_sites]
+    #      - END [1D array] [int for n_variants]
+    #      - POS [1D array] [int for n_variants]
     # - sample_coverage_variance [1D array] [float] [0 to 0.5 for n_samples]
     # - sample_is_high_variance [1D array] [bool] [True or False for n_samples]
     # - samples [1D array] [str]
@@ -636,16 +635,28 @@ def simulate_cnv_hmm(zarr_path, metadata_path, contigs, snp_sites):
         # Create the calldata group for this contig.
         calldata_grp = contig_grp.require_group("calldata")
 
-        # Get the SNP positions for this contig.
-        snp_pos = snp_sites[f"{contig}/variants/POS"][:]
+        # Get the length of this contig.
+        contig_length_bp = contig_sizes[contig]
+
+        # Set the window size.
+        window_size_bp = 300
+
+        # Get the number of non-overlapping windows ("variants") using contig_length.
+        # Note: this uses the floor division operator `//`, which returns an integer.
+        n_windows = contig_length_bp // window_size_bp
 
-        # Get the number of sites (SNP positions) for this contig.
-        n_sites = snp_pos.shape[0]
+        # Produce the set of window start positions as a tuple (immutable list).
+        window_start_pos = tuple(1 + i * window_size_bp for i in range(n_windows))
+
+        # Produce the set of window end positions as a tuple (immutable list).
+        window_end_pos = tuple(i * window_size_bp for i in range(1, n_windows)) + (
+            contig_length_bp,
+        )
 
         # Simulate CN, NormCov, RawCov under calldata.
-        cn = np.random.randint(low=-1, high=12, size=(n_sites, n_samples))
-        normCov = np.random.randint(low=0, high=356, size=(n_sites, n_samples))
-        rawCov = np.random.randint(low=-1, high=18465, size=(n_sites, n_samples))
+        cn = np.random.randint(low=-1, high=12, size=(n_windows, n_samples))
+        normCov = np.random.randint(low=0, high=356, size=(n_windows, n_samples))
+        rawCov = np.random.randint(low=-1, high=18465, size=(n_windows, n_samples))
         calldata_grp.create_dataset(name="CN", data=cn)
         calldata_grp.create_dataset(name="NormCov", data=normCov)
         calldata_grp.create_dataset(name="RawCov", data=rawCov)
@@ -657,38 +668,34 @@ def simulate_cnv_hmm(zarr_path, metadata_path, contigs, snp_sites):
         variants_grp = contig_grp.require_group("variants")
 
         # Simulate POS under variants.
-        # Note: choosing a random subset would cause dimension size conflicts.
-        variants_grp.create_dataset(name="POS", data=snp_pos)
+        variants_grp.create_dataset(name="POS", data=window_start_pos)
 
         # Simulate END under variants.
-        random_multipliers = np.random.randint(2, 12, size=len(snp_pos))
-        end = random_multipliers * snp_pos
-        variants_grp.create_dataset(name="END", data=end)
+        variants_grp.create_dataset(name="END", data=window_end_pos)
 
     zarr.consolidate_metadata(zarr_path)
 
 
-def simulate_cnv_coverage_calls(zarr_path, metadata_path, contigs, snp_sites):
+def simulate_cnv_coverage_calls(zarr_path, metadata_path, contigs, contig_sizes):
     # zarr_path is the output path to the zarr store
     # metadata_path is the input path for the sample metadata
     # contigs is the list of contigs, e.g. Ag has ('2L', '2R', '3R', '3L', 'X')
-    # snp_sites are the data from snp_genotypes/all/sites, inc. {contig}/variants/POS
-    # n_sites will be derived from snp_sites {contig}/variants/POS
+    # contig_sizes is a dictionary of the sizes of the contigs in base pairs
 
     # {release}/cnv/{sample_set}/coverage_calls/{analysis}/zarr
     #   - samples [1D array] [str for n_samples]
     #   - {contig}
     #      - calldata
-    #         - GT [2D array] [int] [0 or 1 for n_sites for n_samples]
+    #         - GT [2D array] [int] [0 or 1 for n_variants for n_samples]
     #      - samples [1D array] [str for n_samples]
     #      - variants
-    #         - CIEND [1D array] [int] [0 to 13200+ for n_sites]
-    #         - CIPOS [1D array] [int] [0 to 37200+ for n_sites]
-    #         - END [1D array] [int for n_sites]
-    #         - FILTER_PASS [1D array] [bool] [True or False for n_sites]
-    #         - FILTER_qMerge [1D array] [bool] [True or False for n_sites]
-    #         - ID [1D array] [unique str for n_sites]
-    #         - POS [1D array] [int for n_sites]
+    #         - CIEND [1D array] [int] [0 to 13200+ for n_variants]
+    #         - CIPOS [1D array] [int] [0 to 37200+ for n_variants]
+    #         - END [1D array] [int for n_variants]
+    #         - FILTER_PASS [1D array] [bool] [True or False for n_variants]
+    #         - FILTER_qMerge [1D array] [bool] [True or False for n_variants]
+    #         - ID [1D array] [unique str for n_variants]
+    #         - POS [1D array] [int for n_variants]
 
     # Get a random probability for choosing allele 1, between 0 and 1.
     p_allele = np.random.random()
@@ -715,17 +722,35 @@ def simulate_cnv_coverage_calls(zarr_path, metadata_path, contigs, snp_sites):
         # Create the calldata group for this contig.
         calldata = contig_grp.require_group("calldata")
 
-        # Get the SNP positions for this contig.
-        snp_pos = snp_sites[f"{contig}/variants/POS"][:]
+        # Get the length of this contig
+        contig_length_bp = contig_sizes[contig]
+
+        # Get a random number of CNV alleles ("variants") to simulate.
+        n_cnv_alleles = np.random.randint(1, 5_000)
+
+        # Produce a set of random start positions for each allele as a sorted list.
+        allele_start_pos = sorted(
+            np.random.randint(1, contig_length_bp, size=n_cnv_alleles)
+        )
+
+        # Produce a set of random allele lengths for each allele, according to a range.
+        allele_length_bp_min = 100
+        allele_length_bp_max = 100_000
+        allele_lengths_bp = np.random.randint(
+            allele_length_bp_min, allele_length_bp_max, size=n_cnv_alleles
+        )
 
-        # Get the number of sites (SNP positions) for this contig.
-        n_sites = snp_pos.shape[0]
+        # Produce the set of end postions for each allele, according to start position and length.
+        allele_end_pos = [
+            start_pos + length
+            for start_pos, length in zip(allele_start_pos, allele_lengths_bp)
+        ]
 
         # Simulate the genotype calls.
         # Note: this is only 2D, unlike SNP, HAP, AIM GT which are 3D
         gt = np.random.choice(
             np.array([0, 1], dtype="i1"),
-            size=(n_sites, n_samples),
+            size=(n_cnv_alleles, n_samples),
             replace=True,
             p=[1 - p_allele, p_allele],
         )
@@ -740,61 +765,57 @@ def simulate_cnv_coverage_calls(zarr_path, metadata_path, contigs, snp_sites):
         variants_grp = contig_grp.require_group("variants")
 
         # Simulate the CIEND and CIPOS arrays under variants.
-        ciend = np.random.randint(low=0, high=13200, size=n_sites)
-        cipos = np.random.randint(low=0, high=37200, size=n_sites)
+        ciend = np.random.randint(low=0, high=13200, size=n_cnv_alleles)
+        cipos = np.random.randint(low=0, high=37200, size=n_cnv_alleles)
         variants_grp.create_dataset(name="CIEND", data=ciend)
         variants_grp.create_dataset(name="CIPOS", data=cipos)
 
         # Simulate the unique ID strings under variants.
         # Note: this is quicker than generating unique random strings.
-        len_str_n_sites = len(str(n_sites))
+        len_str_n_sites = len(str(n_cnv_alleles))
         variant_IDs = [
             f"CNV_{contig}{str(i).zfill(len_str_n_sites)}"
-            for i in range(1, n_sites + 1)
+            for i in range(1, n_cnv_alleles + 1)
         ]
         variants_grp.create_dataset(name="ID", data=variant_IDs)
 
         # Simulate the filters under variants.
         filter_pass = np.random.choice(
-            [False, True], size=n_sites, p=[1 - p_filter_pass, p_filter_pass]
+            [False, True], size=n_cnv_alleles, p=[1 - p_filter_pass, p_filter_pass]
         )
         filter_qMerge = np.random.choice(
-            [False, True], size=n_sites, p=[1 - p_filter_qMerge, p_filter_qMerge]
+            [False, True], size=n_cnv_alleles, p=[1 - p_filter_qMerge, p_filter_qMerge]
         )
         variants_grp.create_dataset(name="FILTER_PASS", data=filter_pass)
         variants_grp.create_dataset(name="FILTER_qMerge", data=filter_qMerge)
 
         # Simulate POS under variants.
-        # Note: choosing a random subset would cause dimension size conflicts.
-        variants_grp.create_dataset(name="POS", data=snp_pos)
+        variants_grp.create_dataset(name="POS", data=allele_start_pos)
 
         # Simulate END under variants.
-        random_multipliers = np.random.randint(2, 12, size=len(snp_pos))
-        end = random_multipliers * snp_pos
-        variants_grp.create_dataset(name="END", data=end)
+        variants_grp.create_dataset(name="END", data=allele_end_pos)
 
     zarr.consolidate_metadata(zarr_path)
 
 
-def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, snp_sites):
+def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, contig_sizes):
     # zarr_path is the output path to the zarr store
     # metadata_path is the input path for the sample metadata
     # contigs is the list of contigs, e.g. Ag has ('2R', '3R', 'X')
-    # snp_sites are the data from snp_genotypes/all/sites, inc. {contig}/variants/POS
-    # n_sites will be derived from snp_sites {contig}/variants/POS
+    # contig_sizes is a dictionary of the sizes of the contigs in base pairs
 
     # {release}/cnv/{sample_set}/discordant_read_calls/zarr
     # - {contig}
     #   - calldata
-    #     - GT [2D array] [int] [0 or 1 for n_sites for n_samples]
+    #     - GT [2D array] [int] [0 or 1 for n_variants for n_samples]
     #   - samples [1D array] [str for n_samples]
     #   - variants
-    #      - END [1D array] [int for n_sites]
-    #      - EndBreakpointMethod [1D array] [-1 to 2 for n_sites]
-    #      - ID [1D array] [unique str for n_sites]
-    #      - POS [1D array] [int for n_sites]
-    #      - Region [1D array] [unique str for n_sites]
-    #      - StartBreakpointMethod [1D array] [int] [-1 to 1 for n_sites]
+    #      - END [1D array] [int for n_variants]
+    #      - EndBreakpointMethod [1D array] [-1 to 2 for n_variants]
+    #      - ID [1D array] [unique str for n_variants]
+    #      - POS [1D array] [int for n_variants]
+    #      - Region [1D array] [unique str for n_variants]
+    #      - StartBreakpointMethod [1D array] [int] [-1 to 1 for n_variants]
     # - sample_coverage_variance [1D array] [float] [0 to 0.5 for n_samples]
     # - sample_is_high_variance [1D array] [bool] [True or False for n_samples]
     # - samples [1D array] [str for n_samples]
@@ -826,24 +847,53 @@ def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, snp_si
     )
     root.create_dataset(name="sample_is_high_variance", data=sample_is_high_variance)
 
-    for contig in contigs:
+    # Note: The cnv_discordant_read_calls() method of AnophelesCnvData concatenates each xarray.Dataset
+    # returned by _cnv_discordant_read_calls_dataset() for each sample set for each contig, so it expects
+    # the same number of variants for all sample sets with respect to each contig. So we need to maintain a
+    # consistent number of variants for each contig for all sample set, otherwise the shapes will not align,
+    # which will raise an error and cause test failures.
+    fixed_seed = 42
+
+    for i, contig in enumerate(contigs):
+        # Use the same random seed per contig, otherwise n_cnv_variants (and shapes) will not align.
+        unique_seed = fixed_seed + i
+        np.random.seed(unique_seed)
+
         # Create the contig group.
         contig_grp = root.require_group(contig)
 
         # Create the calldata group for this contig.
         calldata_grp = contig_grp.require_group("calldata")
 
-        # Get the SNP positions for this contig.
-        snp_pos = snp_sites[f"{contig}/variants/POS"][:]
+        # Get the length of this contig
+        contig_length_bp = contig_sizes[contig]
+
+        # Get a random number of CNV variants to simulate.
+        n_cnv_variants = np.random.randint(1, 100)
 
-        # Get the number of sites (SNP positions) for this contig.
-        n_sites = snp_pos.shape[0]
+        # Produce a set of random start positions for each variant as a sorted list.
+        variant_start_pos = sorted(
+            np.random.randint(1, contig_length_bp, size=n_cnv_variants)
+        )
+
+        # Produce a set of random lengths for each variant, according to a range.
+        variant_length_bp_min = 100
+        variant_length_bp_max = 100_000
+        variant_lengths_bp = np.random.randint(
+            variant_length_bp_min, variant_length_bp_max, size=n_cnv_variants
+        )
+
+        # Produce the set of end postions for each variant, according to start position and length.
+        variant_end_pos = [
+            start_pos + length
+            for start_pos, length in zip(variant_start_pos, variant_lengths_bp)
+        ]
 
         # Simulate the genotype calls.
         # Note: this is only 2D, unlike SNP, HAP, AIM GT which are 3D
         gt = np.random.choice(
             np.array([0, 1], dtype="i1"),
-            size=(n_sites, n_samples),
+            size=(n_cnv_variants, n_samples),
             replace=True,
             p=[1 - p_allele, p_allele],
         )
@@ -858,8 +908,8 @@ def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, snp_si
         variants_grp = contig_grp.require_group("variants")
 
         # Simulate the StartBreakpointMethod and EndBreakpointMethod arrays.
-        startBreakpointMethod = np.random.randint(low=-1, high=1, size=n_sites)
-        endBreakpointMethod = np.random.randint(low=-1, high=2, size=n_sites)
+        startBreakpointMethod = np.random.randint(low=-1, high=1, size=n_cnv_variants)
+        endBreakpointMethod = np.random.randint(low=-1, high=2, size=n_cnv_variants)
         variants_grp.create_dataset(
             name="StartBreakpointMethod", data=startBreakpointMethod
         )
@@ -867,31 +917,30 @@ def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, snp_si
             name="EndBreakpointMethod", data=endBreakpointMethod
         )
 
-        # Get the number of digits in n_sites.
-        len_str_n_sites = len(str(n_sites))
+        # Get the number of digits in n_cnv_variants.
+        len_str_n_cnv_variants = len(str(n_cnv_variants))
 
         # Simulate the Region under variants.
         # Note: this is quicker than generating unique random strings.
         regions = [
-            f"Region_{str(i).zfill(len_str_n_sites)}" for i in range(1, n_sites + 1)
+            f"Region_{str(i).zfill(len_str_n_cnv_variants)}"
+            for i in range(1, n_cnv_variants + 1)
         ]
         variants_grp.create_dataset(name="Region", data=regions)
 
         # Simulate the unique ID strings under variants.
         # Note: this is quicker than generating unique random strings.
         variant_IDs = [
-            f"ID_{str(i).zfill(len_str_n_sites)}" for i in range(1, n_sites + 1)
+            f"ID_{str(i).zfill(len_str_n_cnv_variants)}"
+            for i in range(1, n_cnv_variants + 1)
         ]
         variants_grp.create_dataset(name="ID", data=variant_IDs)
 
         # Simulate POS under variants.
-        # Note: choosing a random subset would cause dimension size conflicts.
-        variants_grp.create_dataset(name="POS", data=snp_pos)
+        variants_grp.create_dataset(name="POS", data=variant_start_pos)
 
         # Simulate END under variants.
-        random_multipliers = np.random.randint(2, 12, size=len(snp_pos))
-        end = random_multipliers * snp_pos
-        variants_grp.create_dataset(name="END", data=end)
+        variants_grp.create_dataset(name="END", data=variant_end_pos)
 
     zarr.consolidate_metadata(zarr_path)
 
@@ -1567,7 +1616,7 @@ def init_cnv_hmm(self):
                     zarr_path=zarr_path,
                     metadata_path=metadata_path,
                     contigs=self.contigs,
-                    snp_sites=self.snp_sites,
+                    contig_sizes=self.contig_sizes,
                 )
 
     def init_cnv_coverage_calls(self):
@@ -1606,7 +1655,7 @@ def init_cnv_coverage_calls(self):
                     zarr_path=zarr_path,
                     metadata_path=metadata_path,
                     contigs=self.contigs,
-                    snp_sites=self.snp_sites,
+                    contig_sizes=self.contig_sizes,
                 )
 
                 # Simulate CNV coverage calls data for the arab analysis.
@@ -1624,7 +1673,7 @@ def init_cnv_coverage_calls(self):
                     zarr_path=zarr_path,
                     metadata_path=metadata_path,
                     contigs=self.contigs,
-                    snp_sites=self.snp_sites,
+                    contig_sizes=self.contig_sizes,
                 )
 
     def init_cnv_discordant_read_calls(self):
@@ -1664,7 +1713,7 @@ def init_cnv_discordant_read_calls(self):
                     metadata_path=metadata_path,
                     # Note: the real data does not include every contig.
                     contigs=self.contigs,
-                    snp_sites=self.snp_sites,
+                    contig_sizes=self.contig_sizes,
                 )
 
 
@@ -2013,7 +2062,7 @@ def init_cnv_hmm(self):
                     zarr_path=zarr_path,
                     metadata_path=metadata_path,
                     contigs=self.contigs,
-                    snp_sites=self.snp_sites,
+                    contig_sizes=self.contig_sizes,
                 )
 
     def init_cnv_coverage_calls(self):
@@ -2049,7 +2098,7 @@ def init_cnv_coverage_calls(self):
                     zarr_path=zarr_path,
                     metadata_path=metadata_path,
                     contigs=self.contigs,
-                    snp_sites=self.snp_sites,
+                    contig_sizes=self.contig_sizes,
                 )
 
     def init_cnv_discordant_read_calls(self):
@@ -2086,7 +2135,7 @@ def init_cnv_discordant_read_calls(self):
                     metadata_path=metadata_path,
                     # Note: the real data does not include every contig.
                     contigs=self.contigs,
-                    snp_sites=self.snp_sites,
+                    contig_sizes=self.contig_sizes,
                 )