diff --git a/src/compare_vcfs.py b/src/compare_vcfs.py index a64e19c..56b4068 100644 --- a/src/compare_vcfs.py +++ b/src/compare_vcfs.py @@ -110,6 +110,13 @@ def make_compatible_genotypes(ds1, ds2, acgt_alleles=False): 3. `ds1` and `ds2` have the same number of variable sites. 4. `ds1` and `ds2` have the same allele list at each site. + Assumptions: + 1. Only ACGT alleles. + 2. No mixed ploidy. + + Additionally, if `actg_alleles` is set to True, + all allele lists in `ds1` and `ds2` are set to ACGT. + :param xarray.Dataset ds1: sgkit-style dataset. :param xarray.Dataset ds2: sgkit-style dataset. :param bool acgt_alleles: All allele lists are set to ACGT (default = False). @@ -123,6 +130,11 @@ def make_compatible_genotypes(ds1, ds2, acgt_alleles=False): ds2_idx = np.array(ds2_idx.flatten()) assert len(ds1_idx) == len(ds2_idx) == len(common_site_idx) + if acgt_alleles: + remapped_ds1_alleles, remapped_ds1_genotypes = remap_genotypes(ds2, ds1, acgt_alleles=acgt_alleles) + assert remapped_ds1_alleles.shape == (len(common_site_idx), 4) + assert remapped_ds1_genotypes.shape == (len(common_site_idx), ds1.dims["samples"], ds1.dims["ploidy"]) + remapped_ds2_alleles, remapped_ds2_genotypes = remap_genotypes(ds1, ds2, acgt_alleles=acgt_alleles) assert remapped_ds2_alleles.shape == (len(common_site_idx), 4) assert remapped_ds2_genotypes.shape == (len(common_site_idx), ds2.dims["samples"], ds2.dims["ploidy"]) @@ -131,8 +143,12 @@ def make_compatible_genotypes(ds1, ds2, acgt_alleles=False): ds1_subset = ds1.copy(deep=True) # TODO: Copying is expensive, another way? ds1_subset["variant_contig"] = ds1_subset["variant_contig"].isel(variants=ds1_idx) ds1_subset["variant_position"] = ds1_subset["variant_position"].isel(variants=ds1_idx) - ds1_subset["variant_allele"] = remapped_ds2_alleles - ds1_subset["call_genotype"] = ds1_subset["call_genotype"].isel(variants=ds1_idx) + if acgt_alleles: + ds1_subset["variant_allele"] = remapped_ds1_alleles + ds1_subset["call_genotype"] = remapped_ds1_genotypes + else: + ds1_subset["variant_allele"] = remapped_ds2_alleles + ds1_subset["call_genotype"] = ds1_subset["call_genotype"].isel(variants=ds1_idx) ds1_subset["call_genotype_mask"] = ds1_subset["call_genotype_mask"].isel(variants=ds1_idx) # Subset `ds2` to common sites