Skip to content

Commit

Permalink
Implement acgt_alleles set to True
Browse files Browse the repository at this point in the history
  • Loading branch information
szhan committed Jun 19, 2023
1 parent b766ddf commit bd4d670
Showing 1 changed file with 18 additions and 2 deletions.
20 changes: 18 additions & 2 deletions src/compare_vcfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ def make_compatible_genotypes(ds1, ds2, acgt_alleles=False):
3. `ds1` and `ds2` have the same number of variable sites.
4. `ds1` and `ds2` have the same allele list at each site.
Assumptions:
1. Only ACGT alleles.
2. No mixed ploidy.
Additionally, if `actg_alleles` is set to True,
all allele lists in `ds1` and `ds2` are set to ACGT.
:param xarray.Dataset ds1: sgkit-style dataset.
:param xarray.Dataset ds2: sgkit-style dataset.
:param bool acgt_alleles: All allele lists are set to ACGT (default = False).
Expand All @@ -123,6 +130,11 @@ def make_compatible_genotypes(ds1, ds2, acgt_alleles=False):
ds2_idx = np.array(ds2_idx.flatten())
assert len(ds1_idx) == len(ds2_idx) == len(common_site_idx)

if acgt_alleles:
remapped_ds1_alleles, remapped_ds1_genotypes = remap_genotypes(ds2, ds1, acgt_alleles=acgt_alleles)
assert remapped_ds1_alleles.shape == (len(common_site_idx), 4)
assert remapped_ds1_genotypes.shape == (len(common_site_idx), ds1.dims["samples"], ds1.dims["ploidy"])

remapped_ds2_alleles, remapped_ds2_genotypes = remap_genotypes(ds1, ds2, acgt_alleles=acgt_alleles)
assert remapped_ds2_alleles.shape == (len(common_site_idx), 4)
assert remapped_ds2_genotypes.shape == (len(common_site_idx), ds2.dims["samples"], ds2.dims["ploidy"])
Expand All @@ -131,8 +143,12 @@ def make_compatible_genotypes(ds1, ds2, acgt_alleles=False):
ds1_subset = ds1.copy(deep=True) # TODO: Copying is expensive, another way?
ds1_subset["variant_contig"] = ds1_subset["variant_contig"].isel(variants=ds1_idx)
ds1_subset["variant_position"] = ds1_subset["variant_position"].isel(variants=ds1_idx)
ds1_subset["variant_allele"] = remapped_ds2_alleles
ds1_subset["call_genotype"] = ds1_subset["call_genotype"].isel(variants=ds1_idx)
if acgt_alleles:
ds1_subset["variant_allele"] = remapped_ds1_alleles
ds1_subset["call_genotype"] = remapped_ds1_genotypes
else:
ds1_subset["variant_allele"] = remapped_ds2_alleles
ds1_subset["call_genotype"] = ds1_subset["call_genotype"].isel(variants=ds1_idx)
ds1_subset["call_genotype_mask"] = ds1_subset["call_genotype_mask"].isel(variants=ds1_idx)

# Subset `ds2` to common sites
Expand Down

0 comments on commit bd4d670

Please sign in to comment.