Skip to content
Merged
1 change: 1 addition & 0 deletions .github/.dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ workflows:
filters:
branches:
- main
- kj_cpx_cutoff_modification
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kjaisingh please undo this in another PR; no need to have a PR only for this, it can go as part of an existing PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Vahid, thanks for pointing this out - yes, I made #866 in light of this, but will include it in another PR if I merge one soon. Apologies for the inconvenience.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I would remove it as part of one of your relevant PRs.

tags:
- /.*/

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
MIN_SIZE = 1000
MIN_DIFF = 0.4
MIN_SIZE_IDEL = 150
MIN_DDUP_THRESH = 1000000


def interval_string(chrom, start, end):
Expand Down Expand Up @@ -411,7 +410,7 @@ def read_vcf(vcf_path, cleaned_genotype_counts_vids):
return {r.id: VcfRecord(r, end_dict) for r in f if r.id in cleaned_genotype_counts_vids}


def final_assessment(cleaned_genotype_counts, variants_to_reclassify):
def final_assessment(cleaned_genotype_counts, variants_to_reclassify, min_ddup_thresh):

def _subtract_interval(start1, end1, start2, end2):
# Spanning case results in null (shouldn't ever do this in this context)
Expand Down Expand Up @@ -577,7 +576,7 @@ def _is_ctx_ins(t):

# Solve inverted dispersed duplications vs dupINV / dupINVdel or INVdup / delINVdup
# DUP5/INS3 or dupINV / dupINVdel
def _evaluate_dup5_ins3(r, records_list, default_sv_type, default_cpx_type):
def _evaluate_dup5_ins3(r, records_list, default_sv_type, default_cpx_type, min_ddup_thresh):
# Get duplication/insertion interval
dup_chrom = r.source_chrom
dup_start = r.source_start
Expand All @@ -601,7 +600,7 @@ def _evaluate_dup5_ins3(r, records_list, default_sv_type, default_cpx_type):
inv_end = r.sink_end
inv_size = inv_end - inv_start
# If inversion length < MINdDUPTHRESH, classify as dupINV or dupINVdel
if inv_size < MIN_DDUP_THRESH:
if inv_size < min_ddup_thresh:
# If sink is deleted, classify as dupINVdel
if sink_is_del:
# Revise inv interval (subtracting del interval)
Expand Down Expand Up @@ -685,7 +684,7 @@ def _evaluate_dup5_ins3(r, records_list, default_sv_type, default_cpx_type):
)

# DUP3/INS5 or INVdup / delINVdup
def _evaluate_dup3_ins5(r, records_list, default_sv_type, default_cpx_type):
def _evaluate_dup3_ins5(r, records_list, default_sv_type, default_cpx_type, min_ddup_thresh):
# Get duplication/insertion interval
dup_chrom = r.source_chrom
dup_start = r.source_start
Expand All @@ -709,7 +708,7 @@ def _evaluate_dup3_ins5(r, records_list, default_sv_type, default_cpx_type):
inv_end = dup_end
inv_size = inv_end - inv_start
# If inversion length < MINdDUPTHRESH, classify as dupINV or dupINVdel
if inv_size < MIN_DDUP_THRESH:
if inv_size < min_ddup_thresh:
# If sink is deleted, classify as delINVdup
if sink_is_del:
# Revise inv interval (subtracting del interval)
Expand Down Expand Up @@ -959,9 +958,9 @@ def _evaluate_irrelevant(r, default_sv_type, default_cpx_type):
if _is_ctx_ins(cpx_type):
result = _evaluate_ctx_ins(vcf_record, records, sv_type, cpx_type)
elif cpx_type == "DUP5/INS3":
result = _evaluate_dup5_ins3(vcf_record, records, sv_type, cpx_type)
result = _evaluate_dup5_ins3(vcf_record, records, sv_type, cpx_type, min_ddup_thresh)
elif cpx_type == "DUP3/INS5":
result = _evaluate_dup3_ins5(vcf_record, records, sv_type, cpx_type)
result = _evaluate_dup3_ins5(vcf_record, records, sv_type, cpx_type, min_ddup_thresh)
else:
result = _evaluate_ins_idel(vcf_record, records, sv_type, cpx_type)
elif sv_type == "BND":
Expand Down Expand Up @@ -1124,6 +1123,7 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
parser.add_argument('--ped', type=str, help='PED family file')
parser.add_argument('--out', type=str, help='Output file')
parser.add_argument('--reclassification-table', type=str, help='Output reclassification table path', required=False)
parser.add_argument('--min-ddup-thresh', type=int, help="Min DUP threshold", default=5000)
parser.add_argument('--chrx', type=str, help='Chromosome X contig name', default='chrX')
parser.add_argument('--chry', type=str, help='Chromosome Y contig name', default='chrY')
parser.add_argument("-l", "--log-level", required=False, default="INFO",
Expand Down Expand Up @@ -1162,7 +1162,7 @@ def main(argv: Optional[List[Text]] = None):
cleaned_genotype_counts = clean_up_intervals(genotype_counts, intervals_list_dict, genotype_counts_tree_dict)
del genotype_counts, intervals_list_dict, genotype_counts_tree_dict
variants_to_reclassify = read_vcf(args.vcf, cleaned_genotype_counts_vids=cleaned_genotype_counts.keys())
assessment = list(final_assessment(cleaned_genotype_counts, variants_to_reclassify))
assessment = list(final_assessment(cleaned_genotype_counts, variants_to_reclassify, args.min_ddup_thresh))
del cleaned_genotype_counts, variants_to_reclassify
if args.reclassification_table:
write_reclassification_table(args.reclassification_table, assessment)
Expand Down
2 changes: 2 additions & 0 deletions wdl/GenotypeComplexVariants.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ workflow GenotypeComplexVariants {

Boolean merge_vcfs = false
Int? records_per_shard
Int? min_ddup_thresh

Array[File] complex_resolve_vcfs
Array[File] complex_resolve_vcf_indexes
Expand Down Expand Up @@ -85,6 +86,7 @@ workflow GenotypeComplexVariants {
n_per_split_small=2500,
n_per_split_large=250,
n_rd_test_bins=100000,
min_ddup_thresh=min_ddup_thresh,
prefix="~{cohort_name}.~{contig}",
contig=contig,
ped_files=SubsetPedFile.ped_subset_file,
Expand Down
4 changes: 4 additions & 0 deletions wdl/GenotypeCpxCnvs.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ workflow GenotypeCpxCnvs {
Int n_per_split_small
Int n_per_split_large
Int n_rd_test_bins
Int? min_ddup_thresh
String prefix
File ped_file
String contig
Expand Down Expand Up @@ -94,6 +95,7 @@ workflow GenotypeCpxCnvs {
vcf=vcf,
intervals=GetCpxCnvIntervals.cpx_cnv_bed,
genotypes=MergeMeltedGts.outfile,
min_ddup_thresh=min_ddup_thresh,
prefix=contig_prefix,
ped_file=ped_file,
contig=contig,
Expand Down Expand Up @@ -166,6 +168,7 @@ task ParseGenotypes {
File vcf
File intervals
File genotypes
Int? min_ddup_thresh
File ped_file
String prefix
String contig
Expand Down Expand Up @@ -201,6 +204,7 @@ task ParseGenotypes {
--vcf ~{vcf} \
--intervals ~{intervals} \
--genotypes ~{genotypes} \
~{if defined(min_ddup_thresh) then "--min-ddup-thresh " + min_ddup_thresh else ""} \
--ped ~{ped_file} \
--out out.vcf.gz \
--reclassification-table ~{prefix}.CPXregenotyping_reclassification_table.~{contig}.txt
Expand Down
2 changes: 2 additions & 0 deletions wdl/ScatterCpxGenotyping.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ workflow ScatterCpxGenotyping {
Int n_per_split_small
Int n_per_split_large
Int n_rd_test_bins
Int? min_ddup_thresh
String prefix
File ped_file
String contig
Expand Down Expand Up @@ -73,6 +74,7 @@ workflow ScatterCpxGenotyping {
n_per_split_large=n_per_split_large,
n_per_split_small=n_per_split_small,
n_rd_test_bins=n_rd_test_bins,
min_ddup_thresh=min_ddup_thresh,
prefix=prefix,
ped_file=ped_file,
contig=contig,
Expand Down
3 changes: 3 additions & 0 deletions website/docs/modules/genotype_complex.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ downstream workflows.
#### <HighlightOptionalArg>Optional</HighlightOptionalArg> `localize_shard_size`
Default: `50000`. Shard size for parallel computations. Decreasing this parameter may help reduce run time.

#### <HighlightOptionalArg>Optional</HighlightOptionalArg> `min_ddup_thresh`
Default: `5000`. Minimize size threshold used to classify a dispersed deletion.

#### `complex_resolve_vcfs`
Array of contig-sharded VCFs containing putative complex variants, generated in [ResolveComplexVariants](./rcv#complex_resolve_vcfs).

Expand Down