broadinstitute · bw2 · Aug 10, 2023
diff --git a/figures_and_tables/plot_tool_accuracy_by_allele_size.py b/figures_and_tables/plot_tool_accuracy_by_allele_size.py
@@ -139,8 +139,10 @@ def plot_accuracy_by_allele_size(
 
         ax.set_xlabel("True Allele Size Minus Number of Repeats in Reference Genome", fontsize=14)
         if i == 0:
-            ax.spines.right.set_visible(False)
-            ax.spines.top.set_visible(False)
+            if hasattr(ax.spines, 'right'):
+                ax.spines.right.set_visible(False)
+            if hasattr(ax.spines, 'top'):
+                ax.spines.top.set_visible(False)
             ax.set_ylabel("Number of Alleles", fontsize=14)
         else:
             ax.set_ylabel("Fraction of Alleles", fontsize=14)
@@ -359,6 +361,9 @@ def generate_all_plots(df, args):
 
                                     output_image_filename += f".{tool}"
 
+                                    if args.compare_loci_with_adjacent_repeats:
+                                        output_image_filename += f".loci_with_{df_plot.iloc[0].AdjacentLociLabel}"
+
                                     coverage_label = f"exome data" if coverage == "exome" else f"{coverage} genome data"
 
                                     # skip_condition1: HipSTR doesn't support motifs larger than 9bp
@@ -478,13 +483,17 @@ def main():
     g2.add_argument("--show-no-call-loci", action="store_true", help="Show loci with no call")
     g2.add_argument("--hide-no-call-loci", action="store_true", help="Hide loci with no call")
 
+    p.add_argument("--compare-loci-with-adjacent-repeats", action="store_true",
+                   help="Generate plots for loci with vs. without adjacent repeats")
+
     p.add_argument("combined_tool_results_tsv", nargs="?", default="../tool_comparison/combined.results.alleles.tsv.gz")
     args = p.parse_args()
 
     # print command line args from argparse
-    print("Command line args:")
-    for arg in vars(args):
-        print(f"{arg}: {getattr(args, arg)}")
+    if not args.compare_loci_with_adjacent_repeats:
+        print("Command line args:")
+        for arg in vars(args):
+            print(f"{arg}: {getattr(args, arg)}")
 
     if args.only_print_total_number_of_plots:
         df = pd.DataFrame()
@@ -501,14 +510,45 @@ def main():
 
     print("Computing additional columns...")
     df.loc[:, "DiffFromRefRepeats: Allele: Truth (bin)"] = df.apply(bin_num_repeats_wrapper(bin_size=2), axis=1)
-    for tool in ("ExpansionHunter", "GangSTR", "HipSTR",):
-        define_hue_column(df, tool)
-
     df = df.sort_values("DiffFromRefRepeats: Allele: Truth")
 
     print(f"Generating {str(args.n) + ' ' if args.n else ''}plots",
           f"starting with plot #{args.start_with_plot_i}" if args.start_with_plot_i else "")
-    generate_all_plots(df, args)
+
+    if not args.compare_loci_with_adjacent_repeats:
+        for tool in ("ExpansionHunter", "GangSTR", "HipSTR",):
+            define_hue_column(df, tool)
+
+
+        generate_all_plots(df, args)
+
+    else:
+        define_hue_column(df, "ExpansionHunter")
+
+        locus_ids_with_adjacent_repeats = set(df[df["AdjacentRepeatCount: ExpansionHunter"] >= 1].LocusId)
+
+        for i in 1, 2:
+            args.tool = "ExpansionHunter"
+            args.coverage = "40x"
+            args.q_threshold = 0
+            args.genotype = "all"
+            args.min_motif_size = 2
+            args.max_motif_size = 6
+            args.show_no_call_loci_option = True
+
+            args.output_dir = f"loci_with_adjacent_repeats"
+            os.system(f"mkdir -p {args.output_dir}")
+
+            if i == 1:
+                current_df = df[df["AdjacentLociLabel"] == "no_adjacent_loci"]
+                print(f"Generating plot for {len(set(current_df.LocusId)):,d} loci with no adjacent loci")
+            elif i == 2:
+                current_df = df[df["AdjacentLociLabel"] == "max_dist_10bp"]
+                print(f"Generating plot for {len(set(current_df.LocusId)):,d} loci with adjacent loci within 10bp")
+
+            current_df = current_df[current_df.LocusId.isin(locus_ids_with_adjacent_repeats)]
+
+            generate_all_plots(current_df, args)
 
     print(f"Done")
 

diff --git a/figures_and_tables/plot_tool_accuracy_percent_exactly_right.py b/figures_and_tables/plot_tool_accuracy_percent_exactly_right.py
@@ -70,10 +70,11 @@ def compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x",
     tables_by_coverage = []
     for coverage in coverage_values:
         for tool in ("ExpansionHunter", "GangSTR", "HipSTR"):
-
             df_current = df.copy()
             df_current = df_current[df_current["Coverage"] == coverage]
-
+            if len(df_current) == 0:
+                print("0 records found for", tool, "at", coverage, "coverage. Skipping...")
+                continue
             df_tool, overall_fraction_exactly_right = compute_fraction_exactly_right(df_current, tool)
             df_tool.loc[:, "tool"] = f"{tool}: {coverage} coverage" if len(coverage_values) > 1 else tool
             tables_by_coverage.append(df_tool)
@@ -85,14 +86,41 @@ def compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x",
     return pd.concat(tables_by_coverage, axis=0)
 
 
-def generate_fraction_exactly_right_plot(df, args, filter_description, image_name):
+def compute_tables_for_fraction_exactly_right_plots_by_adjacent_loci(df, max_dist_values=(
+        "no_adjacent_loci", "max_dist_10bp", "max_dist_24bp", "max_dist_50bp")):
+    tool = "ExpansionHunter"
+    tables_by_adjacent_loci = []
+    for max_dist_value in max_dist_values:
+        df_current = df.copy()
+        df_current = df_current[df_current["AdjacentLociLabel"] == max_dist_value]
+        if len(df_current) == 0:
+            print("0 records found for", tool, "at max_dist =", max_dist_value, "bp. Skipping...")
+            continue
+
+        df_tool, overall_fraction_exactly_right = compute_fraction_exactly_right(df_current, tool)
+        df_tool.loc[:, "tool"] = f"{tool}: {max_dist_value}"
+        df_tool.loc[:, "AdjacentLociLabel"] = max_dist_value.replace("_", " ")
+
+        tables_by_adjacent_loci.append(df_tool)
+
+        print(f"Processed {tool:20s} --  {100*overall_fraction_exactly_right:0.1f}% of calls by {tool} "
+              f" were exactly right for {len(df_current):,d} alleles at {len(set(df_current.LocusId)):,d} loci")
 
-    df_fraction = compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x", "30x", "20x", "10x"))
+    return pd.concat(tables_by_adjacent_loci, axis=0)
+
+
+def generate_fraction_exactly_right_plot(df, args, filter_description, image_name):
+    if args.by_adjacent_loci:
+        df_fraction = compute_tables_for_fraction_exactly_right_plots_by_adjacent_loci(df)
+    else:
+        df_fraction = compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x", "30x", "20x", "10x"))
     df_fraction = df_fraction.reset_index()
     df_fraction = df_fraction.sort_values("DiffFromRefRepeats: Allele: Truth (bin)", key=lambda c: c.str.split(" ").str[0].astype(int))
 
     coverage_levels_to_plot = []
-    if args.by_coverage:
+    if args.by_adjacent_loci:
+        coverage_levels_to_plot.append("40x")
+    elif args.by_coverage:
         coverage_levels_to_plot.append("all")
     elif args.coverage:
         coverage_levels_to_plot.append(args.coverage)
@@ -101,31 +129,40 @@ def generate_fraction_exactly_right_plot(df, args, filter_description, image_nam
 
     for coverage in coverage_levels_to_plot:
         df_current = df_fraction.copy()
-        if coverage == "all":
-            # exclude HipSTR to make plot clearer
-            df_current = df_current[~df_current.tool.str.contains("HipSTR")]
-            df_current = df_current[
-                df_current.tool.str.contains("40x") | df_current.tool.str.contains("20x") | df_current.tool.str.contains("10x")
-            ]
-        else:
-            df_current = df_current[df_current.tool.str.contains(coverage)]
-            df_current.loc[:, "tool"] = df_current["tool"].apply(lambda s: s.split(":")[0])
+        if not args.by_adjacent_loci:
+            if coverage == "all":
+                # exclude HipSTR to make plot clearer
+                df_current = df_current[~df_current.tool.str.contains("HipSTR")]
+                df_current = df_current[
+                    df_current.tool.str.contains("40x") | df_current.tool.str.contains("20x") | df_current.tool.str.contains("10x")
+                ]
+            else:
+                df_current = df_current[df_current.tool.str.contains(coverage)]
+                df_current.loc[:, "tool"] = df_current["tool"].apply(lambda s: s.split(":")[0])
+
+            if coverage == "all":
+                image_name += ".compare_different_coverages"
+            else:
+                image_name += f".{coverage}_coverage"
 
-        if coverage == "all":
-            image_name += ".compare_different_coverages"
-        else:
-            image_name += f".{coverage}_coverage"
 
         figure_title = f"Accuracy of " + ", ".join(sorted(set([t.split(":")[0] for t in set(df_current.tool)])))
         figure_title += "\n\n"
         figure_title += f"at {len(set(df.LocusId)):,d} STR loci (" + ", ".join(filter_description) + ")"
 
         print("Plotting", figure_title.replace("\n", " "))
 
-        hue_values = set(df_current["tool"])
-        num_gangstr_colors = len([h for h in hue_values if h.lower().startswith("gangstr")])
-        num_expansion_hunter_colors = len([h for h in hue_values if h.lower().startswith("expansionhunter")])
-        num_hipstr_colors = len([h for h in hue_values if h.lower().startswith("hipstr")])
+        if args.by_adjacent_loci:
+            hue_values = set(df_current["AdjacentLociLabel"])
+            print("Hue values:", hue_values)
+            num_expansion_hunter_colors = len(hue_values)
+            num_gangstr_colors = 0
+            num_hipstr_colors = 0
+        else:
+            hue_values = set(df_current["tool"])
+            num_expansion_hunter_colors = len([h for h in hue_values if h.lower().startswith("expansionhunter")])
+            num_gangstr_colors = len([h for h in hue_values if h.lower().startswith("gangstr")])
+            num_hipstr_colors = len([h for h in hue_values if h.lower().startswith("hipstr")])
 
         # plot figure
         fig, ax = plt.subplots(1, 1, figsize=(args.width, args.height))
@@ -138,20 +175,27 @@ def hue_order(h):
                 if args.verbose:
                     print(f"Unable to parse hue value: {h}: {e}")
                 return h
+        if args.by_adjacent_loci:
+            palette = None
+        else:
+            palette = []
+            if num_expansion_hunter_colors > 0:
+                palette += list(sns.color_palette("Purples_r", n_colors=num_expansion_hunter_colors) if num_expansion_hunter_colors > 1 else ["#6A51A3"])
+            if num_gangstr_colors > 0:
+                palette += list(sns.color_palette("blend:#FF3355,#FFCCCC", n_colors=num_gangstr_colors))
+            if num_hipstr_colors > 0:
+                list(sns.color_palette("Oranges_r", n_colors=num_hipstr_colors))
 
+        print("len(df_current)", len(df_current))
+        print("len(df_current)", df_current.groupby("AdjacentLociLabel").count())
         sns.pointplot(
             data=df_current,
             x="DiffFromRefRepeats: Allele: Truth (bin)",
             y="FractionExactlyRight",
-            hue="tool",
+            hue="tool" if not args.by_adjacent_loci else "AdjacentLociLabel",
             scale=0.8,
-            hue_order=sorted(hue_values, key=hue_order),
-            palette=(
-                list(sns.color_palette("Purples_r", n_colors=num_expansion_hunter_colors)
-                     if num_expansion_hunter_colors > 1 else ["#6A51A3"]) +
-                list(sns.color_palette("blend:#FF3355,#FFCCCC", n_colors=num_gangstr_colors)) +
-                list(sns.color_palette("Oranges_r", n_colors=num_hipstr_colors))
-            ),
+            hue_order=sorted(hue_values, key=hue_order) if not args.by_adjacent_loci else sorted(hue_values),
+            palette=palette,
             ax=ax,
         )
 
@@ -200,6 +244,7 @@ def main():
     g = p.add_mutually_exclusive_group()
     g.add_argument("--by-coverage", action="store_true", help="Compare different coverages")
     g.add_argument("--coverage", help="Plot by coverage", choices=["40x", "30x", "20x", "10x"])
+    g.add_argument("--by-adjacent-loci", action="store_true", help="Compare different adjacent loci settings")
 
     g = p.add_argument_group("Filters")
     g.add_argument("--min-motif-size", type=int, help="Min motif size")
@@ -241,15 +286,32 @@ def main():
 
     if args.exclude_no_call_loci:
         count_before = len(set(df[df["Coverage"] == "40x"].LocusId))
-        df = df[~df[f"DiffRepeats: Allele: HipSTR - Truth"].isna()]
-        df = df[~df[f"DiffRepeats: Allele: GangSTR - Truth"].isna()]
+        if f"DiffRepeats: Allele: HipSTR - Truth" in df.columns:
+            df = df[~df[f"DiffRepeats: Allele: HipSTR - Truth"].isna()]
+        if f"DiffRepeats: Allele: GangSTR - Truth" in df.columns:
+            df = df[~df[f"DiffRepeats: Allele: GangSTR - Truth"].isna()]
         df = df[~df[f"DiffRepeats: Allele: ExpansionHunter - Truth"].isna()]
         count_discarded = count_before - len(set(df[df["Coverage"] == "40x"].LocusId))
         print(f"Discarded {count_discarded:,d} out of {count_before:,d} ({100.0*count_discarded/count_before:0.1f}%) "
               f"loci where at least one tool had no call")
         filter_description.append(f"excluding no-call loci")
         image_name += f".excluding_no_call_loci"
 
+    if args.by_adjacent_loci:
+        df = df[df["AdjacentLociLabel"].isin({"no_adjacent_loci", "max_dist_50bp"})]
+        adjacent_repeat_count = 1
+        max_dist = 50
+        filter_criteria = df["AdjacentRepeatCount: ExpansionHunter"] >= adjacent_repeat_count
+        filter_criteria &= df["AdjacentRepeatMaxDistance (bp): ExpansionHunter"] <= max_dist
+        #filter_criteria &= df["MotifSize"] == 3
+
+        filter_description.append(f"with at least {adjacent_repeat_count} adjacent locus ≤ {max_dist}bp away")
+
+        locus_ids_with_adjacent_repeats = set(df[filter_criteria].LocusId)
+
+        image_name += ".by_adjacent_loci"
+        df = df[df["LocusId"].isin(locus_ids_with_adjacent_repeats)]
+
     if args.verbose:
         print("Num loci:")
         print(df.groupby(["PositiveOrNegative", "Coverage", "IsPureRepeat"]).count().LocusId/2)
@@ -264,4 +326,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/run_step_B__generate_variant_catalogs_and_upload_to_bucket.sh b/run_step_B__generate_variant_catalogs_and_upload_to_bucket.sh
@@ -18,6 +18,20 @@ suffix=with_overlap_columns
 python3 -u scripts/compute_overlap_with_other_catalogs.py --all-repeats --all-motifs ${negative_loci_bed_path}  ${output_prefix}.${suffix}.tsv.gz
 mv ${output_prefix}.${suffix}.tsv.gz  ${output_prefix}.tsv.gz
 
+# compute catalogs with adjacent loci
+for max_dist in 10 24 50; do 
+    mkdir -p tool_comparison/variant_catalogs/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp
+    for positive_or_negative in "positive" "negative"; do
+	python3 -u -m str_analysis.add_adjacent_loci_to_expansion_hunter_catalog tool_comparison/variant_catalogs/expansion_hunter/${positive_or_negative}*.json \
+		            --ref-fasta ./ref/hg38.fa \
+		            --add-extra-fields -d ${max_dist} \
+		            --source-of-adjacent-loci ./ref/other/repeat_specs_GRCh38_without_mismatches.including_homopolymers.sorted.at_least_9bp.bed.gz \
+		            --output-dir ./tool_comparison/variant_catalogs/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp/ &
+    done
+
+    wait
+done
+
 suffix=with_gencode_v42_columns
 python3 -u scripts/compute_overlap_with_gene_models.py ./ref/other/gencode.v42.annotation.sorted.gtf.gz  ${output_prefix}.tsv.gz  ${output_prefix}.${suffix}.tsv.gz
 mv ${output_prefix}.${suffix}.tsv.gz  ${output_prefix}.tsv.gz
@@ -29,9 +43,10 @@ mv ${output_prefix}.${suffix}.tsv.gz  ${output_prefix}.tsv.gz
 # upload catalogs to bucket
 gsutil -q -m cp tool_comparison/variant_catalogs/positive_loci.bed.gz* tool_comparison/variant_catalogs/negative_loci.bed.gz* gs://str-truth-set/hg38/
 
-gsutil -q -m cp -r tool_comparison/variant_catalogs/expansion_hunter  gs://str-truth-set/hg38/variant_catalogs/
-gsutil -q -m cp -r tool_comparison/variant_catalogs/gangstr           gs://str-truth-set/hg38/variant_catalogs/
-gsutil -q -m cp -r tool_comparison/variant_catalogs/hipstr            gs://str-truth-set/hg38/variant_catalogs/
+gsutil -q -m cp -r tool_comparison/variant_catalogs/expansion_hunter   gs://str-truth-set/hg38/variant_catalogs/
+gsutil -q -m cp -r tool_comparison/variant_catalogs/gangstr            gs://str-truth-set/hg38/variant_catalogs/
+gsutil -q -m cp -r tool_comparison/variant_catalogs/hipstr             gs://str-truth-set/hg38/variant_catalogs/
+gsutil -q -m cp -r tool_comparison/variant_catalogs/expansion_hunter_with_adjacent_loci*   gs://str-truth-set/hg38/variant_catalogs/
 
 
 set +x

diff --git a/run_step_C__run_tool_pipelines_with_adjacent_loci.sh b/run_step_C__run_tool_pipelines_with_adjacent_loci.sh
@@ -0,0 +1,46 @@
+# Run tool pipelines using Hail Batch
+
+
+force=""
+#force="--force"
+
+debug=""
+#debug="echo"
+if [ -z "$debug" ]; then
+  set -ex
+fi
+
+input_bam="gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram"
+input_bai="gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram.crai"
+
+for max_dist in 50 24 10; do
+    output_dir="gs://str-truth-set/hg38/tool_results/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp"
+    variant_catalog_paths="gs://str-truth-set/hg38/variant_catalogs/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp/positive*.with_adjacent_loci.json"
+    run_reviewer=""
+    if [ "${max_dist}" == "50" ]; then
+	echo Running reviewer for max_dist = ${max_dist}
+	run_reviewer="--run-reviewer"
+    fi
+
+    $debug python3 ./tool_comparison/hail_batch_pipelines/expansion_hunter_pipeline.py \
+       --input-bam ${input_bam} \
+       --input-bai ${input_bai} \
+       --variant-catalog-paths ${variant_catalog_paths} \
+       --output-dir ${output_dir} \
+       ${run_reviewer} \
+       --verbose \
+       ${min_locus_coverage_arg} \
+       ${force} 
+
+    local_dir=./tool_comparison/results_with_adjacent_loci__max_dist_${max_dist}bp
+
+    $debug mkdir -p ${local_dir}
+    $debug gsutil -m cp ${output_dir}/positive_loci/combined.positive_loci.*_json_files.*.tsv.gz  ${local_dir}/
+done
+
+# Run REViewer
+python3 ./tool_comparison/hail_batch_pipelines/expansion_hunter_pipeline.py  --positive-loci --run-reviewer \
+	--input-bam gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram --input-bai gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram.crai \
+        --output-dir gs://str-truth-set/hg38/tool_results/expansion_hunter &
+
+echo Done