Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 49 additions & 9 deletions figures_and_tables/plot_tool_accuracy_by_allele_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,10 @@ def plot_accuracy_by_allele_size(

ax.set_xlabel("True Allele Size Minus Number of Repeats in Reference Genome", fontsize=14)
if i == 0:
ax.spines.right.set_visible(False)
ax.spines.top.set_visible(False)
if hasattr(ax.spines, 'right'):
ax.spines.right.set_visible(False)
if hasattr(ax.spines, 'top'):
ax.spines.top.set_visible(False)
ax.set_ylabel("Number of Alleles", fontsize=14)
else:
ax.set_ylabel("Fraction of Alleles", fontsize=14)
Expand Down Expand Up @@ -359,6 +361,9 @@ def generate_all_plots(df, args):

output_image_filename += f".{tool}"

if args.compare_loci_with_adjacent_repeats:
output_image_filename += f".loci_with_{df_plot.iloc[0].AdjacentLociLabel}"

coverage_label = f"exome data" if coverage == "exome" else f"{coverage} genome data"

# skip_condition1: HipSTR doesn't support motifs larger than 9bp
Expand Down Expand Up @@ -478,13 +483,17 @@ def main():
g2.add_argument("--show-no-call-loci", action="store_true", help="Show loci with no call")
g2.add_argument("--hide-no-call-loci", action="store_true", help="Hide loci with no call")

p.add_argument("--compare-loci-with-adjacent-repeats", action="store_true",
help="Generate plots for loci with vs. without adjacent repeats")

p.add_argument("combined_tool_results_tsv", nargs="?", default="../tool_comparison/combined.results.alleles.tsv.gz")
args = p.parse_args()

# print command line args from argparse
print("Command line args:")
for arg in vars(args):
print(f"{arg}: {getattr(args, arg)}")
if not args.compare_loci_with_adjacent_repeats:
print("Command line args:")
for arg in vars(args):
print(f"{arg}: {getattr(args, arg)}")

if args.only_print_total_number_of_plots:
df = pd.DataFrame()
Expand All @@ -501,14 +510,45 @@ def main():

print("Computing additional columns...")
df.loc[:, "DiffFromRefRepeats: Allele: Truth (bin)"] = df.apply(bin_num_repeats_wrapper(bin_size=2), axis=1)
for tool in ("ExpansionHunter", "GangSTR", "HipSTR",):
define_hue_column(df, tool)

df = df.sort_values("DiffFromRefRepeats: Allele: Truth")

print(f"Generating {str(args.n) + ' ' if args.n else ''}plots",
f"starting with plot #{args.start_with_plot_i}" if args.start_with_plot_i else "")
generate_all_plots(df, args)

if not args.compare_loci_with_adjacent_repeats:
for tool in ("ExpansionHunter", "GangSTR", "HipSTR",):
define_hue_column(df, tool)


generate_all_plots(df, args)

else:
define_hue_column(df, "ExpansionHunter")

locus_ids_with_adjacent_repeats = set(df[df["AdjacentRepeatCount: ExpansionHunter"] >= 1].LocusId)

for i in 1, 2:
args.tool = "ExpansionHunter"
args.coverage = "40x"
args.q_threshold = 0
args.genotype = "all"
args.min_motif_size = 2
args.max_motif_size = 6
args.show_no_call_loci_option = True

args.output_dir = f"loci_with_adjacent_repeats"
os.system(f"mkdir -p {args.output_dir}")

if i == 1:
current_df = df[df["AdjacentLociLabel"] == "no_adjacent_loci"]
print(f"Generating plot for {len(set(current_df.LocusId)):,d} loci with no adjacent loci")
elif i == 2:
current_df = df[df["AdjacentLociLabel"] == "max_dist_10bp"]
print(f"Generating plot for {len(set(current_df.LocusId)):,d} loci with adjacent loci within 10bp")

current_df = current_df[current_df.LocusId.isin(locus_ids_with_adjacent_repeats)]

generate_all_plots(current_df, args)

print(f"Done")

Expand Down
128 changes: 95 additions & 33 deletions figures_and_tables/plot_tool_accuracy_percent_exactly_right.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,11 @@ def compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x",
tables_by_coverage = []
for coverage in coverage_values:
for tool in ("ExpansionHunter", "GangSTR", "HipSTR"):

df_current = df.copy()
df_current = df_current[df_current["Coverage"] == coverage]

if len(df_current) == 0:
print("0 records found for", tool, "at", coverage, "coverage. Skipping...")
continue
df_tool, overall_fraction_exactly_right = compute_fraction_exactly_right(df_current, tool)
df_tool.loc[:, "tool"] = f"{tool}: {coverage} coverage" if len(coverage_values) > 1 else tool
tables_by_coverage.append(df_tool)
Expand All @@ -85,14 +86,41 @@ def compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x",
return pd.concat(tables_by_coverage, axis=0)


def generate_fraction_exactly_right_plot(df, args, filter_description, image_name):
def compute_tables_for_fraction_exactly_right_plots_by_adjacent_loci(df, max_dist_values=(
"no_adjacent_loci", "max_dist_10bp", "max_dist_24bp", "max_dist_50bp")):
tool = "ExpansionHunter"
tables_by_adjacent_loci = []
for max_dist_value in max_dist_values:
df_current = df.copy()
df_current = df_current[df_current["AdjacentLociLabel"] == max_dist_value]
if len(df_current) == 0:
print("0 records found for", tool, "at max_dist =", max_dist_value, "bp. Skipping...")
continue

df_tool, overall_fraction_exactly_right = compute_fraction_exactly_right(df_current, tool)
df_tool.loc[:, "tool"] = f"{tool}: {max_dist_value}"
df_tool.loc[:, "AdjacentLociLabel"] = max_dist_value.replace("_", " ")

tables_by_adjacent_loci.append(df_tool)

print(f"Processed {tool:20s} -- {100*overall_fraction_exactly_right:0.1f}% of calls by {tool} "
f" were exactly right for {len(df_current):,d} alleles at {len(set(df_current.LocusId)):,d} loci")

df_fraction = compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x", "30x", "20x", "10x"))
return pd.concat(tables_by_adjacent_loci, axis=0)


def generate_fraction_exactly_right_plot(df, args, filter_description, image_name):
if args.by_adjacent_loci:
df_fraction = compute_tables_for_fraction_exactly_right_plots_by_adjacent_loci(df)
else:
df_fraction = compute_tables_for_fraction_exactly_right_plots(df, coverage_values=("40x", "30x", "20x", "10x"))
df_fraction = df_fraction.reset_index()
df_fraction = df_fraction.sort_values("DiffFromRefRepeats: Allele: Truth (bin)", key=lambda c: c.str.split(" ").str[0].astype(int))

coverage_levels_to_plot = []
if args.by_coverage:
if args.by_adjacent_loci:
coverage_levels_to_plot.append("40x")
elif args.by_coverage:
coverage_levels_to_plot.append("all")
elif args.coverage:
coverage_levels_to_plot.append(args.coverage)
Expand All @@ -101,31 +129,40 @@ def generate_fraction_exactly_right_plot(df, args, filter_description, image_nam

for coverage in coverage_levels_to_plot:
df_current = df_fraction.copy()
if coverage == "all":
# exclude HipSTR to make plot clearer
df_current = df_current[~df_current.tool.str.contains("HipSTR")]
df_current = df_current[
df_current.tool.str.contains("40x") | df_current.tool.str.contains("20x") | df_current.tool.str.contains("10x")
]
else:
df_current = df_current[df_current.tool.str.contains(coverage)]
df_current.loc[:, "tool"] = df_current["tool"].apply(lambda s: s.split(":")[0])
if not args.by_adjacent_loci:
if coverage == "all":
# exclude HipSTR to make plot clearer
df_current = df_current[~df_current.tool.str.contains("HipSTR")]
df_current = df_current[
df_current.tool.str.contains("40x") | df_current.tool.str.contains("20x") | df_current.tool.str.contains("10x")
]
else:
df_current = df_current[df_current.tool.str.contains(coverage)]
df_current.loc[:, "tool"] = df_current["tool"].apply(lambda s: s.split(":")[0])

if coverage == "all":
image_name += ".compare_different_coverages"
else:
image_name += f".{coverage}_coverage"

if coverage == "all":
image_name += ".compare_different_coverages"
else:
image_name += f".{coverage}_coverage"

figure_title = f"Accuracy of " + ", ".join(sorted(set([t.split(":")[0] for t in set(df_current.tool)])))
figure_title += "\n\n"
figure_title += f"at {len(set(df.LocusId)):,d} STR loci (" + ", ".join(filter_description) + ")"

print("Plotting", figure_title.replace("\n", " "))

hue_values = set(df_current["tool"])
num_gangstr_colors = len([h for h in hue_values if h.lower().startswith("gangstr")])
num_expansion_hunter_colors = len([h for h in hue_values if h.lower().startswith("expansionhunter")])
num_hipstr_colors = len([h for h in hue_values if h.lower().startswith("hipstr")])
if args.by_adjacent_loci:
hue_values = set(df_current["AdjacentLociLabel"])
print("Hue values:", hue_values)
num_expansion_hunter_colors = len(hue_values)
num_gangstr_colors = 0
num_hipstr_colors = 0
else:
hue_values = set(df_current["tool"])
num_expansion_hunter_colors = len([h for h in hue_values if h.lower().startswith("expansionhunter")])
num_gangstr_colors = len([h for h in hue_values if h.lower().startswith("gangstr")])
num_hipstr_colors = len([h for h in hue_values if h.lower().startswith("hipstr")])

# plot figure
fig, ax = plt.subplots(1, 1, figsize=(args.width, args.height))
Expand All @@ -138,20 +175,27 @@ def hue_order(h):
if args.verbose:
print(f"Unable to parse hue value: {h}: {e}")
return h
if args.by_adjacent_loci:
palette = None
else:
palette = []
if num_expansion_hunter_colors > 0:
palette += list(sns.color_palette("Purples_r", n_colors=num_expansion_hunter_colors) if num_expansion_hunter_colors > 1 else ["#6A51A3"])
if num_gangstr_colors > 0:
palette += list(sns.color_palette("blend:#FF3355,#FFCCCC", n_colors=num_gangstr_colors))
if num_hipstr_colors > 0:
list(sns.color_palette("Oranges_r", n_colors=num_hipstr_colors))

print("len(df_current)", len(df_current))
print("len(df_current)", df_current.groupby("AdjacentLociLabel").count())
sns.pointplot(
data=df_current,
x="DiffFromRefRepeats: Allele: Truth (bin)",
y="FractionExactlyRight",
hue="tool",
hue="tool" if not args.by_adjacent_loci else "AdjacentLociLabel",
scale=0.8,
hue_order=sorted(hue_values, key=hue_order),
palette=(
list(sns.color_palette("Purples_r", n_colors=num_expansion_hunter_colors)
if num_expansion_hunter_colors > 1 else ["#6A51A3"]) +
list(sns.color_palette("blend:#FF3355,#FFCCCC", n_colors=num_gangstr_colors)) +
list(sns.color_palette("Oranges_r", n_colors=num_hipstr_colors))
),
hue_order=sorted(hue_values, key=hue_order) if not args.by_adjacent_loci else sorted(hue_values),
palette=palette,
ax=ax,
)

Expand Down Expand Up @@ -200,6 +244,7 @@ def main():
g = p.add_mutually_exclusive_group()
g.add_argument("--by-coverage", action="store_true", help="Compare different coverages")
g.add_argument("--coverage", help="Plot by coverage", choices=["40x", "30x", "20x", "10x"])
g.add_argument("--by-adjacent-loci", action="store_true", help="Compare different adjacent loci settings")

g = p.add_argument_group("Filters")
g.add_argument("--min-motif-size", type=int, help="Min motif size")
Expand Down Expand Up @@ -241,15 +286,32 @@ def main():

if args.exclude_no_call_loci:
count_before = len(set(df[df["Coverage"] == "40x"].LocusId))
df = df[~df[f"DiffRepeats: Allele: HipSTR - Truth"].isna()]
df = df[~df[f"DiffRepeats: Allele: GangSTR - Truth"].isna()]
if f"DiffRepeats: Allele: HipSTR - Truth" in df.columns:
df = df[~df[f"DiffRepeats: Allele: HipSTR - Truth"].isna()]
if f"DiffRepeats: Allele: GangSTR - Truth" in df.columns:
df = df[~df[f"DiffRepeats: Allele: GangSTR - Truth"].isna()]
df = df[~df[f"DiffRepeats: Allele: ExpansionHunter - Truth"].isna()]
count_discarded = count_before - len(set(df[df["Coverage"] == "40x"].LocusId))
print(f"Discarded {count_discarded:,d} out of {count_before:,d} ({100.0*count_discarded/count_before:0.1f}%) "
f"loci where at least one tool had no call")
filter_description.append(f"excluding no-call loci")
image_name += f".excluding_no_call_loci"

if args.by_adjacent_loci:
df = df[df["AdjacentLociLabel"].isin({"no_adjacent_loci", "max_dist_50bp"})]
adjacent_repeat_count = 1
max_dist = 50
filter_criteria = df["AdjacentRepeatCount: ExpansionHunter"] >= adjacent_repeat_count
filter_criteria &= df["AdjacentRepeatMaxDistance (bp): ExpansionHunter"] <= max_dist
#filter_criteria &= df["MotifSize"] == 3

filter_description.append(f"with at least {adjacent_repeat_count} adjacent locus ≤ {max_dist}bp away")

locus_ids_with_adjacent_repeats = set(df[filter_criteria].LocusId)

image_name += ".by_adjacent_loci"
df = df[df["LocusId"].isin(locus_ids_with_adjacent_repeats)]

if args.verbose:
print("Num loci:")
print(df.groupby(["PositiveOrNegative", "Coverage", "IsPureRepeat"]).count().LocusId/2)
Expand All @@ -264,4 +326,4 @@ def main():


if __name__ == "__main__":
main()
main()
21 changes: 18 additions & 3 deletions run_step_B__generate_variant_catalogs_and_upload_to_bucket.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ suffix=with_overlap_columns
python3 -u scripts/compute_overlap_with_other_catalogs.py --all-repeats --all-motifs ${negative_loci_bed_path} ${output_prefix}.${suffix}.tsv.gz
mv ${output_prefix}.${suffix}.tsv.gz ${output_prefix}.tsv.gz

# compute catalogs with adjacent loci
for max_dist in 10 24 50; do
mkdir -p tool_comparison/variant_catalogs/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp
for positive_or_negative in "positive" "negative"; do
python3 -u -m str_analysis.add_adjacent_loci_to_expansion_hunter_catalog tool_comparison/variant_catalogs/expansion_hunter/${positive_or_negative}*.json \
--ref-fasta ./ref/hg38.fa \
--add-extra-fields -d ${max_dist} \
--source-of-adjacent-loci ./ref/other/repeat_specs_GRCh38_without_mismatches.including_homopolymers.sorted.at_least_9bp.bed.gz \
--output-dir ./tool_comparison/variant_catalogs/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp/ &
done

wait
done

suffix=with_gencode_v42_columns
python3 -u scripts/compute_overlap_with_gene_models.py ./ref/other/gencode.v42.annotation.sorted.gtf.gz ${output_prefix}.tsv.gz ${output_prefix}.${suffix}.tsv.gz
mv ${output_prefix}.${suffix}.tsv.gz ${output_prefix}.tsv.gz
Expand All @@ -29,9 +43,10 @@ mv ${output_prefix}.${suffix}.tsv.gz ${output_prefix}.tsv.gz
# upload catalogs to bucket
gsutil -q -m cp tool_comparison/variant_catalogs/positive_loci.bed.gz* tool_comparison/variant_catalogs/negative_loci.bed.gz* gs://str-truth-set/hg38/

gsutil -q -m cp -r tool_comparison/variant_catalogs/expansion_hunter gs://str-truth-set/hg38/variant_catalogs/
gsutil -q -m cp -r tool_comparison/variant_catalogs/gangstr gs://str-truth-set/hg38/variant_catalogs/
gsutil -q -m cp -r tool_comparison/variant_catalogs/hipstr gs://str-truth-set/hg38/variant_catalogs/
gsutil -q -m cp -r tool_comparison/variant_catalogs/expansion_hunter gs://str-truth-set/hg38/variant_catalogs/
gsutil -q -m cp -r tool_comparison/variant_catalogs/gangstr gs://str-truth-set/hg38/variant_catalogs/
gsutil -q -m cp -r tool_comparison/variant_catalogs/hipstr gs://str-truth-set/hg38/variant_catalogs/
gsutil -q -m cp -r tool_comparison/variant_catalogs/expansion_hunter_with_adjacent_loci* gs://str-truth-set/hg38/variant_catalogs/


set +x
Expand Down
46 changes: 46 additions & 0 deletions run_step_C__run_tool_pipelines_with_adjacent_loci.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Run tool pipelines using Hail Batch


force=""
#force="--force"

debug=""
#debug="echo"
if [ -z "$debug" ]; then
set -ex
fi

input_bam="gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram"
input_bai="gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram.crai"

for max_dist in 50 24 10; do
output_dir="gs://str-truth-set/hg38/tool_results/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp"
variant_catalog_paths="gs://str-truth-set/hg38/variant_catalogs/expansion_hunter_with_adjacent_loci__max_dist_${max_dist}bp/positive*.with_adjacent_loci.json"
run_reviewer=""
if [ "${max_dist}" == "50" ]; then
echo Running reviewer for max_dist = ${max_dist}
run_reviewer="--run-reviewer"
fi

$debug python3 ./tool_comparison/hail_batch_pipelines/expansion_hunter_pipeline.py \
--input-bam ${input_bam} \
--input-bai ${input_bai} \
--variant-catalog-paths ${variant_catalog_paths} \
--output-dir ${output_dir} \
${run_reviewer} \
--verbose \
${min_locus_coverage_arg} \
${force}

local_dir=./tool_comparison/results_with_adjacent_loci__max_dist_${max_dist}bp

$debug mkdir -p ${local_dir}
$debug gsutil -m cp ${output_dir}/positive_loci/combined.positive_loci.*_json_files.*.tsv.gz ${local_dir}/
done

# Run REViewer
python3 ./tool_comparison/hail_batch_pipelines/expansion_hunter_pipeline.py --positive-loci --run-reviewer \
--input-bam gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram --input-bai gs://broad-public-datasets/CHM1_CHM13_WGS2/CHM1_CHM13_WGS2.cram.crai \
--output-dir gs://str-truth-set/hg38/tool_results/expansion_hunter &

echo Done
Loading