Skip to content

Commit 54f7f90

Browse files
authored
Merge pull request #63 from neherlab/fix/issue-62
Fix/issue 62
2 parents b11ef76 + 3ca68f3 commit 54f7f90

File tree

6 files changed

+108
-20
lines changed

6 files changed

+108
-20
lines changed

Diff for: CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# PanGraph Changelog
22

3+
## v0.7.3
4+
5+
- bugfix in graph building: a particular edge-case would cause minor inconsistencies in the block alignment when merging graphs, see issue [#62](https://github.com/neherlab/pangraph/issues/62) and PR [#63](https://github.com/neherlab/pangraph/pull/63).
6+
37
## v0.7.2
48

59
- minor fix in tree midpoint rooting during panX export, see [#59](https://github.com/neherlab/pangraph/issues/59).

Diff for: Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "PanGraph"
22
uuid = "0f9f61ca-f32c-45e1-b3bc-00138f4f8814"
33
authors = ["Nicholas Noll <[email protected]>"]
4-
version = "0.7.2"
4+
version = "0.7.3"
55

66
[deps]
77
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

Diff for: docs/src/cli/build.md

+2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ Build a multiple sequence alignment pangraph.
1717
| alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" |
1818
| kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. |
1919
| consistency check | Boolean | t | test | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
20+
| verbose mode | Boolean | v | verbose | toggle to activate verbose mode |
21+
| debug mode | Boolean | D | debug | toggle to activate debug mode: during merging intermediate graphs are saved in the `debug` folder |
2022
| random seed | Int | r | random-seed | random seed for pangraph construction. |
2123

2224
## Arguments

Diff for: src/align.jl

+17-9
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@ function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Funct
463463
merges = preprocess(hits, skip, energy, block)
464464
length(merges) > 0 || break
465465

466+
# dictionary block-id => block
466467
blocks = align_kernel(merges, minblock, replace, verbose)
467468
merge!(blocks, G₀.block)
468469

@@ -593,6 +594,7 @@ function align_pair(G₁::Graph, G₂::Graph, energy::Function, minblock::Int, a
593594

594595
merges = preprocess(hits, skip, energy, block)
595596

597+
# dictionary block-id => block
596598
blocks = align_kernel(merges, minblock, replace, verbose)
597599
sequence = merge(G₁.sequence, G₂.sequence)
598600

@@ -618,7 +620,7 @@ end
618620
# TODO: the associative array is a bit hacky...
619621
# can we push it directly into the channel?
620622
"""
621-
align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100)
623+
align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, debugdir=nothing)
622624
623625
Aligns a collection of graphs `Gs` using the specified `aligner` function to recover hits.
624626
Graphs are aligned following an internal guide tree, generated using kmer distance.
@@ -631,7 +633,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
631633
632634
`compare` is the function to be used to generate pairwise distances that generate the internal guide tree.
633635
"""
634-
function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0)
636+
function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0, debugdir=nothing)
635637
function verify(graph; msg="")
636638
if reference !== nothing
637639
for (name,path) graph.sequence
@@ -718,20 +720,26 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
718720
# the lock ensures that at most N=Threads.nthreads() processes are
719721
# spawning run(`cmd`) instances at the same time
720722
G₀ = lock_semaphore(s) do
721-
verbose && log("--> align-pair for clade n. $n_clade")
723+
verbose && log("--> align-pair for clade n. $n_clade from $(Pₗ) and $(Pᵣ)")
722724
G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, verbose)
723725
verbose && log("--> align-self for clade n. $n_clade")
724726
G₀ = align_self(G₀, energy, minblock, aligner, verify, verbose, maxiter=maxiter)
725727
verbose && log("--> graph merging for clade n. $n_clade completed")
726728
G₀
727729
end
728730

729-
# DEBUG : save graph at each iteration in a file
730-
# open("issue/comp/graph_iteration_$(n_clade).json", "w") do io
731-
# finalize!(G₀)
732-
# marshal(io, G₀; fmt=:json)
733-
# end
734-
731+
# if debug: save graph at each iteration in a file
732+
if debugdir !== nothing
733+
open("$(debugdir)/graph_iteration_$(n_clade).json", "w") do io
734+
finalize!(G₀)
735+
marshal(io, G₀; fmt=:json)
736+
end
737+
# if the file exists, remove input intermediate files that were successfully merged
738+
fl = "$(debugdir)/graph_iteration_$(Pₗ).json"
739+
fr = "$(debugdir)/graph_iteration_$(Pᵣ).json"
740+
isfile(fl) && rm(fl)
741+
isfile(fr) && rm(fr)
742+
end
735743

736744
# advance progress bar in a thread-safe way
737745
lock(meter_lock) do

Diff for: src/block.jl

+47-9
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,22 @@ function reconsensus!(b::Block)
957957
return true
958958
end
959959

960+
# """
961+
# debug function that saves all of the block information on a text file.
962+
# """
963+
# function all_block_info(io::IO, b::Block)
964+
# println(io, "Block ID: ", b.uuid)
965+
# println(io, "Sequence: ", b.sequence .|> Char |> join)
966+
# println(io, "Gaps: ", b.gaps)
967+
# for (name, dict) in zip(["Mutations", "Insertions", "Deletions"], [b.mutate, b.insert, b.delete])
968+
# println(io, "$name ---")
969+
# for (node, value) in dict
970+
# println(io, "\tNode: ", hash(node))
971+
# println(io, "\t\t", value)
972+
# end
973+
# end
974+
# end
975+
960976
# TODO: align consensus sequences within overlapping gaps of qry and ref.
961977
# right now we parsimoniously stuff all sequences at the beginning of gaps
962978
# problems:
@@ -998,7 +1014,6 @@ function rereference(qry::Block, ref::Block, segments)
9981014
# TODO: allow for (-) hamming alignments
9991015
gap = gapconsensus(qry, x.qry-1)
10001016
pos = hamming_align(gap, ref.sequence[Δ])-1
1001-
10021017
newgap =.stop, 0)
10031018

10041019
for node keys(qry)
@@ -1015,7 +1030,7 @@ function rereference(qry::Block, ref::Block, segments)
10151030
start = Δ.start + pos + δ
10161031
stop = start + length(ins) - 1
10171032

1018-
if 1 start Δ.stop
1033+
if 1 start Δ.stop
10191034
for i start:min.stop,stop)
10201035
if ins[i-start+1] != ref.sequence[i]
10211036
if node keys(combined.mutate)
@@ -1080,17 +1095,40 @@ function rereference(qry::Block, ref::Block, segments)
10801095
end
10811096

10821097
newgap = nothing
1083-
else
1098+
else # no insertions in qry, just add as new deletion
10841099
newdeletes = DelDict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node keys(qry))
10851100
merge!(combined.delete, newdeletes)
10861101
end
10871102

10881103
x = (qry=x.qry, ref=Δ.stop+1)
10891104
end
10901105
(Δ, nothing) => let # sequence in qry consensus not found in ref consensus
1091-
mutate = translate(lociwithin(qry.mutate,Δ),1-Δ.start)
1092-
insert = translate(lociwithin(qry.insert,Δ),1-Δ.start)
1093-
delete = translate(lociwithin(qry.delete,Δ),1-Δ.start)
1106+
mutate = lociwithin(qry.mutate,Δ)
1107+
insert = lociwithin(qry.insert,Δ)
1108+
delete = lociwithin(qry.delete,Δ)
1109+
1110+
# delete all query insertions that will be accounted for in this segment
1111+
del_gaps = Int[]
1112+
for (node, subdict) in insert
1113+
for (locus, nuc) in subdict
1114+
# @assert locus[1] in keys(qry.gaps)
1115+
# append locus to del_gaps
1116+
push!(del_gaps, locus[1])
1117+
delete!(qry.insert[node], locus)
1118+
end
1119+
end
1120+
1121+
# remove corresponding query gaps
1122+
for dg in unique(del_gaps)
1123+
# @assert dg ∉ [locus[1] for (node, subdict) ∈ qry.insert for (locus, nuc) ∈ keys(subdict)]
1124+
delete!(qry.gaps, dg)
1125+
end
1126+
1127+
# shift variations on the segment start
1128+
mutate = translate(mutate,1-Δ.start)
1129+
insert = translate(insert,1-Δ.start)
1130+
delete = translate(delete,1-Δ.start)
1131+
10941132

10951133
if (x.ref-1) keys(newgaps) # TODO: more sophisticated alignment? have to worry about overriding alignment
10961134
δ = newgaps[x.ref-1] #hamming_align(qry.sequence[Δ], gapconsensus(combined.insert, newgaps[x.ref-1], x.ref-1)) - 1
@@ -1109,13 +1147,13 @@ function rereference(qry::Block, ref::Block, segments)
11091147
if+length(seq)) > last(newgap)
11101148
newgap = (first(newgap),length(seq)+δ)
11111149
end
1112-
node => InsMap((x.ref-1,δ) => seq)
1150+
node => InsMap((x.ref-1,δ) => seq)
11131151
else
11141152
node => InsMap()
11151153
end
11161154
end for node keys(qry)
11171155
)
1118-
1156+
11191157
if last(newgap) > 0
11201158
if newgap[1] keys(newgaps) || newgap[2] > newgaps[newgap[1]]
11211159
newgaps[newgap[1]] = newgap[2]
@@ -1143,7 +1181,7 @@ function rereference(qry::Block, ref::Block, segments)
11431181
) for node keys(qry)
11441182
)
11451183

1146-
# XXX: hacky way to ensure deletions are not inclued in newmuts
1184+
# XXX: hacky way to ensure deletions are not included in newmuts
11471185
newdels = map(qry.delete,Δq,Δr)
11481186
for (node, subdict) in newdels
11491187
for (pos, len) in subdict

Diff for: src/build.jl

+37-1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,20 @@ Build = Command(
8484
"toggle to activate consistency check at each step of the graph merging process",
8585
false,
8686
),
87+
Arg(
88+
Bool,
89+
"debug mode",
90+
(short = "-D", long = "--debug"),
91+
"toggle to activate debug mode: saves intermediate graphs in `debug` folder",
92+
false,
93+
),
94+
Arg(
95+
Bool,
96+
"verbose mode",
97+
(short = "-v", long = "--verbose"),
98+
"toggle to activate verbose mode",
99+
false,
100+
),
87101
Arg(
88102
Int,
89103
"random seed",
@@ -93,16 +107,37 @@ Build = Command(
93107
),
94108
],
95109
(args) -> let
110+
111+
# parse input files
96112
files = parse(Build, args)
97113
files = if files === nothing || length(files) == 0
98114
["/dev/stdin"]
99115
else
100116
files
101117
end
102118

119+
# if verbose mode: print all arguments
120+
verbose = arg(Build, "-v")
121+
122+
if verbose
123+
println(stderr, "pangraph build command arguments:")
124+
for arg in Build.arg
125+
println(stderr, "\t", arg.flag.long, " = ", arg.value)
126+
end
127+
end
128+
129+
debugdir = nothing
130+
if arg(Build, "-D")
131+
# create debug directory
132+
debugdir = joinpath(pwd(), "debug")
133+
isdir(debugdir) || mkdir(debugdir)
134+
end
135+
136+
# parse command arguments
103137
minblock = arg(Build, "-l")
104138
circular = arg(Build, "-c")
105139
uppercase = arg(Build, "-u")
140+
verbose = arg(Build, "-v")
106141

107142
α = arg(Build, "-a")
108143
β = arg(Build, "-b")
@@ -186,8 +221,9 @@ Build = Command(
186221
minblock = minblock,
187222
maxiter = maxiter,
188223
reference = reference,
189-
verbose = false,
224+
verbose = verbose,
190225
rand_seed = r_seed,
226+
debugdir = debugdir,
191227
)
192228
finalize!(graph)
193229

0 commit comments

Comments
 (0)