Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable listing of zip files in fromfiles for index, gather, search #354

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
90b0314
Add a failing test for indexing multiple manifest zips provided as a …
olgabot Jun 13, 2024
97fa413
After loading matching Signature, Add re-looking through the pathlist in
olgabot Jun 13, 2024
3295f0e
Created collection_from_zipfile_or_signature_or_manifest to call with…
olgabot Jun 21, 2024
d92c873
Merge branch 'main' into olgabot/index-multiple-manifests
olgabot Jun 21, 2024
9c3ecf9
Iterate over the records in the collection to get their paths to then…
olgabot Jun 22, 2024
6615436
Clean up some comments
olgabot Jun 23, 2024
4ec39a3
Change to use map instead of filter_map to get record internal locations
olgabot Jun 23, 2024
9084926
Started writing collection_to_pathlist but can't get the types working
olgabot Jun 28, 2024
c9e738e
Get the signatures from a collection
olgabot Jun 30, 2024
8fd921c
remove assert False
olgabot Jun 30, 2024
ef8ee17
Cargo format
olgabot Jun 30, 2024
12cde5b
Merge branch 'main' into olgabot/index-multiple-manifests
olgabot Jul 1, 2024
6f68a3b
Change command name to testing multiple zip files
olgabot Jul 1, 2024
9760992
no compiler errors!
olgabot Jul 5, 2024
8a0f6a4
remove print statement
olgabot Jul 5, 2024
57a83b0
No compiler errors, tried to add error if path doesn't exist
olgabot Jul 5, 2024
7aa1280
Code now checks for path existence
olgabot Jul 5, 2024
83046d8
Added error for if path exists but can't load signature
olgabot Jul 5, 2024
cf2b8ea
Check for non-existence of files in fromfiles/pathlist
olgabot Jul 5, 2024
dff7c5f
Fix bug for loading multiple sketches in fastgather!
olgabot Jul 5, 2024
d1331f0
Don't double-print warning couldn't load sketch
olgabot Jul 5, 2024
3246c3d
Specify path doesn't exist in the pathlist/fromfiles
olgabot Jul 5, 2024
c38f7dd
cargo format
olgabot Jul 5, 2024
d930116
debugging, lots of print statements
olgabot Jul 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/mastiff_manygather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@ pub fn mastiff_manygather(
allow_failed_sigpaths,
)?;

eprintln!(
"queries_file: {}\nquery_collection.len(): {}",
// "\nquery_collection.manifest().internal_locations(): {}",
queries_file,
query_collection.len(),
// query_collection.manifest().internal_locations().cloned().collect::<Vec<String>>().join(", ")
);

query_collection.manifest().internal_locations().for_each(|location|
eprintln!("query collection, internal location: {}", location)
);

// set up a multi-producer, single-consumer channel.
let (send, recv) =
std::sync::mpsc::sync_channel::<BranchwaterGatherResult>(rayon::current_num_threads());
Expand Down
6 changes: 3 additions & 3 deletions src/python/tests/test_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def test_bad_against(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err


Expand All @@ -247,7 +247,7 @@ def test_bad_against_2(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "Sketch loading error: File is too short, less than five bytes" in captured.err
# assert "Sketch loading error: File is too short, less than five bytes" in captured.err
assert "WARNING: could not load sketches from path" in captured.err

assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err
Expand Down Expand Up @@ -307,7 +307,7 @@ def test_against_multisigfile(runtmp, zip_against):
print(df)
else:
print(df)
assert len(df) == 1
assert len(df) == 3
# @CTB this is a bug :(. It should load multiple sketches properly!


Expand Down
28 changes: 27 additions & 1 deletion src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,32 @@ def test_index_manifest(runtmp, capfd):
assert 'index is done' in runtmp.last_result.err


def test_index_manifest_zip_files(runtmp, capfd):
# test index with text file of multiple zip files
sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')
sigs = [sig2, sig47, sig63]
manifests = []
for sig in sigs:
sig_mf = runtmp.output(os.path.basename(sig) + ".mf.zip")
runtmp.sourmash("sig", "cat", sig, "-o", sig_mf)
manifests.append(sig_mf)

# assert False
manifests_zips_list = runtmp.output('manifest_zips.txt')
make_file_list(manifests_zips_list, manifests)

output = runtmp.output('out.db')
runtmp.sourmash('scripts', 'index', manifests_zips_list,
'-o', output)

captured = capfd.readouterr()
print(captured.err)
print(runtmp.last_result.err)
assert 'index is done' in runtmp.last_result.err


def test_index_bad_siglist_2(runtmp, capfd):
# test with a bad siglist (containing a missing file)
against_list = runtmp.output('against.txt')
Expand All @@ -140,7 +166,7 @@ def test_index_bad_siglist_2(runtmp, capfd):

captured = capfd.readouterr()
print(captured.err)
assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err


def test_index_empty_siglist(runtmp, capfd):
Expand Down
8 changes: 4 additions & 4 deletions src/python/tests/test_multigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def test_missing_query(runtmp, capfd, indexed):

captured = capfd.readouterr()
print(captured.err)
assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 query paths failed to load. See error messages above."


Expand Down Expand Up @@ -492,7 +492,7 @@ def test_bad_against(runtmp, capfd):

against_list = runtmp.output('against.txt')
sig2 = get_test_data('2.fa.sig.gz')
make_file_list(against_list, [sig2, "no exist"])
make_file_list(against_list, [sig2, "no-exist"])

# should succeed, but with error output.
runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list,
Expand All @@ -501,7 +501,7 @@ def test_bad_against(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "WARNING: could not load sketches from path 'no exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err


Expand Down Expand Up @@ -548,7 +548,7 @@ def test_empty_against(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "Sketch loading error: No such file or directory" in captured.err
assert "WARNING: path '' does not exist in 'against.txt'" in captured.err
assert "No search signatures loaded, exiting." in captured.err


Expand Down
4 changes: 2 additions & 2 deletions src/python/tests/test_multisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def test_bad_query(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err


Expand Down Expand Up @@ -416,7 +416,7 @@ def test_bad_against(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err


Expand Down
2 changes: 1 addition & 1 deletion src/python/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_bad_query(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 analysis paths failed to load. See error messages above." in captured.err


Expand Down
4 changes: 2 additions & 2 deletions src/python/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def test_bad_query_2(runtmp, capfd, indexed):
captured = capfd.readouterr()
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err


Expand Down Expand Up @@ -449,7 +449,7 @@ def test_bad_against(runtmp, capfd):
captured = capfd.readouterr()
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: path 'no-exist' does not exist" in captured.err
assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err


Expand Down
Loading
Loading