diff --git a/README.md b/README.md index fc5c0f61..09da74e7 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Relies on data from https://simplemaps.com/data/us-cities. ## Running locally If you're using Pipenv (see below), then run commands from `./bin/…` inside a `pipenv shell` or wrapped with `pipenv run ./bin/…`. -1. Run `./bin/fetch-from-gisaid > data/gisaid.ndjson` +1. Run `./bin/fetch-from-gisaid | bunzip2 > data/gisaid.ndjson` 2. Run `./bin/transform-gisaid data/gisaid.ndjson` 3. Look at `data/gisaid/sequences.fasta` and `data/gisaid/metadata.tsv` diff --git a/Snakefile b/Snakefile index 3b8a7d13..e4362ebf 100644 --- a/Snakefile +++ b/Snakefile @@ -23,6 +23,8 @@ if send_notifications: all_targets.append(f"data/{database}/notify.done") if config.get("fetch_from_database", False): all_targets.append(f"data/{database}/raw.upload.done") + if send_notifications: + all_targets.append(f"data/{database}/notify-on-database-change.done") rule all: input: all_targets @@ -47,32 +49,51 @@ def run_shell_command_n_times(cmd, msg, cleanup_failed_cmd, retry_num=5): print(msg + f" has FAILED {retry_num} times. Exiting.") raise Exception("function run_shell_command_n_times has failed") -rule download_main_ndjson: - message: - """Fetching data using the database API""" +if config.get("fetch_from_database", False): + rule download_main_ndjson: + message: + """Fetching data using the database API""" + output: + ndjson = temp(f"data/{database}.ndjson" + (".bz2" if database == "gisaid" else "")) + run: + run_shell_command_n_times( + msg = f"Fetching from {database}", + cmd = f"./bin/fetch-from-{database} > {output.ndjson}", + cleanup_failed_cmd = f"rm {output.ndjson}", + ) +else: + rule download_main_ndjson: + message: + """Fetching data from our S3 bucket""" + params: + file_on_s3_dst= f"{config['s3_dst']}/{database}.ndjson.xz", + file_on_s3_src= f"{config['s3_src']}/{database}.ndjson.xz" + output: + ndjson = temp(f"data/{database}.ndjson") + shell: """ + ./bin/download-from-s3 {params.file_on_s3_dst} {output.ndjson} || \ + ./bin/download-from-s3 {params.file_on_s3_src} {output.ndjson} + """ + + +rule bunzip2: + message: "Decompressing {input}" + input: "{stem}.bz2" + output: "{stem}" + shell: "bunzip2 {input:q}" + + +rule notify_on_database_change: + message: "Notifying on database NDJSON change" + input: + local_file = f"data/{database}.ndjson" params: - s3_src_bucket = config["s3_src"], - file_on_s3_dst= f"{config['s3_dst']}/{database}.ndjson.xz", - file_on_s3_src= f"{config['s3_src']}/{database}.ndjson.xz" + remote_file = f"{config['s3_src']}/{database}.ndjson.xz" output: - ndjson = temp(f"data/{database}.ndjson") - run: - if config.get("fetch_from_database", False): - if database=="gisaid": - msg = "Fetching from GISAID API" - cmd = f"./bin/fetch-from-gisaid > {output.ndjson}" - else: - msg = "Fetching from GenBank API" - cmd = f"./bin/fetch-from-genbank > {output.ndjson}" - cleanup_failed_cmd = f"rm {output.ndjson}" - run_shell_command_n_times(cmd, msg, cleanup_failed_cmd) - if send_notifications: - shell("./bin/notify-on-record-change {output.ndjson} {params.s3_src_bucket}/gisaid.ndjson.xz {database}") - else: - shell(""" - ./bin/download-from-s3 {params.file_on_s3_dst} {output.ndjson} || \ - ./bin/download-from-s3 {params.file_on_s3_src} {output.ndjson} - """) + touch(f"data/{database}/notify-on-database-change.done") + shell: """ + ./bin/notify-on-record-change {input.local_file:q} {params.remote_file:q} {database} + """ rule download_biosample: diff --git a/bin/fetch-from-gisaid b/bin/fetch-from-gisaid index d649fb6e..243f6288 100755 --- a/bin/fetch-from-gisaid +++ b/bin/fetch-from-gisaid @@ -6,5 +6,4 @@ set -euo pipefail curl "$GISAID_API_ENDPOINT" \ --user "$GISAID_USERNAME_AND_PASSWORD" \ - --fail --silent --show-error --location-trusted --http1.1 \ - | bunzip2 + --fail --silent --show-error --location-trusted --http1.1