diff --git a/benchmark_qed/data/cli.py b/benchmark_qed/data/cli.py index 0f2f9db..64e2083 100644 --- a/benchmark_qed/data/cli.py +++ b/benchmark_qed/data/cli.py @@ -7,7 +7,6 @@ import requests import typer -from rich.progress import Progress app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False) @@ -20,19 +19,6 @@ class Dataset(StrEnum): EXAMPLE_ANSWERS = "example_answers" -def _download_folder(contents: list[dict], output_dir: Path) -> None: - with Progress() as progress: - task = progress.add_task("Downloading files...", total=len(contents)) - for item in contents: - item_name = item["name"] - download_url = item["download_url"] - if item["type"] == "file": - file_response = requests.get(download_url, timeout=60) - (output_dir / item_name).write_bytes(file_response.content) - typer.echo(f"Downloaded {item_name}") - progress.update(task, advance=1) - - @app.command() def download( dataset: Annotated[ @@ -53,14 +39,15 @@ def download( abort=True, ) - if dataset == Dataset.EXAMPLE_ANSWERS: - api_url = f"https://api.github.com/repos/microsoft/benchmark-qed/contents/docs/notebooks/{dataset}" - for subdir in ["graphrag_global", "lazygraphrag", "vector_rag"]: - response = requests.get(f"{api_url}/{subdir}", timeout=60) - contents = response.json() - _download_folder(contents, output_dir / subdir) - else: - api_url = f"https://api.github.com/repos/microsoft/benchmark-qed/contents/datasets/{dataset}" - response = requests.get(api_url, timeout=60) - contents = response.json() - _download_folder(contents, output_dir) + match dataset: + case Dataset.EXAMPLE_ANSWERS: + api_url = f"https://raw.githubusercontent.com/microsoft/benchmark-qed/refs/heads/main/docs/notebooks/{dataset}/raw_data.zip" + response = requests.get(api_url, timeout=60) + output_file = output_dir / f"{dataset}.zip" + output_file.write_bytes(response.content) + + case Dataset.AP_NEWS | Dataset.PODCAST: + api_url = f"https://raw.githubusercontent.com/microsoft/benchmark-qed/refs/heads/main/datasets/{dataset}/raw_data.zip" + response = requests.get(api_url, timeout=60) + output_file = output_dir / f"{dataset}.zip" + output_file.write_bytes(response.content) diff --git a/datasets/AP_news/raw_data.zip b/datasets/AP_news/raw_data.zip new file mode 100644 index 0000000..630fc25 Binary files /dev/null and b/datasets/AP_news/raw_data.zip differ diff --git a/datasets/podcast/raw_data.zip b/datasets/podcast/raw_data.zip new file mode 100644 index 0000000..b1ece8d Binary files /dev/null and b/datasets/podcast/raw_data.zip differ diff --git a/docs/notebooks/example_answers/raw_data.zip b/docs/notebooks/example_answers/raw_data.zip new file mode 100644 index 0000000..603ba36 Binary files /dev/null and b/docs/notebooks/example_answers/raw_data.zip differ