diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 678ef00..216995e 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -54,4 +54,8 @@ jobs: - name: benchmark-qed - Check run: | - uv run poe check \ No newline at end of file + uv run poe check + + - name: benchmark-qed - Test + run: | + uv run poe test \ No newline at end of file diff --git a/.gitignore b/.gitignore index 16e6e9a..fbe5c4b 100644 --- a/.gitignore +++ b/.gitignore @@ -171,7 +171,7 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ # Ruff stuff: .ruff_cache/ diff --git a/.semversioner/next-release/minor-20251217232258224287.json b/.semversioner/next-release/minor-20251217232258224287.json new file mode 100644 index 0000000..940c298 --- /dev/null +++ b/.semversioner/next-release/minor-20251217232258224287.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Support parquet inputs" +} diff --git a/benchmark_qed/autod/io/document.py b/benchmark_qed/autod/io/document.py index 666d24e..440db1c 100644 --- a/benchmark_qed/autod/io/document.py +++ b/benchmark_qed/autod/io/document.py @@ -1,4 +1,5 @@ -# Copyright (c) 2025 Microsoft Corporation. +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. """Load input files into Document objects.""" import datetime @@ -106,17 +107,16 @@ def load_text_dir( return documents -def load_csv_doc( - file_path: str, - encoding: str = defs.FILE_ENCODING, +def _load_docs_from_dataframe( + data_df: pd.DataFrame, + input_type: InputDataType, + title: str, text_tag: str = defs.TEXT_COLUMN, metadata_tags: list[str] | None = None, max_text_length: int | None = None, ) -> list[Document]: - """Load a CSV file and return a Document object.""" - data_df = pd.read_csv(file_path, encoding=encoding) - documents: list[Document] = [] + for index, row in enumerate(data_df.itertuples()): text = getattr(row, text_tag, "") if max_text_length is not None: @@ -127,6 +127,7 @@ def load_csv_doc( for tag in metadata_tags: if tag in data_df.columns: metadata[tag] = getattr(row, tag) + if "date_created" not in metadata: metadata["date_created"] = datetime.datetime.now( tz=datetime.UTC @@ -136,8 +137,8 @@ def load_csv_doc( Document( id=str(uuid4()), short_id=str(index), - title=str(file_path.replace(".csv", "")), - type="csv", + title=title, + type=str(input_type), text=text, attributes=metadata, ) @@ -145,6 +146,24 @@ def load_csv_doc( return documents +def load_csv_doc( + file_path: str, + encoding: str = defs.FILE_ENCODING, + text_tag: str = defs.TEXT_COLUMN, + metadata_tags: list[str] | None = None, + max_text_length: int | None = None, +) -> list[Document]: + """Load a CSV file and return a Document object.""" + return _load_docs_from_dataframe( + data_df=pd.read_csv(file_path, encoding=encoding), + input_type=InputDataType.CSV, + title=str(file_path.replace(".csv", "")), + text_tag=text_tag, + metadata_tags=metadata_tags, + max_text_length=max_text_length, + ) + + def load_csv_dir( dir_path: str, encoding: str = defs.FILE_ENCODING, @@ -171,6 +190,47 @@ def load_csv_dir( return documents +def load_parquet_doc( + file_path: str, + text_tag: str = defs.TEXT_COLUMN, + metadata_tags: list[str] | None = None, + max_text_length: int | None = None, +) -> list[Document]: + """Load Documents from a parquet file.""" + return _load_docs_from_dataframe( + data_df=pd.read_parquet(file_path), + input_type=InputDataType.PARQUET, + title=str(file_path.replace(".parquet", "")), + text_tag=text_tag, + metadata_tags=metadata_tags, + max_text_length=max_text_length, + ) + + +def load_parquet_dir( + dir_path: str, + text_tag: str = defs.TEXT_COLUMN, + metadata_tags: list[str] | None = None, + max_text_length: int | None = None, +) -> list[Document]: + """Load a directory of parquet files and return a list of Document objects.""" + documents: list[Document] = [] + for file_path in Path(dir_path).rglob("*.parquet"): + documents.extend( + load_parquet_doc( + file_path=str(file_path), + text_tag=text_tag, + metadata_tags=metadata_tags, + max_text_length=max_text_length, + ) + ) + + for index, document in enumerate(documents): + document.short_id = str(index) + + return documents + + def create_documents( input_path: str, input_type: InputDataType | str = InputDataType.JSON, @@ -205,6 +265,13 @@ def create_documents( metadata_tags=metadata_tags, max_text_length=max_text_length, ) + case InputDataType.PARQUET: + documents = load_parquet_dir( + dir_path=str(input_path), + text_tag=text_tag, + metadata_tags=metadata_tags, + max_text_length=max_text_length, + ) case _: msg = f"Unsupported input type: {input_type}" raise ValueError(msg) @@ -236,6 +303,13 @@ def create_documents( metadata_tags=metadata_tags, max_text_length=max_text_length, ) + case InputDataType.PARQUET: + documents = load_parquet_doc( + file_path=str(input_path), + text_tag=text_tag, + metadata_tags=metadata_tags, + max_text_length=max_text_length, + ) case _: msg = f"Unsupported input type: {input_type}" raise ValueError(msg) @@ -254,6 +328,11 @@ def load_documents( """Read documents from a dataframe using pre-converted records.""" records = df.to_dict("records") + def _get_attributes(row: dict) -> dict[str, Any]: + attributes = row.get("attributes", {}) + selected_attributes = attributes_cols or [] + return {attr: attributes.get(attr, None) for attr in selected_attributes} + return [ Document( id=row.get(id_col, str(uuid4())), @@ -261,11 +340,7 @@ def load_documents( title=row.get(title_col, ""), type=row.get(type_col, ""), text=row.get(text_col, ""), - attributes=( - {col: row.get(col) for col in attributes_cols} - if attributes_cols - else {} - ), + attributes=_get_attributes(row), ) for index, row in enumerate(records) ] diff --git a/benchmark_qed/autod/io/enums.py b/benchmark_qed/autod/io/enums.py index 9fdb254..43edfc4 100644 --- a/benchmark_qed/autod/io/enums.py +++ b/benchmark_qed/autod/io/enums.py @@ -10,3 +10,4 @@ class InputDataType(StrEnum): JSON = "json" CSV = "csv" TEXT = "text" + PARQUET = "parquet" diff --git a/pyproject.toml b/pyproject.toml index 10cd972..da7f5fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ test = "pytest tests" serve_docs = "mkdocs serve" build_docs = "mkdocs build" -_test_with_coverage = 'coverage run --source=benchmark_qed -m pytest tests/unit' +_test_with_coverage = 'coverage run --source=benchmark_qed -m pytest tests' _coverage_report = 'coverage report --fail-under=100 --show-missing --omit="benchmark_qed/doc_gen/__main__.py"' _generate_coverage_xml = 'coverage xml --omit="benchmark_qed/doc_gen/__main__.py"' _generate_coverage_html = 'coverage html --omit="benchmark_qed/doc_gen/__main__.py"' @@ -120,3 +120,6 @@ sequence = [ [tool.pyright] include = ["benchmark_qed", "tests"] exclude = ["**/__pycache__"] + +[tool.pytest.ini_options] +tmp_path_retention_policy = "failed" diff --git a/ruff.toml b/ruff.toml index 3f8bb96..2a258b0 100644 --- a/ruff.toml +++ b/ruff.toml @@ -95,3 +95,6 @@ builtins-ignorelist = ["input", "id", "bytes"] [lint.pydocstyle] convention = "numpy" + +[lint.flake8-copyright] +notice-rgx = "(?i)Copyright \\(C\\) (\\d{4} )?Microsoft Corporation" diff --git a/tests/autod/__init__.py b/tests/autod/__init__.py new file mode 100644 index 0000000..59e481e --- /dev/null +++ b/tests/autod/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. diff --git a/tests/autod/io/__init__.py b/tests/autod/io/__init__.py new file mode 100644 index 0000000..59e481e --- /dev/null +++ b/tests/autod/io/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. diff --git a/tests/autod/io/document_test.py b/tests/autod/io/document_test.py new file mode 100644 index 0000000..7c96fac --- /dev/null +++ b/tests/autod/io/document_test.py @@ -0,0 +1,326 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +import json +from pathlib import Path +from typing import Any + +import pandas as pd +import pytest + +import benchmark_qed.config.defaults as defs +from benchmark_qed.autod.data_model.document import Document +from benchmark_qed.autod.io.document import ( + create_documents, + load_documents, + save_documents, +) +from benchmark_qed.autod.io.enums import InputDataType + + +def _save_input_docs( + doc_prefix_path: Path, docs: list[dict[str, Any]], input_data_type: InputDataType +): + df = pd.DataFrame.from_records(data=docs) + if input_data_type == InputDataType.PARQUET: + input_path = doc_prefix_path.with_suffix(".parquet") + df.to_parquet(input_path) + elif input_data_type == InputDataType.CSV: + input_path = doc_prefix_path.with_suffix(".csv") + df.to_csv(input_path, header=True) + else: + msg = f"input_data_type must be {InputDataType.CSV} or {InputDataType.PARQUET}" + raise ValueError(msg) + return input_path + + +def _doc_has_attribute(doc: Document, attr: str) -> bool: + return doc.attributes is not None and attr in doc.attributes + + +def _doc_get_attribute(doc: Document, attr: str, default_value: str) -> Any: + if doc.attributes is not None: + return doc.attributes.get(attr, default_value) + return default_value + + +def test_create_documents_text_file(tmp_path: Path): + file = tmp_path / "text_doc.txt" + text = "here is a text document" + file.write_text(text, encoding="utf-8") + + docs = create_documents(input_path=str(file), input_type=InputDataType.TEXT) + assert len(docs) == 1 + assert docs[0].text == text + assert docs[0].title.endswith("text_doc") + + +def test_create_documents_text_dir(tmp_path: Path): + file_1 = tmp_path / "text_doc_1.txt" + text_1 = "1" + file_1.write_text(text_1, encoding="utf-8") + + file_2 = tmp_path / "text_doc_2.txt" + text_2 = "2" + file_2.write_text(text_2, encoding="utf-8") + + docs = create_documents(input_path=str(tmp_path), input_type=InputDataType.TEXT) + assert len(docs) == 2 + + # verify doc title and contents + docs_sorted_by_title = sorted(docs, key=lambda d: d.title) + assert docs_sorted_by_title[0].title.endswith("text_doc_1") + assert docs_sorted_by_title[0].text == text_1 + assert docs_sorted_by_title[1].title.endswith("text_doc_2") + assert docs_sorted_by_title[1].text == text_2 + + +@pytest.mark.parametrize("input_data_type", [InputDataType.CSV, InputDataType.PARQUET]) +@pytest.mark.parametrize("file_or_dir", ["file", "dir"]) +def test_create_documents_from_dataframe_simple( + tmp_path: Path, input_data_type: InputDataType, file_or_dir: str +): + simple_docs = [{"text": "text 1"}, {"text": "text 2"}] + + if file_or_dir == "file": + input_path = _save_input_docs(tmp_path / "doc", simple_docs, input_data_type) + else: + for idx, doc in enumerate(simple_docs): + _save_input_docs(tmp_path / f"doc_{idx}", [doc], input_data_type) + input_path = tmp_path + + docs = create_documents(str(input_path), input_type=input_data_type) + assert len(docs) == 2 + + # verify doc title and contents + docs_sorted_by_title = sorted(docs, key=lambda d: (d.title, d.text)) + assert len(docs_sorted_by_title[0].id) > 0 + assert docs_sorted_by_title[0].title.endswith( + "doc" if file_or_dir == "file" else "doc_0" + ) + assert docs_sorted_by_title[0].text == "text 1" + assert docs_sorted_by_title[0].type == str(input_data_type) + assert _doc_has_attribute(docs_sorted_by_title[0], "date_created") + assert len(docs_sorted_by_title[1].id) > 0 + assert docs_sorted_by_title[1].title.endswith( + "doc" if file_or_dir == "file" else "doc_1" + ) + assert docs_sorted_by_title[1].text == "text 2" + assert docs_sorted_by_title[1].type == str(input_data_type) + assert _doc_has_attribute(docs_sorted_by_title[1], "date_created") + + +@pytest.mark.parametrize("input_data_type", [InputDataType.CSV, InputDataType.PARQUET]) +@pytest.mark.parametrize("file_or_dir", ["file", "dir"]) +def test_create_documents_from_dataframe_complex( + tmp_path: Path, input_data_type: InputDataType, file_or_dir: str +): + simple_docs = [ + { + "content": "text 1", + "attr1": 1, + "attr2": "foo", + "date_created": "20251217T000000Z", + }, + { + "content": "text 2truncateme", + "attr1": 2, + "attr2": "bar", + "date_created": "20240101T000000Z", + }, + ] + + if file_or_dir == "file": + input_path = _save_input_docs(tmp_path / "doc", simple_docs, input_data_type) + else: + for idx, doc in enumerate(simple_docs): + _save_input_docs(tmp_path / f"doc_{idx}", [doc], input_data_type) + input_path = tmp_path + + docs = create_documents( + str(input_path), + input_type=input_data_type, + text_tag="content", + metadata_tags=["attr1", "date_created"], + max_text_length=6, + ) + assert len(docs) == 2 + + # verify doc title and contents + docs_sorted_by_title = sorted(docs, key=lambda d: (d.title, d.text)) + assert len(docs_sorted_by_title[0].id) > 0 + assert docs_sorted_by_title[0].title.endswith( + "doc" if file_or_dir == "file" else "doc_0" + ) + assert docs_sorted_by_title[0].text == "text 1" + assert docs_sorted_by_title[0].type == str(input_data_type) + assert ( + _doc_get_attribute(docs_sorted_by_title[0], "date_created", "") + == "20251217T000000Z" + ) + assert _doc_get_attribute(docs_sorted_by_title[0], "attr1", "") == 1 + assert not _doc_has_attribute(docs_sorted_by_title[0], "attr2") + assert len(docs_sorted_by_title[1].id) > 0 + assert docs_sorted_by_title[1].title.endswith( + "doc" if file_or_dir == "file" else "doc_1" + ) + assert docs_sorted_by_title[1].text == "text 2" + assert docs_sorted_by_title[1].type == str(input_data_type) + assert ( + _doc_get_attribute(docs_sorted_by_title[1], "date_created", "") + == "20240101T000000Z" + ) + assert _doc_get_attribute(docs_sorted_by_title[1], "attr1", "") == 2 + assert not _doc_has_attribute(docs_sorted_by_title[1], "attr2") + + +@pytest.mark.parametrize("file_or_dir", ["file", "dir"]) +def test_create_documents_json_simple(tmp_path: Path, file_or_dir: str): + simple_docs = [{"text": "text 1"}, {"text": "text 2"}] + + if file_or_dir == "file": + input_path = tmp_path / "doc.json" + input_path.write_text(json.dumps(simple_docs[0]), encoding="utf-8") + expected_count = 1 + else: + for idx, doc in enumerate(simple_docs): + file_path = tmp_path / f"doc_{idx}.json" + file_path.write_text(json.dumps(doc), encoding="utf-8") + input_path = tmp_path + expected_count = 2 + + docs = create_documents(str(input_path), input_type=InputDataType.JSON) + assert len(docs) == expected_count + + docs_sorted = sorted(docs, key=lambda d: d.text) + assert docs_sorted[0].text == "text 1" + assert docs_sorted[0].type == "json" + assert _doc_has_attribute(docs_sorted[0], "date_created") + + +@pytest.mark.parametrize("file_or_dir", ["file", "dir"]) +def test_create_documents_json_complex(tmp_path: Path, file_or_dir: str): + if file_or_dir == "file": + input_path = tmp_path / "doc.json" + input_path.write_text( + '{"content": "text 1 truncateme", "attr1": 1, "attr2": "foo", "date_created": "20251217T000000Z"}', + encoding="utf-8", + ) + expected_count = 1 + else: + docs_data = [ + { + "content": "text 1 truncateme", + "attr1": 1, + "attr2": "foo", + "date_created": "20251217T000000Z", + }, + { + "content": "text 2 truncateme", + "attr1": 2, + "attr2": "bar", + "date_created": "20240101T000000Z", + }, + ] + for idx, doc in enumerate(docs_data): + file_path = tmp_path / f"doc_{idx}.json" + file_path.write_text(json.dumps(doc), encoding="utf-8") + input_path = tmp_path + expected_count = 2 + + docs = create_documents( + str(input_path), + input_type=InputDataType.JSON, + text_tag="content", + metadata_tags=["attr1", "date_created"], + max_text_length=6, + ) + assert len(docs) == expected_count + + docs_sorted = sorted(docs, key=lambda d: d.text) + assert docs_sorted[0].text == "text 1" + assert _doc_get_attribute(docs_sorted[0], "attr1", "") == 1 + assert _doc_get_attribute(docs_sorted[0], "date_created", "") == "20251217T000000Z" + assert not _doc_has_attribute(docs_sorted[0], "attr2") + + if expected_count > 1: + assert docs_sorted[1].text == "text 2" + assert _doc_get_attribute(docs_sorted[1], "attr1", "") == 2 + assert ( + _doc_get_attribute(docs_sorted[1], "date_created", "") == "20240101T000000Z" + ) + assert not _doc_has_attribute(docs_sorted[1], "attr2") + assert {d.short_id for d in docs} == {"0", "1"} + + +def test_create_documents_text_max_length(tmp_path: Path): + file = tmp_path / "text_doc.txt" + file.write_text("hello world truncate this", encoding="utf-8") + + docs = create_documents( + input_path=str(file), input_type=InputDataType.TEXT, max_text_length=11 + ) + assert len(docs) == 1 + assert docs[0].text == "hello world" + assert docs[0].title.endswith("text_doc") + + +def test_create_documents_text_dir_nested(tmp_path: Path): + subdir = tmp_path / "subdir" + subdir.mkdir() + + (tmp_path / "doc1.txt").write_text("root doc", encoding="utf-8") + (subdir / "doc2.txt").write_text("nested doc", encoding="utf-8") + + docs = create_documents(input_path=str(tmp_path), input_type=InputDataType.TEXT) + assert len(docs) == 2 + + texts = {d.text for d in docs} + assert texts == {"root doc", "nested doc"} + assert {d.title.split("/")[-1] for d in docs} == {"doc1", "doc2"} + assert {d.short_id for d in docs} == {"0", "1"} + + +@pytest.mark.parametrize("output_dir_exists", [True, False]) +def test_create_save_and_load_documents(tmp_path: Path, output_dir_exists: bool): + (tmp_path / "text_doc_1.txt").write_text("doc 1", encoding="utf-8") + (tmp_path / "text_doc_2.txt").write_text("doc 2", encoding="utf-8") + + docs = create_documents(input_path=str(tmp_path), input_type=InputDataType.TEXT) + assert len(docs) == 2 + + if output_dir_exists: + expected_path = tmp_path / f"{defs.DOCUMENT_OUTPUT}.parquet" + assert expected_path.parent.exists() + else: + expected_path = tmp_path / "nested" / f"{defs.DOCUMENT_OUTPUT}.parquet" + assert not expected_path.parent.exists() + + docs_df = save_documents(docs, output_path=str(expected_path.parent)) + + assert len(docs_df) == 2 + assert expected_path.exists() + + loaded_docs = load_documents(docs_df, attributes_cols=["date_created"]) + assert len(loaded_docs) == 2 + for original, loaded in zip( + sorted(docs, key=lambda d: d.id), + sorted(loaded_docs, key=lambda d: d.id), + strict=True, + ): + assert original.id == loaded.id + assert original.short_id == loaded.short_id + assert original.title == loaded.title + assert original.text == loaded.text + assert original.type == loaded.type + assert original.attributes == loaded.attributes + + +@pytest.mark.parametrize("file_or_dir", ["file", "dir"]) +def test_create_documents_unsupported_input_type(tmp_path: Path, file_or_dir: str): + input_file = tmp_path / "text_doc_1.txt" + input_file.write_text("doc 1", encoding="utf-8") + with pytest.raises(ValueError): # noqa: PT011, PT012 + if file_or_dir == "file": + create_documents(str(input_file), input_type="goblin") + else: + create_documents(str(tmp_path), input_type="goblin")