diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
index 678ef00..216995e 100644
--- a/.github/workflows/python-ci.yml
+++ b/.github/workflows/python-ci.yml
@@ -54,4 +54,8 @@ jobs:
 
       - name: benchmark-qed - Check
         run: |
-          uv run poe check
\ No newline at end of file
+          uv run poe check
+
+      - name: benchmark-qed - Test
+        run: |
+          uv run poe test
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 16e6e9a..fbe5c4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,7 +171,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 # Ruff stuff:
 .ruff_cache/
diff --git a/.semversioner/next-release/minor-20251217232258224287.json b/.semversioner/next-release/minor-20251217232258224287.json
new file mode 100644
index 0000000..940c298
--- /dev/null
+++ b/.semversioner/next-release/minor-20251217232258224287.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Support parquet inputs"
+}
diff --git a/benchmark_qed/autod/io/document.py b/benchmark_qed/autod/io/document.py
index 666d24e..440db1c 100644
--- a/benchmark_qed/autod/io/document.py
+++ b/benchmark_qed/autod/io/document.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2025 Microsoft Corporation.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
 """Load input files into Document objects."""
 
 import datetime
@@ -106,17 +107,16 @@ def load_text_dir(
     return documents
 
 
-def load_csv_doc(
-    file_path: str,
-    encoding: str = defs.FILE_ENCODING,
+def _load_docs_from_dataframe(
+    data_df: pd.DataFrame,
+    input_type: InputDataType,
+    title: str,
     text_tag: str = defs.TEXT_COLUMN,
     metadata_tags: list[str] | None = None,
     max_text_length: int | None = None,
 ) -> list[Document]:
-    """Load a CSV file and return a Document object."""
-    data_df = pd.read_csv(file_path, encoding=encoding)
-
     documents: list[Document] = []
+
     for index, row in enumerate(data_df.itertuples()):
         text = getattr(row, text_tag, "")
         if max_text_length is not None:
@@ -127,6 +127,7 @@ def load_csv_doc(
             for tag in metadata_tags:
                 if tag in data_df.columns:
                     metadata[tag] = getattr(row, tag)
+
         if "date_created" not in metadata:
             metadata["date_created"] = datetime.datetime.now(
                 tz=datetime.UTC
@@ -136,8 +137,8 @@ def load_csv_doc(
             Document(
                 id=str(uuid4()),
                 short_id=str(index),
-                title=str(file_path.replace(".csv", "")),
-                type="csv",
+                title=title,
+                type=str(input_type),
                 text=text,
                 attributes=metadata,
             )
@@ -145,6 +146,24 @@ def load_csv_doc(
     return documents
 
 
+def load_csv_doc(
+    file_path: str,
+    encoding: str = defs.FILE_ENCODING,
+    text_tag: str = defs.TEXT_COLUMN,
+    metadata_tags: list[str] | None = None,
+    max_text_length: int | None = None,
+) -> list[Document]:
+    """Load a CSV file and return a Document object."""
+    return _load_docs_from_dataframe(
+        data_df=pd.read_csv(file_path, encoding=encoding),
+        input_type=InputDataType.CSV,
+        title=str(file_path.replace(".csv", "")),
+        text_tag=text_tag,
+        metadata_tags=metadata_tags,
+        max_text_length=max_text_length,
+    )
+
+
 def load_csv_dir(
     dir_path: str,
     encoding: str = defs.FILE_ENCODING,
@@ -171,6 +190,47 @@ def load_csv_dir(
     return documents
 
 
+def load_parquet_doc(
+    file_path: str,
+    text_tag: str = defs.TEXT_COLUMN,
+    metadata_tags: list[str] | None = None,
+    max_text_length: int | None = None,
+) -> list[Document]:
+    """Load Documents from a parquet file."""
+    return _load_docs_from_dataframe(
+        data_df=pd.read_parquet(file_path),
+        input_type=InputDataType.PARQUET,
+        title=str(file_path.replace(".parquet", "")),
+        text_tag=text_tag,
+        metadata_tags=metadata_tags,
+        max_text_length=max_text_length,
+    )
+
+
+def load_parquet_dir(
+    dir_path: str,
+    text_tag: str = defs.TEXT_COLUMN,
+    metadata_tags: list[str] | None = None,
+    max_text_length: int | None = None,
+) -> list[Document]:
+    """Load a directory of parquet files and return a list of Document objects."""
+    documents: list[Document] = []
+    for file_path in Path(dir_path).rglob("*.parquet"):
+        documents.extend(
+            load_parquet_doc(
+                file_path=str(file_path),
+                text_tag=text_tag,
+                metadata_tags=metadata_tags,
+                max_text_length=max_text_length,
+            )
+        )
+
+    for index, document in enumerate(documents):
+        document.short_id = str(index)
+
+    return documents
+
+
 def create_documents(
     input_path: str,
     input_type: InputDataType | str = InputDataType.JSON,
@@ -205,6 +265,13 @@ def create_documents(
                     metadata_tags=metadata_tags,
                     max_text_length=max_text_length,
                 )
+            case InputDataType.PARQUET:
+                documents = load_parquet_dir(
+                    dir_path=str(input_path),
+                    text_tag=text_tag,
+                    metadata_tags=metadata_tags,
+                    max_text_length=max_text_length,
+                )
             case _:
                 msg = f"Unsupported input type: {input_type}"
                 raise ValueError(msg)
@@ -236,6 +303,13 @@ def create_documents(
                     metadata_tags=metadata_tags,
                     max_text_length=max_text_length,
                 )
+            case InputDataType.PARQUET:
+                documents = load_parquet_doc(
+                    file_path=str(input_path),
+                    text_tag=text_tag,
+                    metadata_tags=metadata_tags,
+                    max_text_length=max_text_length,
+                )
             case _:
                 msg = f"Unsupported input type: {input_type}"
                 raise ValueError(msg)
@@ -254,6 +328,11 @@ def load_documents(
     """Read documents from a dataframe using pre-converted records."""
     records = df.to_dict("records")
 
+    def _get_attributes(row: dict) -> dict[str, Any]:
+        attributes = row.get("attributes", {})
+        selected_attributes = attributes_cols or []
+        return {attr: attributes.get(attr, None) for attr in selected_attributes}
+
     return [
         Document(
             id=row.get(id_col, str(uuid4())),
@@ -261,11 +340,7 @@ def load_documents(
             title=row.get(title_col, ""),
             type=row.get(type_col, ""),
             text=row.get(text_col, ""),
-            attributes=(
-                {col: row.get(col) for col in attributes_cols}
-                if attributes_cols
-                else {}
-            ),
+            attributes=_get_attributes(row),
         )
         for index, row in enumerate(records)
     ]
diff --git a/benchmark_qed/autod/io/enums.py b/benchmark_qed/autod/io/enums.py
index 9fdb254..43edfc4 100644
--- a/benchmark_qed/autod/io/enums.py
+++ b/benchmark_qed/autod/io/enums.py
@@ -10,3 +10,4 @@ class InputDataType(StrEnum):
     JSON = "json"
     CSV = "csv"
     TEXT = "text"
+    PARQUET = "parquet"
diff --git a/pyproject.toml b/pyproject.toml
index 10cd972..da7f5fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,7 +84,7 @@ test = "pytest tests"
 serve_docs = "mkdocs serve"
 build_docs = "mkdocs build"
 
-_test_with_coverage = 'coverage run --source=benchmark_qed -m pytest tests/unit'
+_test_with_coverage = 'coverage run --source=benchmark_qed -m pytest tests'
 _coverage_report = 'coverage report --fail-under=100 --show-missing --omit="benchmark_qed/doc_gen/__main__.py"'
 _generate_coverage_xml = 'coverage xml --omit="benchmark_qed/doc_gen/__main__.py"'
 _generate_coverage_html = 'coverage html --omit="benchmark_qed/doc_gen/__main__.py"'
@@ -120,3 +120,6 @@ sequence = [
 [tool.pyright]
 include = ["benchmark_qed", "tests"]
 exclude = ["**/__pycache__"]
+
+[tool.pytest.ini_options]
+tmp_path_retention_policy = "failed"
diff --git a/ruff.toml b/ruff.toml
index 3f8bb96..2a258b0 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -95,3 +95,6 @@ builtins-ignorelist = ["input", "id", "bytes"]
 
 [lint.pydocstyle]
 convention = "numpy"
+
+[lint.flake8-copyright]
+notice-rgx = "(?i)Copyright \\(C\\) (\\d{4} )?Microsoft Corporation"
diff --git a/tests/autod/__init__.py b/tests/autod/__init__.py
new file mode 100644
index 0000000..59e481e
--- /dev/null
+++ b/tests/autod/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/tests/autod/io/__init__.py b/tests/autod/io/__init__.py
new file mode 100644
index 0000000..59e481e
--- /dev/null
+++ b/tests/autod/io/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/tests/autod/io/document_test.py b/tests/autod/io/document_test.py
new file mode 100644
index 0000000..7c96fac
--- /dev/null
+++ b/tests/autod/io/document_test.py
@@ -0,0 +1,326 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import json
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import pytest
+
+import benchmark_qed.config.defaults as defs
+from benchmark_qed.autod.data_model.document import Document
+from benchmark_qed.autod.io.document import (
+    create_documents,
+    load_documents,
+    save_documents,
+)
+from benchmark_qed.autod.io.enums import InputDataType
+
+
+def _save_input_docs(
+    doc_prefix_path: Path, docs: list[dict[str, Any]], input_data_type: InputDataType
+):
+    df = pd.DataFrame.from_records(data=docs)
+    if input_data_type == InputDataType.PARQUET:
+        input_path = doc_prefix_path.with_suffix(".parquet")
+        df.to_parquet(input_path)
+    elif input_data_type == InputDataType.CSV:
+        input_path = doc_prefix_path.with_suffix(".csv")
+        df.to_csv(input_path, header=True)
+    else:
+        msg = f"input_data_type must be {InputDataType.CSV} or {InputDataType.PARQUET}"
+        raise ValueError(msg)
+    return input_path
+
+
+def _doc_has_attribute(doc: Document, attr: str) -> bool:
+    return doc.attributes is not None and attr in doc.attributes
+
+
+def _doc_get_attribute(doc: Document, attr: str, default_value: str) -> Any:
+    if doc.attributes is not None:
+        return doc.attributes.get(attr, default_value)
+    return default_value
+
+
+def test_create_documents_text_file(tmp_path: Path):
+    file = tmp_path / "text_doc.txt"
+    text = "here is a text document"
+    file.write_text(text, encoding="utf-8")
+
+    docs = create_documents(input_path=str(file), input_type=InputDataType.TEXT)
+    assert len(docs) == 1
+    assert docs[0].text == text
+    assert docs[0].title.endswith("text_doc")
+
+
+def test_create_documents_text_dir(tmp_path: Path):
+    file_1 = tmp_path / "text_doc_1.txt"
+    text_1 = "1"
+    file_1.write_text(text_1, encoding="utf-8")
+
+    file_2 = tmp_path / "text_doc_2.txt"
+    text_2 = "2"
+    file_2.write_text(text_2, encoding="utf-8")
+
+    docs = create_documents(input_path=str(tmp_path), input_type=InputDataType.TEXT)
+    assert len(docs) == 2
+
+    # verify doc title and contents
+    docs_sorted_by_title = sorted(docs, key=lambda d: d.title)
+    assert docs_sorted_by_title[0].title.endswith("text_doc_1")
+    assert docs_sorted_by_title[0].text == text_1
+    assert docs_sorted_by_title[1].title.endswith("text_doc_2")
+    assert docs_sorted_by_title[1].text == text_2
+
+
+@pytest.mark.parametrize("input_data_type", [InputDataType.CSV, InputDataType.PARQUET])
+@pytest.mark.parametrize("file_or_dir", ["file", "dir"])
+def test_create_documents_from_dataframe_simple(
+    tmp_path: Path, input_data_type: InputDataType, file_or_dir: str
+):
+    simple_docs = [{"text": "text 1"}, {"text": "text 2"}]
+
+    if file_or_dir == "file":
+        input_path = _save_input_docs(tmp_path / "doc", simple_docs, input_data_type)
+    else:
+        for idx, doc in enumerate(simple_docs):
+            _save_input_docs(tmp_path / f"doc_{idx}", [doc], input_data_type)
+        input_path = tmp_path
+
+    docs = create_documents(str(input_path), input_type=input_data_type)
+    assert len(docs) == 2
+
+    # verify doc title and contents
+    docs_sorted_by_title = sorted(docs, key=lambda d: (d.title, d.text))
+    assert len(docs_sorted_by_title[0].id) > 0
+    assert docs_sorted_by_title[0].title.endswith(
+        "doc" if file_or_dir == "file" else "doc_0"
+    )
+    assert docs_sorted_by_title[0].text == "text 1"
+    assert docs_sorted_by_title[0].type == str(input_data_type)
+    assert _doc_has_attribute(docs_sorted_by_title[0], "date_created")
+    assert len(docs_sorted_by_title[1].id) > 0
+    assert docs_sorted_by_title[1].title.endswith(
+        "doc" if file_or_dir == "file" else "doc_1"
+    )
+    assert docs_sorted_by_title[1].text == "text 2"
+    assert docs_sorted_by_title[1].type == str(input_data_type)
+    assert _doc_has_attribute(docs_sorted_by_title[1], "date_created")
+
+
+@pytest.mark.parametrize("input_data_type", [InputDataType.CSV, InputDataType.PARQUET])
+@pytest.mark.parametrize("file_or_dir", ["file", "dir"])
+def test_create_documents_from_dataframe_complex(
+    tmp_path: Path, input_data_type: InputDataType, file_or_dir: str
+):
+    simple_docs = [
+        {
+            "content": "text 1",
+            "attr1": 1,
+            "attr2": "foo",
+            "date_created": "20251217T000000Z",
+        },
+        {
+            "content": "text 2truncateme",
+            "attr1": 2,
+            "attr2": "bar",
+            "date_created": "20240101T000000Z",
+        },
+    ]
+
+    if file_or_dir == "file":
+        input_path = _save_input_docs(tmp_path / "doc", simple_docs, input_data_type)
+    else:
+        for idx, doc in enumerate(simple_docs):
+            _save_input_docs(tmp_path / f"doc_{idx}", [doc], input_data_type)
+        input_path = tmp_path
+
+    docs = create_documents(
+        str(input_path),
+        input_type=input_data_type,
+        text_tag="content",
+        metadata_tags=["attr1", "date_created"],
+        max_text_length=6,
+    )
+    assert len(docs) == 2
+
+    # verify doc title and contents
+    docs_sorted_by_title = sorted(docs, key=lambda d: (d.title, d.text))
+    assert len(docs_sorted_by_title[0].id) > 0
+    assert docs_sorted_by_title[0].title.endswith(
+        "doc" if file_or_dir == "file" else "doc_0"
+    )
+    assert docs_sorted_by_title[0].text == "text 1"
+    assert docs_sorted_by_title[0].type == str(input_data_type)
+    assert (
+        _doc_get_attribute(docs_sorted_by_title[0], "date_created", "")
+        == "20251217T000000Z"
+    )
+    assert _doc_get_attribute(docs_sorted_by_title[0], "attr1", "") == 1
+    assert not _doc_has_attribute(docs_sorted_by_title[0], "attr2")
+    assert len(docs_sorted_by_title[1].id) > 0
+    assert docs_sorted_by_title[1].title.endswith(
+        "doc" if file_or_dir == "file" else "doc_1"
+    )
+    assert docs_sorted_by_title[1].text == "text 2"
+    assert docs_sorted_by_title[1].type == str(input_data_type)
+    assert (
+        _doc_get_attribute(docs_sorted_by_title[1], "date_created", "")
+        == "20240101T000000Z"
+    )
+    assert _doc_get_attribute(docs_sorted_by_title[1], "attr1", "") == 2
+    assert not _doc_has_attribute(docs_sorted_by_title[1], "attr2")
+
+
+@pytest.mark.parametrize("file_or_dir", ["file", "dir"])
+def test_create_documents_json_simple(tmp_path: Path, file_or_dir: str):
+    simple_docs = [{"text": "text 1"}, {"text": "text 2"}]
+
+    if file_or_dir == "file":
+        input_path = tmp_path / "doc.json"
+        input_path.write_text(json.dumps(simple_docs[0]), encoding="utf-8")
+        expected_count = 1
+    else:
+        for idx, doc in enumerate(simple_docs):
+            file_path = tmp_path / f"doc_{idx}.json"
+            file_path.write_text(json.dumps(doc), encoding="utf-8")
+        input_path = tmp_path
+        expected_count = 2
+
+    docs = create_documents(str(input_path), input_type=InputDataType.JSON)
+    assert len(docs) == expected_count
+
+    docs_sorted = sorted(docs, key=lambda d: d.text)
+    assert docs_sorted[0].text == "text 1"
+    assert docs_sorted[0].type == "json"
+    assert _doc_has_attribute(docs_sorted[0], "date_created")
+
+
+@pytest.mark.parametrize("file_or_dir", ["file", "dir"])
+def test_create_documents_json_complex(tmp_path: Path, file_or_dir: str):
+    if file_or_dir == "file":
+        input_path = tmp_path / "doc.json"
+        input_path.write_text(
+            '{"content": "text 1 truncateme", "attr1": 1, "attr2": "foo", "date_created": "20251217T000000Z"}',
+            encoding="utf-8",
+        )
+        expected_count = 1
+    else:
+        docs_data = [
+            {
+                "content": "text 1 truncateme",
+                "attr1": 1,
+                "attr2": "foo",
+                "date_created": "20251217T000000Z",
+            },
+            {
+                "content": "text 2 truncateme",
+                "attr1": 2,
+                "attr2": "bar",
+                "date_created": "20240101T000000Z",
+            },
+        ]
+        for idx, doc in enumerate(docs_data):
+            file_path = tmp_path / f"doc_{idx}.json"
+            file_path.write_text(json.dumps(doc), encoding="utf-8")
+        input_path = tmp_path
+        expected_count = 2
+
+    docs = create_documents(
+        str(input_path),
+        input_type=InputDataType.JSON,
+        text_tag="content",
+        metadata_tags=["attr1", "date_created"],
+        max_text_length=6,
+    )
+    assert len(docs) == expected_count
+
+    docs_sorted = sorted(docs, key=lambda d: d.text)
+    assert docs_sorted[0].text == "text 1"
+    assert _doc_get_attribute(docs_sorted[0], "attr1", "") == 1
+    assert _doc_get_attribute(docs_sorted[0], "date_created", "") == "20251217T000000Z"
+    assert not _doc_has_attribute(docs_sorted[0], "attr2")
+
+    if expected_count > 1:
+        assert docs_sorted[1].text == "text 2"
+        assert _doc_get_attribute(docs_sorted[1], "attr1", "") == 2
+        assert (
+            _doc_get_attribute(docs_sorted[1], "date_created", "") == "20240101T000000Z"
+        )
+        assert not _doc_has_attribute(docs_sorted[1], "attr2")
+        assert {d.short_id for d in docs} == {"0", "1"}
+
+
+def test_create_documents_text_max_length(tmp_path: Path):
+    file = tmp_path / "text_doc.txt"
+    file.write_text("hello world truncate this", encoding="utf-8")
+
+    docs = create_documents(
+        input_path=str(file), input_type=InputDataType.TEXT, max_text_length=11
+    )
+    assert len(docs) == 1
+    assert docs[0].text == "hello world"
+    assert docs[0].title.endswith("text_doc")
+
+
+def test_create_documents_text_dir_nested(tmp_path: Path):
+    subdir = tmp_path / "subdir"
+    subdir.mkdir()
+
+    (tmp_path / "doc1.txt").write_text("root doc", encoding="utf-8")
+    (subdir / "doc2.txt").write_text("nested doc", encoding="utf-8")
+
+    docs = create_documents(input_path=str(tmp_path), input_type=InputDataType.TEXT)
+    assert len(docs) == 2
+
+    texts = {d.text for d in docs}
+    assert texts == {"root doc", "nested doc"}
+    assert {d.title.split("/")[-1] for d in docs} == {"doc1", "doc2"}
+    assert {d.short_id for d in docs} == {"0", "1"}
+
+
+@pytest.mark.parametrize("output_dir_exists", [True, False])
+def test_create_save_and_load_documents(tmp_path: Path, output_dir_exists: bool):
+    (tmp_path / "text_doc_1.txt").write_text("doc 1", encoding="utf-8")
+    (tmp_path / "text_doc_2.txt").write_text("doc 2", encoding="utf-8")
+
+    docs = create_documents(input_path=str(tmp_path), input_type=InputDataType.TEXT)
+    assert len(docs) == 2
+
+    if output_dir_exists:
+        expected_path = tmp_path / f"{defs.DOCUMENT_OUTPUT}.parquet"
+        assert expected_path.parent.exists()
+    else:
+        expected_path = tmp_path / "nested" / f"{defs.DOCUMENT_OUTPUT}.parquet"
+        assert not expected_path.parent.exists()
+
+    docs_df = save_documents(docs, output_path=str(expected_path.parent))
+
+    assert len(docs_df) == 2
+    assert expected_path.exists()
+
+    loaded_docs = load_documents(docs_df, attributes_cols=["date_created"])
+    assert len(loaded_docs) == 2
+    for original, loaded in zip(
+        sorted(docs, key=lambda d: d.id),
+        sorted(loaded_docs, key=lambda d: d.id),
+        strict=True,
+    ):
+        assert original.id == loaded.id
+        assert original.short_id == loaded.short_id
+        assert original.title == loaded.title
+        assert original.text == loaded.text
+        assert original.type == loaded.type
+        assert original.attributes == loaded.attributes
+
+
+@pytest.mark.parametrize("file_or_dir", ["file", "dir"])
+def test_create_documents_unsupported_input_type(tmp_path: Path, file_or_dir: str):
+    input_file = tmp_path / "text_doc_1.txt"
+    input_file.write_text("doc 1", encoding="utf-8")
+    with pytest.raises(ValueError):  # noqa: PT011, PT012
+        if file_or_dir == "file":
+            create_documents(str(input_file), input_type="goblin")
+        else:
+            create_documents(str(tmp_path), input_type="goblin")