Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added Document.from_gcs_multi() method to get multiple wrapped documents from a GCS directory. #223

Closed
wants to merge 9 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def print_gcs_document_tree(
)

for directory, files in path_list.items():
print(directory)
print(create_gcs_uri(gcs_bucket_name, directory))
dir_size = len(files)
for idx, file_name in enumerate(files):
if idx == dir_size - 1:
Expand Down
29 changes: 29 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,35 @@ def from_gcs(
gcs_input_uri=gcs_input_uri,
)

@classmethod
def list_from_gcs(
cls: Type["Document"],
gcs_bucket_name: str,
gcs_prefix: str,
) -> List["Document"]:
r"""Loads a list of Documents from Cloud Storage.

Args:
gcs_bucket_name (str):
Required. The gcs bucket.

Format: Given `gs://{bucket_name}/{optional_folder}/{operation_id}/` where `gcs_bucket_name={bucket_name}`.
gcs_prefix (str):
Required. The prefix to the location of the target folder.

Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where `gcs_prefix={optional_folder}/{target_folder}`.
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
Returns:
List[Document]:
A List of documents from gcs.
"""
return [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is assuming that every sub-directory in the parent (bucket+prefix) is acceptable by Document.from_gcs, which I think has some validation checks.

What is the desired behavior here if some of those sub-directories causes Document.from_gcs to fail? Should the whole list_from_gcs call fail, or should it return just whatever is successfully loaded (maybe along with some metadata explaining the failed items)?

(Let's include unit tests for the case where one of the sub-directories contains wrong files.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list_document_tree() only outputs directories that contain JSON files. (However, it doesn't directly validate if they are Document JSON format.)

.from_gcs() will throw a ValueError if a file is invalid. Or the documentai.Document.from_json() method will throw an exception.

It would probably make sense for this batch case to return whatever was successfully loaded and just skip files that throw exceptions. (And printing out or returning the failed files.)

Document.from_gcs(gcs_bucket_name=gcs_bucket_name, gcs_prefix=directory)
for directory, files in gcs_utilities.list_gcs_document_tree(
gcs_bucket_name, gcs_prefix
).items()
if files != [""]
]

@classmethod
def from_batch_process_metadata(
cls: Type["Document"], metadata: documentai.BatchProcessMetadata
Expand Down
34 changes: 33 additions & 1 deletion tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@

import glob

from google.cloud.vision import AnnotateFileResponse
import pytest

from google.cloud import documentai
from google.cloud.storage import Blob
from google.cloud.documentai_toolbox import document, gcs_utilities
from google.cloud.vision import AnnotateFileResponse


def get_bytes(file_name):
Expand Down Expand Up @@ -397,6 +398,37 @@ def test_document_from_gcs_with_unordered_shards(get_bytes_unordered_files_mock)
assert page.page_number == page_index + 1


@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage")
def test_document_list_from_gcs_with_multiple_input_files(
mock_storage,
get_bytes_multiple_directories_mock,
):
client = mock_storage.Client.return_value

mock_bucket = mock.Mock()

client.Bucket.return_value = mock_bucket

client.list_blobs.return_value = [
Blob(name="documentai/output/123456789/1/test_shard1.json", bucket=None),
Blob(name="documentai/output/123456789/1/test_shard2.json", bucket=None),
Blob(name="documentai/output/123456789/2/test_shard3.json", bucket=None),
]
documents = document.Document.list_from_gcs(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/"
)
get_bytes_multiple_directories_mock.assert_called()
assert get_bytes_multiple_directories_mock.call_count == 2

assert len(documents) == 2

assert documents[0].gcs_bucket_name == "test-directory"
assert documents[0].gcs_prefix == "documentai/output/123456789/1"

assert documents[1].gcs_bucket_name == "test-directory"
assert documents[1].gcs_prefix == "documentai/output/123456789/2"


def test_document_from_batch_process_metadata_with_multiple_input_files(
get_bytes_multiple_directories_mock,
):
Expand Down
Loading