Skip to content

Commit 793763b

Browse files
committed
feat: Added Document.list_from_gcs() method to get multiple wrapped documents from a GCS directory.
Fixes #214 - Note: `from_gcs()` takes in a GCS directory, but it only works for a single sharded document from a single input document source. - In a GA release, it would be a better practice to have `from_gcs()` take in any GCS directory and output a list of Wrapped Documents. But this would be a backwards-incompatible change now. - Not sure if it's possible/advisable to have two possible return types for `from_gcs()` and just have it return a list when there are multiple Wrapped Documents?
1 parent 769e011 commit 793763b

File tree

2 files changed

+61
-0
lines changed

2 files changed

+61
-0
lines changed

google/cloud/documentai_toolbox/wrappers/document.py

+29
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,35 @@ def from_gcs(
481481
gcs_input_uri=gcs_input_uri,
482482
)
483483

484+
@classmethod
485+
def list_from_gcs(
486+
cls: Type["Document"],
487+
gcs_bucket_name: str,
488+
gcs_prefix: str,
489+
) -> List["Document"]:
490+
r"""Loads a list of Documents from Cloud Storage.
491+
492+
Args:
493+
gcs_bucket_name (str):
494+
Required. The gcs bucket.
495+
496+
Format: Given `gs://{bucket_name}/{optional_folder}/{operation_id}/` where `gcs_bucket_name={bucket_name}`.
497+
gcs_prefix (str):
498+
Required. The prefix to the location of the target folder.
499+
500+
Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where `gcs_prefix={optional_folder}/{target_folder}`.
501+
Returns:
502+
List[Document]:
503+
A List of documents from gcs.
504+
"""
505+
return [
506+
Document.from_gcs(gcs_bucket_name=gcs_bucket_name, gcs_prefix=directory)
507+
for directory, files in gcs_utilities.list_gcs_document_tree(
508+
gcs_bucket_name, gcs_prefix
509+
).items()
510+
if files != [""]
511+
]
512+
484513
@classmethod
485514
def from_batch_process_metadata(
486515
cls: Type["Document"], metadata: documentai.BatchProcessMetadata

tests/unit/test_document.py

+32
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import pytest
3131

3232
from google.cloud import documentai
33+
from google.cloud.storage import Blob, Bucket
3334
from google.cloud.documentai_toolbox import document, gcs_utilities
3435

3536

@@ -397,6 +398,37 @@ def test_document_from_gcs_with_unordered_shards(get_bytes_unordered_files_mock)
397398
assert page.page_number == page_index + 1
398399

399400

401+
@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage")
402+
def test_document_list_from_gcs_with_multiple_input_files(
403+
mock_storage,
404+
get_bytes_multiple_directories_mock,
405+
):
406+
client = mock_storage.Client.return_value
407+
408+
mock_bucket = mock.Mock()
409+
410+
client.Bucket.return_value = mock_bucket
411+
412+
client.list_blobs.return_value = [
413+
Blob(name="documentai/output/123456789/1/test_shard1.json", bucket=None),
414+
Blob(name="documentai/output/123456789/1/test_shard2.json", bucket=None),
415+
Blob(name="documentai/output/123456789/2/test_shard3.json", bucket=None),
416+
]
417+
documents = document.Document.list_from_gcs(
418+
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/"
419+
)
420+
get_bytes_multiple_directories_mock.assert_called()
421+
assert get_bytes_multiple_directories_mock.call_count == 2
422+
423+
assert len(documents) == 2
424+
425+
assert documents[0].gcs_bucket_name == "test-directory"
426+
assert documents[0].gcs_prefix == "documentai/output/123456789/1"
427+
428+
assert documents[1].gcs_bucket_name == "test-directory"
429+
assert documents[1].gcs_prefix == "documentai/output/123456789/2"
430+
431+
400432
def test_document_from_batch_process_metadata_with_multiple_input_files(
401433
get_bytes_multiple_directories_mock,
402434
):

0 commit comments

Comments
 (0)