Skip to content

Commit

Permalink
Text and PDFs uploads (#819)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexey-cord-tech authored Dec 11, 2024
1 parent f3fb970 commit 3d3b419
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 4 deletions.
2 changes: 2 additions & 0 deletions encord/constants/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class DataType(StringEnum):
DICOM_STUDY = "dicom_study"
NIFTI = "nifti"
AUDIO = "audio"
PLAIN_TEXT = "plain_text"
PDF = "pdf"

# will be displayed if the Encord platform has a new data type that is not present in this SDK version. Please upgrade your SDK version
MISSING_DATA_TYPE = "_MISSING_DATA_TYPE_"
Expand Down
13 changes: 9 additions & 4 deletions encord/objects/ontology_labels_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,7 +1707,7 @@ def _to_encord_data_unit(self, frame_level_data: FrameLevelImageGroupData) -> Di
):
data_sequence = frame_level_data.frame_number

elif data_type == DataType.AUDIO:
elif data_type == DataType.AUDIO or data_type == DataType.PLAIN_TEXT or data_type == DataType.PDF:
data_sequence = 0

elif data_type == DataType.DICOM_STUDY:
Expand Down Expand Up @@ -1760,7 +1760,7 @@ def _to_encord_labels(self, frame_level_data: FrameLevelImageGroupData) -> Dict[
for frame in self._frame_to_hashes.keys():
ret[str(frame)] = self._to_encord_label(frame)

elif data_type == DataType.AUDIO:
elif data_type == DataType.AUDIO or data_type == DataType.PDF or data_type == DataType.PLAIN_TEXT:
return {}

elif data_type == DataType.DICOM_STUDY:
Expand Down Expand Up @@ -2009,6 +2009,12 @@ def _parse_label_row_dict(self, label_row_dict: dict) -> LabelRowReadOnlyData:
elif data_type == DataType.DICOM_STUDY:
pass

elif data_type == DataType.PLAIN_TEXT or data_type == DataType.PDF:
data_dict = list(label_row_dict["data_units"].values())[0]
data_link = data_dict["data_link"]
height = None
width = None

elif data_type == DataType.MISSING_DATA_TYPE:
raise NotImplementedError(f"The data type {data_type} is not implemented yet.")

Expand Down Expand Up @@ -2082,8 +2088,7 @@ def _parse_labels_from_dict(self, label_row_dict: dict):
elif data_type == DataType.MISSING_DATA_TYPE:
raise NotImplementedError(f"Got an unexpected data type `{data_type}`")

# In the future, PDF and Text should come here
elif data_type == DataType.AUDIO:
elif data_type == DataType.AUDIO or data_type == DataType.PDF or data_type == DataType.PLAIN_TEXT:
self._add_classification_instances_from_classifications_without_frames(classification_answers)

else:
Expand Down
36 changes: 36 additions & 0 deletions encord/orm/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,36 @@ class DataUploadDicomSeries(BaseDTO):
"""Type of the external file."""


class DataUploadText(BaseDTO):
object_url: str
"""URL of the text (TXT, HTML, etc) file to be registered with Encord service."""
title: Optional[str] = None
"""Title of the file (derived from the URL if omitted)."""
client_metadata: dict = Field(default_factory=dict)
"""Custom metadata to be associated with the file."""

external_file_type: Literal["PLAIN_TEXT"] = "PLAIN_TEXT"
"""Type of the external file."""

placeholder_item_uuid: Optional[UUID] = None
"""For system use only."""


class DataUploadPDF(BaseDTO):
object_url: str
"""URL of the PDF file to be registered with Encord service."""
title: Optional[str] = None
"""Title of the file (derived from the URL if omitted)."""
client_metadata: dict = Field(default_factory=dict)
"""Custom metadata to be associated with the file."""

external_file_type: Literal["PDF"] = "PDF"
"""Type of the external file."""

placeholder_item_uuid: Optional[UUID] = None
"""For system use only."""


class DataUploadAudio(BaseDTO):
"""
Data about an audio item to be registered with Encord service.
Expand Down Expand Up @@ -458,6 +488,12 @@ class DataUploadItems(BaseDTO):
nifti: List[DataUploadNifti] = Field(default_factory=list)
"""List of NIFTI items to be registered. See :class:`DataUploadNifti` for more details."""

text: List[DataUploadText] = Field(default_factory=list)
"""List of text items to be registered. See :class:`DataUploadText` for more details."""

pdf: List[DataUploadPDF] = Field(default_factory=list)
"""List of PDF items to be registered. See :class:`DataUploadPDF` for more details."""

skip_duplicate_urls: bool = False
"""If set to `True`, Encord service will skip items with URLs that already exist in the same folder.
Otherwise, duplicate items will be created."""
Expand Down
144 changes: 144 additions & 0 deletions encord/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,136 @@ def upload_audio(
else:
return upload_result.items_with_names[0].item_uuid

def upload_text(
self,
file_path: Union[Path, str],
title: Optional[str] = None,
client_metadata: Optional[Dict[str, Any]] = None,
cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(),
) -> UUID: # TODO this should return an item?
"""
Upload a text file to a Folder in Encord storage.
Args:
file_path: File path of the text file. For example: '/home/user/data/report.txt'
title:
The item title. If unspecified, the file name is used as the title.
client_metadata:
Optional custom metadata to be associated with the audio. Should be a dictionary
that is JSON-serializable.
cloud_upload_settings:
Settings for uploading data into the cloud. Change this object to overwrite the default values.
Returns:
UUID of the newly created text item.
Raises:
AuthorizationError: If the user is not authorized to access the folder.
EncordException: If the audio could not be uploaded. For example, due to being in an unsupported format.
"""
upload_url_info = self._get_upload_signed_urls(
item_type=StorageItemType.PLAIN_TEXT, count=1, frames_subfolder_name=None
)
if len(upload_url_info) != 1:
raise EncordException("Can't access upload location")

title = self._guess_title(title, file_path)

self._upload_local_file(
file_path,
title,
StorageItemType.PLAIN_TEXT,
upload_url_info[0].signed_url,
cloud_upload_settings,
)

upload_result = self._add_data(
integration_id=None,
private_files=DataUploadItems(
text=[
orm_storage.DataUploadText(
object_url=upload_url_info[
0
].object_key, # this is actually ignored when placeholder_item_uuid is set
placeholder_item_uuid=upload_url_info[0].item_uuid,
title=title,
client_metadata=client_metadata or {},
)
],
),
ignore_errors=False,
)

if upload_result.status == LongPollingStatus.ERROR:
raise EncordException(f"Could not register text file, errors occurred {upload_result.errors}")
else:
return upload_result.items_with_names[0].item_uuid

def upload_pdf(
self,
file_path: Union[Path, str],
title: Optional[str] = None,
client_metadata: Optional[Dict[str, Any]] = None,
cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(),
) -> UUID: # TODO this should return an item?
"""
Upload a PDF file to a Folder in Encord storage.
Args:
file_path: File path of the PDF file. For example: '/home/user/data/report.pdf'
title:
The item title. If unspecified, the file name is used as the title.
client_metadata:
Optional custom metadata to be associated with the audio. Should be a dictionary
that is JSON-serializable.
cloud_upload_settings:
Settings for uploading data into the cloud. Change this object to overwrite the default values.
Returns:
UUID of the newly created PDF item.
Raises:
AuthorizationError: If the user is not authorized to access the folder.
EncordException: If the document could not be uploaded. For example, due to being in an unsupported format.
"""
upload_url_info = self._get_upload_signed_urls(
item_type=StorageItemType.PDF, count=1, frames_subfolder_name=None
)
if len(upload_url_info) != 1:
raise EncordException("Can't access upload location")

title = self._guess_title(title, file_path)

self._upload_local_file(
file_path,
title,
StorageItemType.PDF,
upload_url_info[0].signed_url,
cloud_upload_settings,
)

upload_result = self._add_data(
integration_id=None,
private_files=DataUploadItems(
pdf=[
orm_storage.DataUploadPDF(
object_url=upload_url_info[
0
].object_key, # this is actually ignored when placeholder_item_uuid is set
placeholder_item_uuid=upload_url_info[0].item_uuid,
title=title,
client_metadata=client_metadata or {},
)
],
),
ignore_errors=False,
)

if upload_result.status == LongPollingStatus.ERROR:
raise EncordException(f"Could not register text file, errors occurred {upload_result.errors}")
else:
return upload_result.items_with_names[0].item_uuid

def add_private_data_to_folder_start(
self,
integration_id: str,
Expand Down Expand Up @@ -1108,6 +1238,20 @@ def _get_content_type(self, file_path: Union[Path, str], item_type: StorageItemT
return "application/nifti"
elif item_type == StorageItemType.DICOM_FILE:
return "application/dicom"
elif item_type == StorageItemType.PLAIN_TEXT:
text_mime = mimetypes.guess_type(str(file_path))[0]
if text_mime and (
text_mime.startswith("text/")
or text_mime.startswith("application/json")
or text_mime.startswith("application/xml")
):
return text_mime
else:
raise ValueError(
f"Type of {file_path} is detected to be '{text_mime}', which is not supported for text annotations"
)
elif item_type == StorageItemType.PDF:
return "application/pdf"
else:
raise ValueError(f"Unsupported upload item type `{item_type}`")

Expand Down

0 comments on commit 3d3b419

Please sign in to comment.