diff --git a/src/kili/entrypoints/cli/project/export.py b/src/kili/entrypoints/cli/project/export.py index d57508383..440cff252 100644 --- a/src/kili/entrypoints/cli/project/export.py +++ b/src/kili/entrypoints/cli/project/export.py @@ -123,6 +123,7 @@ def export_labels( asset_filter_kwargs=None, normalized_coordinates=normalized_coordinates, label_type_in=None, + include_sent_back_labels=None, ) except NoCompatibleJobError as excp: print(str(excp)) diff --git a/src/kili/llm/services/export/dynamic.py b/src/kili/llm/services/export/dynamic.py index 6f8ac8179..374e11294 100644 --- a/src/kili/llm/services/export/dynamic.py +++ b/src/kili/llm/services/export/dynamic.py @@ -29,17 +29,6 @@ "modelName", ] -ASSET_NEEDED_FIELDS = [ - "assetProjectModels.id", - "assetProjectModels.projectModelId", - "assetProjectModels.configuration", - "content", - "externalId", - "jsonMetadata", - *(f"labels.{field}" for field in LABELS_NEEDED_FIELDS), - "status", -] - class LLMDynamicExporter: """Handle exports of LLM_RLHF projects.""" diff --git a/src/kili/presentation/client/label.py b/src/kili/presentation/client/label.py index c2fe1895d..3af4318f2 100644 --- a/src/kili/presentation/client/label.py +++ b/src/kili/presentation/client/label.py @@ -1138,7 +1138,7 @@ def export_labels( asset_filter_kwargs: Optional[Dict[str, object]] = None, normalized_coordinates: Optional[bool] = None, label_type_in: Optional[List[str]] = None, - include_sent_back_labels: Optional[bool] = True, + include_sent_back_labels: Optional[bool] = None, ) -> Optional[List[Dict[str, Union[List[str], str]]]]: # pylint: disable=line-too-long """Export the project labels with the requested format into the requested output path. diff --git a/src/kili/services/export/__init__.py b/src/kili/services/export/__init__.py index ea3890249..813beb782 100644 --- a/src/kili/services/export/__init__.py +++ b/src/kili/services/export/__init__.py @@ -39,11 +39,17 @@ def export_labels( # pylint: disable=too-many-arguments, too-many-locals asset_filter_kwargs: Optional[Dict[str, object]], normalized_coordinates: Optional[bool], label_type_in: Optional[List[str]], - include_sent_back_labels: Optional[bool] = True, + include_sent_back_labels: Optional[bool], ) -> Optional[List[Dict[str, Union[List[str], str]]]]: """Export the selected assets into the required format, and save it into a file archive.""" kili.kili_api_gateway.get_project(project_id, ["id"]) + include_sent_back_labels = ( + include_sent_back_labels + if include_sent_back_labels is not None + else (label_format != "llm_v1") + ) + export_params = ExportParams( assets_ids=asset_ids, project_id=project_id, @@ -57,7 +63,7 @@ def export_labels( # pylint: disable=too-many-arguments, too-many-locals asset_filter_kwargs=asset_filter_kwargs, normalized_coordinates=normalized_coordinates, label_type_in=label_type_in, - include_sent_back_labels=include_sent_back_labels if label_format != "llm_v1" else False, + include_sent_back_labels=include_sent_back_labels, ) logger = get_logger(log_level) diff --git a/src/kili/services/export/format/base.py b/src/kili/services/export/format/base.py index bb1f48994..c60327c74 100644 --- a/src/kili/services/export/format/base.py +++ b/src/kili/services/export/format/base.py @@ -289,8 +289,9 @@ def preprocess_assets(self, assets: List[Dict]) -> List[Dict]: lambda label: label["isSentBackToQueue"] is False, labels_of_asset ) ) - asset["labels"] = labels_of_asset - assets_in_format.append(asset) + if len(labels_of_asset) > 0: + asset["labels"] = labels_of_asset + assets_in_format.append(asset) if "latestLabel" in asset: label = asset["latestLabel"] if label is not None: diff --git a/src/kili/services/export/format/llm/__init__.py b/src/kili/services/export/format/llm/__init__.py index 9f63bcb55..be067a1f0 100644 --- a/src/kili/services/export/format/llm/__init__.py +++ b/src/kili/services/export/format/llm/__init__.py @@ -53,7 +53,7 @@ def process_and_save( def process(self, assets: List[Dict]) -> List[Dict[str, Union[List[str], str]]]: """LLM specific process.""" warnings.warn( - "Exporting llm labels with `kili.export` is deprecated." + "Exporting llm labels with `kili.export_labels` is deprecated." " Please use `kili.llm.export` instead.", DeprecationWarning, stacklevel=2, @@ -97,6 +97,8 @@ def _process_llm_dynamic_v1(self, assets: List[Dict]) -> List[Dict[str, Union[Li def _process_llm_v1(self, assets: List[Dict]) -> List[Dict[str, Union[List[str], str]]]: result = [] + if len(assets) == 0: + return result for asset in assets: result.append( { @@ -263,7 +265,10 @@ def _format_raw_data( "id": _safe_pop(chat_items_ids), "chat_id": chat_id, "model": models[index_completion] - if (index == len(prompts) - 1 or all_model_keys) + if ( + (index == len(prompts) - 1 or all_model_keys) + and len(models) > index_completion + ) else None, } ) diff --git a/tests/unit/llm/services/export/test_static.py b/tests/unit/llm/services/export/test_static.py index 9380de8d6..8c42ec8cb 100644 --- a/tests/unit/llm/services/export/test_static.py +++ b/tests/unit/llm/services/export/test_static.py @@ -2,148 +2,9 @@ import tempfile from kili.llm.presentation.client.llm import LlmClientMethods - -mock_json_interface = { - "jobs": { - "CLASSIFICATION_JOB": { - "content": { - "categories": { - "A_BETTER_THAN_B": { - "children": [], - "name": "A better than B", - "id": "category1", - }, - "B_BETTER_THAN_A": { - "children": [], - "name": "B better than A", - "id": "category2", - }, - "TIE": {"children": [], "name": "Tie", "id": "category3"}, - }, - "input": "radio", - }, - "instruction": "Compare", - "mlTask": "CLASSIFICATION", - "required": 0, - "isChild": False, - "isNew": False, - }, - "TRANSCRIPTION_JOB": { - "content": {"input": "markdown"}, - "instruction": "", - "mlTask": "TRANSCRIPTION", - "required": 0, - "isChild": False, - "isNew": False, - }, - } -} - -mock_fetch_assets = [ - { - "labels": [ - { - "author": { - "id": "user-1", - "email": "test+admin@kili-technology.com", - "firstname": "Test", - "lastname": "Admin", - }, - "jsonResponse": { - "CLASSIFICATION_JOB": {"categories": [{"name": "A_BETTER_THAN_B"}]} - }, - "createdAt": "2024-08-05T13:03:00.051Z", - "isLatestLabelForUser": True, - "isSentBackToQueue": False, - "labelType": "DEFAULT", - "modelName": None, - } - ], - "content": "https://storage.googleapis.com/label-public-staging/demo-projects/LLM/01.json", - "externalId": "asset#0", - "jsonMetadata": {}, - "status": "LABELED", - }, - { - "labels": [ - { - "author": { - "id": "user-1", - "email": "test+admin@kili-technology.com", - "firstname": "Test", - "lastname": "Admin", - }, - "jsonResponse": { - "CLASSIFICATION_JOB": {"categories": [{"name": "B_BETTER_THAN_A"}]} - }, - "createdAt": "2024-08-05T13:03:03.061Z", - "isLatestLabelForUser": True, - "isSentBackToQueue": False, - "labelType": "DEFAULT", - "modelName": None, - } - ], - "content": "https://storage.googleapis.com/label-public-staging/demo-projects/LLM/02.json", - "externalId": "asset#1", - "jsonMetadata": {}, - "status": "LABELED", - }, - { - "labels": [ - { - "author": { - "id": "user-1", - "email": "test+admin@kili-technology.com", - "firstname": "Test", - "lastname": "Admin", - }, - "jsonResponse": { - "CLASSIFICATION_JOB": {"categories": [{"name": "TIE"}]}, - "TRANSCRIPTION_JOB": {"text": "There is only some formatting changes\n"}, - }, - "createdAt": "2024-08-05T13:03:16.028Z", - "isLatestLabelForUser": True, - "isSentBackToQueue": True, - "labelType": "DEFAULT", - "modelName": None, - } - ], - "content": "https://storage.googleapis.com/label-public-staging/demo-projects/LLM/03.json", - "externalId": "asset#2", - "jsonMetadata": {}, - "status": "LABELED", - }, -] - -mock_raw_asset_content = """{ - "prompts": [ - { - "prompt": "BLABLABLA", - "completions": [ - { - "content": "response A1" - }, - { - "content": "response B1" - } - ] - }, - { - "prompt": "BLIBLIBLI", - "completions": [ - { - "content": "response A2" - }, - { - "content": "response B2" - } - ] - } - ], - "type": "markdown", - "version": "0.1" -} -""" +from tests.unit.services.export.fakes.llm_json_interface import mock_json_interface +from tests.unit.services.export.fakes.llm_project_assets import mock_fetch_assets +from tests.unit.services.export.fakes.llm_raw_asset_content import mock_raw_asset_content expected_export = [ { diff --git a/tests/unit/services/export/fakes/llm_json_interface.py b/tests/unit/services/export/fakes/llm_json_interface.py new file mode 100644 index 000000000..a57d628e9 --- /dev/null +++ b/tests/unit/services/export/fakes/llm_json_interface.py @@ -0,0 +1,35 @@ +mock_json_interface = { + "jobs": { + "CLASSIFICATION_JOB": { + "content": { + "categories": { + "A_BETTER_THAN_B": { + "children": [], + "name": "A better than B", + "id": "category1", + }, + "B_BETTER_THAN_A": { + "children": [], + "name": "B better than A", + "id": "category2", + }, + "TIE": {"children": [], "name": "Tie", "id": "category3"}, + }, + "input": "radio", + }, + "instruction": "Compare", + "mlTask": "CLASSIFICATION", + "required": 0, + "isChild": False, + "isNew": False, + }, + "TRANSCRIPTION_JOB": { + "content": {"input": "markdown"}, + "instruction": "", + "mlTask": "TRANSCRIPTION", + "required": 0, + "isChild": False, + "isNew": False, + }, + } +} diff --git a/tests/unit/services/export/fakes/llm_project_assets.py b/tests/unit/services/export/fakes/llm_project_assets.py new file mode 100644 index 000000000..9d9d5c628 --- /dev/null +++ b/tests/unit/services/export/fakes/llm_project_assets.py @@ -0,0 +1,75 @@ +mock_fetch_assets = [ + { + "labels": [ + { + "author": { + "id": "user-1", + "email": "test+admin@kili-technology.com", + "firstname": "Test", + "lastname": "Admin", + }, + "jsonResponse": { + "CLASSIFICATION_JOB": {"categories": [{"name": "A_BETTER_THAN_B"}]} + }, + "createdAt": "2024-08-05T13:03:00.051Z", + "isLatestLabelForUser": True, + "isSentBackToQueue": False, + "labelType": "DEFAULT", + "modelName": None, + } + ], + "content": "https://storage.googleapis.com/label-public-staging/demo-projects/LLM/01.json", + "externalId": "asset#0", + "jsonMetadata": {}, + "status": "LABELED", + }, + { + "labels": [ + { + "author": { + "id": "user-1", + "email": "test+admin@kili-technology.com", + "firstname": "Test", + "lastname": "Admin", + }, + "jsonResponse": { + "CLASSIFICATION_JOB": {"categories": [{"name": "B_BETTER_THAN_A"}]} + }, + "createdAt": "2024-08-05T13:03:03.061Z", + "isLatestLabelForUser": True, + "isSentBackToQueue": False, + "labelType": "DEFAULT", + "modelName": None, + } + ], + "content": "https://storage.googleapis.com/label-public-staging/demo-projects/LLM/02.json", + "externalId": "asset#1", + "jsonMetadata": {}, + "status": "LABELED", + }, + { + "labels": [ + { + "author": { + "id": "user-1", + "email": "test+admin@kili-technology.com", + "firstname": "Test", + "lastname": "Admin", + }, + "jsonResponse": { + "CLASSIFICATION_JOB": {"categories": [{"name": "TIE"}]}, + "TRANSCRIPTION_JOB": {"text": "There is only some formatting changes\n"}, + }, + "createdAt": "2024-08-05T13:03:16.028Z", + "isLatestLabelForUser": True, + "isSentBackToQueue": True, + "labelType": "DEFAULT", + "modelName": None, + } + ], + "content": "https://storage.googleapis.com/label-public-staging/demo-projects/LLM/03.json", + "externalId": "asset#2", + "jsonMetadata": {}, + "status": "LABELED", + }, +] diff --git a/tests/unit/services/export/fakes/llm_raw_asset_content.py b/tests/unit/services/export/fakes/llm_raw_asset_content.py new file mode 100644 index 000000000..4bc3e9422 --- /dev/null +++ b/tests/unit/services/export/fakes/llm_raw_asset_content.py @@ -0,0 +1,29 @@ +mock_raw_asset_content = """{ + "prompts": [ + { + "prompt": "BLABLABLA", + "completions": [ + { + "content": "response A1" + }, + { + "content": "response B1" + } + ] + }, + { + "prompt": "BLIBLIBLI", + "completions": [ + { + "content": "response A2" + }, + { + "content": "response B2" + } + ] + } + ], + "type": "markdown", + "version": "0.1" +} +""" diff --git a/tests/unit/services/export/test_export.py b/tests/unit/services/export/test_export.py index b979ebb10..dc6cd8706 100644 --- a/tests/unit/services/export/test_export.py +++ b/tests/unit/services/export/test_export.py @@ -1,6 +1,7 @@ # pylint: disable=missing-module-docstring import glob import os +import tempfile from pathlib import Path from tempfile import TemporaryDirectory from unittest.mock import patch @@ -31,6 +32,10 @@ ) from tests.unit.services.export.fakes.fake_ffmpeg import mock_ffmpeg +from .fakes.llm_json_interface import mock_json_interface +from .fakes.llm_project_assets import mock_fetch_assets +from .fakes.llm_raw_asset_content import mock_raw_asset_content + def get_file_tree(folder: str): """Returns the file tree in the shape of a dictionary. @@ -654,6 +659,7 @@ def test_export_service_layout(mocker: pytest_mock.MockerFixture, name, test_cas "asset_filter_kwargs": None, "normalized_coordinates": None, "label_type_in": None, + "include_sent_back_labels": None, } default_kwargs.update(test_case["export_kwargs"]) @@ -781,6 +787,7 @@ def test_export_service_errors(mocker_project, name, test_case, error): "asset_filter_kwargs": None, "normalized_coordinates": None, "label_type_in": None, + "include_sent_back_labels": None, } default_kwargs.update(test_case["export_kwargs"]) @@ -1174,3 +1181,373 @@ def test_when_exporting_asset_with_include_sent_back_labels_parameter_it_filter_ # Then process_and_save_mock.assert_called_once() + + +def test_when_exporting_asset_with_include_sent_back_labels_parameter_it_filter_asset_exported_on_llm_v1( + mocker: pytest_mock.MockerFixture, +): + expected_export = [ + { + "raw_data": [ + { + "role": "user", + "content": "BLABLABLA", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "user", + "content": "BLIBLIBLI", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A2", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B2", + "id": None, + "chat_id": None, + "model": None, + }, + ], + "status": "LABELED", + "external_id": "asset#0", + "metadata": {}, + "labels": [ + { + "author": "test+admin@kili-technology.com", + "created_at": "2024-08-05T13:03:00.051Z", + "label_type": "DEFAULT", + "label": {"CLASSIFICATION_JOB": ["A_BETTER_THAN_B"]}, + } + ], + }, + { + "raw_data": [ + { + "role": "user", + "content": "BLABLABLA", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "user", + "content": "BLIBLIBLI", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A2", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B2", + "id": None, + "chat_id": None, + "model": None, + }, + ], + "status": "LABELED", + "external_id": "asset#1", + "metadata": {}, + "labels": [ + { + "author": "test+admin@kili-technology.com", + "created_at": "2024-08-05T13:03:03.061Z", + "label_type": "DEFAULT", + "label": {"CLASSIFICATION_JOB": ["B_BETTER_THAN_A"]}, + } + ], + }, + { + "raw_data": [ + { + "role": "user", + "content": "BLABLABLA", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "user", + "content": "BLIBLIBLI", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A2", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B2", + "id": None, + "chat_id": None, + "model": None, + }, + ], + "status": "LABELED", + "external_id": "asset#2", + "metadata": {}, + "labels": [ + { + "author": "test+admin@kili-technology.com", + "created_at": "2024-08-05T13:03:16.028Z", + "label_type": "DEFAULT", + "label": { + "CLASSIFICATION_JOB": ["TIE"], + "TRANSCRIPTION_JOB": "There is only some formatting changes\n", + }, + } + ], + }, + ] + + get_project_return_val = { + "jsonInterface": mock_json_interface, + "inputType": "LLM_RLHF", + "title": "", + "id": "project_id", + "dataConnections": None, + } + kili = mock_kili(mocker, with_data_connection=False) + kili.api_endpoint = "https://" # type: ignore + kili.api_key = "" # type: ignore + kili.graphql_client = mocker.MagicMock() # pyright: ignore[reportGeneralTypeIssues] + kili.http_client = mocker.MagicMock() # pyright: ignore[reportGeneralTypeIssues] + kili.kili_api_gateway = mocker.MagicMock() + kili.kili_api_gateway.count_assets.return_value = 1 + kili.kili_api_gateway.get_project.return_value = get_project_return_val + fd, path = tempfile.mkstemp() + + try: + with os.fdopen(fd, "w") as tmp: + tmp.write(mock_raw_asset_content) + for mocked_asset in mock_fetch_assets: + mocked_asset["content"] = path + with patch("kili.services.export.format.base.fetch_assets") as mocked_fetch_assets: + mocked_fetch_assets.return_value = mock_fetch_assets + result = kili.export_labels( + project_id="project_id", + fmt="llm_v1", + filename=None, + include_sent_back_labels=True, + ) + assert result == expected_export + finally: + os.remove(path) + + +def test_when_exporting_asset_with_include_sent_back_labels_parameter_at_false_it_filter_asset_exported_on_llm_v1( + mocker: pytest_mock.MockerFixture, +): + expected_export = [ + { + "raw_data": [ + { + "role": "user", + "content": "BLABLABLA", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "user", + "content": "BLIBLIBLI", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A2", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B2", + "id": None, + "chat_id": None, + "model": None, + }, + ], + "status": "LABELED", + "external_id": "asset#0", + "metadata": {}, + "labels": [ + { + "author": "test+admin@kili-technology.com", + "created_at": "2024-08-05T13:03:00.051Z", + "label_type": "DEFAULT", + "label": {"CLASSIFICATION_JOB": ["A_BETTER_THAN_B"]}, + } + ], + }, + { + "raw_data": [ + { + "role": "user", + "content": "BLABLABLA", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B1", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "user", + "content": "BLIBLIBLI", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response A2", + "id": None, + "chat_id": None, + "model": None, + }, + { + "role": "assistant", + "content": "response B2", + "id": None, + "chat_id": None, + "model": None, + }, + ], + "status": "LABELED", + "external_id": "asset#1", + "metadata": {}, + "labels": [ + { + "author": "test+admin@kili-technology.com", + "created_at": "2024-08-05T13:03:03.061Z", + "label_type": "DEFAULT", + "label": {"CLASSIFICATION_JOB": ["B_BETTER_THAN_A"]}, + } + ], + }, + ] + + get_project_return_val = { + "jsonInterface": mock_json_interface, + "inputType": "LLM_RLHF", + "title": "", + "id": "project_id", + "dataConnections": None, + } + kili = mock_kili(mocker, with_data_connection=False) + kili.api_endpoint = "https://" # type: ignore + kili.api_key = "" # type: ignore + kili.graphql_client = mocker.MagicMock() # pyright: ignore[reportGeneralTypeIssues] + kili.http_client = mocker.MagicMock() # pyright: ignore[reportGeneralTypeIssues] + kili.kili_api_gateway = mocker.MagicMock() + kili.kili_api_gateway.count_assets.return_value = 1 + kili.kili_api_gateway.get_project.return_value = get_project_return_val + fd, path = tempfile.mkstemp() + + try: + with os.fdopen(fd, "w") as tmp: + tmp.write(mock_raw_asset_content) + for mocked_asset in mock_fetch_assets: + mocked_asset["content"] = path + with patch("kili.services.export.format.base.fetch_assets") as mocked_fetch_assets: + mocked_fetch_assets.return_value = mock_fetch_assets + result = kili.export_labels( + project_id="project_id", + fmt="llm_v1", + filename=None, + include_sent_back_labels=False, + ) + assert result == expected_export + finally: + os.remove(path)