From 5b3c6d457ef7f2a7b604032951ac9403036868a0 Mon Sep 17 00:00:00 2001 From: alex-encord Date: Thu, 14 Dec 2023 22:42:22 +0000 Subject: [PATCH 1/8] Remove duplicate classifications. --- encord/objects/classification_instance.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/encord/objects/classification_instance.py b/encord/objects/classification_instance.py index 0ecd97db1..afdeab66d 100644 --- a/encord/objects/classification_instance.py +++ b/encord/objects/classification_instance.py @@ -133,7 +133,8 @@ def set_for_frames( frames_list = frames_class_to_frames_list(frames) - self._check_classification_already_present(frames_list) + if self._check_classification_already_present(frames_list): + return for frame in frames_list: self._check_within_range(frame) @@ -534,16 +535,13 @@ def _check_within_range(self, frame: int) -> None: f"The supplied frame of `{frame}` is not within the acceptable bounds of `0` to `{self._last_frame}`." ) - def _check_classification_already_present(self, frames: Iterable[int]) -> None: + def _check_classification_already_present(self, frames: Iterable[int]) -> bool: if self._parent is None: - return + return False already_present_frame = self._parent._is_classification_already_present(self.ontology_item, frames) if already_present_frame is not None: - raise LabelRowError( - f"The LabelRowV2, that this classification is part of, already has a classification of the same type " - f"on frame `{already_present_frame}`. The same type of classification can only be present once per " - f"frame per LabelRowV2." - ) + return True + return False def __repr__(self): return ( From 8185c56b08f5396eaa86b10f8fc95cfd52c1764c Mon Sep 17 00:00:00 2001 From: alex-encord Date: Fri, 15 Dec 2023 00:06:19 +0000 Subject: [PATCH 2/8] Clean 3 classes of label row errors: duplicated classifications, out of bounds, missing answer dict. --- encord/objects/classification_instance.py | 36 +++++++++++++---------- encord/objects/ontology_labels_impl.py | 10 +++++-- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/encord/objects/classification_instance.py b/encord/objects/classification_instance.py index afdeab66d..020c5ad79 100644 --- a/encord/objects/classification_instance.py +++ b/encord/objects/classification_instance.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from collections import defaultdict from copy import deepcopy from dataclasses import dataclass, field @@ -134,21 +135,25 @@ def set_for_frames( frames_list = frames_class_to_frames_list(frames) if self._check_classification_already_present(frames_list): + logging.warning(f'Skipping {frames_list}') return for frame in frames_list: - self._check_within_range(frame) - self._set_frame_and_frame_data( - frame, - overwrite=overwrite, - created_at=created_at, - created_by=created_by, - confidence=confidence, - manual_annotation=manual_annotation, - last_edited_at=last_edited_at, - last_edited_by=last_edited_by, - reviews=reviews, - ) + if self._check_within_range(frame): + self._set_frame_and_frame_data( + frame, + overwrite=overwrite, + created_at=created_at, + created_by=created_by, + confidence=confidence, + manual_annotation=manual_annotation, + last_edited_at=last_edited_at, + last_edited_by=last_edited_by, + reviews=reviews, + ) + else: + logging.warning(f'Cutting {frame} in {frames_list}') + return if self.is_assigned_to_label_row(): assert self._parent is not None @@ -529,11 +534,10 @@ def _is_selectable_child_attribute(self, attribute: Attribute) -> bool: top_attribute = ontology_classification.attributes[0] return _search_child_attributes(attribute, top_attribute, self._static_answer_map) - def _check_within_range(self, frame: int) -> None: + def _check_within_range(self, frame: int) -> bool: if frame < 0 or frame >= self._last_frame: - raise LabelRowError( - f"The supplied frame of `{frame}` is not within the acceptable bounds of `0` to `{self._last_frame}`." - ) + return False + return True def _check_classification_already_present(self, frames: Iterable[int]) -> bool: if self._parent is None: diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index 8c677f48f..d099d6029 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -1422,7 +1422,8 @@ def _add_classification_instances_from_classifications( classification_instance = self._create_new_classification_instance( frame_classification_label, frame, classification_answers ) - self.add_classification_instance(classification_instance) + if classification_instance: + self.add_classification_instance(classification_instance) else: self._add_frames_to_classification_instance(frame_classification_label, frame) @@ -1444,7 +1445,7 @@ def _parse_image_group_frame_level_data(self, label_row_data_units: dict) -> Dic def _create_new_classification_instance( self, frame_classification_label: dict, frame: int, classification_answers: dict - ) -> ClassificationInstance: + ) -> ClassificationInstance | None: feature_hash = frame_classification_label["featureHash"] classification_hash = frame_classification_label["classificationHash"] @@ -1463,7 +1464,10 @@ def _create_new_classification_instance( reviews=frame_view.reviews, ) - answers_dict = classification_answers[classification_hash]["classifications"] + answers_dict = classification_answers.get(classification_hash, {}).get("classifications") + if not answers_dict: + logging.warning(f'Skipping {classification_hash}') + return None self._add_static_answers_from_dict(classification_instance, answers_dict) return classification_instance From b25564b856243240596b108c2791de7f8f276028 Mon Sep 17 00:00:00 2001 From: alex-encord Date: Fri, 15 Dec 2023 11:51:44 +0000 Subject: [PATCH 3/8] Skip adding a duplicate object on a frame. --- encord/objects/ontology_object_instance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/encord/objects/ontology_object_instance.py b/encord/objects/ontology_object_instance.py index 5895a60cb..b553473b3 100644 --- a/encord/objects/ontology_object_instance.py +++ b/encord/objects/ontology_object_instance.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from collections import defaultdict from copy import deepcopy from dataclasses import dataclass, field @@ -396,9 +397,8 @@ def set_for_frames( existing_frame_data = self._frames_to_instance_data.get(frame) if overwrite is False and existing_frame_data is not None: - raise LabelRowError( - "Cannot overwrite existing data for a frame. Set `overwrite` to `True` to overwrite." - ) + logging.warning(f'Skipping overwrite for {frame} in {frames_list}') + return check_coordinate_type(coordinates, self._ontology_object) self.check_within_range(frame) From c243ac1db49e6ef16661fccb959d49000ecd7406 Mon Sep 17 00:00:00 2001 From: alex-encord Date: Wed, 10 Jan 2024 15:35:00 +0000 Subject: [PATCH 4/8] Ignore objects that have been deleted from the ontology. --- encord/objects/ontology_labels_impl.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index d099d6029..449980001 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -10,7 +10,7 @@ from encord.client import EncordClientProject from encord.client import LabelRow as OrmLabelRow from encord.constants.enums import DataType -from encord.exceptions import LabelRowError, WrongProjectTypeError +from encord.exceptions import LabelRowError, WrongProjectTypeError, OntologyError from encord.http.bundle import Bundle, BundleResultHandler, BundleResultMapper, bundled_operation from encord.http.limits import ( LABEL_ROW_BUNDLE_CREATE_LIMIT, @@ -1325,18 +1325,23 @@ def _add_object_instances_from_objects( for frame_object_label in objects_list: object_hash = frame_object_label["objectHash"] if object_hash not in self._objects_map: - object_instance = self._create_new_object_instance(frame_object_label, frame) - self.add_object_instance(object_instance) + try: + object_instance = self._create_new_object_instance(frame_object_label, frame) + self.add_object_instance(object_instance) + except OntologyError: + logging.warning(f'Skipping object {object_hash} since it is not in the ontology.') else: self._add_coordinates_to_object_instance(frame_object_label, frame) def _add_objects_answers(self, label_row_dict: dict): for answer in label_row_dict["object_answers"].values(): object_hash = answer["objectHash"] - object_instance = self._objects_map[object_hash] - - answer_list = answer["classifications"] - object_instance.set_answer_from_list(answer_list) + object_instance = self._objects_map.get(object_hash) + if object_instance: + answer_list = answer["classifications"] + object_instance.set_answer_from_list(answer_list) + else: + logging.warning(f'Skipping answers for object {object_hash} as it has no corresponding object.') def _add_action_answers(self, label_row_dict: dict): for answer in label_row_dict["object_actions"].values(): From db30930132af12adc891219fa2f704d6fe5400e8 Mon Sep 17 00:00:00 2001 From: alex-encord Date: Mon, 22 Jan 2024 12:30:46 +0000 Subject: [PATCH 5/8] More frame skipping logic. --- encord/objects/classification_instance.py | 2 +- encord/objects/ontology_labels_impl.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/encord/objects/classification_instance.py b/encord/objects/classification_instance.py index 020c5ad79..da6e3d7c1 100644 --- a/encord/objects/classification_instance.py +++ b/encord/objects/classification_instance.py @@ -135,7 +135,7 @@ def set_for_frames( frames_list = frames_class_to_frames_list(frames) if self._check_classification_already_present(frames_list): - logging.warning(f'Skipping {frames_list}') + logging.warning(f'Skipping {frames_list} as already present for {self.ontology_item}') return for frame in frames_list: diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index 449980001..fa476b8a8 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -1428,7 +1428,10 @@ def _add_classification_instances_from_classifications( frame_classification_label, frame, classification_answers ) if classification_instance: - self.add_classification_instance(classification_instance) + try: + self.add_classification_instance(classification_instance) + except LabelRowError: + logging.warning(f'Skipping {frame}') else: self._add_frames_to_classification_instance(frame_classification_label, frame) From 30ee6b57ec76ba9c67efe1d201a000b612d02253 Mon Sep 17 00:00:00 2001 From: alex-encord Date: Wed, 24 Jan 2024 10:57:51 +0000 Subject: [PATCH 6/8] Skip orphaned classification item where the ontology hash is no longer present. --- encord/objects/ontology_labels_impl.py | 5 ++++- encord/objects/ontology_structure.py | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index fa476b8a8..b7ad63af3 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -1458,6 +1458,9 @@ def _create_new_classification_instance( classification_hash = frame_classification_label["classificationHash"] label_class = self._ontology.structure.get_child_by_hash(feature_hash, type_=Classification) + if not label_class: + logging.warning(f'Skipping classification hash:{classification_hash} as no ontology object was found.') + return None classification_instance = ClassificationInstance(label_class, classification_hash=classification_hash) frame_view = ClassificationInstance.FrameData.from_dict(frame_classification_label) @@ -1474,7 +1477,7 @@ def _create_new_classification_instance( answers_dict = classification_answers.get(classification_hash, {}).get("classifications") if not answers_dict: - logging.warning(f'Skipping {classification_hash}') + logging.warning(f'Skipping classification hash:{classification_hash} as no corresponding answer was found.') return None self._add_static_answers_from_dict(classification_instance, answers_dict) diff --git a/encord/objects/ontology_structure.py b/encord/objects/ontology_structure.py index f55f7829d..f03549aa2 100644 --- a/encord/objects/ontology_structure.py +++ b/encord/objects/ontology_structure.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Type, cast from uuid import uuid4 @@ -28,7 +29,7 @@ def get_child_by_hash( self, feature_node_hash: str, type_: Optional[Type[OntologyElementT]] = None, - ) -> OntologyElementT: + ) -> OntologyElementT | None: """ Returns the first child node of this ontology tree node with the matching feature node hash. If there is more than one child with the same feature node hash in the ontology tree node, then the ontology would be in @@ -52,8 +53,7 @@ def get_child_by_hash( found_item = _get_element_by_hash(feature_node_hash, classification.attributes) if found_item is not None: return checked_cast(found_item, type_) - - raise OntologyError(f"Item not found: can't find an item with a hash {feature_node_hash} in the ontology.") + logging.warning(f"Item not found: can't find an item with a hash {feature_node_hash} in the ontology.") def get_child_by_title( self, From bfb41baffc726ab4728158371b61a504279e0c94 Mon Sep 17 00:00:00 2001 From: alex-encord Date: Wed, 24 Jan 2024 11:06:19 +0000 Subject: [PATCH 7/8] Clean up handling of orphaned classes for objects as we no longer throw from get_child_by_hash. --- encord/objects/ontology_labels_impl.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index b7ad63af3..cf60239c4 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -1325,10 +1325,10 @@ def _add_object_instances_from_objects( for frame_object_label in objects_list: object_hash = frame_object_label["objectHash"] if object_hash not in self._objects_map: - try: - object_instance = self._create_new_object_instance(frame_object_label, frame) + object_instance = self._create_new_object_instance(frame_object_label, frame) + if object_instance: self.add_object_instance(object_instance) - except OntologyError: + else: logging.warning(f'Skipping object {object_hash} since it is not in the ontology.') else: self._add_coordinates_to_object_instance(frame_object_label, frame) @@ -1351,12 +1351,14 @@ def _add_action_answers(self, label_row_dict: dict): answer_list = answer["actions"] object_instance.set_answer_from_list(answer_list) - def _create_new_object_instance(self, frame_object_label: dict, frame: int) -> ObjectInstance: + def _create_new_object_instance(self, frame_object_label: dict, frame: int) -> ObjectInstance | None: ontology = self._ontology.structure feature_hash = frame_object_label["featureHash"] object_hash = frame_object_label["objectHash"] label_class = ontology.get_child_by_hash(feature_hash, type_=Object) + if not label_class: + return None object_instance = ObjectInstance(label_class, object_hash=object_hash) coordinates = self._get_coordinates(frame_object_label) From 3547ba3722bf8fb7e4772c5c5cf92cd5e6c646b5 Mon Sep 17 00:00:00 2001 From: Oscar E Date: Tue, 27 Feb 2024 13:47:17 +0000 Subject: [PATCH 8/8] OE saving scripts for duplicated and orphaned label cleaning --- .../UGhent-orphaned-label-cleaner.py | 61 +++++++ encord/oe_label_cleaning/__init__.py | 0 encord/oe_label_cleaning/investigate-jsons.py | 154 ++++++++++++++++++ .../iteratable-duplicated-label-cleaner.py | 86 ++++++++++ 4 files changed, 301 insertions(+) create mode 100644 encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py create mode 100644 encord/oe_label_cleaning/__init__.py create mode 100644 encord/oe_label_cleaning/investigate-jsons.py create mode 100644 encord/oe_label_cleaning/iteratable-duplicated-label-cleaner.py diff --git a/encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py b/encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py new file mode 100644 index 000000000..2d13fbd1d --- /dev/null +++ b/encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py @@ -0,0 +1,61 @@ +import json +from pprint import pprint + +from encord import EncordUserClient + +# set the ranges for the classifications you want to keep +TRUE_LABEL_RANGES = [[19280, 19580], [22248, 22548]] + +# Connect to encord +keyfile = "/Users/encord/oe-public-key-private-key.txt" +user_client = EncordUserClient.create_with_ssh_private_key(ssh_private_key_path=keyfile) + +# get the project +proj_hash = "6508ede1-cfd4-4eb7-bdc2-83508e805879" +project = user_client.get_project(proj_hash) +# get the label row for the specific data unit +data_hash = "561e8ed5-b65b-4dfb-9556-8fd73d968b43" +label_rows = project.list_label_rows_v2(data_hashes=[data_hash]) +if len(label_rows) == 1: + lr = label_rows.pop() +else: + raise NotImplementedError("Program not built for multiple label rows") + +# initialise labels, save a backup copy of labels and +lr.initialise_labels() +lr_dict = lr.to_encord_dict() +# save a backup of the label row +with open(f"{lr.label_hash}_bkp.json", "w") as f: + json.dump(lr_dict, f) + +# get the labels-by-frame dictionary +lab_row_data_unit = list(lr_dict["data_units"].keys())[0] +labels_by_frame = lr_dict["data_units"][lab_row_data_unit]["labels"] + +# iterate through frame numbers +for frame_num in labels_by_frame.keys(): + # is the frame number NOT within one of our desired frame ranges + in_true_label_range = True + for tlr in TRUE_LABEL_RANGES: + in_true_label_range = in_true_label_range and (not (tlr[0] <= int(frame_num) <= tlr[1])) + # look for non-desired classifications that contain a classification + if in_true_label_range and labels_by_frame[frame_num]["classifications"] != []: + print("REMOVING CLASSIFICATION FROM FRAME:", frame_num, labels_by_frame[frame_num]["classifications"]) + # get ALL classification instances for that frame + bad_class_instance_list = lr.get_classification_instances(filter_frames=int(frame_num)) + # when there is one classification per frame, extract from list. + if len(bad_class_instance_list) == 1: + bad_class_instance = bad_class_instance_list.pop() + bad_class_instance.remove_from_frames(int(frame_num)) + else: + # TODO: if you have multiple classifications in a frame then you will need to filter on classification hash. + raise NotImplementedError("Only one classification per frame is supported") + +# save an backup of the label row before initialising +with open(f"{lr.label_hash}_edited.json", "w") as f: + json.dump(lr.to_encord_dict(), f) + +print(f"FINISHED LABEL FILE: {lr.label_hash}_edited.json") + +# CHECK JSONs BEFORE SAVING ! ! ! ! ! ! ! +# lr.save() diff --git a/encord/oe_label_cleaning/__init__.py b/encord/oe_label_cleaning/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/encord/oe_label_cleaning/investigate-jsons.py b/encord/oe_label_cleaning/investigate-jsons.py new file mode 100644 index 000000000..89c958a61 --- /dev/null +++ b/encord/oe_label_cleaning/investigate-jsons.py @@ -0,0 +1,154 @@ +import json +import os +from pprint import pprint + +import pandas as pd + + +def get_file_pairs(dir_path): + # Check if the directory exists + if not os.path.exists(dir_path): + print(f"Directory '{dir_path}' does not exist.") + exit() + + file_pairs = [] + # Loop through all files in the directory + for filename in os.listdir(dir_path): + # Check if the file is a JSON file and has the required suffixes + if filename.lower().endswith("bkp.json"): + file_edit = filename[: -len("bkp.json")] + "edit.json" + file_pairs.append((filename, file_edit)) + return file_pairs + + +def dict_compare(d1, d2): + d1_keys = set(d1.keys()) + d2_keys = set(d2.keys()) + shared_keys = d1_keys.intersection(d2_keys) + added = d1_keys - d2_keys + removed = d2_keys - d1_keys + modified = {o: (d1[o], d2[o]) for o in shared_keys if d1[o] != d2[o]} + mod_copy = modified.copy() + same = set(o for o in shared_keys if d1[o] == d2[o]) + + for k, v in mod_copy.items(): + if (k in ["createdAt", "lastEditedAt"] and v[0][: len(v[0]) - len("GMT")].strip() == v[1].strip()) or ( + k == "value" and v[0] == "Usable Clip" and v[1] == "usable_clip" + ): + same.add(k) + modified.pop(k) + return added, removed, modified, same + + +def remove_uncorrupted_labels(b_class, e_class): + b_class_copy = b_class.copy() + e_class_copy = e_class.copy() + diffs = [] + for b_cls in b_class: + for e_cls in e_class: + diff = [] + added, removed, modified, same = dict_compare(b_cls, e_cls) + if ( + (added == set() or added == dict()) + and (removed == set() or removed == dict()) + and (modified == set() or modified == dict()) + ): + print(f"REMOVING {b_cls['classificationHash']}") + b_class_copy.remove(b_cls) + e_class_copy.remove(e_cls) + elif e_cls["classificationHash"] == b_cls["classificationHash"]: + # pprint(added) + # pprint(removed) + diff = modified.copy() + pprint(modified) + diffs.append(diff) + # pprint(same) + return b_class_copy, e_class_copy, diffs + + +def compare_frames(b_dict, e_dict, dir_path): + lab_hash = b_dict["label_hash"] + problem_labels = { + "Frame Number": [], + "Label Type": [], + "Labels Before": [], + "Labels After": [], + "Deduped Labels Before": [], + "Deduped Labels After": [], + "Differences": [], + "Label Hash": [], + } + for data_unit, meta in b_dict["data_units"].items(): + print(lab_hash) + for frame_num, labels in meta["labels"].items(): + b_class = labels["classifications"] + b_objects = labels["objects"] + e_labels = e_dict["data_units"][data_unit]["labels"] + if frame_num in e_labels.keys(): + e_label = e_labels[frame_num] + e_class = e_label["classifications"] + # e_class_feat_hashes = [cls["featureHash"] for cls in e_class] + e_objects = e_label["objects"] + # e_obj_feat_hashes = [obj["featureHash"] for obj in e_objects] + else: + print("\nNo e labels for this value") + e_class = [] + e_objects = [] + if len(e_class) != len(b_class): + print(frame_num) + deduped_b_class, deduped_e_class, diffs = remove_uncorrupted_labels(b_class, e_class) + problem_labels["Frame Number"].append(frame_num) + problem_labels["Label Type"].append(["Classification"]) + problem_labels["Labels Before"].append(b_class) + problem_labels["Labels After"].append(e_class) + problem_labels["Deduped Labels Before"].append(deduped_b_class) + problem_labels["Deduped Labels After"].append(deduped_e_class) + problem_labels["Differences"].append(diffs) + problem_labels["Label Hash"].append(lab_hash) + if len(e_objects) != len(b_objects): + raise NotImplementedError("Fix for OBJECTS as well as CLASSIFICATIONS") + # print(frame_num) + # deduped_b_objects, deduped_e_objects, diffs = remove_uncorrupted_labels(b_objects, e_objects) + # problem_labels["Frame Number"].append(frame_num) + # problem_labels["Label Type"].append(["Objects"]) + # problem_labels["Labels Before"].append(b_objects) + # problem_labels["Labels After"].append(e_objects) + # problem_labels["Deduped Labels Before"].append(deduped_b_objects) + # problem_labels["Deduped Labels After"].append(deduped_e_objects) + # problem_labels["Differences"].append(diffs) + # problem_labels["Label Hash"].append(lab_hash) + problem_labels = pd.DataFrame.from_dict(problem_labels, orient="columns") + problem_labels.to_csv(dir_path + "/" + lab_hash + "_problem_labels.csv", index=False) + return problem_labels + + +def main(): + directory_path = "/encord/984cb43c-b6ea-4f13-bd53-b75e25b02358-NOT-WIP" + file_pairs = get_file_pairs(directory_path) + + prob_labels = pd.DataFrame( + { + "Label Hash": [], + "Frame Number": [], + "Label Type": [], + "Labels Before": [], + "Labels After": [], + "Deduped Labels Before": [], + "Deduped Labels After": [], + "Differences": [], + } + ) + for pair in file_pairs: + bkp_file_path = os.path.join(directory_path, pair[0]) + edit_file_path = os.path.join(directory_path, pair[1]) + with open(bkp_file_path, "r") as b: + bkp_json = json.load(b) + with open(edit_file_path, "r") as e: + edit_json = json.load(e) + prob_label = compare_frames(bkp_json, edit_json, directory_path) + prob_labels = pd.concat([prob_labels, prob_label]) + prob_labels.to_csv(directory_path + "/PROJECT PROBLEM LABELS.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/encord/oe_label_cleaning/iteratable-duplicated-label-cleaner.py b/encord/oe_label_cleaning/iteratable-duplicated-label-cleaner.py new file mode 100644 index 000000000..ebd451ede --- /dev/null +++ b/encord/oe_label_cleaning/iteratable-duplicated-label-cleaner.py @@ -0,0 +1,86 @@ +import json +import logging +from pprint import pprint + +import pandas as pd + +from encord import EncordUserClient + + +# Optional: Add a custom handler to capture warnings in a variable +class StringHandler(logging.Handler): + def __init__(self): + super().__init__() + self.log_messages = [] + + def emit(self, record): + self.log_messages.append(record.getMessage()) + + +# Connect to encord +keyfile = "/Users/encord/oe-public-key-private-key.txt" +user_client = EncordUserClient.create_with_ssh_private_key(ssh_private_key_path=keyfile) +logging.basicConfig(level=logging.WARNING) + +# get the project +proj_hash = "984cb43c-b6ea-4f13-bd53-b75e25b02358" +project = user_client.get_project(proj_hash) +# get the label row for the specific data unit +# data_hash = "e3e8669b-40fb-4fd8-8c3a-799c97c204d9" + +# label_rows = project.list_label_rows_v2(data_hashes=[data_hash]) +# label_rows = project.list_label_rows_v2() + +checked_labels = [None] +# checked_labels = pd.read_csv(proj_hash+"_label_hashes_ROUND2.csv")["Label Hash"].to_list() + +num_labs = len(project.list_label_rows_v2()) +print("Total number of labels", num_labs) +num_labs = len(project.list_label_rows_v2()) - len(checked_labels) +print("Number of labels to check", num_labs) +label_hashes: dict = {"Label Hash": [], "Status": []} + +skipped = 0 +for lab_num, lr in enumerate(project.list_label_rows_v2()): + if lr.label_hash in checked_labels: + skipped += 1 + print(f"Skipped label hash: {lr.label_hash} - skipped: {skipped}/{len(checked_labels)}") + else: + print(f"Label Hash: {lr.label_hash} \n Label Number: {lab_num-skipped}/{num_labs}") + if lr.label_hash is not None: + # Create a StringHandler instance and add it to the root logger + handler = StringHandler() + logging.getLogger().addHandler(handler) + + lr_v1 = project.get_label_row(lr.label_hash) + + # print('V1 Captured warnings:', handler.log_messages) + + # initialise labels + lr.initialise_labels() + lr_dict = lr.to_encord_dict() + + # print('V2 Captured warnings:', handler.log_messages) + if len(handler.log_messages) > 0: + # save a backup copy of labels + with open(f"{proj_hash}/{lr.label_hash}_bkp.json", "w") as f: + json.dump(lr_v1, f) + print("SAVING JSON:", f"{lr.label_hash}_bkp.json") + # save a backup of the label row + with open(f"{proj_hash}/{lr.label_hash}_edit.json", "w") as f: + json.dump(lr_dict, f) + print("SAVING JSON:", f"{lr.label_hash}_edit.json") + # lr.save() + # print("Saved LRV2") + label_hashes["Label Hash"].append(lr.label_hash) + label_hashes["Status"].append(handler.log_messages) + else: + label_hashes["Label Hash"].append(lr.label_hash) + label_hashes["Status"].append(handler.log_messages) + else: + print("Skipping") + lr.initialise_labels() + label_hashes["Label Hash"].append(lr.label_hash) + label_hashes["Status"].append(["Skipping"]) + label_hashes_df = pd.DataFrame(label_hashes) + label_hashes_df.to_csv(proj_hash + "_label_hashes.csv", index=False)