diff --git a/encord/objects/classification_instance.py b/encord/objects/classification_instance.py index 0ecd97db1..da6e3d7c1 100644 --- a/encord/objects/classification_instance.py +++ b/encord/objects/classification_instance.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from collections import defaultdict from copy import deepcopy from dataclasses import dataclass, field @@ -133,21 +134,26 @@ def set_for_frames( frames_list = frames_class_to_frames_list(frames) - self._check_classification_already_present(frames_list) + if self._check_classification_already_present(frames_list): + logging.warning(f'Skipping {frames_list} as already present for {self.ontology_item}') + return for frame in frames_list: - self._check_within_range(frame) - self._set_frame_and_frame_data( - frame, - overwrite=overwrite, - created_at=created_at, - created_by=created_by, - confidence=confidence, - manual_annotation=manual_annotation, - last_edited_at=last_edited_at, - last_edited_by=last_edited_by, - reviews=reviews, - ) + if self._check_within_range(frame): + self._set_frame_and_frame_data( + frame, + overwrite=overwrite, + created_at=created_at, + created_by=created_by, + confidence=confidence, + manual_annotation=manual_annotation, + last_edited_at=last_edited_at, + last_edited_by=last_edited_by, + reviews=reviews, + ) + else: + logging.warning(f'Cutting {frame} in {frames_list}') + return if self.is_assigned_to_label_row(): assert self._parent is not None @@ -528,22 +534,18 @@ def _is_selectable_child_attribute(self, attribute: Attribute) -> bool: top_attribute = ontology_classification.attributes[0] return _search_child_attributes(attribute, top_attribute, self._static_answer_map) - def _check_within_range(self, frame: int) -> None: + def _check_within_range(self, frame: int) -> bool: if frame < 0 or frame >= self._last_frame: - raise LabelRowError( - f"The supplied frame of `{frame}` is not within the acceptable bounds of `0` to `{self._last_frame}`." - ) + return False + return True - def _check_classification_already_present(self, frames: Iterable[int]) -> None: + def _check_classification_already_present(self, frames: Iterable[int]) -> bool: if self._parent is None: - return + return False already_present_frame = self._parent._is_classification_already_present(self.ontology_item, frames) if already_present_frame is not None: - raise LabelRowError( - f"The LabelRowV2, that this classification is part of, already has a classification of the same type " - f"on frame `{already_present_frame}`. The same type of classification can only be present once per " - f"frame per LabelRowV2." - ) + return True + return False def __repr__(self): return ( diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index 8c677f48f..cf60239c4 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -10,7 +10,7 @@ from encord.client import EncordClientProject from encord.client import LabelRow as OrmLabelRow from encord.constants.enums import DataType -from encord.exceptions import LabelRowError, WrongProjectTypeError +from encord.exceptions import LabelRowError, WrongProjectTypeError, OntologyError from encord.http.bundle import Bundle, BundleResultHandler, BundleResultMapper, bundled_operation from encord.http.limits import ( LABEL_ROW_BUNDLE_CREATE_LIMIT, @@ -1326,17 +1326,22 @@ def _add_object_instances_from_objects( object_hash = frame_object_label["objectHash"] if object_hash not in self._objects_map: object_instance = self._create_new_object_instance(frame_object_label, frame) - self.add_object_instance(object_instance) + if object_instance: + self.add_object_instance(object_instance) + else: + logging.warning(f'Skipping object {object_hash} since it is not in the ontology.') else: self._add_coordinates_to_object_instance(frame_object_label, frame) def _add_objects_answers(self, label_row_dict: dict): for answer in label_row_dict["object_answers"].values(): object_hash = answer["objectHash"] - object_instance = self._objects_map[object_hash] - - answer_list = answer["classifications"] - object_instance.set_answer_from_list(answer_list) + object_instance = self._objects_map.get(object_hash) + if object_instance: + answer_list = answer["classifications"] + object_instance.set_answer_from_list(answer_list) + else: + logging.warning(f'Skipping answers for object {object_hash} as it has no corresponding object.') def _add_action_answers(self, label_row_dict: dict): for answer in label_row_dict["object_actions"].values(): @@ -1346,12 +1351,14 @@ def _add_action_answers(self, label_row_dict: dict): answer_list = answer["actions"] object_instance.set_answer_from_list(answer_list) - def _create_new_object_instance(self, frame_object_label: dict, frame: int) -> ObjectInstance: + def _create_new_object_instance(self, frame_object_label: dict, frame: int) -> ObjectInstance | None: ontology = self._ontology.structure feature_hash = frame_object_label["featureHash"] object_hash = frame_object_label["objectHash"] label_class = ontology.get_child_by_hash(feature_hash, type_=Object) + if not label_class: + return None object_instance = ObjectInstance(label_class, object_hash=object_hash) coordinates = self._get_coordinates(frame_object_label) @@ -1422,7 +1429,11 @@ def _add_classification_instances_from_classifications( classification_instance = self._create_new_classification_instance( frame_classification_label, frame, classification_answers ) - self.add_classification_instance(classification_instance) + if classification_instance: + try: + self.add_classification_instance(classification_instance) + except LabelRowError: + logging.warning(f'Skipping {frame}') else: self._add_frames_to_classification_instance(frame_classification_label, frame) @@ -1444,11 +1455,14 @@ def _parse_image_group_frame_level_data(self, label_row_data_units: dict) -> Dic def _create_new_classification_instance( self, frame_classification_label: dict, frame: int, classification_answers: dict - ) -> ClassificationInstance: + ) -> ClassificationInstance | None: feature_hash = frame_classification_label["featureHash"] classification_hash = frame_classification_label["classificationHash"] label_class = self._ontology.structure.get_child_by_hash(feature_hash, type_=Classification) + if not label_class: + logging.warning(f'Skipping classification hash:{classification_hash} as no ontology object was found.') + return None classification_instance = ClassificationInstance(label_class, classification_hash=classification_hash) frame_view = ClassificationInstance.FrameData.from_dict(frame_classification_label) @@ -1463,7 +1477,10 @@ def _create_new_classification_instance( reviews=frame_view.reviews, ) - answers_dict = classification_answers[classification_hash]["classifications"] + answers_dict = classification_answers.get(classification_hash, {}).get("classifications") + if not answers_dict: + logging.warning(f'Skipping classification hash:{classification_hash} as no corresponding answer was found.') + return None self._add_static_answers_from_dict(classification_instance, answers_dict) return classification_instance diff --git a/encord/objects/ontology_object_instance.py b/encord/objects/ontology_object_instance.py index 5895a60cb..b553473b3 100644 --- a/encord/objects/ontology_object_instance.py +++ b/encord/objects/ontology_object_instance.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from collections import defaultdict from copy import deepcopy from dataclasses import dataclass, field @@ -396,9 +397,8 @@ def set_for_frames( existing_frame_data = self._frames_to_instance_data.get(frame) if overwrite is False and existing_frame_data is not None: - raise LabelRowError( - "Cannot overwrite existing data for a frame. Set `overwrite` to `True` to overwrite." - ) + logging.warning(f'Skipping overwrite for {frame} in {frames_list}') + return check_coordinate_type(coordinates, self._ontology_object) self.check_within_range(frame) diff --git a/encord/objects/ontology_structure.py b/encord/objects/ontology_structure.py index f55f7829d..f03549aa2 100644 --- a/encord/objects/ontology_structure.py +++ b/encord/objects/ontology_structure.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Type, cast from uuid import uuid4 @@ -28,7 +29,7 @@ def get_child_by_hash( self, feature_node_hash: str, type_: Optional[Type[OntologyElementT]] = None, - ) -> OntologyElementT: + ) -> OntologyElementT | None: """ Returns the first child node of this ontology tree node with the matching feature node hash. If there is more than one child with the same feature node hash in the ontology tree node, then the ontology would be in @@ -52,8 +53,7 @@ def get_child_by_hash( found_item = _get_element_by_hash(feature_node_hash, classification.attributes) if found_item is not None: return checked_cast(found_item, type_) - - raise OntologyError(f"Item not found: can't find an item with a hash {feature_node_hash} in the ontology.") + logging.warning(f"Item not found: can't find an item with a hash {feature_node_hash} in the ontology.") def get_child_by_title( self, diff --git a/encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py b/encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py new file mode 100644 index 000000000..2d13fbd1d --- /dev/null +++ b/encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py @@ -0,0 +1,61 @@ +import json +from pprint import pprint + +from encord import EncordUserClient + +# set the ranges for the classifications you want to keep +TRUE_LABEL_RANGES = [[19280, 19580], [22248, 22548]] + +# Connect to encord +keyfile = "/Users/encord/oe-public-key-private-key.txt" +user_client = EncordUserClient.create_with_ssh_private_key(ssh_private_key_path=keyfile) + +# get the project +proj_hash = "6508ede1-cfd4-4eb7-bdc2-83508e805879" +project = user_client.get_project(proj_hash) +# get the label row for the specific data unit +data_hash = "561e8ed5-b65b-4dfb-9556-8fd73d968b43" +label_rows = project.list_label_rows_v2(data_hashes=[data_hash]) +if len(label_rows) == 1: + lr = label_rows.pop() +else: + raise NotImplementedError("Program not built for multiple label rows") + +# initialise labels, save a backup copy of labels and +lr.initialise_labels() +lr_dict = lr.to_encord_dict() +# save a backup of the label row +with open(f"{lr.label_hash}_bkp.json", "w") as f: + json.dump(lr_dict, f) + +# get the labels-by-frame dictionary +lab_row_data_unit = list(lr_dict["data_units"].keys())[0] +labels_by_frame = lr_dict["data_units"][lab_row_data_unit]["labels"] + +# iterate through frame numbers +for frame_num in labels_by_frame.keys(): + # is the frame number NOT within one of our desired frame ranges + in_true_label_range = True + for tlr in TRUE_LABEL_RANGES: + in_true_label_range = in_true_label_range and (not (tlr[0] <= int(frame_num) <= tlr[1])) + # look for non-desired classifications that contain a classification + if in_true_label_range and labels_by_frame[frame_num]["classifications"] != []: + print("REMOVING CLASSIFICATION FROM FRAME:", frame_num, labels_by_frame[frame_num]["classifications"]) + # get ALL classification instances for that frame + bad_class_instance_list = lr.get_classification_instances(filter_frames=int(frame_num)) + # when there is one classification per frame, extract from list. + if len(bad_class_instance_list) == 1: + bad_class_instance = bad_class_instance_list.pop() + bad_class_instance.remove_from_frames(int(frame_num)) + else: + # TODO: if you have multiple classifications in a frame then you will need to filter on classification hash. + raise NotImplementedError("Only one classification per frame is supported") + +# save an backup of the label row before initialising +with open(f"{lr.label_hash}_edited.json", "w") as f: + json.dump(lr.to_encord_dict(), f) + +print(f"FINISHED LABEL FILE: {lr.label_hash}_edited.json") + +# CHECK JSONs BEFORE SAVING ! ! ! ! ! ! ! +# lr.save() diff --git a/encord/oe_label_cleaning/__init__.py b/encord/oe_label_cleaning/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/encord/oe_label_cleaning/investigate-jsons.py b/encord/oe_label_cleaning/investigate-jsons.py new file mode 100644 index 000000000..89c958a61 --- /dev/null +++ b/encord/oe_label_cleaning/investigate-jsons.py @@ -0,0 +1,154 @@ +import json +import os +from pprint import pprint + +import pandas as pd + + +def get_file_pairs(dir_path): + # Check if the directory exists + if not os.path.exists(dir_path): + print(f"Directory '{dir_path}' does not exist.") + exit() + + file_pairs = [] + # Loop through all files in the directory + for filename in os.listdir(dir_path): + # Check if the file is a JSON file and has the required suffixes + if filename.lower().endswith("bkp.json"): + file_edit = filename[: -len("bkp.json")] + "edit.json" + file_pairs.append((filename, file_edit)) + return file_pairs + + +def dict_compare(d1, d2): + d1_keys = set(d1.keys()) + d2_keys = set(d2.keys()) + shared_keys = d1_keys.intersection(d2_keys) + added = d1_keys - d2_keys + removed = d2_keys - d1_keys + modified = {o: (d1[o], d2[o]) for o in shared_keys if d1[o] != d2[o]} + mod_copy = modified.copy() + same = set(o for o in shared_keys if d1[o] == d2[o]) + + for k, v in mod_copy.items(): + if (k in ["createdAt", "lastEditedAt"] and v[0][: len(v[0]) - len("GMT")].strip() == v[1].strip()) or ( + k == "value" and v[0] == "Usable Clip" and v[1] == "usable_clip" + ): + same.add(k) + modified.pop(k) + return added, removed, modified, same + + +def remove_uncorrupted_labels(b_class, e_class): + b_class_copy = b_class.copy() + e_class_copy = e_class.copy() + diffs = [] + for b_cls in b_class: + for e_cls in e_class: + diff = [] + added, removed, modified, same = dict_compare(b_cls, e_cls) + if ( + (added == set() or added == dict()) + and (removed == set() or removed == dict()) + and (modified == set() or modified == dict()) + ): + print(f"REMOVING {b_cls['classificationHash']}") + b_class_copy.remove(b_cls) + e_class_copy.remove(e_cls) + elif e_cls["classificationHash"] == b_cls["classificationHash"]: + # pprint(added) + # pprint(removed) + diff = modified.copy() + pprint(modified) + diffs.append(diff) + # pprint(same) + return b_class_copy, e_class_copy, diffs + + +def compare_frames(b_dict, e_dict, dir_path): + lab_hash = b_dict["label_hash"] + problem_labels = { + "Frame Number": [], + "Label Type": [], + "Labels Before": [], + "Labels After": [], + "Deduped Labels Before": [], + "Deduped Labels After": [], + "Differences": [], + "Label Hash": [], + } + for data_unit, meta in b_dict["data_units"].items(): + print(lab_hash) + for frame_num, labels in meta["labels"].items(): + b_class = labels["classifications"] + b_objects = labels["objects"] + e_labels = e_dict["data_units"][data_unit]["labels"] + if frame_num in e_labels.keys(): + e_label = e_labels[frame_num] + e_class = e_label["classifications"] + # e_class_feat_hashes = [cls["featureHash"] for cls in e_class] + e_objects = e_label["objects"] + # e_obj_feat_hashes = [obj["featureHash"] for obj in e_objects] + else: + print("\nNo e labels for this value") + e_class = [] + e_objects = [] + if len(e_class) != len(b_class): + print(frame_num) + deduped_b_class, deduped_e_class, diffs = remove_uncorrupted_labels(b_class, e_class) + problem_labels["Frame Number"].append(frame_num) + problem_labels["Label Type"].append(["Classification"]) + problem_labels["Labels Before"].append(b_class) + problem_labels["Labels After"].append(e_class) + problem_labels["Deduped Labels Before"].append(deduped_b_class) + problem_labels["Deduped Labels After"].append(deduped_e_class) + problem_labels["Differences"].append(diffs) + problem_labels["Label Hash"].append(lab_hash) + if len(e_objects) != len(b_objects): + raise NotImplementedError("Fix for OBJECTS as well as CLASSIFICATIONS") + # print(frame_num) + # deduped_b_objects, deduped_e_objects, diffs = remove_uncorrupted_labels(b_objects, e_objects) + # problem_labels["Frame Number"].append(frame_num) + # problem_labels["Label Type"].append(["Objects"]) + # problem_labels["Labels Before"].append(b_objects) + # problem_labels["Labels After"].append(e_objects) + # problem_labels["Deduped Labels Before"].append(deduped_b_objects) + # problem_labels["Deduped Labels After"].append(deduped_e_objects) + # problem_labels["Differences"].append(diffs) + # problem_labels["Label Hash"].append(lab_hash) + problem_labels = pd.DataFrame.from_dict(problem_labels, orient="columns") + problem_labels.to_csv(dir_path + "/" + lab_hash + "_problem_labels.csv", index=False) + return problem_labels + + +def main(): + directory_path = "/encord/984cb43c-b6ea-4f13-bd53-b75e25b02358-NOT-WIP" + file_pairs = get_file_pairs(directory_path) + + prob_labels = pd.DataFrame( + { + "Label Hash": [], + "Frame Number": [], + "Label Type": [], + "Labels Before": [], + "Labels After": [], + "Deduped Labels Before": [], + "Deduped Labels After": [], + "Differences": [], + } + ) + for pair in file_pairs: + bkp_file_path = os.path.join(directory_path, pair[0]) + edit_file_path = os.path.join(directory_path, pair[1]) + with open(bkp_file_path, "r") as b: + bkp_json = json.load(b) + with open(edit_file_path, "r") as e: + edit_json = json.load(e) + prob_label = compare_frames(bkp_json, edit_json, directory_path) + prob_labels = pd.concat([prob_labels, prob_label]) + prob_labels.to_csv(directory_path + "/PROJECT PROBLEM LABELS.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/encord/oe_label_cleaning/iteratable-duplicated-label-cleaner.py b/encord/oe_label_cleaning/iteratable-duplicated-label-cleaner.py new file mode 100644 index 000000000..ebd451ede --- /dev/null +++ b/encord/oe_label_cleaning/iteratable-duplicated-label-cleaner.py @@ -0,0 +1,86 @@ +import json +import logging +from pprint import pprint + +import pandas as pd + +from encord import EncordUserClient + + +# Optional: Add a custom handler to capture warnings in a variable +class StringHandler(logging.Handler): + def __init__(self): + super().__init__() + self.log_messages = [] + + def emit(self, record): + self.log_messages.append(record.getMessage()) + + +# Connect to encord +keyfile = "/Users/encord/oe-public-key-private-key.txt" +user_client = EncordUserClient.create_with_ssh_private_key(ssh_private_key_path=keyfile) +logging.basicConfig(level=logging.WARNING) + +# get the project +proj_hash = "984cb43c-b6ea-4f13-bd53-b75e25b02358" +project = user_client.get_project(proj_hash) +# get the label row for the specific data unit +# data_hash = "e3e8669b-40fb-4fd8-8c3a-799c97c204d9" + +# label_rows = project.list_label_rows_v2(data_hashes=[data_hash]) +# label_rows = project.list_label_rows_v2() + +checked_labels = [None] +# checked_labels = pd.read_csv(proj_hash+"_label_hashes_ROUND2.csv")["Label Hash"].to_list() + +num_labs = len(project.list_label_rows_v2()) +print("Total number of labels", num_labs) +num_labs = len(project.list_label_rows_v2()) - len(checked_labels) +print("Number of labels to check", num_labs) +label_hashes: dict = {"Label Hash": [], "Status": []} + +skipped = 0 +for lab_num, lr in enumerate(project.list_label_rows_v2()): + if lr.label_hash in checked_labels: + skipped += 1 + print(f"Skipped label hash: {lr.label_hash} - skipped: {skipped}/{len(checked_labels)}") + else: + print(f"Label Hash: {lr.label_hash} \n Label Number: {lab_num-skipped}/{num_labs}") + if lr.label_hash is not None: + # Create a StringHandler instance and add it to the root logger + handler = StringHandler() + logging.getLogger().addHandler(handler) + + lr_v1 = project.get_label_row(lr.label_hash) + + # print('V1 Captured warnings:', handler.log_messages) + + # initialise labels + lr.initialise_labels() + lr_dict = lr.to_encord_dict() + + # print('V2 Captured warnings:', handler.log_messages) + if len(handler.log_messages) > 0: + # save a backup copy of labels + with open(f"{proj_hash}/{lr.label_hash}_bkp.json", "w") as f: + json.dump(lr_v1, f) + print("SAVING JSON:", f"{lr.label_hash}_bkp.json") + # save a backup of the label row + with open(f"{proj_hash}/{lr.label_hash}_edit.json", "w") as f: + json.dump(lr_dict, f) + print("SAVING JSON:", f"{lr.label_hash}_edit.json") + # lr.save() + # print("Saved LRV2") + label_hashes["Label Hash"].append(lr.label_hash) + label_hashes["Status"].append(handler.log_messages) + else: + label_hashes["Label Hash"].append(lr.label_hash) + label_hashes["Status"].append(handler.log_messages) + else: + print("Skipping") + lr.initialise_labels() + label_hashes["Label Hash"].append(lr.label_hash) + label_hashes["Status"].append(["Skipping"]) + label_hashes_df = pd.DataFrame(label_hashes) + label_hashes_df.to_csv(proj_hash + "_label_hashes.csv", index=False)