Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added the scripts used to clean duplicated and orphaned labels #535

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
50 changes: 26 additions & 24 deletions encord/objects/classification_instance.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
from collections import defaultdict
from copy import deepcopy
from dataclasses import dataclass, field
Expand Down Expand Up @@ -133,21 +134,26 @@ def set_for_frames(

frames_list = frames_class_to_frames_list(frames)

self._check_classification_already_present(frames_list)
if self._check_classification_already_present(frames_list):
logging.warning(f'Skipping {frames_list} as already present for {self.ontology_item}')
return

for frame in frames_list:
self._check_within_range(frame)
self._set_frame_and_frame_data(
frame,
overwrite=overwrite,
created_at=created_at,
created_by=created_by,
confidence=confidence,
manual_annotation=manual_annotation,
last_edited_at=last_edited_at,
last_edited_by=last_edited_by,
reviews=reviews,
)
if self._check_within_range(frame):
self._set_frame_and_frame_data(
frame,
overwrite=overwrite,
created_at=created_at,
created_by=created_by,
confidence=confidence,
manual_annotation=manual_annotation,
last_edited_at=last_edited_at,
last_edited_by=last_edited_by,
reviews=reviews,
)
else:
logging.warning(f'Cutting {frame} in {frames_list}')
return

if self.is_assigned_to_label_row():
assert self._parent is not None
Expand Down Expand Up @@ -528,22 +534,18 @@ def _is_selectable_child_attribute(self, attribute: Attribute) -> bool:
top_attribute = ontology_classification.attributes[0]
return _search_child_attributes(attribute, top_attribute, self._static_answer_map)

def _check_within_range(self, frame: int) -> None:
def _check_within_range(self, frame: int) -> bool:
if frame < 0 or frame >= self._last_frame:
raise LabelRowError(
f"The supplied frame of `{frame}` is not within the acceptable bounds of `0` to `{self._last_frame}`."
)
return False
return True

def _check_classification_already_present(self, frames: Iterable[int]) -> None:
def _check_classification_already_present(self, frames: Iterable[int]) -> bool:
if self._parent is None:
return
return False
already_present_frame = self._parent._is_classification_already_present(self.ontology_item, frames)
if already_present_frame is not None:
raise LabelRowError(
f"The LabelRowV2, that this classification is part of, already has a classification of the same type "
f"on frame `{already_present_frame}`. The same type of classification can only be present once per "
f"frame per LabelRowV2."
)
return True
return False

def __repr__(self):
return (
Expand Down
37 changes: 27 additions & 10 deletions encord/objects/ontology_labels_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from encord.client import EncordClientProject
from encord.client import LabelRow as OrmLabelRow
from encord.constants.enums import DataType
from encord.exceptions import LabelRowError, WrongProjectTypeError
from encord.exceptions import LabelRowError, WrongProjectTypeError, OntologyError
from encord.http.bundle import Bundle, BundleResultHandler, BundleResultMapper, bundled_operation
from encord.http.limits import (
LABEL_ROW_BUNDLE_CREATE_LIMIT,
Expand Down Expand Up @@ -1326,17 +1326,22 @@ def _add_object_instances_from_objects(
object_hash = frame_object_label["objectHash"]
if object_hash not in self._objects_map:
object_instance = self._create_new_object_instance(frame_object_label, frame)
self.add_object_instance(object_instance)
if object_instance:
self.add_object_instance(object_instance)
else:
logging.warning(f'Skipping object {object_hash} since it is not in the ontology.')
else:
self._add_coordinates_to_object_instance(frame_object_label, frame)

def _add_objects_answers(self, label_row_dict: dict):
for answer in label_row_dict["object_answers"].values():
object_hash = answer["objectHash"]
object_instance = self._objects_map[object_hash]

answer_list = answer["classifications"]
object_instance.set_answer_from_list(answer_list)
object_instance = self._objects_map.get(object_hash)
if object_instance:
answer_list = answer["classifications"]
object_instance.set_answer_from_list(answer_list)
else:
logging.warning(f'Skipping answers for object {object_hash} as it has no corresponding object.')

def _add_action_answers(self, label_row_dict: dict):
for answer in label_row_dict["object_actions"].values():
Expand All @@ -1346,12 +1351,14 @@ def _add_action_answers(self, label_row_dict: dict):
answer_list = answer["actions"]
object_instance.set_answer_from_list(answer_list)

def _create_new_object_instance(self, frame_object_label: dict, frame: int) -> ObjectInstance:
def _create_new_object_instance(self, frame_object_label: dict, frame: int) -> ObjectInstance | None:
ontology = self._ontology.structure
feature_hash = frame_object_label["featureHash"]
object_hash = frame_object_label["objectHash"]

label_class = ontology.get_child_by_hash(feature_hash, type_=Object)
if not label_class:
return None
object_instance = ObjectInstance(label_class, object_hash=object_hash)

coordinates = self._get_coordinates(frame_object_label)
Expand Down Expand Up @@ -1422,7 +1429,11 @@ def _add_classification_instances_from_classifications(
classification_instance = self._create_new_classification_instance(
frame_classification_label, frame, classification_answers
)
self.add_classification_instance(classification_instance)
if classification_instance:
try:
self.add_classification_instance(classification_instance)
except LabelRowError:
logging.warning(f'Skipping {frame}')
else:
self._add_frames_to_classification_instance(frame_classification_label, frame)

Expand All @@ -1444,11 +1455,14 @@ def _parse_image_group_frame_level_data(self, label_row_data_units: dict) -> Dic

def _create_new_classification_instance(
self, frame_classification_label: dict, frame: int, classification_answers: dict
) -> ClassificationInstance:
) -> ClassificationInstance | None:
feature_hash = frame_classification_label["featureHash"]
classification_hash = frame_classification_label["classificationHash"]

label_class = self._ontology.structure.get_child_by_hash(feature_hash, type_=Classification)
if not label_class:
logging.warning(f'Skipping classification hash:{classification_hash} as no ontology object was found.')
return None
classification_instance = ClassificationInstance(label_class, classification_hash=classification_hash)

frame_view = ClassificationInstance.FrameData.from_dict(frame_classification_label)
Expand All @@ -1463,7 +1477,10 @@ def _create_new_classification_instance(
reviews=frame_view.reviews,
)

answers_dict = classification_answers[classification_hash]["classifications"]
answers_dict = classification_answers.get(classification_hash, {}).get("classifications")
if not answers_dict:
logging.warning(f'Skipping classification hash:{classification_hash} as no corresponding answer was found.')
return None
self._add_static_answers_from_dict(classification_instance, answers_dict)

return classification_instance
Expand Down
6 changes: 3 additions & 3 deletions encord/objects/ontology_object_instance.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
from collections import defaultdict
from copy import deepcopy
from dataclasses import dataclass, field
Expand Down Expand Up @@ -396,9 +397,8 @@ def set_for_frames(
existing_frame_data = self._frames_to_instance_data.get(frame)

if overwrite is False and existing_frame_data is not None:
raise LabelRowError(
"Cannot overwrite existing data for a frame. Set `overwrite` to `True` to overwrite."
)
logging.warning(f'Skipping overwrite for {frame} in {frames_list}')
return

check_coordinate_type(coordinates, self._ontology_object)
self.check_within_range(frame)
Expand Down
6 changes: 3 additions & 3 deletions encord/objects/ontology_structure.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Type, cast
from uuid import uuid4
Expand Down Expand Up @@ -28,7 +29,7 @@ def get_child_by_hash(
self,
feature_node_hash: str,
type_: Optional[Type[OntologyElementT]] = None,
) -> OntologyElementT:
) -> OntologyElementT | None:
"""
Returns the first child node of this ontology tree node with the matching feature node hash. If there is
more than one child with the same feature node hash in the ontology tree node, then the ontology would be in
Expand All @@ -52,8 +53,7 @@ def get_child_by_hash(
found_item = _get_element_by_hash(feature_node_hash, classification.attributes)
if found_item is not None:
return checked_cast(found_item, type_)

raise OntologyError(f"Item not found: can't find an item with a hash {feature_node_hash} in the ontology.")
logging.warning(f"Item not found: can't find an item with a hash {feature_node_hash} in the ontology.")

def get_child_by_title(
self,
Expand Down
61 changes: 61 additions & 0 deletions encord/oe_label_cleaning/UGhent-orphaned-label-cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import json
from pprint import pprint

from encord import EncordUserClient

# set the ranges for the classifications you want to keep
TRUE_LABEL_RANGES = [[19280, 19580], [22248, 22548]]

# Connect to encord
keyfile = "/Users/encord/oe-public-key-private-key.txt"
user_client = EncordUserClient.create_with_ssh_private_key(ssh_private_key_path=keyfile)

# get the project
proj_hash = "6508ede1-cfd4-4eb7-bdc2-83508e805879"
project = user_client.get_project(proj_hash)
# get the label row for the specific data unit
data_hash = "561e8ed5-b65b-4dfb-9556-8fd73d968b43"
label_rows = project.list_label_rows_v2(data_hashes=[data_hash])
if len(label_rows) == 1:
lr = label_rows.pop()
else:
raise NotImplementedError("Program not built for multiple label rows")

# initialise labels, save a backup copy of labels and
lr.initialise_labels()
lr_dict = lr.to_encord_dict()
# save a backup of the label row
with open(f"{lr.label_hash}_bkp.json", "w") as f:
json.dump(lr_dict, f)

# get the labels-by-frame dictionary
lab_row_data_unit = list(lr_dict["data_units"].keys())[0]
labels_by_frame = lr_dict["data_units"][lab_row_data_unit]["labels"]

# iterate through frame numbers
for frame_num in labels_by_frame.keys():
# is the frame number NOT within one of our desired frame ranges
in_true_label_range = True
for tlr in TRUE_LABEL_RANGES:
in_true_label_range = in_true_label_range and (not (tlr[0] <= int(frame_num) <= tlr[1]))
# look for non-desired classifications that contain a classification
if in_true_label_range and labels_by_frame[frame_num]["classifications"] != []:
print("REMOVING CLASSIFICATION FROM FRAME:", frame_num, labels_by_frame[frame_num]["classifications"])
# get ALL classification instances for that frame
bad_class_instance_list = lr.get_classification_instances(filter_frames=int(frame_num))
# when there is one classification per frame, extract from list.
if len(bad_class_instance_list) == 1:
bad_class_instance = bad_class_instance_list.pop()
bad_class_instance.remove_from_frames(int(frame_num))
else:
# TODO: if you have multiple classifications in a frame then you will need to filter on classification hash.
raise NotImplementedError("Only one classification per frame is supported")

# save an backup of the label row before initialising
with open(f"{lr.label_hash}_edited.json", "w") as f:
json.dump(lr.to_encord_dict(), f)

print(f"FINISHED LABEL FILE: {lr.label_hash}_edited.json")

# CHECK JSONs BEFORE SAVING ! ! ! ! ! ! !
# lr.save()
Empty file.
Loading