fix(export): Use mappings instead of renaming files (#273)

* fix(export): Use mappings instead of renaming files Now when exporting projects and creating subsets we create file mappings so as to avoid issues with versioning. This also fixes a versioning bug that discards you are changes when you're already on latest
encord-team · Mar 22, 2023 · f95a504 · f95a504
1 parent 7c9e78b
commit f95a504
Show file tree

Hide file tree

Showing 14 changed files with 171 additions and 97 deletions.
diff --git a/src/encord_active/app/actions_page/export_filter.py b/src/encord_active/app/actions_page/export_filter.py
@@ -83,7 +83,8 @@ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                     "Choose tags to filter", options=Tags().all(), format_func=lambda x: x.name, key=key
                 )
                 filtered_rows = [True if set(tag_filters) <= set(x) else False for x in filtered["tags"]]
-                filtered = filtered.loc[filtered_rows]
+                filtered_items = filtered.loc[filtered_rows]
+                filtered = filtered[filtered.data_row_id.isin(filtered_items["data_row_id"])]
 
             # Treat columns with < 10 unique values as categorical
             elif is_categorical_dtype(filtered[column]) or filtered[column].nunique() < 10:

diff --git a/src/encord_active/app/common/components/metric_summary.py b/src/encord_active/app/common/components/metric_summary.py
@@ -328,7 +328,7 @@ def render_metric_summary(
 
 
 def render_summary_item(row, metric_name: str, iqr_outliers: IqrOutliers, metric_scope: MetricScope):
-    image = show_image_and_draw_polygons(row, get_state().project_paths.data, get_state().object_drawing_configurations)
+    image = show_image_and_draw_polygons(row, get_state().project_paths, get_state().object_drawing_configurations)
     st.image(image)
 
     multiselect_tag(row, f"{metric_name}_summary")

diff --git a/src/encord_active/app/common/components/prediction_grid.py b/src/encord_active/app/common/components/prediction_grid.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import List, Optional
 
 import pandas as pd
@@ -27,12 +26,13 @@
     LabelMatchSchema,
     PredictionMatchSchema,
 )
+from encord_active.lib.project import ProjectFileStructure
 
 
 def build_card_for_labels(
     label: pd.Series,
     predictions: DataFrame[PredictionMatchSchema],
-    data_dir: Path,
+    project_file_structure: ProjectFileStructure,
     label_color: Color = Color.RED,
 ):
     class_colors = {
@@ -42,7 +42,7 @@ def build_card_for_labels(
     image = show_image_with_predictions_and_label(
         label,
         predictions,
-        data_dir,
+        project_file_structure,
         label_color=label_color,
         class_colors=class_colors,
         draw_configurations=get_state().object_drawing_configurations,
@@ -57,9 +57,9 @@ def build_card_for_labels(
     build_data_tags(label, get_state().predictions.metric_datas.selected_label)
 
 
-def build_card_for_predictions(row: pd.Series, data_dir: Path, box_color=Color.GREEN):
+def build_card_for_predictions(row: pd.Series, project_file_structure: ProjectFileStructure, box_color=Color.GREEN):
     conf = get_state().object_drawing_configurations
-    image = show_image_and_draw_polygons(row, data_dir, draw_configurations=conf, skip_object_hash=True)
+    image = show_image_and_draw_polygons(row, project_file_structure, draw_configurations=conf, skip_object_hash=True)
     image = draw_object(image, row, draw_configuration=conf, color=box_color, with_box=True)
     st.image(image)
     multiselect_tag(row, "metric_view", is_predictions=True)
@@ -74,17 +74,17 @@ def build_card_for_predictions(row: pd.Series, data_dir: Path, box_color=Color.G
 def build_card(
     row: pd.Series,
     predictions: Optional[DataFrame[PredictionMatchSchema]],
-    data_dir: Path,
+    project_file_structure: ProjectFileStructure,
     box_color: Color = Color.GREEN,
 ):
     if predictions is not None:
-        build_card_for_labels(row, predictions, data_dir, box_color)
+        build_card_for_labels(row, predictions, project_file_structure, box_color)
     else:
-        build_card_for_predictions(row, data_dir, box_color)
+        build_card_for_predictions(row, project_file_structure, box_color)
 
 
-def build_card_classifications(row: pd.Series, data_dir: Path):
-    image = show_image_and_draw_polygons(row, data_dir)
+def build_card_classifications(row: pd.Series, project_file_structure: ProjectFileStructure):
+    image = show_image_and_draw_polygons(row, project_file_structure)
     st.image(image)
     multiselect_tag(row, "metric_view_classification", is_predictions=True)
     build_data_tags(row, get_state().predictions.metric_datas_classification.selected_prediction)
@@ -94,7 +94,7 @@ def build_card_classifications(row: pd.Series, data_dir: Path):
 
 
 def prediction_grid(
-    data_dir: Path,
+    project_file_structure: ProjectFileStructure,
     model_predictions: DataFrame[PredictionMatchSchema],
     labels: Optional[DataFrame[LabelMatchSchema]] = None,
     box_color: Color = Color.GREEN,
@@ -134,11 +134,12 @@ def prediction_grid(
                     divider()
                 cols = list(st.columns(n_cols))
             with cols.pop(0):
-                build_card(row, frame_additionals, data_dir, box_color=box_color)
+                build_card(row, frame_additionals, project_file_structure, box_color=box_color)
 
 
 def prediction_grid_classifications(
-    data_dir: Path, model_predictions: DataFrame[ClassificationPredictionMatchSchemaWithClassNames]
+    project_file_structure: ProjectFileStructure,
+    model_predictions: DataFrame[ClassificationPredictionMatchSchemaWithClassNames],
 ):
     df = model_predictions
     selected_metric = get_state().predictions.metric_datas_classification.selected_prediction or ""
@@ -162,4 +163,4 @@ def prediction_grid_classifications(
                     divider()
                 cols = list(st.columns(n_cols))
             with cols.pop(0):
-                build_card_classifications(row, data_dir)
+                build_card_classifications(row, project_file_structure)
diff --git a/src/encord_active/app/common/components/similarities.py b/src/encord_active/app/common/components/similarities.py
@@ -23,7 +23,7 @@ def show_similarities(identifier: str, expander: DeltaGenerator, embedding_infor
         if embedding_information.has_annotations:
             load_image = show_image_and_draw_polygons
 
-        image = load_image(nearest_image["key"], get_state().project_paths.data)
+        image = load_image(nearest_image["key"], get_state().project_paths)
 
         st_columns[column_id].image(image)
         st_columns[column_id].write(f"Annotated as `{nearest_image['name']}`")

diff --git a/src/encord_active/app/data_quality/sub_pages/explorer.py b/src/encord_active/app/data_quality/sub_pages/explorer.py
@@ -258,8 +258,6 @@ def build_card(
     """
     Builds each sub card (the content displayed for each row in a csv file).
     """
-    data_dir = get_state().project_paths.data
-
     identifier_parts = 4 if embedding_information.has_annotations else 3
     identifier = "_".join(str(row["identifier"]).split("_")[:identifier_parts])
 
@@ -271,7 +269,9 @@ def build_card(
         st.write(f"{embedding_information.type.value} card type is not defined in EmbeddingTypes")
         return
 
-    image = show_image_and_draw_polygons(row, data_dir, draw_configurations=get_state().object_drawing_configurations)
+    image = show_image_and_draw_polygons(
+        row, get_state().project_paths, draw_configurations=get_state().object_drawing_configurations
+    )
     st.image(image)
 
     # === Write scores and link to editor === #

diff --git a/src/encord_active/app/model_quality/sub_pages/false_positives.py b/src/encord_active/app/model_quality/sub_pages/false_positives.py
@@ -102,7 +102,7 @@ def _build_classifications(
         else:
             histogram = get_histogram(fp_df, metric_name)
             st.altair_chart(histogram, use_container_width=True)
-            prediction_grid_classifications(get_state().project_paths.data, model_predictions=fp_df)
+            prediction_grid_classifications(get_state().project_paths, model_predictions=fp_df)
 
     def build(
         self,

diff --git a/src/encord_active/app/model_quality/sub_pages/true_positives.py b/src/encord_active/app/model_quality/sub_pages/true_positives.py
@@ -99,7 +99,7 @@ def _build_classifications(
         else:
             histogram = get_histogram(tp_df, metric_name)
             st.altair_chart(histogram, use_container_width=True)
-            prediction_grid_classifications(get_state().project_paths.data, model_predictions=tp_df)
+            prediction_grid_classifications(get_state().project_paths, model_predictions=tp_df)
 
     def build(
         self,

diff --git a/src/encord_active/lib/common/image_utils.py b/src/encord_active/lib/common/image_utils.py
@@ -1,6 +1,5 @@
 import json
 from dataclasses import dataclass
-from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
 import cv2
@@ -16,6 +15,11 @@
 from encord_active.lib.db.predictions import BoundingBox
 from encord_active.lib.labels.object import ObjectShape
 from encord_active.lib.model_predictions.reader import PredictionMatchSchema
+from encord_active.lib.project import (
+    DataUnitStructure,
+    LabelRowStructure,
+    ProjectFileStructure,
+)
 
 
 @dataclass
@@ -96,7 +100,7 @@ def draw_object(
 def show_image_with_predictions_and_label(
     label: pd.Series,
     predictions: DataFrame[PredictionMatchSchema],
-    data_dir: Path,
+    project_file_structure: ProjectFileStructure,
     draw_configurations: Optional[ObjectDrawingConfigurations] = None,
     label_color: Color = Color.RED,
     class_colors: Optional[Dict[int, str]] = None,
@@ -109,12 +113,12 @@ def show_image_with_predictions_and_label(
 
     :param label: The csv row of the false-negative label to display (from a LabelSchema).
     :param predictions: All the predictions on the same image with the samme predicted class (from a PredictionSchema).
-    :param data_dir: The data directory of the project
+    :param project_file_structure: The directory of the project
     :param label_color: The hex color to use when drawing the prediction.
     :param class_colors: Dict of [class_id, hex_color] pairs.
     """
     class_colors = class_colors or {}
-    image = load_or_fill_image(label, data_dir)
+    image = load_or_fill_image(label, project_file_structure)
 
     for _, pred in predictions.iterrows():
         color = class_colors.get(pred["class_id"], Color.PURPLE)
@@ -125,45 +129,49 @@ def show_image_with_predictions_and_label(
 
 def show_image_and_draw_polygons(
     row: Union[Series, str],
-    data_dir: Path,
+    project_file_structure: ProjectFileStructure,
     draw_configurations: Optional[ObjectDrawingConfigurations] = None,
     skip_object_hash: bool = False,
 ) -> np.ndarray:
-    image = load_or_fill_image(row, data_dir)
+    image = load_or_fill_image(row, project_file_structure)
 
     if draw_configurations is None:
         draw_configurations = ObjectDrawingConfigurations()
 
     if draw_configurations.draw_objects:
         img_h, img_w = image.shape[:2]
-        for color, geometry in get_geometries(row, img_h, img_w, data_dir, skip_object_hash=skip_object_hash):
+        for color, geometry in get_geometries(
+            row, img_h, img_w, project_file_structure, skip_object_hash=skip_object_hash
+        ):
             image = draw_object_with_background_color(image, geometry, color, draw_configurations)
     return image
 
 
-def load_or_fill_image(row: Union[pd.Series, str], data_dir: Path) -> np.ndarray:
+def load_or_fill_image(row: Union[pd.Series, str], project_file_structure: ProjectFileStructure) -> np.ndarray:
     """
-    Tries to read the infered image path. If not possible, generates a white image
-    and indicates what the error seemd to be embedded in the image.
+    Tries to read the inferred image path. If not possible, generates a white image
+    and indicates what the error seemed to be embedded in the image.
     :param row: A csv row from either a metric, a prediction, or a label csv file.
     :return: Numpy / cv2 image.
     """
     key = __get_key(row)
 
-    img_pth: Optional[Path] = key_to_image_path(key, data_dir)
+    img_du: Optional[DataUnitStructure] = key_to_data_unit(key, project_file_structure)
 
-    if img_pth and img_pth.is_file():
+    if img_du and img_du.path.is_file():
         try:
-            image = cv2.imread(img_pth.as_posix())
+            image = cv2.imread(img_du.path.as_posix())
             return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         except Exception:
             pass
 
     # Read not successful, so tell the user why
-    error_text = "Image not found" if not img_pth else "File seems broken"
+    error_text = "Image not found" if not img_du else "File seems broken"
 
     _, du_hash, *_ = key.split("_")
-    lr = json.loads(key_to_lr_path(key, data_dir).read_text(encoding="utf-8"))
+    # lr = json.loads(key_to_lr_path(key, project_file_structure).read_text(encoding="utf-8"))
+    label_row_structure = key_to_label_row_structure(key, project_file_structure)
+    lr = json.loads(label_row_structure.label_row_file.read_text())
 
     h, w = get_du_size(lr["data_units"].get(du_hash, {}), None) or (600, 900)
 
@@ -235,7 +243,11 @@ def __to_absolute_points(bounding_box: BoundingBox, height: int, width: int):
 
 
 def get_geometries(
-    row: Union[pd.Series, str], img_h: int, img_w: int, data_dir: Path, skip_object_hash: bool = False
+    row: Union[pd.Series, str],
+    img_h: int,
+    img_w: int,
+    project_file_structure: ProjectFileStructure,
+    skip_object_hash: bool = False,
 ) -> List[Tuple[str, np.ndarray]]:
     """
     Loads cached label row and computes geometries from the label row.
@@ -247,10 +259,8 @@ def get_geometries(
     key = __get_key(row)
     _, du_hash, frame, *remainder = key.split("_")
 
-    lr_pth = key_to_lr_path(key, data_dir)
-    with lr_pth.open("r") as f:
-        label_row = json.load(f)
-
+    label_row_structure = key_to_label_row_structure(key, project_file_structure)
+    label_row = json.loads(label_row_structure.label_row_file.read_text())
     du = label_row["data_units"][du_hash]
 
     geometries = []
@@ -277,22 +287,22 @@ def get_geometries(
     return valid_geometries
 
 
-def key_to_lr_path(key: str, data_dir: Path) -> Path:
+def key_to_label_row_structure(key: str, project_file_structure: ProjectFileStructure) -> LabelRowStructure:
     label_hash, *_ = key.split("_")
-    return data_dir / label_hash / "label_row.json"
+    return project_file_structure.label_row_structure(label_hash)
 
 
-def key_to_image_path(key: str, data_dir: Path) -> Optional[Path]:
+def key_to_data_unit(key: str, project_file_structure: ProjectFileStructure) -> Optional[DataUnitStructure]:
     """
     Infer image path from the identifier stored in the csv files.
     :param key: the row["identifier"] from a csv row
     :return: The associated image path if it exists or a path to a placeholder otherwise
     """
     label_hash, du_hash, frame, *_ = key.split("_")
-    img_folder = data_dir / label_hash / "images"
+    label_row_structure = project_file_structure.label_row_structure(label_hash)
 
     # check if it is a video frame
-    frame_pth = next(img_folder.glob(f"{du_hash}_{int(frame)}.*"), None)
-    if frame_pth is not None:
-        return frame_pth
-    return next(img_folder.glob(f"{du_hash}.*"), None)  # So this is an img_group image
+    frame_du: Optional[DataUnitStructure] = next(label_row_structure.iter_data_unit(du_hash, int(frame)), None)
+    if frame_du is not None:
+        return frame_du
+    return next(label_row_structure.iter_data_unit(du_hash), None)  # So this is an img_group image
diff --git a/src/encord_active/lib/dataset/summary_utils.py b/src/encord_active/lib/dataset/summary_utils.py
@@ -56,7 +56,7 @@ def get_median_value_of_2d_array(array: np.ndarray) -> np.ndarray:
     return array[item_index[0][0], :]
 
 
-def get_all_annotation_numbers(project_paths: ProjectFileStructure) -> AnnotationStatistics:
+def get_all_annotation_numbers(project_file_structure: ProjectFileStructure) -> AnnotationStatistics:
     """
     Returns label statistics for both objects and classifications. Does not count nested
     labels, only counts the immediate labels.
@@ -66,7 +66,7 @@ def get_all_annotation_numbers(project_paths: ProjectFileStructure) -> Annotatio
     classification_label_counter = 0
     object_label_counter = 0
 
-    project_ontology = json.loads((project_paths.ontology).read_text(encoding="utf-8"))
+    project_ontology = json.loads((project_file_structure.ontology).read_text(encoding="utf-8"))
     ontology = OntologyStructure.from_dict(project_ontology)
 
     for object_item in ontology.objects:
@@ -79,9 +79,9 @@ def get_all_annotation_numbers(project_paths: ProjectFileStructure) -> Annotatio
             for option in classification_item.attributes[0].options:
                 labels.classifications[classification_item.attributes[0].name][option.label] = 0
 
-    for label_row in (project_paths.data).iterdir():
-        if (label_row / "label_row.json").exists():
-            label_row_meta = json.loads((label_row / "label_row.json").read_text(encoding="utf-8"))
+    for label_row_structure in project_file_structure.iter_labels():
+        if label_row_structure.label_row_file.exists():
+            label_row_meta = json.loads(label_row_structure.label_row_file.read_text(encoding="utf-8"))
             if label_row_meta["data_type"] in [DataType.IMAGE.value, DataType.IMG_GROUP.value]:
                 for data_unit in label_row_meta["data_units"].values():
 

diff --git a/src/encord_active/lib/db/tags.py b/src/encord_active/lib/db/tags.py
@@ -58,6 +58,11 @@ def all(self) -> List[Tag]:
                 Tag(name, scope) for name, scope, in conn.execute(f"SELECT name, scope FROM {TABLE_NAME}").fetchall()
             ]
 
+    @ensure_existence
+    def create_many(self, tags: List[Tag]):
+        with DBConnection() as conn:
+            return conn.executemany(f"INSERT INTO {TABLE_NAME} (name, scope) VALUES(?, ?) ", tags)
+
     @ensure_existence
     def create_tag(self, tag: Tag):
         stripped = tag.name.strip()