From 9357af411c599c8cc99cd4ad2e47dc0801982b08 Mon Sep 17 00:00:00 2001
From: BlairLee <bowenlee919@gmail.com>
Date: Tue, 9 Feb 2021 14:50:51 -0800
Subject: [PATCH] Coco cat training and evaluation (#129)

* Add coco-cat

* add coco-cat training pipeline

* Move download out of COCO dataloader

* change batch size

* Change training epoch

* update coco yaml

* Add train and eval pipeline for coco-cat and synthetic

* Add training and evaluation pipeline for Synthetic COCO-cat

* remove unnecessary pipelines

* revoke previous gcs path

* change num of classes

* Add Model_Performance_Statistics notebook

* add some note

* Add USim data training and evaluation pipeline

* Add some notes

* Modify link

* Fix linting issues

* Clean up notebook

* change session in notebook

* update notebook

* remove single training pipelines and notebook

* Add archive method

* make unarchived function private

* Add UTs for coco dataloader

* fix linting issue

Co-authored-by: Bowen Li <bowen.li@unity3d.com>
---
 datasetinsights/constants.py     |   1 -
 datasetinsights/datasets/coco.py | 131 +++++++++++++++++++++++++++++--
 tests/datasets/test_coco.py      |  46 +++++++++++
 3 files changed, 169 insertions(+), 9 deletions(-)
 create mode 100644 tests/datasets/test_coco.py

diff --git a/datasetinsights/constants.py b/datasetinsights/constants.py
index 32c23604..000109eb 100644
--- a/datasetinsights/constants.py
+++ b/datasetinsights/constants.py
@@ -37,7 +37,6 @@
 #   ...
 DEFAULT_DATA_ROOT = "/data"
 SYNTHETIC_SUBFOLDER = "synthetic"
-DEFAULT_PUBLIC_DATASET = "synthetic"
 
 # Default Unity Project ID where USim jobs was executed
 DEFAULT_PROJECT_ID = "474ba200-4dcc-4976-818e-0efd28efed30"
diff --git a/datasetinsights/datasets/coco.py b/datasetinsights/datasets/coco.py
index 8dde5b41..76eab7e7 100644
--- a/datasetinsights/datasets/coco.py
+++ b/datasetinsights/datasets/coco.py
@@ -1,5 +1,9 @@
+import fcntl
+import glob
+import json
 import logging
 import os
+import shutil
 import zipfile
 from pathlib import Path
 from typing import List, Tuple
@@ -13,6 +17,7 @@
 from datasetinsights.io.gcs import GCSClient
 
 from .base import Dataset
+from .exceptions import DatasetNotFoundError
 
 ANNOTATION_FILE_TEMPLATE = "{}_{}2017.json"
 COCO_GCS_PATH = "data/coco"
@@ -58,9 +63,9 @@ def convert_coco2canonical(coco_annotation):
     convert from a tuple of image and coco style dictionary describing the
     bboxes to a tuple of image, List of BBox2D
     Args:
-        coco_annotation (tuple): image and coco style dictionary
+        coco_annotation (tuple): image and coco style dictionary.
 
-    Returns: a tuple of image, List of BBox2D
+    Returns: a tuple of image, List of BBox2D.
 
     """
     image, targets = coco_annotation
@@ -74,34 +79,59 @@ def convert_coco2canonical(coco_annotation):
 
 
 class CocoDetection(Dataset):
-    """
-    http://cocodataset.org/#detection-2019
+    """COCO dataset for 2D object detection.
+
+    Before the class instantiation, it would assume that the COCO dataset is
+    downloaded.
+
+    See COCO dataset `documentation <http://cocodataset.org/#detection-2019>`_
+    for more details.
+
+    Attributes:
+        root (str): root path of the data.
+        transforms: callable transformation that applies to a pair of
+            capture, annotation. Capture is the information captured by the
+            sensor, in this case an image, and annotations, which in this
+            dataset are 2d bounding box coordinates and labels.
+        split (str): indicate split type of the dataset (train|val).
+        label_mappings (dict): a dict of {label_id: label_name} mapping.
+        coco (torchvision.datasets.CocoDetection): COCO dataset.
     """
 
     def __init__(
         self,
         *,
-        data_root=const.DEFAULT_DATA_ROOT,
+        data_path=const.DEFAULT_DATA_ROOT,
         split="train",
         transforms=None,
         remove_examples_without_boxes=True,
         **kwargs,
     ):
+        """
+        Args:
+            data_path (str): Directory of the dataset.
+            split (str): indicate split type of the dataset (train|val).
+            transforms: callable transformation that applies to a pair of
+            capture, annotation.
+            remove_examples_without_boxes (bool): whether to remove examples
+            without boxes. Defaults to True.
+        """
         # todo add test split
         self.split = split
-        self.root = os.path.join(data_root, COCO_LOCAL_PATH)
-        self.download()
+        self.root = data_path
+        self._preprocess_dataset(data_path=self.root, split=self.split)
         self.coco = self._get_coco(root=self.root, image_set=split)
         if remove_examples_without_boxes:
             self.coco = _coco_remove_images_without_annotations(
                 dataset=self.coco
             )
         self.transforms = transforms
+        self.label_mappings = self._get_label_mappings()
 
     def __getitem__(self, idx) -> Tuple[Image, List[BBox2D]]:
         """
         Args:
-            idx:
+            idx (int): index of the data.
 
         Returns: Image with list of bounding boxes found inside the image
 
@@ -143,6 +173,91 @@ def _get_local_annotations_zip(self):
     def _get_local_images_zip(self):
         return os.path.join(self.root, f"{self.split}2017.zip")
 
+    def _get_label_mappings(self):
+        """get label mappings.
+
+        Returns:
+            dict: A dict containing {label_id: label_name} mappings.
+        """
+        ann_file_name = (
+            Path(self.root) / "annotations" / f"instances_{self.split}2017.json"
+        )
+        label_mappings = {}
+        with open(ann_file_name, "r") as ann_file:
+            anns = json.load(ann_file)
+            for cat in anns["categories"]:
+                label_mappings[cat["id"]] = cat["name"]
+        return label_mappings
+
+    @staticmethod
+    def _preprocess_dataset(data_path, split):
+        """ Preprocess dataset inside data_path and un-archive if necessary.
+
+        Args:
+            data_path (str): Path where dataset is stored.
+            split (str): indicate split type of the dataset (train|val).
+
+        Return:
+            Tuple: (unarchived img path, unarchived annotation path)
+        """
+
+        archive_img_file = Path(data_path) / f"{split}2017.zip"
+        archive_ann_file = Path(data_path) / "annotations_trainval2017.zip"
+        if archive_img_file.exists() and archive_ann_file.exists():
+            unarchived_img_path = CocoDetection._unarchive_data(
+                data_path, archive_img_file
+            )
+            unarchived_ann_path = CocoDetection._unarchive_data(
+                data_path, archive_ann_file
+            )
+            return (unarchived_img_path, unarchived_ann_path)
+        elif CocoDetection._is_dataset_files_present(data_path):
+            # This is for dataset generated by unity simulation.
+            return data_path
+        else:
+            raise DatasetNotFoundError(
+                f"Expecting a file {archive_img_file} and {archive_ann_file}"
+                "under {data_path}"
+            )
+
+    def _unarchive_data(self, data_path, archive_file):
+        """unarchive downloaded data.
+        Args:
+            data_path (str): Path where dataset is stored.
+            archive_file (str): archived file name.
+
+        Returns:
+            str: unarchived path.
+        """
+        file_descriptor = os.open(archive_file, os.O_RDONLY)
+        try:
+            fcntl.flock(file_descriptor, fcntl.LOCK_EX)
+            unarchived_path = Path(data_path)
+            if not CocoDetection._is_dataset_files_present(unarchived_path):
+                shutil.unpack_archive(
+                    filename=archive_file, extract_dir=unarchived_path,
+                )
+                logger.info(f"Unpack {archive_file} to {unarchived_path}")
+        finally:
+            os.close(file_descriptor)
+        return unarchived_path
+
+    @staticmethod
+    def _is_dataset_files_present(data_path):
+        """check whether dataset files exist.
+
+        Args:
+            data_path (str): Path where dataset is stored.
+
+        Returns:
+            bool: whether dataset files exist.
+        """
+        return (
+            os.path.isdir(data_path)
+            and any(glob.glob(f"{data_path}/*.json"))
+            and any(glob.glob(f"{data_path}/*.jpg"))
+        )
+
     def download(self, cloud_path=COCO_GCS_PATH):
         path = Path(self.root)
         path.mkdir(parents=True, exist_ok=True)
diff --git a/tests/datasets/test_coco.py b/tests/datasets/test_coco.py
new file mode 100644
index 00000000..dd6b86c0
--- /dev/null
+++ b/tests/datasets/test_coco.py
@@ -0,0 +1,46 @@
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+from pytest import raises
+
+from datasetinsights.datasets.coco import CocoDetection
+from datasetinsights.datasets.exceptions import DatasetNotFoundError
+
+
+def test__is_dataset_files_present():
+    with tempfile.TemporaryDirectory() as tmp:
+        with open(os.path.join(tmp, "coco.json"), "x"):
+            with open(os.path.join(tmp, "coco.jpg"), "x"):
+                assert CocoDetection._is_dataset_files_present(tmp)
+
+    with tempfile.TemporaryDirectory() as tmp:
+        assert not CocoDetection._is_dataset_files_present(tmp)
+
+
+@patch("datasetinsights.datasets.CocoDetection._unarchive_data")
+def test__preprocess_dataset(mock_unarchive):
+    tmp_dir = tempfile.TemporaryDirectory()
+    tmp_name = tmp_dir.name
+    split = "train"
+
+    # test no dataset found
+    with raises(DatasetNotFoundError):
+        CocoDetection._preprocess_dataset(tmp_name, split)
+
+    # test dataset already exists
+    with open(os.path.join(tmp_name, "coco.json"), "x"):
+        with open(os.path.join(tmp_name, "coco.jpg"), "x"):
+            return_value = CocoDetection._preprocess_dataset(tmp_name, split)
+            assert return_value == tmp_name
+
+    # test whether it can unarchive data
+    archive_img_file = Path(tmp_name) / f"{split}2017.zip"
+    archive_ann_file = Path(tmp_name) / "annotations_trainval2017.zip"
+    with open(archive_img_file, "x"):
+        with open(archive_ann_file, "x"):
+            CocoDetection._preprocess_dataset(tmp_name, split)
+            assert mock_unarchive.call_count == 2
+
+    tmp_dir.cleanup()