Skip to content
This repository has been archived by the owner on Dec 19, 2024. It is now read-only.

Commit

Permalink
Coco cat training and evaluation (#129)
Browse files Browse the repository at this point in the history
* Add coco-cat

* add coco-cat training pipeline

* Move download out of COCO dataloader

* change batch size

* Change training epoch

* update coco yaml

* Add train and eval pipeline for coco-cat and synthetic

* Add training and evaluation pipeline for Synthetic COCO-cat

* remove unnecessary pipelines

* revoke previous gcs path

* change num of classes

* Add Model_Performance_Statistics notebook

* add some note

* Add USim data training and evaluation pipeline

* Add some notes

* Modify link

* Fix linting issues

* Clean up notebook

* change session in notebook

* update notebook

* remove single training pipelines and notebook

* Add archive method

* make unarchived function private

* Add UTs for coco dataloader

* fix linting issue

Co-authored-by: Bowen Li <[email protected]>
  • Loading branch information
BlairLee and bowenlee919 authored Feb 9, 2021
1 parent 60a66fc commit 9357af4
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 9 deletions.
1 change: 0 additions & 1 deletion datasetinsights/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
# ...
DEFAULT_DATA_ROOT = "/data"
SYNTHETIC_SUBFOLDER = "synthetic"
DEFAULT_PUBLIC_DATASET = "synthetic"

# Default Unity Project ID where USim jobs was executed
DEFAULT_PROJECT_ID = "474ba200-4dcc-4976-818e-0efd28efed30"
Expand Down
131 changes: 123 additions & 8 deletions datasetinsights/datasets/coco.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import fcntl
import glob
import json
import logging
import os
import shutil
import zipfile
from pathlib import Path
from typing import List, Tuple
Expand All @@ -13,6 +17,7 @@
from datasetinsights.io.gcs import GCSClient

from .base import Dataset
from .exceptions import DatasetNotFoundError

ANNOTATION_FILE_TEMPLATE = "{}_{}2017.json"
COCO_GCS_PATH = "data/coco"
Expand Down Expand Up @@ -58,9 +63,9 @@ def convert_coco2canonical(coco_annotation):
convert from a tuple of image and coco style dictionary describing the
bboxes to a tuple of image, List of BBox2D
Args:
coco_annotation (tuple): image and coco style dictionary
coco_annotation (tuple): image and coco style dictionary.
Returns: a tuple of image, List of BBox2D
Returns: a tuple of image, List of BBox2D.
"""
image, targets = coco_annotation
Expand All @@ -74,34 +79,59 @@ def convert_coco2canonical(coco_annotation):


class CocoDetection(Dataset):
"""
http://cocodataset.org/#detection-2019
"""COCO dataset for 2D object detection.
Before the class instantiation, it would assume that the COCO dataset is
downloaded.
See COCO dataset `documentation <http://cocodataset.org/#detection-2019>`_
for more details.
Attributes:
root (str): root path of the data.
transforms: callable transformation that applies to a pair of
capture, annotation. Capture is the information captured by the
sensor, in this case an image, and annotations, which in this
dataset are 2d bounding box coordinates and labels.
split (str): indicate split type of the dataset (train|val).
label_mappings (dict): a dict of {label_id: label_name} mapping.
coco (torchvision.datasets.CocoDetection): COCO dataset.
"""

def __init__(
self,
*,
data_root=const.DEFAULT_DATA_ROOT,
data_path=const.DEFAULT_DATA_ROOT,
split="train",
transforms=None,
remove_examples_without_boxes=True,
**kwargs,
):
"""
Args:
data_path (str): Directory of the dataset.
split (str): indicate split type of the dataset (train|val).
transforms: callable transformation that applies to a pair of
capture, annotation.
remove_examples_without_boxes (bool): whether to remove examples
without boxes. Defaults to True.
"""
# todo add test split
self.split = split
self.root = os.path.join(data_root, COCO_LOCAL_PATH)
self.download()
self.root = data_path
self._preprocess_dataset(data_path=self.root, split=self.split)
self.coco = self._get_coco(root=self.root, image_set=split)
if remove_examples_without_boxes:
self.coco = _coco_remove_images_without_annotations(
dataset=self.coco
)
self.transforms = transforms
self.label_mappings = self._get_label_mappings()

def __getitem__(self, idx) -> Tuple[Image, List[BBox2D]]:
"""
Args:
idx:
idx (int): index of the data.
Returns: Image with list of bounding boxes found inside the image
Expand Down Expand Up @@ -143,6 +173,91 @@ def _get_local_annotations_zip(self):
def _get_local_images_zip(self):
return os.path.join(self.root, f"{self.split}2017.zip")

def _get_label_mappings(self):
"""get label mappings.
Returns:
dict: A dict containing {label_id: label_name} mappings.
"""
ann_file_name = (
Path(self.root) / "annotations" / f"instances_{self.split}2017.json"
)
label_mappings = {}
with open(ann_file_name, "r") as ann_file:
anns = json.load(ann_file)
for cat in anns["categories"]:
label_mappings[cat["id"]] = cat["name"]
return label_mappings

@staticmethod
def _preprocess_dataset(data_path, split):
""" Preprocess dataset inside data_path and un-archive if necessary.
Args:
data_path (str): Path where dataset is stored.
split (str): indicate split type of the dataset (train|val).
Return:
Tuple: (unarchived img path, unarchived annotation path)
"""

archive_img_file = Path(data_path) / f"{split}2017.zip"
archive_ann_file = Path(data_path) / "annotations_trainval2017.zip"
if archive_img_file.exists() and archive_ann_file.exists():
unarchived_img_path = CocoDetection._unarchive_data(
data_path, archive_img_file
)
unarchived_ann_path = CocoDetection._unarchive_data(
data_path, archive_ann_file
)
return (unarchived_img_path, unarchived_ann_path)
elif CocoDetection._is_dataset_files_present(data_path):
# This is for dataset generated by unity simulation.
return data_path
else:
raise DatasetNotFoundError(
f"Expecting a file {archive_img_file} and {archive_ann_file}"
"under {data_path}"
)

def _unarchive_data(self, data_path, archive_file):
"""unarchive downloaded data.
Args:
data_path (str): Path where dataset is stored.
archive_file (str): archived file name.
Returns:
str: unarchived path.
"""
file_descriptor = os.open(archive_file, os.O_RDONLY)
try:
fcntl.flock(file_descriptor, fcntl.LOCK_EX)
unarchived_path = Path(data_path)
if not CocoDetection._is_dataset_files_present(unarchived_path):
shutil.unpack_archive(
filename=archive_file, extract_dir=unarchived_path,
)
logger.info(f"Unpack {archive_file} to {unarchived_path}")
finally:
os.close(file_descriptor)
return unarchived_path

@staticmethod
def _is_dataset_files_present(data_path):
"""check whether dataset files exist.
Args:
data_path (str): Path where dataset is stored.
Returns:
bool: whether dataset files exist.
"""
return (
os.path.isdir(data_path)
and any(glob.glob(f"{data_path}/*.json"))
and any(glob.glob(f"{data_path}/*.jpg"))
)

def download(self, cloud_path=COCO_GCS_PATH):
path = Path(self.root)
path.mkdir(parents=True, exist_ok=True)
Expand Down
46 changes: 46 additions & 0 deletions tests/datasets/test_coco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import tempfile
from pathlib import Path
from unittest.mock import patch

from pytest import raises

from datasetinsights.datasets.coco import CocoDetection
from datasetinsights.datasets.exceptions import DatasetNotFoundError


def test__is_dataset_files_present():
with tempfile.TemporaryDirectory() as tmp:
with open(os.path.join(tmp, "coco.json"), "x"):
with open(os.path.join(tmp, "coco.jpg"), "x"):
assert CocoDetection._is_dataset_files_present(tmp)

with tempfile.TemporaryDirectory() as tmp:
assert not CocoDetection._is_dataset_files_present(tmp)


@patch("datasetinsights.datasets.CocoDetection._unarchive_data")
def test__preprocess_dataset(mock_unarchive):
tmp_dir = tempfile.TemporaryDirectory()
tmp_name = tmp_dir.name
split = "train"

# test no dataset found
with raises(DatasetNotFoundError):
CocoDetection._preprocess_dataset(tmp_name, split)

# test dataset already exists
with open(os.path.join(tmp_name, "coco.json"), "x"):
with open(os.path.join(tmp_name, "coco.jpg"), "x"):
return_value = CocoDetection._preprocess_dataset(tmp_name, split)
assert return_value == tmp_name

# test whether it can unarchive data
archive_img_file = Path(tmp_name) / f"{split}2017.zip"
archive_ann_file = Path(tmp_name) / "annotations_trainval2017.zip"
with open(archive_img_file, "x"):
with open(archive_ann_file, "x"):
CocoDetection._preprocess_dataset(tmp_name, split)
assert mock_unarchive.call_count == 2

tmp_dir.cleanup()

0 comments on commit 9357af4

Please sign in to comment.