From e858b60d3082a07d6a6e0233020f79aa0d28886d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B0=A2=E6=98=95=E8=BE=B0?= <xiexinch@outlook.com>
Date: Wed, 14 Jun 2023 20:39:53 +0800
Subject: [PATCH] Add ade doc (#10504)

---
 configs/_base_/datasets/ade20k_instance.py    |  53 ++++
 configs/_base_/datasets/ade20k_panoptic.py    |  23 --
 configs/_base_/datasets/ade20k_semantic.py    |   2 +-
 configs/_base_/datasets/refcoco+.py           |   2 +-
 configs/_base_/datasets/refcoco.py            |   2 +-
 configs/_base_/datasets/refcocog.py           |   2 +-
 docs/en/user_guides/dataset_prepare.md        |  66 +++++
 mmdet/datasets/__init__.py                    |   6 +-
 mmdet/datasets/ade20k.py                      | 263 +++++++++---------
 mmdet/evaluation/metrics/refseg_metric.py     |  10 +-
 projects/XDecoder/README.md                   |  20 +-
 ...iny_zeroshot_open-vocab-instance_ade20k.py |  29 ++
 ...iny_zeroshot_open-vocab-panoptic_ade20k.py |  22 ++
 setup.cfg                                     |   2 +-
 tools/dataset_converters/ade20k2coco.py       | 197 +++++++++++--
 15 files changed, 507 insertions(+), 192 deletions(-)
 create mode 100644 configs/_base_/datasets/ade20k_instance.py
 create mode 100644 projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-instance_ade20k.py
 create mode 100644 projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-panoptic_ade20k.py

diff --git a/configs/_base_/datasets/ade20k_instance.py b/configs/_base_/datasets/ade20k_instance.py
new file mode 100644
index 00000000000..57f657aa67f
--- /dev/null
+++ b/configs/_base_/datasets/ade20k_instance.py
@@ -0,0 +1,53 @@
+# dataset settings
+dataset_type = 'ADE20KInstanceDataset'
+data_root = 'data/ADEChallengeData2016/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/ADEChallengeData2016/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='ade20k_instance_val.json',
+        data_prefix=dict(img='images/validation'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'ade20k_instance_val.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/ade20k_panoptic.py b/configs/_base_/datasets/ade20k_panoptic.py
index 7672d5d99fc..48b570fe6ae 100644
--- a/configs/_base_/datasets/ade20k_panoptic.py
+++ b/configs/_base_/datasets/ade20k_panoptic.py
@@ -4,15 +4,6 @@
 
 backend_args = None
 
-train_pipeline = [
-    dict(type='LoadImageFromFile', backend_args=backend_args),
-    dict(type='LoadPanopticAnnotations', backend_args=backend_args),
-    # TODO: the performance of `FixScaleResize` need to check.
-    dict(type='FixScaleResize', scale=(2560, 640), backend_args=backend_args),
-    dict(type='RandomCrop', crop_size=(640, 640), crop_type='absolute'),
-    dict(type='RandomFlip', prob=0.5),
-    dict(type='PackDetInputs')
-]
 test_pipeline = [
     dict(type='LoadImageFromFile', backend_args=backend_args),
     dict(type='Resize', scale=(640, 640), keep_ratio=True),
@@ -23,20 +14,6 @@
                    'scale_factor'))
 ]
 
-train_dataloader = dict(
-    batch_size=4,
-    num_workers=2,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    batch_sampler=dict(type='AspectRatioBatchSampler'),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='ade20k_panoptic_train.json',
-        data_prefix=dict(img='images/training/', seg='ade20k_panoptic_train/'),
-        filter_cfg=dict(filter_empty_gt=True, min_size=32),
-        pipeline=train_pipeline,
-        backend_args=backend_args))
 val_dataloader = dict(
     batch_size=1,
     num_workers=2,
diff --git a/configs/_base_/datasets/ade20k_semantic.py b/configs/_base_/datasets/ade20k_semantic.py
index 10ca17d3a54..68899e4b6b0 100644
--- a/configs/_base_/datasets/ade20k_semantic.py
+++ b/configs/_base_/datasets/ade20k_semantic.py
@@ -5,7 +5,7 @@
 # Method 1: simply set the data root and let the file I/O module
 # automatically infer from prefix (not support LMDB and Memcache yet)
 
-# data_root = 's3://openmmlab/datasets/detection/coco/'
+# data_root = 's3://openmmlab/datasets/detection/ADEChallengeData2016/'
 
 # Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
 # backend_args = dict(
diff --git a/configs/_base_/datasets/refcoco+.py b/configs/_base_/datasets/refcoco+.py
index 56db966decf..9d7ce8adee1 100644
--- a/configs/_base_/datasets/refcoco+.py
+++ b/configs/_base_/datasets/refcoco+.py
@@ -51,5 +51,5 @@
         text_mode='original',
         pipeline=test_pipeline))
 
-val_evaluator = dict(type='RefSegMetric', metrics=['cIoU', 'mIoU'])
+val_evaluator = dict(type='RefSegMetric', iou_metrics=['cIoU', 'mIoU'])
 test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/refcoco.py b/configs/_base_/datasets/refcoco.py
index 518c652bcbb..fdea1132ae1 100644
--- a/configs/_base_/datasets/refcoco.py
+++ b/configs/_base_/datasets/refcoco.py
@@ -51,5 +51,5 @@
         text_mode='original',
         pipeline=test_pipeline))
 
-val_evaluator = dict(type='RefSegMetric', metrics=['cIoU', 'mIoU'])
+val_evaluator = dict(type='RefSegMetric', iou_metrics=['cIoU', 'mIoU'])
 test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/refcocog.py b/configs/_base_/datasets/refcocog.py
index 03b40add316..21d42b3f7c5 100644
--- a/configs/_base_/datasets/refcocog.py
+++ b/configs/_base_/datasets/refcocog.py
@@ -51,5 +51,5 @@
         text_mode='original',
         pipeline=test_pipeline))
 
-val_evaluator = dict(type='RefSegMetric', metrics=['cIoU', 'mIoU'])
+val_evaluator = dict(type='RefSegMetric', iou_metrics=['cIoU', 'mIoU'])
 test_evaluator = val_evaluator
diff --git a/docs/en/user_guides/dataset_prepare.md b/docs/en/user_guides/dataset_prepare.md
index 7d960ba18ec..e1e2fb34116 100644
--- a/docs/en/user_guides/dataset_prepare.md
+++ b/docs/en/user_guides/dataset_prepare.md
@@ -99,3 +99,69 @@ data
 │   │   └── refs(umd).p
 |   |── train2014
 ```
+
+The images and annotations of [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/) dataset can be download by running `tools/misc/download_dataset.py`:
+
+```shell
+python tools/misc/download_dataset.py --dataset-name ade20k_2016 --save-dir data --unzip
+```
+
+Then move the annotations to the `data/ADEChallengeData2016` directory and run the preprocess script to produce the coco format annotations:
+
+```shell
+mv data/annotations_instance data/ADEChallengeData2016/
+mv data/categoryMapping.txt data/ADEChallengeData2016/
+mv data/objectInfo150.txt data/ADEChallengeData2016/
+python tools/dataset_converters/ade20k2coco.py data/ADEChallengeData2016 --task panoptic
+python tools/dataset_converters/ade20k2coco.py data/ADEChallengeData2016 --task instance
+```
+
+The directory should be like this.
+
+```text
+data
+├── ADEChallengeData2016
+│   ├── ade20k_instance_train.json
+│   ├── ade20k_instance_val.json
+│   ├── ade20k_panoptic_train
+|   |   ├── ADE_train_00000001.png
+|   |   ├── ADE_train_00000002.png
+|   |   ├── ...
+│   ├── ade20k_panoptic_train.json
+│   ├── ade20k_panoptic_val
+|   |   ├── ADE_val_00000001.png
+|   |   ├── ADE_val_00000002.png
+|   |   ├── ...
+│   ├── ade20k_panoptic_val.json
+│   ├── annotations
+|   |   ├── training
+|   |   |   ├── ADE_train_00000001.png
+|   |   |   ├── ADE_train_00000002.png
+|   |   |   ├── ...
+|   |   ├── validation
+|   |   |   ├── ADE_val_00000001.png
+|   |   |   ├── ADE_val_00000002.png
+|   |   |   ├── ...
+│   ├── annotations_instance
+|   |   ├── training
+|   |   |   ├── ADE_train_00000001.png
+|   |   |   ├── ADE_train_00000002.png
+|   |   |   ├── ...
+|   |   ├── validation
+|   |   |   ├── ADE_val_00000001.png
+|   |   |   ├── ADE_val_00000002.png
+|   |   |   ├── ...
+│   ├── categoryMapping.txt
+│   ├── images
+│   |   ├── training
+|   |   |   ├── ADE_train_00000001.jpg
+|   |   |   ├── ADE_train_00000002.jpg
+|   |   |   ├── ...
+|   |   ├── validation
+|   |   |   ├── ADE_val_00000001.jpg
+|   |   |   ├── ADE_val_00000002.jpg
+|   |   |   ├── ...
+│   ├── imgCatIds.json
+│   ├── objectInfo150.txt
+|   |── sceneCategories.txt
+```
diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py
index 3e14849262b..20d373d6179 100644
--- a/mmdet/datasets/__init__.py
+++ b/mmdet/datasets/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .ade20k import ADE20KPanopticDataset, ADE20KSegDataset
+from .ade20k import (ADE20KInstanceDataset, ADE20KPanopticDataset,
+                     ADE20KSegDataset)
 from .base_det_dataset import BaseDetDataset
 from .base_semseg_dataset import BaseSegDataset
 from .base_video_dataset import BaseVideoDataset
@@ -38,5 +39,6 @@
     'BaseVideoDataset', 'MOTChallengeDataset', 'TrackImgSampler',
     'ReIDDataset', 'YouTubeVISDataset', 'TrackAspectRatioBatchSampler',
     'ADE20KPanopticDataset', 'COCOCaptionDataset', 'RefCOCODataset',
-    'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset'
+    'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset',
+    'ADE20KInstanceDataset'
 ]
diff --git a/mmdet/datasets/ade20k.py b/mmdet/datasets/ade20k.py
index c0c81809a00..c0484c2fc44 100644
--- a/mmdet/datasets/ade20k.py
+++ b/mmdet/datasets/ade20k.py
@@ -6,52 +6,91 @@
 
 from mmdet.registry import DATASETS
 from .base_semseg_dataset import BaseSegDataset
+from .coco import CocoDataset
 from .coco_panoptic import CocoPanopticDataset
 
+ADE_PALETTE = [(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
+               (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
+               (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
+               (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
+               (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
+               (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
+               (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
+               (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
+               (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
+               (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+               (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
+               (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
+               (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
+               (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
+               (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
+               (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
+               (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
+               (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
+               (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
+               (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
+               (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
+               (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
+               (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
+               (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
+               (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
+               (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
+               (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
+               (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
+               (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
+               (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
+               (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
+               (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
+               (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
+               (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
+               (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
+               (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
+               (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
+               (102, 255, 0), (92, 0, 255)]
+
 
 @DATASETS.register_module()
 class ADE20KPanopticDataset(CocoPanopticDataset):
     METAINFO = {
         'classes':
-        ('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road, route',
-         'bed', 'window ', 'grass', 'cabinet', 'sidewalk, pavement', 'person',
-         'earth, ground', 'door', 'table', 'mountain, mount', 'plant',
-         'curtain', 'chair', 'car', 'water', 'painting, picture', 'sofa',
-         'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair', 'seat',
-         'fence', 'desk', 'rock, stone', 'wardrobe, closet, press', 'lamp',
-         'tub', 'rail', 'cushion', 'base, pedestal, stand', 'box',
-         'column, pillar', 'signboard, sign',
-         'chest of drawers, chest, bureau, dresser', 'counter', 'sand', 'sink',
-         'skyscraper', 'fireplace', 'refrigerator, icebox',
-         'grandstand, covered stand', 'path', 'stairs', 'runway',
+        ('bed', 'window ', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
+         'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press',
+         'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar',
+         'signboard, sign', 'chest of drawers, chest, bureau, dresser',
+         'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs',
          'case, display case, showcase, vitrine',
          'pool table, billiard table, snooker table', 'pillow',
-         'screen door, screen', 'stairway, staircase', 'river', 'bridge, span',
-         'bookcase', 'blind, screen', 'coffee table',
+         'screen door, screen', 'bookcase', 'coffee table',
          'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
-         'book', 'hill', 'bench', 'countertop', 'stove', 'palm, palm tree',
-         'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
-         'arcade machine', 'hovel, hut, hutch, shack, shanty', 'bus', 'towel',
-         'light', 'truck', 'tower', 'chandelier', 'awning, sunshade, sunblind',
-         'street lamp', 'booth', 'tv', 'plane', 'dirt track', 'clothes',
-         'pole', 'land, ground, soil',
+         'book', 'bench', 'countertop', 'stove', 'palm, palm tree',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv', 'plane',
+         'clothes', 'pole',
          'bannister, banister, balustrade, balusters, handrail',
+         'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship',
+         'fountain', 'washer, automatic washer, washing machine',
+         'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket',
+         'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food',
+         'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle',
+         'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce',
+         'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate',
+         'monitor', 'bulletin board', 'radiator', 'glass, drinking glass',
+         'clock', 'flag', 'wall', 'building', 'sky', 'floor', 'tree',
+         'ceiling', 'road, route', 'grass', 'sidewalk, pavement',
+         'earth, ground', 'mountain, mount', 'plant', 'water', 'house', 'sea',
+         'rug', 'field', 'rock, stone', 'base, pedestal, stand', 'sand',
+         'skyscraper', 'grandstand, covered stand', 'path', 'runway',
+         'stairway, staircase', 'river', 'bridge, span', 'blind, screen',
+         'hill', 'bar', 'hovel, hut, hutch, shack, shanty', 'tower',
+         'dirt track', 'land, ground, soil',
          'escalator, moving staircase, moving stairway',
-         'ottoman, pouf, pouffe, puff, hassock', 'bottle',
          'buffet, counter, sideboard',
-         'poster, posting, placard, notice, bill, card', 'stage', 'van',
-         'ship', 'fountain',
-         'conveyor belt, conveyor belt, conveyor, conveyor, transporter',
-         'canopy', 'washer, automatic washer, washing machine',
-         'plaything, toy', 'pool', 'stool', 'barrel, cask',
-         'basket, handbasket', 'falls', 'tent', 'bag', 'minibike, motorbike',
-         'cradle', 'oven', 'ball', 'food, solid food', 'step, stair',
-         'tank, storage tank', 'trade name', 'microwave', 'pot', 'animal',
-         'bicycle', 'lake', 'dishwasher', 'screen', 'blanket, cover',
-         'sculpture', 'hood, exhaust hood', 'sconce', 'vase', 'traffic light',
-         'tray', 'trash can', 'fan', 'pier', 'crt screen', 'plate', 'monitor',
-         'bulletin board', 'shower', 'radiator', 'glass, drinking glass',
-         'clock', 'flag'),
+         'poster, posting, placard, notice, bill, card', 'stage',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+         'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
+         'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
         'thing_classes':
         ('bed', 'window ', 'cabinet', 'person', 'door', 'table', 'curtain',
          'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
@@ -89,74 +128,61 @@ class ADE20KPanopticDataset(CocoPanopticDataset):
          'land, ground, soil', 'escalator, moving staircase, moving stairway',
          'buffet, counter, sideboard',
          'poster, posting, placard, notice, bill, card', 'stage',
-         'conveyor belt, conveyor belt, conveyor, conveyor, transporter',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
          'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
          'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
         'palette':
-        ((120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
-         (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
-         (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
-         (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
-         (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
-         (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
-         (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
-         (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
-         (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
-         (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7), (255, 122,
-                                                                     8),
-         (0, 255, 20), (255, 8, 41), (255, 5, 153), (6, 51, 255), (235, 12,
-                                                                   255),
-         (160, 150, 20), (0, 163, 255), (140, 140, 140), (250, 10,
-                                                          15), (20, 255, 0),
-         (31, 255, 0), (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
-         (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
-         (11, 200,
-          200), (255, 82,
-                 0), (0, 255, 245), (0, 61, 255), (0, 255, 112), (0, 255, 133),
-         (255, 0, 0), (255, 163, 0), (255, 102, 0), (194, 255, 0), (0, 143,
-                                                                    255),
-         (51, 255, 0), (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
-         (173, 255,
-          0), (0, 255, 153), (255, 92, 0), (255, 0, 255), (255, 0,
-                                                           245), (255, 0, 102),
-         (255, 173, 0), (255, 0, 20), (255, 184,
-                                       184), (0, 31, 255), (0, 255,
-                                                            61), (0, 71, 255),
-         (255, 0, 204), (0, 255, 194), (0, 255,
-                                        82), (0, 10, 255), (0, 112,
-                                                            255), (51, 0, 255),
-         (0, 194, 255), (0, 122, 255), (0, 255, 163), (255, 153,
-                                                       0), (0, 255,
-                                                            10), (255, 112, 0),
-         (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235,
-                                                      0), (8, 184,
-                                                           170), (133, 0, 255),
-         (0, 255, 92), (184, 0, 255), (255, 0, 31), (0, 184, 255), (0, 214,
-                                                                    255),
-         (255, 0, 112), (92, 255,
-                         0), (0, 224, 255), (112, 224,
-                                             255), (70, 184,
-                                                    160), (163, 0,
-                                                           255), (153, 0, 255),
-         (71, 255, 0), (255, 0, 163), (255, 204,
-                                       0), (255, 0, 143), (0, 255,
-                                                           235), (133, 255, 0),
-         (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245,
-                                                       0), (10, 190,
-                                                            212), (214, 255,
-                                                                   0), (0, 204,
-                                                                        255),
-         (20, 0, 255), (255, 255,
-                        0), (0, 153, 255), (0, 41, 255), (0, 255, 204), (41, 0,
-                                                                         255),
-         (41, 255,
-          0), (173, 0, 255), (0, 245, 255), (71, 0, 255), (122, 0,
-                                                           255), (0, 255, 184),
-         (0, 92, 255), (184, 255, 0), (0, 133, 255), (255, 214,
-                                                      0), (25, 194,
-                                                           194), (102, 255,
-                                                                  0), (92, 0,
-                                                                       255))
+        ADE_PALETTE
+    }
+
+
+@DATASETS.register_module()
+class ADE20KInstanceDataset(CocoDataset):
+    METAINFO = {
+        'classes':
+        ('bed', 'windowpane', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting', 'sofa', 'shelf', 'mirror', 'armchair',
+         'seat', 'fence', 'desk', 'wardrobe', 'lamp', 'bathtub', 'railing',
+         'cushion', 'box', 'column', 'signboard', 'chest of drawers',
+         'counter', 'sink', 'fireplace', 'refrigerator', 'stairs', 'case',
+         'pool table', 'pillow', 'screen door', 'bookcase', 'coffee table',
+         'toilet', 'flower', 'book', 'bench', 'countertop', 'stove', 'palm',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning', 'streetlight', 'booth', 'television receiver', 'airplane',
+         'apparel', 'pole', 'bannister', 'ottoman', 'bottle', 'van', 'ship',
+         'fountain', 'washer', 'plaything', 'stool', 'barrel', 'basket', 'bag',
+         'minibike', 'oven', 'ball', 'food', 'step', 'trade name', 'microwave',
+         'pot', 'animal', 'bicycle', 'dishwasher', 'screen', 'sculpture',
+         'hood', 'sconce', 'vase', 'traffic light', 'tray', 'ashcan', 'fan',
+         'plate', 'monitor', 'bulletin board', 'radiator', 'glass', 'clock',
+         'flag'),
+        'palette': [(204, 5, 255), (230, 230, 230), (224, 5, 255),
+                    (150, 5, 61), (8, 255, 51), (255, 6, 82), (255, 51, 7),
+                    (204, 70, 3), (0, 102, 200), (255, 6, 51), (11, 102, 255),
+                    (255, 7, 71), (220, 220, 220), (8, 255, 214),
+                    (7, 255, 224), (255, 184, 6), (10, 255, 71), (7, 255, 255),
+                    (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+                    (0, 255, 20), (255, 8, 41), (255, 5, 153), (6, 51, 255),
+                    (235, 12, 255), (0, 163, 255), (250, 10, 15), (20, 255, 0),
+                    (255, 224, 0), (0, 0, 255), (255, 71, 0), (0, 235, 255),
+                    (0, 173, 255), (0, 255, 245), (0, 255, 112), (0, 255, 133),
+                    (255, 0, 0), (255, 163, 0), (194, 255, 0), (0, 143, 255),
+                    (51, 255, 0), (0, 82, 255), (0, 255, 41), (0, 255, 173),
+                    (10, 0, 255), (173, 255, 0), (255, 92, 0), (255, 0, 245),
+                    (255, 0, 102), (255, 173, 0), (255, 0, 20), (0, 31, 255),
+                    (0, 255, 61), (0, 71, 255), (255, 0, 204), (0, 255, 194),
+                    (0, 255, 82), (0, 112, 255), (51, 0, 255), (0, 122, 255),
+                    (255, 153, 0), (0, 255, 10), (163, 255, 0), (255, 235, 0),
+                    (8, 184, 170), (184, 0, 255), (255, 0, 31), (0, 214, 255),
+                    (255, 0, 112), (92, 255, 0), (70, 184, 160), (163, 0, 255),
+                    (71, 255, 0), (255, 0, 163), (255, 204, 0), (255, 0, 143),
+                    (133, 255, 0), (255, 0, 235), (245, 0, 255), (255, 0, 122),
+                    (255, 245, 0), (214, 255, 0), (0, 204, 255), (255, 255, 0),
+                    (0, 153, 255), (0, 41, 255), (0, 255, 204), (41, 0, 255),
+                    (41, 255, 0), (173, 0, 255), (0, 245, 255), (0, 255, 184),
+                    (0, 92, 255), (184, 255, 0), (255, 214, 0), (25, 194, 194),
+                    (102, 255, 0), (92, 0, 255)],
     }
 
 
@@ -197,44 +223,7 @@ class ADE20KSegDataset(BaseSegDataset):
                  'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate',
                  'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
                  'clock', 'flag'),
-        palette=[(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
-                 (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
-                 (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
-                 (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
-                 (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
-                 (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
-                 (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
-                 (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
-                 (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
-                 (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
-                 (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
-                 (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
-                 (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
-                 (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
-                 (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
-                 (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
-                 (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
-                 (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
-                 (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
-                 (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
-                 (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
-                 (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
-                 (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
-                 (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
-                 (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
-                 (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
-                 (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
-                 (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
-                 (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
-                 (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
-                 (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
-                 (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
-                 (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
-                 (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
-                 (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
-                 (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
-                 (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
-                 (102, 255, 0), (92, 0, 255)])
+        palette=ADE_PALETTE)
 
     def __init__(self,
                  img_suffix='.jpg',
@@ -249,7 +238,7 @@ def load_data_list(self) -> List[dict]:
         """Load annotation from directory or annotation file.
 
         Returns:
-            list(dict): All data info of dataset.
+            List[dict]: All data info of dataset.
         """
         data_list = []
         img_dir = self.data_prefix.get('img_path', None)
diff --git a/mmdet/evaluation/metrics/refseg_metric.py b/mmdet/evaluation/metrics/refseg_metric.py
index a59fd3db163..a66acc32edb 100644
--- a/mmdet/evaluation/metrics/refseg_metric.py
+++ b/mmdet/evaluation/metrics/refseg_metric.py
@@ -11,14 +11,14 @@
 class RefSegMetric(BaseMetric):
 
     def __init__(self,
-                 metrics: list = ['cIoU', 'mIoU'],
+                 iou_metrics: list = ['cIoU', 'mIoU'],
                  eval_first_text: bool = False,
                  **kwargs):
         super().__init__(**kwargs)
-        assert set(metrics).issubset(['cIoU', 'mIoU']), \
-            f'Only support cIoU and mIoU, but got {metrics}'
-        assert len(metrics) > 0, 'metrics should not be empty'
-        self.metrics = metrics
+        assert set(iou_metrics).issubset(['cIoU', 'mIoU']), \
+            f'Only support cIoU and mIoU, but got {iou_metrics}'
+        assert len(iou_metrics) > 0, 'metrics should not be empty'
+        self.metrics = iou_metrics
         self.eval_first_text = eval_first_text
 
     def compute_iou(self, pred_seg, gt_seg):
diff --git a/projects/XDecoder/README.md b/projects/XDecoder/README.md
index 9b235b55798..629ffce4288 100644
--- a/projects/XDecoder/README.md
+++ b/projects/XDecoder/README.md
@@ -116,7 +116,7 @@ The image that best matches the given text is ../../images/coco/000.jpg and prob
 
 ### Semantic segmentation on ADE20K
 
-Prepare your dataset according to the [docs](https://mmsegmentation.readthedocs.io/en/latest/user_guides/2_dataset_prepare.html#ade20k).
+Prepare your dataset according to the [docs](../../docs/en/user_guides/dataset_prepare.md).
 
 **Test Command**
 
@@ -126,14 +126,22 @@ Since semantic segmentation is a pixel-level task, we don't need to use a thresh
 ./tools/dist_test.sh  projects/XDecoder/configs/xdecoder-tiny_zeroshot_semseg.py xdecoder_focalt_best_openseg.pt 8 --cfg-options model.test_cfg.use_thr_for_mc=False
 ```
 
-| Model                               | mIoU  |                       Config                       |
-| :---------------------------------- | :---: | :------------------------------------------------: |
-| `xdecoder_focalt_best_openseg.pt`\* | 25.13 | [config](configs/xdecoder-tiny_zeroshot_semseg.py) |
+| Model                               | mIoU  | mIOU(official) |                                Config                                |
+| :---------------------------------- | :---: | :------------: | :------------------------------------------------------------------: |
+| `xdecoder_focalt_best_openseg.pt`\* | 25.13 |     25.13      | [config](configs/xdecoder-tiny_zeroshot_open-vocab-semseg_ade20k.py) |
 
 ### Instance segmentation on ADE20K
 
+| Model                               | mIoU | mIOU(official) |                                 Config                                 |
+| :---------------------------------- | :--: | :------------: | :--------------------------------------------------------------------: |
+| `xdecoder_focalt_best_openseg.pt`\* | 10.1 |      10.1      | [config](configs/xdecoder-tiny_zeroshot_open-vocab-instance_ade20k.py) |
+
 ### Panoptic segmentation on ADE20K
 
+| Model                               | mIoU  | mIOU(official) |                                 Config                                 |
+| :---------------------------------- | :---: | :------------: | :--------------------------------------------------------------------: |
+| `xdecoder_focalt_best_openseg.pt`\* | 15.26 |     18.97      | [config](configs/xdecoder-tiny_zeroshot_open-vocab-panoptic_ade20k.py) |
+
 ### Semantic segmentation on COCO2017
 
 Prepare your dataset according to the [docs](https://mmdetection.readthedocs.io/en/latest/user_guides/dataset_prepare.html#coco).
@@ -178,6 +186,10 @@ Prepare your dataset according to the [docs](https://mmdetection.readthedocs.io/
 
 ### Referring segmentation on RefCOCO
 
+| Model                            | cIoU  | cIOU(official) |                                 Config                                  |
+| :------------------------------- | :---: | :------------: | :---------------------------------------------------------------------: |
+| `xdecoder_focalt_last_novg.pt`\* | 62.25 |     57.85      | [config](configs/xdecoder-tiny_zeroshot_open-vocab-ref-seg_refcocog.py) |
+
 ### Image Caption on COCO2014
 
 Prepare your dataset according to the [docs](https://mmdetection.readthedocs.io/en/latest/user_guides/dataset_prepare.html#coco_caption).
diff --git a/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-instance_ade20k.py b/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-instance_ade20k.py
new file mode 100644
index 00000000000..58923599cba
--- /dev/null
+++ b/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-instance_ade20k.py
@@ -0,0 +1,29 @@
+_base_ = [
+    '_base_/xdecoder-tiny_open-vocab-instance.py',
+    'mmdet::_base_/datasets/ade20k_instance.py'
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        imdecode_backend='pillow',
+        backend_args=_base_.backend_args),
+    dict(
+        type='ResizeShortestEdge', scale=640, max_size=2560, backend='pillow'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=False,
+        with_mask=False,
+        with_seg=True,
+        reduce_zero_label=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text'))
+]
+
+val_dataloader = dict(
+    dataset=dict(return_classes=True, pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+test_evaluator = dict(metric=['segm'])
diff --git a/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-panoptic_ade20k.py b/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-panoptic_ade20k.py
new file mode 100644
index 00000000000..5e8f71f79da
--- /dev/null
+++ b/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-panoptic_ade20k.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '_base_/xdecoder-tiny_open-vocab-panoptic.py',
+    'mmdet::_base_/datasets/ade20k_panoptic.py'
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        imdecode_backend='pillow',
+        backend_args=_base_.backend_args),
+    dict(
+        type='ResizeShortestEdge', scale=640, max_size=2560, backend='pillow'),
+    dict(type='LoadPanopticAnnotations', backend_args=_base_.backend_args),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'stuff_text'))
+]
+
+val_dataloader = dict(
+    dataset=dict(return_classes=True, pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/setup.cfg b/setup.cfg
index 70dd621c8f5..a3878cf1071 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,4 +18,4 @@ SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
 [codespell]
 skip = *.ipynb
 quiet-level = 3
-ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood,ba,warmup,nam,DOTA,dota
+ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood,ba,warmup,nam,DOTA,dota,conveyer
diff --git a/tools/dataset_converters/ade20k2coco.py b/tools/dataset_converters/ade20k2coco.py
index 3ae92325c28..3a23a2e5f65 100644
--- a/tools/dataset_converters/ade20k2coco.py
+++ b/tools/dataset_converters/ade20k2coco.py
@@ -1,23 +1,161 @@
 import argparse
+import json
 import os
 from pathlib import Path
 
 import numpy as np
+import pycocotools.mask as mask_util
 from mmengine.utils import ProgressBar, mkdir_or_exist
 from panopticapi.utils import IdGenerator, save_json
 from PIL import Image
 
 from mmdet.datasets.ade20k import ADE20KPanopticDataset
 
+ORIGINAL_CATEGORIES = [
+    'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road, route',
+    'bed', 'window ', 'grass', 'cabinet', 'sidewalk, pavement', 'person',
+    'earth, ground', 'door', 'table', 'mountain, mount', 'plant', 'curtain',
+    'chair', 'car', 'water', 'painting, picture', 'sofa', 'shelf', 'house',
+    'sea', 'mirror', 'rug', 'field', 'armchair', 'seat', 'fence', 'desk',
+    'rock, stone', 'wardrobe, closet, press', 'lamp', 'tub', 'rail', 'cushion',
+    'base, pedestal, stand', 'box', 'column, pillar', 'signboard, sign',
+    'chest of drawers, chest, bureau, dresser', 'counter', 'sand', 'sink',
+    'skyscraper', 'fireplace', 'refrigerator, icebox',
+    'grandstand, covered stand', 'path', 'stairs', 'runway',
+    'case, display case, showcase, vitrine',
+    'pool table, billiard table, snooker table', 'pillow',
+    'screen door, screen', 'stairway, staircase', 'river', 'bridge, span',
+    'bookcase', 'blind, screen', 'coffee table',
+    'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+    'book', 'hill', 'bench', 'countertop', 'stove', 'palm, palm tree',
+    'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+    'arcade machine', 'hovel, hut, hutch, shack, shanty', 'bus', 'towel',
+    'light', 'truck', 'tower', 'chandelier', 'awning, sunshade, sunblind',
+    'street lamp', 'booth', 'tv', 'plane', 'dirt track', 'clothes', 'pole',
+    'land, ground, soil',
+    'bannister, banister, balustrade, balusters, handrail',
+    'escalator, moving staircase, moving stairway',
+    'ottoman, pouf, pouffe, puff, hassock', 'bottle',
+    'buffet, counter, sideboard',
+    'poster, posting, placard, notice, bill, card', 'stage', 'van', 'ship',
+    'fountain',
+    'conveyer belt, conveyor belt, conveyer, conveyor, transporter', 'canopy',
+    'washer, automatic washer, washing machine', 'plaything, toy', 'pool',
+    'stool', 'barrel, cask', 'basket, handbasket', 'falls', 'tent', 'bag',
+    'minibike, motorbike', 'cradle', 'oven', 'ball', 'food, solid food',
+    'step, stair', 'tank, storage tank', 'trade name', 'microwave', 'pot',
+    'animal', 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket, cover',
+    'sculpture', 'hood, exhaust hood', 'sconce', 'vase', 'traffic light',
+    'tray', 'trash can', 'fan', 'pier', 'crt screen', 'plate', 'monitor',
+    'bulletin board', 'shower', 'radiator', 'glass, drinking glass', 'clock',
+    'flag'
+]
+
 
 def parse_args():
     parser = argparse.ArgumentParser(
         description='Convert ADE20K annotations to COCO format')
     parser.add_argument('src', help='ade20k data path')
+    parser.add_argument('--task', help='task name', default='panoptic')
     args = parser.parse_args()
     return args
 
 
+def prepare_instance_annotations(dataset_dir: str):
+    dataset_dir = Path(dataset_dir)
+    for name, dirname in [('train', 'training'), ('val', 'validation')]:
+        image_dir = dataset_dir / 'images' / dirname
+        instance_dir = dataset_dir / 'annotations_instance' / dirname
+
+        ann_id = 0
+
+        # json
+        out_file = dataset_dir / f'ade20k_instance_{name}.json'
+
+        # json config
+        instance_config_file = dataset_dir / 'imgCatIds.json'
+        with open(instance_config_file, 'r') as f:
+            category_dict = json.load(f)['categories']
+
+        # catid mapping
+        mapping_file = dataset_dir / 'categoryMapping.txt'
+        with open(mapping_file, 'r') as f:
+            map_id = {}
+            for i, line in enumerate(f.readlines()):
+                if i == 0:
+                    continue
+                ins_id, sem_id, _ = line.strip().split()
+                map_id[int(ins_id)] = int(sem_id) - 1
+
+        for cat in category_dict:
+            cat['id'] = map_id[cat['id']]
+
+        filenames = sorted(list(image_dir.iterdir()))
+
+        ann_dict = {}
+        images = []
+        annotations = []
+
+        progressbar = ProgressBar(len(filenames))
+        for filename in filenames:
+            image = {}
+            image_id = filename.stem
+
+            image['id'] = image_id
+            image['file_name'] = filename.name
+
+            original_format = np.array(Image.open(filename))
+            image['height'] = original_format.shape[0]
+            image['width'] = original_format.shape[1]
+
+            images.append(image)
+
+            instance_file = instance_dir / f'{image_id}.png'
+            ins_seg = np.array(Image.open(instance_file))
+            assert ins_seg.dtype == np.uint8
+
+            instance_cat_ids = ins_seg[..., 0]
+            instance_ins_ids = ins_seg[..., 1]
+
+            for thing_id in np.unique(instance_ins_ids):
+                if thing_id == 0:
+                    continue
+                mask = instance_ins_ids == thing_id
+                instance_cat_id = np.unique(instance_cat_ids[mask])
+                assert len(instance_cat_id) == 1
+
+                anno = {}
+                anno['id'] = ann_id
+                ann_id += 1
+                anno['image_id'] = image['id']
+                anno['iscrowd'] = int(0)
+                anno['category_id'] = int(map_id[instance_cat_id[0]])
+
+                inds = np.nonzero(mask)
+                ymin, ymax = inds[0].min(), inds[0].max()
+                xmin, xmax = inds[1].min(), inds[1].max()
+                anno['bbox'] = [
+                    int(xmin),
+                    int(ymin),
+                    int(xmax - xmin + 1),
+                    int(ymax - ymin + 1)
+                ]
+
+                rle = mask_util.encode(
+                    np.array(mask[:, :, np.newaxis], order='F',
+                             dtype='uint8'))[0]
+                rle['counts'] = rle['counts'].decode('utf-8')
+                anno['segmentation'] = rle
+                anno['area'] = int(mask_util.area(rle))
+                annotations.append(anno)
+            progressbar.update()
+
+        ann_dict['images'] = images
+        ann_dict['categories'] = category_dict
+        ann_dict['annotations'] = annotations
+        save_json(ann_dict, out_file)
+
+
 def prepare_panoptic_annotations(dataset_dir: str):
     dataset_dir = Path(dataset_dir)
 
@@ -44,9 +182,21 @@ def prepare_panoptic_annotations(dataset_dir: str):
                 map_id[int(ins_id) - 1] = int(sem_id) - 1
 
         ADE20K_150_CATEGORIES = []
-        ADE20K_SEM_SEG_CATEGORIES = ADE20KPanopticDataset.METAINFO['classes']
-        PALETTE = ADE20KPanopticDataset.METAINFO['palette']
-        for cat_id, cat_name in enumerate(ADE20K_SEM_SEG_CATEGORIES):
+        # ADE20K_SEM_SEG_CATEGORIES = ADE20KPanopticDataset.METAINFO['classes']
+        all_classes = ORIGINAL_CATEGORIES
+        thing_classes = ADE20KPanopticDataset.METAINFO['thing_classes']
+        stuff_classes = ADE20KPanopticDataset.METAINFO['stuff_classes']
+        palette = ADE20KPanopticDataset.METAINFO['palette']
+
+        mapping = {}
+        for i, t in enumerate(thing_classes):
+            j = list(all_classes).index(t)
+            mapping[j] = i
+        for i, t in enumerate(stuff_classes):
+            j = list(all_classes).index(t)
+            mapping[j] = i + len(thing_classes)
+
+        for cat_id, cat_name in enumerate(all_classes):
             ADE20K_150_CATEGORIES.append({
                 'id':
                 cat_id,
@@ -55,7 +205,7 @@ def prepare_panoptic_annotations(dataset_dir: str):
                 'isthing':
                 int(cat_id in map_id.values()),
                 'color':
-                PALETTE[cat_id]
+                palette[cat_id]
             })
         categories_dict = {cat['id']: cat for cat in ADE20K_150_CATEGORIES}
 
@@ -127,7 +277,7 @@ def prepare_panoptic_annotations(dataset_dir: str):
 
                 segm_info.append({
                     'id': int(segment_id),
-                    'category_id': int(semantic_cat_id),
+                    'category_id': mapping[int(semantic_cat_id)],
                     'area': int(area),
                     'bbox': bbox,
                     'iscrowd': 0
@@ -161,7 +311,7 @@ def prepare_panoptic_annotations(dataset_dir: str):
 
                 segm_info.append({
                     'id': int(segment_id),
-                    'category_id': int(semantic_cat_id),
+                    'category_id': mapping[int(semantic_cat_id)],
                     'area': int(area),
                     'bbox': bbox,
                     'iscrowd': 0
@@ -190,17 +340,32 @@ def prepare_panoptic_annotations(dataset_dir: str):
 
 def main():
     args = parse_args()
+    assert args.task in ['panoptic', 'instance']
     src = args.src
-    annotation_train_path = f'{src}/ade20k_panoptic_train'
-    annotation_val_path = f'{src}/ade20k_panoptic_val'
-    print('Preparing ADE20K panoptic annotations ...')
-    print(
-        f'Creating panoptic annotations to {annotation_train_path} and {annotation_val_path} ...'  # noqa
-    )
-    if os.path.exists(annotation_train_path) or os.path.exists(
-            annotation_val_path):
-        raise RuntimeError('Panoptic annotations already exist.')
-    prepare_panoptic_annotations(src)
+    if args.task == 'panoptic':
+        annotation_train_path = f'{src}/ade20k_panoptic_train'
+        annotation_val_path = f'{src}/ade20k_panoptic_val'
+        print('Preparing ADE20K panoptic annotations ...')
+        print(
+            f'Creating panoptic annotations to {annotation_train_path} and {annotation_val_path} ...'  # noqa
+        )
+        if os.path.exists(annotation_train_path) or os.path.exists(
+                annotation_val_path):
+            raise RuntimeError('Panoptic annotations already exist.')
+        prepare_panoptic_annotations(src)
+        print('Done.')
+    else:
+        annotation_train_path = f'{src}/ade20k_instance_train'
+        annotation_val_path = f'{src}/ade20k_instance_val'
+        print('Preparing ADE20K instance annotations ...')
+        print(
+            f'Creating instance annotations to {annotation_train_path} and {annotation_val_path} ...'  # noqa
+        )
+        if os.path.exists(annotation_train_path) or os.path.exists(
+                annotation_val_path):
+            raise RuntimeError('Instance annotations already exist.')
+        prepare_instance_annotations(src)
+        print('Done.')
 
 
 if __name__ == '__main__':