rbalestr-lab · Hazel-Heejeong-Nam · Dec 20, 2025 · Dec 20, 2025
diff --git a/docs/source/datasets/clevrer.rst b/docs/source/datasets/clevrer.rst
@@ -0,0 +1,168 @@
+CLEVRER
+=======
+
+.. raw:: html
+
+   <p style="display: flex; gap: 10px;">
+   <img src="https://img.shields.io/badge/Task-Video%20Reasoning-blue" alt="Task: Video Reasoning">
+   <img src="https://img.shields.io/badge/Videos-20%2C000-green" alt="Videos: 20,000">
+   <img src="https://img.shields.io/badge/Resolution-480x320-orange" alt="Resolution: 480x320">
+   <img src="https://img.shields.io/badge/Format-MP4-lightgrey" alt="Format: MP4">
+   </p>
+
+Overview
+--------
+
+**CLEVRER** (CoLlision Events for Video REpresentation and Reasoning) is a diagnostic video dataset designed for systematic evaluation of computational models on temporal and causal reasoning tasks.
+
+The dataset contains **20,000 synthetic videos** of moving and colliding objects (spheres, cubes, cylinders) with various colors and materials. Each video is **5 seconds long** with **128 frames** at resolution **480×320**.
+
+CLEVRER includes four types of questions:
+
+- **Descriptive**: e.g., "What color is the sphere?"
+- **Explanatory**: e.g., "What's responsible for the collision?"
+- **Predictive**: e.g., "What will happen next?"
+- **Counterfactual**: e.g., "What if the red cube were removed?"
+
+Split sizes:
+
+- **Train**: 10,000 videos (index 0-9999)
+- **Validation**: 5,000 videos (index 10000-14999)
+- **Test**: 5,000 videos (index 15000-19999)
+
+Data Structure
+--------------
+
+When accessing an example using ``ds[i]``, you will receive a dictionary with the following keys:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 60
+
+   * - Key
+     - Type
+     - Description
+   * - ``video``
+     - ``Video``
+     - MP4 video file (480×320, 128 frames)
+   * - ``scene_index``
+     - int
+     - Unique scene identifier
+   * - ``video_filename``
+     - str
+     - Original video filename (e.g., "video_00001.mp4")
+   * - ``questions_json``
+     - str
+     - JSON string containing list of questions with answers
+   * - ``annotations_json``
+     - str
+     - JSON string containing object properties and collision events
+
+Questions JSON Structure
+------------------------
+
+Each question in ``questions_json`` contains:
+
+.. code-block:: python
+
+    {
+        "question_id": 0,
+        "question": "What color is the sphere?",
+        "question_type": "descriptive",  # or explanatory, predictive, counterfactual
+        "answer": "blue",
+        "choices": [...]  # for multiple choice questions
+    }
+
+Annotations JSON Structure
+--------------------------
+
+The ``annotations_json`` contains:
+
+.. code-block:: python
+
+    {
+        "object_property": [
+            {"object_id": 0, "color": "blue", "material": "rubber", "shape": "sphere"},
+            ...
+        ],
+        "collision": [
+            {"frame_id": 19, "object_ids": [0, 1]},
+            ...
+        ],
+        "motion_trajectory": [...]
+    }
+
+Usage Example
+-------------
+
+**Basic Usage**
+
+.. code-block:: python
+
+    import json
+    from stable_datasets.images.clevrer import CLEVRER
+
+    # Load the train split
+    ds = CLEVRER(split="train")
+
+    sample = ds[0]
+    print(sample.keys())  # {"video", "scene_index", "video_filename", "questions_json", "annotations_json"}
+
+    # Parse questions
+    questions = json.loads(sample["questions_json"])
+    print(f"First question: {questions[0]['question']}")
+    print(f"Answer: {questions[0]['answer']}")
+
+    # Parse annotations
+    annotations = json.loads(sample["annotations_json"])
+    print(f"Objects in scene: {len(annotations.get('object_property', []))}")
+
+**Working with Videos**
+
+.. code-block:: python
+
+    # Access video frames (requires torchcodec)
+    video = sample["video"]
+    frame = video.get_frame_at(0)  # Get first frame
+    print(f"Frame shape: {frame.data.shape}")
+
+Requirements
+------------
+
+Video decoding requires ``torchcodec``:
+
+.. code-block:: bash
+
+    pip install torchcodec
+
+.. note::
+
+    **Large Download Size**: The CLEVRER video files are very large (~12GB for train, ~6GB each for validation/test).
+    This dataset uses ``wget`` with resume support (``-c`` flag) instead of the standard download method to handle
+    these large files reliably. If a download is interrupted, it will automatically resume from where it left off.
+
+Related Datasets
+----------------
+
+- **CLEVR**: Static image version for visual reasoning
+- **GQA**: Real-world visual question answering
+- **Something-Something**: Video action understanding
+
+References
+----------
+
+- Official website: http://clevrer.csail.mit.edu/
+- Paper: `CLEVRER: CoLlision Events for Video REpresentation and Reasoning (ICLR 2020) <https://arxiv.org/abs/1910.01442>`_
+- License: CC0
+
+Citation
+--------
+
+.. code-block:: bibtex
+
+    @inproceedings{yi2020clevrer,
+        title={CLEVRER: CoLlision Events for Video REpresentation and Reasoning},
+        author={Yi, Kexin and Gan, Chuang and Li, Yunzhu and Kohli, Pushmeet and Wu, Jiajun and Torralba, Antonio and Tenenbaum, Joshua B},
+        booktitle={International Conference on Learning Representations},
+        year={2020}
+    }
diff --git a/docs/source/datasets/not_mnist.rst b/docs/source/datasets/not_mnist.rst
@@ -0,0 +1,110 @@
+not-MNIST
+=========
+
+.. raw:: html
+
+   <p style="display: flex; gap: 10px;">
+   <img src="https://img.shields.io/badge/Task-Image%20Classification-blue" alt="Task: Image Classification">
+   <img src="https://img.shields.io/badge/Classes-10-green" alt="Classes: 10">
+   <img src="https://img.shields.io/badge/Size-28x28-orange" alt="Image Size: 28x28">
+   <img src="https://img.shields.io/badge/Format-Grayscale-lightgrey" alt="Format: Grayscale">
+   </p>
+
+Overview
+--------
+
+The **not-MNIST** dataset was created by Yaroslav Bulatov as a more challenging alternative to the classic MNIST dataset. While the original MNIST consists of handwritten digits, not-MNIST is composed of glyphs extracted from various publicly available fonts.
+
+This dataset features the letters **A through J** (10 classes) and serves as a rigorous benchmark for machine learning models. It is significantly more difficult than MNIST because the fonts range from standard typefaces to highly artistic, experimental, or even barely legible designs.
+
+Split sizes:
+
+- **Train**: 60,000 images
+- **Test**: 10,000 images
+
+Data Structure
+--------------
+
+When accessing an example using ``ds[i]``, you will receive a dictionary with the following keys:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 60
+
+   * - Key
+     - Type
+     - Description
+   * - ``image``
+     - ``PIL.Image.Image``
+     - 28×28 Grayscale image of a letter (A-J)
+   * - ``label``
+     - int
+     - Class label (0-9) corresponding to letters A through J
+
+Usage Example
+-------------
+
+**Basic Usage**
+
+.. code-block:: python
+
+    from stable_datasets.images.not_mnist import NotMNIST
+
+    # First run will download + prepare cache, then return the split as a HF Dataset
+    ds = NotMNIST(split="train")
+
+    # If you omit the split (split=None), you get a DatasetDict with all available splits
+    ds_all = NotMNIST(split=None)
+
+    sample = ds[0]
+    print(sample.keys())  # {"image", "label"}
+
+    # Access the image and label
+    image = sample["image"]  # PIL.Image.Image
+    label = sample["label"]  # int (0-9)
+    print(f"Label: {label} -> Letter: {chr(ord('A') + label)}")
+
+    # Optional: make it PyTorch-friendly
+    ds_torch = ds.with_format("torch")
+
+**With Transforms**
+
+.. code-block:: python
+
+    from torchvision import transforms
+
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,))
+    ])
+
+    ds = NotMNIST(split="train")
+    sample = ds[0]
+    tensor = transform(sample["image"])
+    print(f"Tensor shape: {tensor.shape}")  # torch.Size([1, 28, 28])
+
+Related Datasets
+----------------
+
+- :doc:`mnist`: The original handwritten digit dataset
+- **EMNIST**: Extended MNIST including both digits and handwritten letters
+- **Fashion-MNIST**: A replacement for MNIST consisting of clothing items
+
+References
+----------
+
+- Creator: Yaroslav Bulatov
+- Blog: http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html
+- Data source: https://github.com/davidflanagan/notMNIST-to-MNIST
+
+Citation
+--------
+
+.. code-block:: bibtex
+
+    @misc{bulatov2011notmnist,
+        author={Yaroslav Bulatov},
+        title={notMNIST dataset},
+        year={2011},
+        url={http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html}
+    }
diff --git a/docs/source/datasets/teasers/clevrer_teaser.png b/docs/source/datasets/teasers/clevrer_teaser.png
diff --git a/docs/source/datasets/teasers/not_mnist_teaser.png b/docs/source/datasets/teasers/not_mnist_teaser.png
diff --git a/stable_datasets/images/__init__.py b/stable_datasets/images/__init__.py
@@ -1,25 +1,23 @@
 #!/usr/bin/env python
 
 from .arabic_characters import ArabicCharacters
-from .cifar10 import CIFAR10
-from .cifar100 import CIFAR100
 from .cars196 import Cars196
-from .dtd import DTD
-from .med_mnist import MedMNIST
-
+from .cifar10 import CIFAR10
 
 # from .arabic_digits import ArabicDigits
 # from .awa2 import AWA2
 # from .beans import Beans
 # from .celeb_a import CelebA
 # from .cifar10 import CIFAR10
 from .cifar10_c import CIFAR10C
+from .cifar100 import CIFAR100
 
 # from .cifar100 import CIFAR100
 from .cifar100_c import CIFAR100C
+from .clevrer import CLEVRER
+from .dtd import DTD
 from .med_mnist import MedMNIST
 
-
 # from .country211 import Country211
 # from .cub200 import CUB200
 # from .dsprites import DSprites
@@ -34,7 +32,9 @@
 # from .k_mnist import KMNIST
 # from .linnaeus5 import Linnaeus5
 # from .mnist import MNIST
-# from .not_mnist import NotMNIST
+from .not_mnist import NotMNIST
+
+
 # from .places365_small import Places365Small
 # from .rock_paper_scissor import RockPaperScissor
 # from .stl10 import STL10
@@ -48,7 +48,9 @@
     "CIFAR100",
     "CIFAR10C",
     "CIFAR100C",
+    "CLEVRER",
     "MedMNIST",
     "Cars196",
     "DTD",
+    "NotMNIST",
 ]