From e36d46633a885360bd3b07739b1c16589e3bde42 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 13 Jan 2021 05:19:45 -0500
Subject: [PATCH] Add MNIST dataset (#1730)

* Add MNIST dataset

* Update datasets/mnist/README.md

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/mnist/README.md                      | 146 ++++++++++++++++++
 datasets/mnist/dataset_infos.json             |   1 +
 .../mnist/dummy/mnist/1.0.0/dummy_data.zip    | Bin 0 -> 2672 bytes
 datasets/mnist/mnist.py                       | 116 ++++++++++++++
 4 files changed, 263 insertions(+)
 create mode 100644 datasets/mnist/README.md
 create mode 100644 datasets/mnist/dataset_infos.json
 create mode 100644 datasets/mnist/dummy/mnist/1.0.0/dummy_data.zip
 create mode 100644 datasets/mnist/mnist.py

diff --git a/datasets/mnist/README.md b/datasets/mnist/README.md
new file mode 100644
index 00000000000..9a05178b13d
--- /dev/null
+++ b/datasets/mnist/README.md
@@ -0,0 +1,146 @@
+---
+annotations_creators:
+- experts
+language_creators:
+- found
+languages: []
+licenses:
+- MIT
+multilinguality: []
+size_categories:
+- 10K<n<100K
+source_datasets:
+- extended|other-nist
+task_categories:
+- other
+task_ids:
+- other-other-image-classification
+---
+
+# Dataset Card for MNIST
+
+## Table of Contents
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks](#supported-tasks-and-leaderboards)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-instances)
+  - [Data Splits](#data-instances)
+- [Dataset Creation](#dataset-creation)
+  - [Curation Rationale](#curation-rationale)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+  - [Other Known Limitations](#other-known-limitations)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+
+## Dataset Description
+
+- **Homepage:** http://yann.lecun.com/exdb/mnist/
+- **Repository:** 
+- **Paper:** MNIST handwritten digit database by Yann LeCun, Corinna Cortes, and CJ Burges
+- **Leaderboard:**
+- **Point of Contact:**
+
+### Dataset Summary
+
+The MNIST dataset consists of 70,000 28x28 black-and-white images of handwritten digits extracted from two NIST databases. There are 60,000 images in the training dataset and 10,000 images in the validation dataset, one class per digit so a total of 10 classes, with 7,000 images (6,000 train images and 1,000 test images) per class.
+Half of the image were drawn by Census Bureau employees and the other half by high school students (this split is evenly distributed in the training and testing sets).
+
+### Supported Tasks and Leaderboards
+
+[More Information Needed]
+
+### Languages
+
+English
+
+## Dataset Structure
+
+### Data Instances
+
+A data point comprises an image and its label.
+
+### Data Fields
+
+- image: a 2d array of integers representing the 28x28 image.
+- label: an integer between 0 and 9 representing the digit.
+
+### Data Splits
+
+The data is split into training and test set. All the images in the test set were drawn by different individuals than the images in the training set. The training set contains 60,000 images and the test set 10,000 images. 
+
+## Dataset Creation
+
+### Curation Rationale
+
+The MNIST database was created to provide a testbed for people wanting to try pattern recognition methods or machine learning algorithms while spending minimal efforts on preprocessing and formatting. Images of the original dataset (NIST) were  in two groups, one consisting of images drawn by Census Bureau employees and one consisting of images drawn by high school students. In NIST, the training set was built by grouping all the images of the Census Bureau employees, and the test set was built by grouping the images form the high school students.
+The goal in building MNIST was to have a training and test set following the same distributions, so the training set contains 30,000 images drawn by Census Bureau employees and 30,000 images drawn by high school students, and the test set contains 5,000 images of each group. The curators took care to make sure all the images in the test set were drawn by different individuals than the images in the training set. 
+
+### Source Data
+
+#### Initial Data Collection and Normalization
+
+The original images from NIST were size normalized to fit a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels (i.e., pixels don't simply have a value of black and white, but a level of greyness from 0 to 255) as a result of the anti-aliasing technique used by the normalization algorithm. The images were then centered in a 28x28 image by computing the center of mass of the pixels, and translating the image so as to position this point at the center of the 28x28 field.
+
+#### Who are the source image producers?
+
+Half of the source images were drawn by Census Bureau employees, half by high school students. According to the dataset curator, the images from the first group are more easily recognizable.
+
+### Annotations
+
+#### Annotation process
+
+The images were not annotated after their creation: the image creators annotated their images with the corresponding label after drawing them.
+
+#### Who are the annotators?
+
+Same as the source data creators.
+
+### Personal and Sensitive Information
+
+[More Information Needed]
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[More Information Needed]
+
+### Discussion of Biases
+
+[More Information Needed]
+
+### Other Known Limitations
+
+[More Information Needed]
+
+## Additional Information
+
+### Dataset Curators
+
+Chris Burges, Corinna Cortes and Yann LeCun
+
+### Licensing Information
+
+MIT Licence
+
+### Citation Information
+
+```
+@article{lecun2010mnist,
+  title={MNIST handwritten digit database},
+  author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
+  journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
+  volume={2},
+  year={2010}
+}
+```
diff --git a/datasets/mnist/dataset_infos.json b/datasets/mnist/dataset_infos.json
new file mode 100644
index 00000000000..589d3e6289b
--- /dev/null
+++ b/datasets/mnist/dataset_infos.json
@@ -0,0 +1 @@
+{"mnist": {"description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", "citation": "@article{lecun2010mnist,\n  title={MNIST handwritten digit database},\n  author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n  journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n  volume={2},\n  year={2010}\n}\n", "homepage": "http://yann.lecun.com/exdb/mnist/", "license": "", "features": {"image": {"shape": [28, 28], "dtype": "uint8", "id": null, "_type": "Array2D"}, "label": {"num_classes": 10, "names": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image", "output": "label"}, "builder_name": "mnist", "config_name": "mnist", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 54480048, "num_examples": 60000, "dataset_name": "mnist"}, "test": {"name": "test", "num_bytes": 9080008, "num_examples": 10000, "dataset_name": "mnist"}}, "download_checksums": {"https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz": {"num_bytes": 9912422, "checksum": "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609"}, "https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz": {"num_bytes": 28881, "checksum": "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c"}, "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz": {"num_bytes": 1648877, "checksum": "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6"}, "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz": {"num_bytes": 4542, "checksum": "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6"}}, "download_size": 11594722, "post_processing_size": null, "dataset_size": 63560056, "size_in_bytes": 75154778}}
\ No newline at end of file
diff --git a/datasets/mnist/dummy/mnist/1.0.0/dummy_data.zip b/datasets/mnist/dummy/mnist/1.0.0/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..fc61ea9299d56e5255e1bdffcefac65092a7d49c
GIT binary patch
literal 2672
zcmai$2UHX30)-O-gr;B!9YH`uN<ir#E)c3jq#BT(kdQ>0B1Bk<g-ADqrbLWXQ82Km
z5FUb}XF-++nn*{wJ`j{<P?{`k(A~45;+r}1&&-)K_nUwIbMLv12oSdbU~@ZqNyGW`
z#ovP<Ko)?B2o8?&#h}7bTE|`a0iZ^Sb>B+P5fLc}0CG=)0D#YfBS!?vd5sSc!Vwv!
z$=(U(Jn|C;0F<|igomLBAuvKP3Wq%dBVZzRU=ir3aI7Zo*U$1B9O;Jd<!5?N*t$AN
zg69LV9w|Fq%5o4aG_($if#(wbXE6@Z%oei{-UV}1#5*xM$pke1i6y#JikgrzM!{QO
zIO_hQv_E&;xjzk?bz`o+lO{#08>kaLZ1o{gYK|&Da$<IjjFWsS?lwDau)3<Q<5Pje
z<v-F{NNA49-H3WTSYb7ezg)-#&MS(9>Yp!G@YD}-muu2;X{<3vT<+hyTYc=UW)o64
zz$v5MvcW6mlxW1YH;xA606AkX3)ppkZLfhD_!CPd^n{RrW~Gfru9bPW_~TDq=82~b
z?ZhvpcPqXfM?$hUGU`K>^qk+@&s7a~w48>#PmV*?_a|rW?5+*^#L7N!L}N_M8@Y12
z!@{_Ou=7T~abYHHwK3<WN0(;CEfx^4tcGa&Wlzkz_@rbAwc*v_JtjxEO!nMGSDJFO
zbp6UO(u2h%h?ZGOWSc#e;2K>yWJ*KL*Pkk1Kvc~UGXIn=Bsqi}_e-O_H3+beNd>r1
z;gje*5ziBN@}jC|Ri+~tBJ8LuLiw`k=>w|tC8+zYcyheLm0j+$ecFgohTrL<0@BkG
zQ=mw1Td$#0KDW<><p2*Mt|Ej+Un6EKRwR|kBT56}AI9xSZD~r^y>nG;HgvA!`n*|+
z!4$FX%^Jb#-OXML42T;_&%pDV*^DI`>tw$L`Q4~OVNKQZ3MQIoqVxCFL((4lsiDo&
z>S9c^Y_E^%@y0&g1YGN2w5|KoX0Fbq^^>B8ZjYg(ZV!FZr<3j~t2|@Ygpy3*6W)Q0
zeq1dr7>8H&XYO}z%BN!Qz-iV`1jJTL@~jJI^O@$=^b*Ph_=#=G?tQMFe_k>X48M9)
zY>32KTogcqE|n^*$if`%duCu<{!0|0=P5eyTX#%aOl2H0#;YFo*+=4Tb-{xb#Io<j
z!LJ=wO&|r8P<k#(+DcW(JbN=fcTm)dvf7|V4OJRiPrNa|rr#7>wld~yH^UQzf4z=M
zO}A_(C7H7-!amZ0?IL-+1wE-KeUM~n>LfQ@&PM30VUmQC{x$VFoqN%PB64CG<q~Rj
zrf1|q*~!9f!vNqgRoI>n^Dt7eT<)C{=)o^$>FxJSU%$IA7%Oi^?wU@n9+kV+teDh0
zk#<qFd&ae@S-d2DiML7R08W=xMG5eUaMWCQ?GM!g);Ll|{`GPg$IFpXSLX#e&fdW}
ze(2>O6dD`!*~<sFdKoOa?)&h2FZ=reKu9F7xb-o>za%1uRKR9x@BTYA_I{HZ2M+{p
z9~#0O(4+4`<GsGNE>5!Gc^SqZrqHQ<{Nuc(d&e!X!U1YVYBz=Kif`TS6%d3ed7t}W
zuIwn9w!(_H3^?XQf+uNn5vbTg$V5Ihvrbi<&{O(m?YVK@(%aem5zh}4-@e6oW*qM9
z(hV9-oo{_|**|r8*&%bGx{|n|A1s44xlcT3pL`+UIrEQm^Or6!oYjcK(&60=s<H9?
zLC5sls}}ljIEzq2c!tYx5WRvFJCf|k=0Y^TwCmT|4S#a@My&gQeGmD?F18YXAs;x<
zZ2u%ohHS1uzN<k>^Pgr*H`z%cEcat~*214A)D|xS>-9HG+(LAtANF*5U_j>*ifX3(
zkIR^4q(CGJnb3XoXyLa}c~+}M%ZLo{&f6`ztIQ)>c&aH+NL3=f)5*ZFlfposKS_ES
zXW^ZzZ)N{GR6lDPki)lo!3?a8wv-q~^0F>g^UIyzPl{n!FpEXQe~qPgpRC~L*F7ga
zr<;_6?e{<_3x?EI6zmyoA$FthE_=+L(0Jbm{_R!MU9YZVFGO$WFpIJegj&H8QKFtA
z4@?u$7<4|1p+gs~5eUzKW<L5zZ<&6uq+pEEIYfRD&<l7;yxd)gY%6!n^P7^qDKe;b
z!a)SC0HTWocJ>cY^ay%;y99=;o6!6h1YcqzFTHG*Amzc_0`CBuaHO<pC#U+}+HCuD
zwu<siqnKE|6J^b$>seW?t%)*q&$J3oXRftp1*2jTH@xPw&aBj<p^MgQZn{L(;BmgA
zLq4~Ks|DaC&UzCCm;~K8*Q^)Ukk1;Em8@9zh~Qck_M8DXg=z#GH8l!@u-9&dO_f=r
z)`B$pF*a!@iMNn2>f*;XyfQ3NN0%XV$E3cc=@ko^BWB$EDx^5GhiFG+IMnsXmtfI3
z4m-l^a;35vqFOmUlf&xBF&~QObFW5MBs?+3CbYQYVM1%oeqSpCvP+pp@t@id#VNMd
zV{7?V*fK3V9l~Eeg(?2cft}rT*15c)Eje`6D$45^oKRKJUMVYQ{vU_3l#;<BUEqE6
zM+l@Z(Zh&l^Gxpye@)AzZ^uZh{8sCcf=$i;G5L?Ix2?xFt+!bTzNrJBJx<(b%`49F
zIFL&Yv|YiF<fO{xCIXl--#l-r8$4g?#y8qu1h*?2{|EvAB^Ru=3jVBma8R4o<0q)^
zv+!%wF77Sg{sr~pzs5mrCgV>~--qGXsOWD{+eagZw7EY2hqQI=za}k$w@zx4wC$XJ
VNaEq$TvQ&;{s{;G`0{Pu{RKmqCDs4{

literal 0
HcmV?d00001

diff --git a/datasets/mnist/mnist.py b/datasets/mnist/mnist.py
new file mode 100644
index 00000000000..58e48b22b55
--- /dev/null
+++ b/datasets/mnist/mnist.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""MNIST Data Set"""
+
+from __future__ import absolute_import, division, print_function
+
+import struct
+
+import numpy as np
+
+import datasets
+
+
+_CITATION = """\
+@article{lecun2010mnist,
+  title={MNIST handwritten digit database},
+  author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
+  journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
+  volume={2},
+  year={2010}
+}
+"""
+
+_DESCRIPTION = """\
+The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000
+images per class. There are 60,000 training images and 10,000 test images.
+"""
+
+_URL = "https://storage.googleapis.com/cvdf-datasets/mnist/"
+_URLS = {
+    "train_images": "train-images-idx3-ubyte.gz",
+    "train_labels": "train-labels-idx1-ubyte.gz",
+    "test_images": "t10k-images-idx3-ubyte.gz",
+    "test_labels": "t10k-labels-idx1-ubyte.gz",
+}
+
+
+class MNIST(datasets.GeneratorBasedBuilder):
+    """MNIST Data Set"""
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="mnist",
+            version=datasets.Version("1.0.0"),
+            description=_DESCRIPTION,
+        )
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "image": datasets.Array2D(shape=(28, 28), dtype="uint8"),
+                    "label": datasets.features.ClassLabel(names=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]),
+                }
+            ),
+            supervised_keys=("image", "label"),
+            homepage="http://yann.lecun.com/exdb/mnist/",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls_to_download = {key: _URL + fname for key, fname in _URLS.items()}
+        downloaded_files = dl_manager.download_and_extract(urls_to_download)
+        print(downloaded_files)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": [downloaded_files["train_images"], downloaded_files["train_labels"]],
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": [downloaded_files["test_images"], downloaded_files["test_labels"]],
+                    "split": "test",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath, split):
+        """This function returns the examples in the raw form."""
+        # Images
+        with open(filepath[0], "rb") as f:
+            # First 16 bytes contain some metadata
+            _ = f.read(4)
+            size = struct.unpack(">I", f.read(4))[0]
+            _ = f.read(8)
+            images = np.frombuffer(f.read(), dtype=np.uint8).reshape(size, 28, 28)
+
+        # Labels
+        with open(filepath[1], "rb") as f:
+            # First 8 bytes contain some metadata
+            _ = f.read(8)
+            labels = np.frombuffer(f.read(), dtype=np.uint8)
+
+        for idx in range(size):
+            yield idx, {"image": images[idx], "label": str(labels[idx])}