From 32f5e0e5856f6bbe97be4c42a0c2c7e1b1446896 Mon Sep 17 00:00:00 2001 From: Eric Kerfoot Date: Tue, 14 Mar 2023 21:27:17 +0000 Subject: [PATCH 1/6] Adding JSON Generator --- randomdataset/generators/__init__.py | 1 + randomdataset/generators/generator.py | 4 ++- randomdataset/generators/jsongenerator.py | 36 +++++++++++++++++++++++ randomdataset/schemaparser.py | 15 ++++++++-- 4 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 randomdataset/generators/jsongenerator.py diff --git a/randomdataset/generators/__init__.py b/randomdataset/generators/__init__.py index 03c771a..2cd72b9 100644 --- a/randomdataset/generators/__init__.py +++ b/randomdataset/generators/__init__.py @@ -3,3 +3,4 @@ from .generator import * from .csvgenerator import * +from .jsongenerator import * diff --git a/randomdataset/generators/generator.py b/randomdataset/generators/generator.py index c3de0fe..0191d25 100644 --- a/randomdataset/generators/generator.py +++ b/randomdataset/generators/generator.py @@ -11,7 +11,9 @@ class DataGenerator: - def __init__(self, dataset: Dataset, num_lines: int, file_mode: str = "w", file_ext: str = ""): + def __init__( + self, dataset: Dataset, num_lines: int, file_mode: str = "w", file_ext: str = "" + ): self.dataset = dataset self.num_lines = num_lines self.file_mode = file_mode diff --git a/randomdataset/generators/jsongenerator.py b/randomdataset/generators/jsongenerator.py new file mode 100644 index 0000000..ceaa9b1 --- /dev/null +++ b/randomdataset/generators/jsongenerator.py @@ -0,0 +1,36 @@ +# RandomDataset +# Copyright (c) 2021 Eric Kerfoot, KCL, see LICENSE file + +import json +from itertools import starmap +from typing import IO + +from ..dataset import Dataset +from .generator import DataGenerator +from ..fields import FieldTypes + +__all__ = ["JSONGenerator"] + + +class JSONGenerator(DataGenerator): + def __init__(self, dataset: Dataset, num_lines: int, write_header: bool = True): + super().__init__(dataset, num_lines, file_ext=".json") + self.write_header: bool = write_header + self.sep = "," + + def write_stream(self, stream: IO): + field_types = self.dataset.field_types + + stream.write("{\n") + if self.write_header: + line = self.get_header() + stream.write(f' "header": {json.dumps(line)},\n') + + stream.write(' "data": [') + end = "\n" + for line in self.generate_rows(): + line_str = f"{end} {json.dumps(line)}" + stream.write(line_str) + end = ",\n" + + stream.write("\n ]\n}\n") diff --git a/randomdataset/schemaparser.py b/randomdataset/schemaparser.py index a79ee15..3564a39 100644 --- a/randomdataset/schemaparser.py +++ b/randomdataset/schemaparser.py @@ -72,10 +72,16 @@ def __init__(self, name, a, b): sig = signature(typeconstr) - missing_params = [k for k, v in sig.parameters.items() if k not in schema_dict and v.default is Signature.empty] + missing_params = [ + k + for k, v in sig.parameters.items() + if k not in schema_dict and v.default is Signature.empty + ] if missing_params: - raise ValueError(f"Missing values for these parameters of type '{typename}': {', '.join(missing_params)}") + raise ValueError( + f"Missing values for these parameters of type '{typename}': {', '.join(missing_params)}" + ) args = {} @@ -83,7 +89,10 @@ def __init__(self, name, a, b): if isinstance(value, dict): arg = parse_obj_constr(value) elif isinstance(value, (list, tuple)): - arg = tuple(parse_obj_constr(item) if isinstance(item, dict) else item for item in value) + arg = tuple( + parse_obj_constr(item) if isinstance(item, dict) else item + for item in value + ) else: arg = value From 70065d294e92530371b69afa0035c8ef3ddedd8b Mon Sep 17 00:00:00 2001 From: Eric Kerfoot Date: Wed, 15 Mar 2023 12:48:24 +0000 Subject: [PATCH 2/6] Adding test Signed-off-by: Eric Kerfoot --- tests/test_jsongenerator.py | 56 +++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 tests/test_jsongenerator.py diff --git a/tests/test_jsongenerator.py b/tests/test_jsongenerator.py new file mode 100644 index 0000000..6deb0c9 --- /dev/null +++ b/tests/test_jsongenerator.py @@ -0,0 +1,56 @@ +# RandomDataset +# Copyright (c) 2021 Eric Kerfoot, KCL, see LICENSE file + +import unittest +import io +import json +from randomdataset import ( + StrFieldGen, + IntFieldGen, + Dataset, + JSONGenerator, +) + + +class TestJSONGenerator(unittest.TestCase): + def setUp(self) -> None: + self.fields = [ + StrFieldGen("Name", 6, 12), + IntFieldGen("Age", 1, 90), + IntFieldGen("Height", 50, 250), + ] + + self.ds = Dataset("Humans", self.fields) + self.temp=io.StringIO() + + def _write_temp(self, gen): + gen.write_stream(self.temp) + return json.loads(self.temp.getvalue()) + + def test_write(self): + gen = JSONGenerator(self.ds, 10) + data = self._write_temp(gen) + + self.assertGreater(self.temp.tell(), 0) + + self.assertSetEqual({"header", "data"}, set(data.keys())) + + self.assertEqual(len(data["data"]), 10) + self.assertEqual(len(data["data"][0]), 3) + + def test_write_empty(self): + gen = JSONGenerator(self.ds, 0) + data = self._write_temp(gen) + + self.assertSetEqual({"header", "data"}, set(data.keys())) + + self.assertEqual(len(data["data"]), 0) + + def test_write_no_header(self): + gen = JSONGenerator(self.ds, 10, False) + data = self._write_temp(gen) + + self.assertSetEqual({"data"}, set(data.keys())) + + self.assertEqual(len(data["data"]), 10) + self.assertEqual(len(data["data"][0]), 3) From 5d35cdb4391c99583f7d2614a39dc54a08358e07 Mon Sep 17 00:00:00 2001 From: Eric Kerfoot Date: Wed, 15 Mar 2023 13:50:45 +0000 Subject: [PATCH 3/6] Update --- randomdataset/generators/jsongenerator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/randomdataset/generators/jsongenerator.py b/randomdataset/generators/jsongenerator.py index ceaa9b1..0315b2c 100644 --- a/randomdataset/generators/jsongenerator.py +++ b/randomdataset/generators/jsongenerator.py @@ -2,12 +2,10 @@ # Copyright (c) 2021 Eric Kerfoot, KCL, see LICENSE file import json -from itertools import starmap from typing import IO from ..dataset import Dataset from .generator import DataGenerator -from ..fields import FieldTypes __all__ = ["JSONGenerator"] @@ -19,8 +17,6 @@ def __init__(self, dataset: Dataset, num_lines: int, write_header: bool = True): self.sep = "," def write_stream(self, stream: IO): - field_types = self.dataset.field_types - stream.write("{\n") if self.write_header: line = self.get_header() From f22c6263ff9972e7260ce37a35738ed75332b628 Mon Sep 17 00:00:00 2001 From: Eric Kerfoot Date: Wed, 22 Mar 2023 15:50:29 +0000 Subject: [PATCH 4/6] Adding JSON example notebook Signed-off-by: Eric Kerfoot --- .gitignore | 2 + examples/json_dataset.ipynb | 346 ++++++++++++++++++++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 examples/json_dataset.ipynb diff --git a/.gitignore b/.gitignore index 09cf662..a25f9b1 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,5 @@ dmypy.json .idea *~ + .vscode + \ No newline at end of file diff --git a/examples/json_dataset.ipynb b/examples/json_dataset.ipynb new file mode 100644 index 0000000..dedb921 --- /dev/null +++ b/examples/json_dataset.ipynb @@ -0,0 +1,346 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# JSON Example\n", + "\n", + "This example demonstrates creating a very simple and small dataset in JSON format. Refer to the [Simple Dataset](./simple_dataset.ipynb) example for more details on what is being demonstrated here as the contents are broadly the same.\n", + "\n", + "First thing to do is import `randomdataset` from the parent of this directory:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "\n", + "import randomdataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The YAML schema is written out which will be used to generate the random data:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting paymentschema_json.yaml\n" + ] + } + ], + "source": [ + "%%writefile paymentschema_json.yaml\n", + "\n", + "- typename: randomdataset.generators.JSONGenerator\n", + " num_lines: 10\n", + " dataset:\n", + " name: customers\n", + " typename: randomdataset.Dataset\n", + " fields:\n", + " - name: id\n", + " typename: randomdataset.UIDFieldGen\n", + " - name: FirstName\n", + " typename: randomdataset.StrFieldGen\n", + " lmin: 6\n", + " lmax: 14\n", + " - name: LastName\n", + " typename: randomdataset.StrFieldGen\n", + " lmin: 6\n", + " lmax: 14\n", + " \n", + "- typename: randomdataset.generators.JSONGenerator\n", + " num_lines: 20\n", + " dataset:\n", + " name: payments\n", + " typename: randomdataset.Dataset\n", + " fields:\n", + " - name: date\n", + " typename: randomdataset.DateTimeFieldGen\n", + " - name: customer_id\n", + " typename: randomdataset.IntFieldGen\n", + " vmin: 0\n", + " vmax: 10\n", + " - name: amount\n", + " typename: randomdataset.FloatFieldGen\n", + " vmin: 0\n", + " vmax: 100" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The generation is done by passing this schema to the `generate_dataset` command line utility in the library:\n", + "\n", + "```bash\n", + "$ generate_dataset paymentschema_json.yaml .\n", + "```\n", + "\n", + "Instead of invoking this utility the command can be called directly through the imported library:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Schema: 'paymentschema_json.yaml'\n", + "Output: '.'\n", + "Generating dataset 'customers'\n", + "Generating dataset 'payments'\n" + ] + } + ], + "source": [ + "randomdataset.application.generate_dataset.callback(\"paymentschema_json.yaml\",\".\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output is two JSON files, we can look at `customers_json.csv` to see the list of randomly generated customer:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"header\": [\"id\", \"FirstName\", \"LastName\"],\n", + " \"data\": [\n", + " [0, \"cFF3vm5XGeh\", \"NpweyS\"],\n", + " [1, \"X4xOvr\", \"Dx5yAslv\"],\n", + " [2, \"OHjeDIqm0J\", \"xDNQMubp3fbpD\"],\n", + " [3, \"q6JJwF\", \"jwKSf6kGvY3S\"],\n", + " [4, \"wrwGkTtIdjg8V\", \"JtlEx5YqoSMs\"],\n", + " [5, \"xMyhP335D\", \"4RuKdiD5\"],\n", + " [6, \"pgusVtOScPu\", \"lEtjY1lg\"],\n", + " [7, \"GwtQnDFBSwG\", \"02vaMHEe5D2\"],\n", + " [8, \"vTVBAJKbi\", \"47HlCy\"],\n", + " [9, \"XA56V36msh\", \"coDvP6hmNQSX\"]\n", + " ]\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "print(open(\"customers.json\").read())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "145efc09", + "metadata": {}, + "source": [ + "This data can be loaded with the `json` library to get a dictionary containing the `header` and `data` components:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "66c9ba98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'header': ['id', 'FirstName', 'LastName'], 'data': [[0, 'cFF3vm5XGeh', 'NpweyS'], [1, 'X4xOvr', 'Dx5yAslv'], [2, 'OHjeDIqm0J', 'xDNQMubp3fbpD'], [3, 'q6JJwF', 'jwKSf6kGvY3S'], [4, 'wrwGkTtIdjg8V', 'JtlEx5YqoSMs'], [5, 'xMyhP335D', '4RuKdiD5'], [6, 'pgusVtOScPu', 'lEtjY1lg'], [7, 'GwtQnDFBSwG', '02vaMHEe5D2'], [8, 'vTVBAJKbi', '47HlCy'], [9, 'XA56V36msh', 'coDvP6hmNQSX']]}\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "with open(\"customers.json\") as o:\n", + " data=json.load(o)\n", + "\n", + "print(data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c1ab1535", + "metadata": {}, + "source": [ + "Constructing a Pandas dataframe is straight forward from here:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "459bdb7e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idFirstNameLastName
00cFF3vm5XGehNpweyS
11X4xOvrDx5yAslv
22OHjeDIqm0JxDNQMubp3fbpD
33q6JJwFjwKSf6kGvY3S
44wrwGkTtIdjg8VJtlEx5YqoSMs
55xMyhP335D4RuKdiD5
66pgusVtOScPulEtjY1lg
77GwtQnDFBSwG02vaMHEe5D2
88vTVBAJKbi47HlCy
99XA56V36mshcoDvP6hmNQSX
\n", + "
" + ], + "text/plain": [ + " id FirstName LastName\n", + "0 0 cFF3vm5XGeh NpweyS\n", + "1 1 X4xOvr Dx5yAslv\n", + "2 2 OHjeDIqm0J xDNQMubp3fbpD\n", + "3 3 q6JJwF jwKSf6kGvY3S\n", + "4 4 wrwGkTtIdjg8V JtlEx5YqoSMs\n", + "5 5 xMyhP335D 4RuKdiD5\n", + "6 6 pgusVtOScPu lEtjY1lg\n", + "7 7 GwtQnDFBSwG 02vaMHEe5D2\n", + "8 8 vTVBAJKbi 47HlCy\n", + "9 9 XA56V36msh coDvP6hmNQSX" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "pd.DataFrame(data[\"data\"],columns=data[\"header\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "monai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5f2cff69adfee386d0482e2608af8430c396819a Mon Sep 17 00:00:00 2001 From: Eric Kerfoot Date: Wed, 22 Mar 2023 15:52:59 +0000 Subject: [PATCH 5/6] Fix Signed-off-by: Eric Kerfoot --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a25f9b1..a7fc0c3 100644 --- a/.gitignore +++ b/.gitignore @@ -131,5 +131,5 @@ dmypy.json .idea *~ - .vscode +.vscode \ No newline at end of file From 6d7f526501494f65c1ab1310a9b27d0854ed4f39 Mon Sep 17 00:00:00 2001 From: Eric Kerfoot Date: Wed, 22 Mar 2023 18:13:17 +0000 Subject: [PATCH 6/6] Update Signed-off-by: Eric Kerfoot --- examples/json_dataset.ipynb | 110 ++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 4 deletions(-) diff --git a/examples/json_dataset.ipynb b/examples/json_dataset.ipynb index dedb921..bf1b67e 100644 --- a/examples/json_dataset.ipynb +++ b/examples/json_dataset.ipynb @@ -15,6 +15,7 @@ { "cell_type": "code", "execution_count": 1, + "id": "95e9168d", "metadata": {}, "outputs": [], "source": [ @@ -28,6 +29,7 @@ }, { "cell_type": "markdown", + "id": "700fb00f", "metadata": {}, "source": [ "The YAML schema is written out which will be used to generate the random data:" @@ -36,6 +38,7 @@ { "cell_type": "code", "execution_count": 2, + "id": "293ea7d5", "metadata": {}, "outputs": [ { @@ -87,6 +90,7 @@ { "attachments": {}, "cell_type": "markdown", + "id": "8c59dace", "metadata": {}, "source": [ "The generation is done by passing this schema to the `generate_dataset` command line utility in the library:\n", @@ -101,6 +105,7 @@ { "cell_type": "code", "execution_count": 3, + "id": "eb78199c", "metadata": {}, "outputs": [ { @@ -115,12 +120,13 @@ } ], "source": [ - "randomdataset.application.generate_dataset.callback(\"paymentschema_json.yaml\",\".\")" + "randomdataset.application.generate_dataset.callback(\"paymentschema_json.yaml\", \".\")" ] }, { "attachments": {}, "cell_type": "markdown", + "id": "3391769e", "metadata": {}, "source": [ "The output is two JSON files, we can look at `customers_json.csv` to see the list of randomly generated customer:" @@ -129,6 +135,7 @@ { "cell_type": "code", "execution_count": 9, + "id": "248fecbc", "metadata": {}, "outputs": [ { @@ -169,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "66c9ba98", "metadata": {}, "outputs": [ @@ -185,11 +192,106 @@ "import json\n", "\n", "with open(\"customers.json\") as o:\n", - " data=json.load(o)\n", + " data = json.load(o)\n", "\n", "print(data)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "25019e4e", + "metadata": {}, + "source": [ + "Using a Jupyter widget a tree view can be created:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "51755663", + "metadata": {}, + "outputs": [ + { + "data": { + "application/json": { + "data": [ + [ + 0, + "cFF3vm5XGeh", + "NpweyS" + ], + [ + 1, + "X4xOvr", + "Dx5yAslv" + ], + [ + 2, + "OHjeDIqm0J", + "xDNQMubp3fbpD" + ], + [ + 3, + "q6JJwF", + "jwKSf6kGvY3S" + ], + [ + 4, + "wrwGkTtIdjg8V", + "JtlEx5YqoSMs" + ], + [ + 5, + "xMyhP335D", + "4RuKdiD5" + ], + [ + 6, + "pgusVtOScPu", + "lEtjY1lg" + ], + [ + 7, + "GwtQnDFBSwG", + "02vaMHEe5D2" + ], + [ + 8, + "vTVBAJKbi", + "47HlCy" + ], + [ + 9, + "XA56V36msh", + "coDvP6hmNQSX" + ] + ], + "header": [ + "id", + "FirstName", + "LastName" + ] + }, + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": { + "application/json": { + "expanded": false, + "root": "root" + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import JSON\n", + "JSON(data)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -318,7 +420,7 @@ "source": [ "import pandas as pd\n", "\n", - "pd.DataFrame(data[\"data\"],columns=data[\"header\"])" + "pd.DataFrame(data[\"data\"], columns=data[\"header\"])" ] } ],