diff --git a/README.rst b/README.rst index f7bb6ca..45be3f7 100644 --- a/README.rst +++ b/README.rst @@ -221,12 +221,10 @@ Iterator support In many situations the direct input users want to pass to ijson is an iterator (e.g., a generator) rather than a file-like object. -To bridge this gap users need to adapt the iterator into a file-like object. -Examples of this can be found -`here `__ -and `here `__. -Future versions of ijson might provide built-in adapters for this, -and/or support iterators without the need to adapt them first. +ijson provides built-in adapters to bridge this gap: + +- ``ijson.from_iter(iterable_of_bytes)`` +- ``ijson.from_aiter(async_iterable_of_bytes)`` ``asyncio`` support @@ -635,23 +633,42 @@ FAQ by passing the ``multiple_values=True`` to the ijson function in use. See the options_ section for details. -#. How do I use ijson with the ``requests`` library +#. **Q**: How do I use ijson with ``requests`` or ``httpx`` The ``requests`` library downloads the body of the HTTP response immediately by default. - Users wanting to feed the response into ijson - will need to override this behaviour - by using the ``requests.get(..., stream=True)`` parameter. - Then they have at least two options: + To stream JSON into ijson, pass ``stream=True`` and adapt the byte iterator: + + .. code-block:: python + + import requests + import ijson + + with requests.get('https://jsonplaceholder.typicode.com/posts', stream=True) as resp: + resp.raise_for_status() + f = ijson.from_iter(resp.iter_content(chunk_size=64*1024)) + for post in ijson.items(f, 'item'): + print(f"post id = {post['id']}, \t title: {post['title']}") + + You can also pass ``Response.raw`` directly (it's a file-like object), + but using ``iter_content`` is preferred because ``requests`` will transparently + handle HTTP transfer encodings (e.g., gzip, chunked). + + + For async usage with ``httpx``: + + .. code-block:: python - * Wrap the ``Response.iter_content()`` iterator into a file-like object, - then give that to ijson. + import httpx, ijson, asyncio - * Pass the ``Response.raw`` object (the underlying ``socket.socket``) to ijson. + async def main(): + async with httpx.AsyncClient() as client: + async with client.stream('GET', 'https://jsonplaceholder.typicode.com/posts') as resp: + resp.raise_for_status() + f = ijson.from_aiter(resp.aiter_bytes()) + async for item in ijson.items(f, 'item'): + print(f"post id = {post['id']}, \t title: {post['title']}") - The first alternative is best, since ``requests`` will automatically decode - any HTTP transfer encodings, which doesn't happen with ``Response.raw``. - See `Iterator support`_ for how to wrap ``Response.iter_content()`` - into a file-like object. + asyncio.run(main()) Acknowledgements diff --git a/src/ijson/__init__.py b/src/ijson/__init__.py index 9f7c679..806af45 100644 --- a/src/ijson/__init__.py +++ b/src/ijson/__init__.py @@ -13,6 +13,7 @@ also two other backends using the C library yajl in ``ijson.backends`` that have the same API and are faster under CPython. ''' +from ijson.adapters import from_aiter, from_iter from ijson.common import JSONError, IncompleteJSONError, ObjectBuilder from ijson.utils import coroutine, sendable_list diff --git a/src/ijson/adapters.py b/src/ijson/adapters.py new file mode 100644 index 0000000..845ef1e --- /dev/null +++ b/src/ijson/adapters.py @@ -0,0 +1,35 @@ +from typing import AsyncIterable, AsyncIterator, Iterable, Iterator + + +class IterReader: + """File-like object backed by a byte iterator.""" + + def __init__(self, byte_iter: Iterator[bytes]): + self._iter = byte_iter + + def read(self, n: int) -> bytes: + if n == 0: + return b"" + return next(self._iter, b"") + + +class AiterReader: + """Async file-like object backed by an async byte iterator.""" + + def __init__(self, byte_aiter: AsyncIterator[bytes]): + self._aiter = byte_aiter + + async def read(self, n: int) -> bytes: + if n == 0: + return b"" + return await anext(self._aiter, b"") + + +def from_iter(byte_iter: Iterable[bytes]) -> IterReader: + """Convert a synchronous byte iterable to a file-like object.""" + return IterReader(iter(byte_iter)) + + +def from_aiter(byte_aiter: AsyncIterable[bytes]) -> AiterReader: + """Convert an asynchronous byte iterable to an async file-like object.""" + return AiterReader(aiter(byte_aiter)) diff --git a/tests/test_adapters.py b/tests/test_adapters.py new file mode 100644 index 0000000..cf34d61 --- /dev/null +++ b/tests/test_adapters.py @@ -0,0 +1,116 @@ +import asyncio +import ijson +import pytest + +from .test_base import JSON, JSON_EVENTS, JSON_PARSE_EVENTS, JSON_OBJECT + +CHUNK_SIZE = 10 + + +@pytest.fixture +def chunks(): + return [JSON[i : i + CHUNK_SIZE] for i in range(0, len(JSON), CHUNK_SIZE)] + + +@pytest.fixture +def async_chunks(): + async def chunks(): + for i in range(0, len(JSON), CHUNK_SIZE): + yield JSON[i : i + CHUNK_SIZE] + + return chunks() + + +def test_from_iter_read0_does_not_consume(): + chunks = [b'{"key":', b'"value"}'] + file_obj = ijson.from_iter(iter(chunks)) + assert file_obj.read(0) == b"" + assert file_obj.read(1) == b'{"key":' + assert file_obj.read(1) == b'"value"}' + assert file_obj.read(1) == b"" + + +def test_from_iter_accepts_iterable(): + chunks = [b'{"key":', b'"value"}'] + file_obj = ijson.from_iter(chunks) # no iter(...) + assert file_obj.read(1) == b'{"key":' + assert file_obj.read(1) == b'"value"}' + assert file_obj.read(1) == b"" + + +def test_from_iter_basic_parse(backend, chunks): + file_obj = ijson.from_iter(iter(chunks)) + assert JSON_EVENTS == list(backend.basic_parse(file_obj)) + + +def test_from_iter_parse(backend, chunks): + file_obj = ijson.from_iter(iter(chunks)) + assert JSON_PARSE_EVENTS == list(backend.parse(file_obj)) + + +def test_from_iter_items(backend, chunks): + file_obj = ijson.from_iter(iter(chunks)) + assert [JSON_OBJECT] == list(backend.items(file_obj, "")) + + +def test_from_iter_kvitems(backend, chunks): + file_obj = ijson.from_iter(iter(chunks)) + kv = list(backend.kvitems(file_obj, "")) + assert len(kv) == 1 + key, value = kv[0] + assert key == "docs" + assert value == JSON_OBJECT["docs"] + + +def test_from_aiter_read0_does_not_consume(): + async def chunks(): + yield b'{"key":' + yield b'"value"}' + + async def main(): + file_obj = ijson.from_aiter(chunks()) + assert await file_obj.read(0) == b"" + assert await file_obj.read(1) == b'{"key":' + assert await file_obj.read(1) == b'"value"}' + assert await file_obj.read(1) == b"" + + asyncio.run(main()) + + +def test_from_aiter_basic_parse(backend, async_chunks): + async def main(): + file_obj = ijson.from_aiter(async_chunks) + events = [e async for e in backend.basic_parse(file_obj)] + assert JSON_EVENTS == events + + asyncio.run(main()) + + +def test_from_aiter_parse(backend, async_chunks): + async def main(): + file_obj = ijson.from_aiter(async_chunks) + events = [e async for e in backend.parse(file_obj)] + assert JSON_PARSE_EVENTS == events + + asyncio.run(main()) + + +def test_from_aiter_items(backend, async_chunks): + async def main(): + file_obj = ijson.from_aiter(async_chunks) + items = [obj async for obj in backend.items(file_obj, "")] + assert [JSON_OBJECT] == items + + asyncio.run(main()) + + +def test_from_aiter_kvitems(backend, async_chunks): + async def main(): + file_obj = ijson.from_aiter(async_chunks) + kv = [kv async for kv in backend.kvitems(file_obj, "")] + assert len(kv) == 1 + key, value = kv[0] + assert key == "docs" + assert value == JSON_OBJECT["docs"] + + asyncio.run(main())