Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QueryBuilder: Implementation of contains Filter Operator applied to JSON Fields for SQLite backend #6619

Merged
merged 38 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
a9165f9
add tests to `contains` filter operator on PostgreSQL backend
dependabot[bot] Nov 6, 2024
ab88f00
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 19, 2024
518cc6b
Merge branch 'main' into test-json-contains
rabbull Nov 19, 2024
ec36aae
temp
rabbull Nov 19, 2024
474dd26
add tests for nested arrays
rabbull Nov 19, 2024
a759d43
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 19, 2024
5b7b62b
update
rabbull Nov 19, 2024
e23ec32
custom function
rabbull Nov 19, 2024
e530f03
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 19, 2024
6df03e3
cleanup
rabbull Nov 19, 2024
84c50f6
catchup
rabbull Nov 19, 2024
f787404
fix compilation error on py39
rabbull Nov 20, 2024
36c7102
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 20, 2024
dcd3cf9
add benchmark
rabbull Nov 20, 2024
d293d18
ignore benchmark results
rabbull Nov 20, 2024
a34623f
Merge branch 'test-json-contains' into sqlite-json-contains
rabbull Nov 20, 2024
079cc32
remove requires_psql marks for sqlite tests
rabbull Nov 20, 2024
598f821
temp
rabbull Nov 21, 2024
93ad037
add benchmark
rabbull Nov 23, 2024
93966cd
Merge branch 'sqlite-json-contains' of github.com:rabbull/aiida-core …
rabbull Nov 23, 2024
9293c67
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 23, 2024
180bec0
Merge branch 'main' of github.com:rabbull/aiida-core into sqlite-json…
rabbull Nov 26, 2024
edfe2b2
Merge branch 'main' of github.com:rabbull/aiida-core into sqlite-json…
rabbull Nov 26, 2024
2189a81
migrate sqlite filter tests to orm
rabbull Nov 26, 2024
ffa0b11
add comment on impl for psql of_type
rabbull Nov 26, 2024
1ca1d3c
Merge branch 'main' of github.com:rabbull/aiida-core into sqlite-json…
rabbull Dec 5, 2024
c699484
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 5, 2024
9d56802
Merge branch 'main' of github.com:rabbull/aiida-core into sqlite-json…
rabbull Dec 10, 2024
3eca114
Merge branch 'main' of github.com:rabbull/aiida-core into sqlite-json…
rabbull Dec 10, 2024
af544e7
add tests for custom functions
rabbull Dec 10, 2024
fbb7ee3
enable sqlite database backend testing in github actions
rabbull Dec 10, 2024
57beea7
add sqlite to coverage report workflow
rabbull Dec 10, 2024
7e78b22
Merge branch 'main' of github.com:rabbull/aiida-core into sqlite-json…
rabbull Dec 17, 2024
2d14fa0
Merge branch 'main' of github.com:rabbull/aiida-core into sqlite-json…
rabbull Jan 13, 2025
ed103da
Merge branch 'main' into sqlite-json-contains
unkcpz Jan 16, 2025
d7c70d8
return NULL when attr_key doesn't exist
rabbull Jan 21, 2025
d226c26
revert modificiations in .github
rabbull Jan 21, 2025
7617861
explain why sqlite contains implementation in comment
rabbull Jan 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test-install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ jobs:
fail-fast: false
matrix:
python-version: ['3.9', '3.10', '3.11', '3.12']
database-backend: [psql, sqlite]
rabbull marked this conversation as resolved.
Show resolved Hide resolved

services:
postgres:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ pplot_out/

# docker
docker-bake.override.json

# benchmark
.benchmarks/
3 changes: 1 addition & 2 deletions src/aiida/storage/sqlite_zip/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,7 @@ def _cast_json_type(comparator: JSON.Comparator, value: Any) -> Tuple[ColumnElem
return case((type_filter, casted_entity.ilike(value, escape='\\')), else_=False)

if operator == 'contains':
# to-do, see: https://github.com/sqlalchemy/sqlalchemy/discussions/7836
raise NotImplementedError('The operator `contains` is not implemented for SQLite-based storage plugins.')
return func.json_contains(database_entity, json.dumps(value))

if operator == 'has_key':
return (
Expand Down
32 changes: 32 additions & 0 deletions src/aiida/storage/sqlite_zip/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,44 @@
cursor.close()


def _contains(lhs: Union[dict, list], rhs: Union[dict, list]):
rabbull marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(lhs, dict) and isinstance(rhs, dict):
for key in rhs:
if key not in lhs or not _contains(lhs[key], rhs[key]):
return False
return True

elif isinstance(lhs, list) and isinstance(rhs, list):
for item in rhs:
if not any(_contains(e, item) for e in lhs):
return False
return True
else:
return lhs == rhs


def _json_contains(lhs: Union[str, bytes, bytearray, dict, list], rhs: Union[str, bytes, bytearray, dict, list]):
try:
if isinstance(lhs, (str, bytes, bytearray)):
lhs = json.loads(lhs)
if isinstance(rhs, (str, bytes, bytearray)):
rhs = json.loads(rhs)
except json.JSONDecodeError:
return 0

Check warning on line 74 in src/aiida/storage/sqlite_zip/utils.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/storage/sqlite_zip/utils.py#L73-L74

Added lines #L73 - L74 were not covered by tests
return int(_contains(lhs, rhs))


def register_json_contains(dbapi_connection, _):
dbapi_connection.create_function('json_contains', 2, _json_contains)


def create_sqla_engine(path: Union[str, Path], *, enforce_foreign_keys: bool = True, **kwargs) -> Engine:
"""Create a new engine instance."""
engine = create_engine(f'sqlite:///{path}', json_serializer=json.dumps, json_deserializer=json.loads, **kwargs)
event.listen(engine, 'connect', sqlite_case_sensitive_like)
if enforce_foreign_keys:
event.listen(engine, 'connect', sqlite_enforce_foreign_keys)
event.listen(engine, 'connect', register_json_contains)
return engine


Expand Down
138 changes: 138 additions & 0 deletions tests/benchmark/test_json_contains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import random
import string

import pytest

from aiida import orm
from aiida.orm.querybuilder import QueryBuilder

GROUP_NAME = 'json-contains'


COMPLEX_JSON_DEPTH_RANGE = [2**i for i in range(4)]
COMPLEX_JSON_BREADTH_RANGE = [2**i for i in range(4)]
LARGE_TABLE_SIZE_RANGE = [2**i for i in range(1, 11)]


def gen_json(depth: int, breadth: int):
def gen_str(n: int, with_digits: bool = True):
population = string.ascii_letters
if with_digits:
population += string.digits
return ''.join(random.choices(population, k=n))

if depth == 0: # random primitive value
# real numbers are not included as their equivalence is tricky
return random.choice(
[
random.randint(-114, 514), # integers
gen_str(6), # strings
random.choice([True, False]), # booleans
None, # nulls
]
)

else:
gen_dict = random.choice([True, False])
data = [gen_json(depth - 1, breadth) for _ in range(breadth)]
if gen_dict:
keys = set()
while len(keys) < breadth:
keys.add(gen_str(6, False))
data = dict(zip(list(keys), data))
return data


def extract_component(data, p: float = -1):
if random.random() < p:
return data

if isinstance(data, dict) and data:
key = random.choice(list(data.keys()))
return {key: extract_component(data[key])}
elif isinstance(data, list) and data:
element = random.choice(data)
return [extract_component(element)]
else:
return data


@pytest.mark.benchmark(group=GROUP_NAME)
@pytest.mark.parametrize('depth', [1, 2, 4, 8])
@pytest.mark.parametrize('breadth', [1, 2, 4])
@pytest.mark.usefixtures('aiida_profile_clean')
def test_deep_json(benchmark, depth, breadth):
lhs = gen_json(depth, breadth)
rhs = extract_component(lhs, p=1.0 / depth)
assert 0 == len(QueryBuilder().append(orm.Dict).all())

orm.Dict(
{
'id': f'{depth}-{breadth}',
'data': lhs,
}
).store()
qb = QueryBuilder().append(
orm.Dict,
filters={
'attributes.data': {'contains': rhs},
},
project=['attributes.id'],
)
qb.all()
result = benchmark(qb.all)
assert len(result) == 1


@pytest.mark.benchmark(group=GROUP_NAME)
@pytest.mark.parametrize('depth', [2])
@pytest.mark.parametrize('breadth', [1, 10, 100])
@pytest.mark.usefixtures('aiida_profile_clean')
def test_wide_json(benchmark, depth, breadth):
lhs = gen_json(depth, breadth)
rhs = extract_component(lhs, p=1.0 / depth)
assert 0 == len(QueryBuilder().append(orm.Dict).all())

orm.Dict(
{
'id': f'{depth}-{breadth}',
'data': lhs,
}
).store()
qb = QueryBuilder().append(
orm.Dict,
filters={
'attributes.data': {'contains': rhs},
},
project=['attributes.id'],
)
qb.all()
result = benchmark(qb.all)
assert len(result) == 1


@pytest.mark.benchmark(group=GROUP_NAME)
@pytest.mark.parametrize('num_entries', LARGE_TABLE_SIZE_RANGE)
@pytest.mark.usefixtures('aiida_profile_clean')
def test_large_table(benchmark, num_entries):
data = gen_json(2, 10)
rhs = extract_component(data)
assert 0 == len(QueryBuilder().append(orm.Dict).all())

for i in range(num_entries):
orm.Dict(
{
'id': f'N={num_entries}, i={i}',
'data': data,
}
).store()
qb = QueryBuilder().append(
orm.Dict,
filters={
'attributes.data': {'contains': rhs},
},
project=['attributes.id'],
)
qb.all()
result = benchmark(qb.all)
assert len(result) == num_entries
Loading
Loading