From ed3143a4eabe41caf1a6c752a4e57e565a571f0d Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Wed, 28 Feb 2024 18:00:10 +0000 Subject: [PATCH 01/10] Adding methods to enable cache RW --- msticpy/common/cache/codec.py | 40 +++++++++++++++++++++++++++++++++++ requirements-all.txt | 1 + requirements.txt | 1 + 3 files changed, 42 insertions(+) create mode 100644 msticpy/common/cache/codec.py diff --git a/msticpy/common/cache/codec.py b/msticpy/common/cache/codec.py new file mode 100644 index 000000000..f0d3164e1 --- /dev/null +++ b/msticpy/common/cache/codec.py @@ -0,0 +1,40 @@ +"""Functions to encode/decode cached objects.""" + +import base64 +import json +import logging +from collections.abc import MutableMapping +from hashlib import sha256 +from io import BytesIO + +import compress_pickle # type: ignore[import-untyped] + +from ...datamodel.result import QueryResult + +from ..._version import VERSION + +__version__ = VERSION +__author__ = "Florian Bracq" + +LOGGER: logging.Logger = logging.getLogger(__name__) + + +def encode_as_base64_pickle(data: QueryResult) -> str: + """Encode data as Base64 pickle to be written to cache.""" + with BytesIO() as bytes_io: + compress_pickle.dump(data, bytes_io, compression="lzma") + return base64.b64encode(bytes_io.getvalue()).decode() + + +def decode_base64_as_pickle(b64_string: str) -> QueryResult: + """Decode Base64 pickle from cache to Results.""" + return compress_pickle.loads(base64.b64decode(b64_string), compression="lzma") + + +def compute_digest(obj: MutableMapping) -> str: + """Compute the digest from the parameters.""" + str_params: str = json.dumps(obj, sort_keys=True, default=str) + LOGGER.debug("Received: %s", str_params) + digest: str = sha256(bytes(str_params, "utf-8")).hexdigest() + LOGGER.debug("Generated digest: %s", digest) + return digest diff --git a/requirements-all.txt b/requirements-all.txt index 727c282e7..a34135f9e 100644 --- a/requirements-all.txt +++ b/requirements-all.txt @@ -16,6 +16,7 @@ azure-monitor-query>=1.0.0, <=2.0.0 azure-storage-blob>=12.5.0 beautifulsoup4>=4.0.0 bokeh>=1.4.0, <4.0.0 +compress-pickle >= 2.1.0 cryptography>=3.1 deprecated>=1.2.4 dnspython>=2.0.0, <3.0.0 diff --git a/requirements.txt b/requirements.txt index eea84c4fb..ae2cb23d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ azure-mgmt-subscription>=3.0.0 azure-monitor-query>=1.0.0, <=2.0.0 beautifulsoup4>=4.0.0 bokeh>=1.4.0, <4.0.0 +compress-pickle >= 2.1.0 cryptography>=3.1 deprecated>=1.2.4 dnspython>=2.0.0, <3.0.0 From b24198fa38b2689f9a32a11042e670ea83f5d1f4 Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Wed, 28 Feb 2024 18:00:50 +0000 Subject: [PATCH 02/10] Create object to store cached results --- msticpy/datamodel/result.py | 60 +++++++++++++++++++++++++++++++++++++ requirements-all.txt | 1 + requirements.txt | 1 + 3 files changed, 62 insertions(+) create mode 100644 msticpy/datamodel/result.py diff --git a/msticpy/datamodel/result.py b/msticpy/datamodel/result.py new file mode 100644 index 000000000..554ac85d3 --- /dev/null +++ b/msticpy/datamodel/result.py @@ -0,0 +1,60 @@ +"""Define standard models for query results.""" +from __future__ import annotations + +import datetime as dt +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +from dataclasses_json import dataclass_json +from typing_extensions import Self + +if TYPE_CHECKING: + from collections.abc import Hashable + + import pandas as pd + + +@dataclass_json +@dataclass +class QueryResult: + """DataFrame model.""" + + name: str + query: str + raw_results: pd.DataFrame + arguments: dict[str, Any] = field(default_factory=dict) + timestamp: dt.datetime = field(default_factory=dt.datetime.utcnow) + + @property + def normalizer(self: Self) -> str: + """Normalizer class name.""" + return str(self.__class__.__name__) + + @property + def total_results(self: Self) -> int: + """Total Results.""" + return len(self.results) + + @property + def results(self: Self) -> list[dict[Hashable, Any]]: + """Return results as a List of dicts.""" + return self.raw_results.to_dict(orient="records") + + def _repr_markdown_(self: Self) -> str: + """Represent as markdown.""" + return self.raw_results.to_html(index=False) + + def _repr_html_(self: Self) -> str: + """Represent as HTML.""" + return self.raw_results.to_html(index=False) + + def __eq__(self: Self, __value: object) -> bool: + """Return True if self and __value are equal.""" + if not isinstance(__value, QueryResult): + return False + return ( + self.name == __value.name + and self.query == __value.query + and len(self.arguments) == len(__value.arguments) + and self.raw_results.equals(__value.raw_results) + ) diff --git a/requirements-all.txt b/requirements-all.txt index a34135f9e..8ae813cd9 100644 --- a/requirements-all.txt +++ b/requirements-all.txt @@ -18,6 +18,7 @@ beautifulsoup4>=4.0.0 bokeh>=1.4.0, <4.0.0 compress-pickle >= 2.1.0 cryptography>=3.1 +dataclasses-json >= 0.6.4 deprecated>=1.2.4 dnspython>=2.0.0, <3.0.0 folium>=0.9.0 diff --git a/requirements.txt b/requirements.txt index ae2cb23d7..6161229fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ beautifulsoup4>=4.0.0 bokeh>=1.4.0, <4.0.0 compress-pickle >= 2.1.0 cryptography>=3.1 +dataclasses-json >= 0.6.4 deprecated>=1.2.4 dnspython>=2.0.0, <3.0.0 folium>=0.9.0 From 40b9c3092ca1854b14cbed706bd10debd12ae92f Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Wed, 28 Feb 2024 18:01:13 +0000 Subject: [PATCH 03/10] Add methods to write cache to NB cells and file --- msticpy/common/cache/__init__.py | 95 +++++++++++++++++++++++++++ msticpy/common/cache/cell.py | 109 +++++++++++++++++++++++++++++++ msticpy/common/cache/file.py | 48 ++++++++++++++ 3 files changed, 252 insertions(+) create mode 100644 msticpy/common/cache/__init__.py create mode 100644 msticpy/common/cache/cell.py create mode 100644 msticpy/common/cache/file.py diff --git a/msticpy/common/cache/__init__.py b/msticpy/common/cache/__init__.py new file mode 100644 index 000000000..8b6806086 --- /dev/null +++ b/msticpy/common/cache/__init__.py @@ -0,0 +1,95 @@ +"""Common methods to handle caching.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +from ...datamodel.result import QueryResult +from ..utility.ipython import is_ipython +from . import cell +from . import file as cache_file +from .codec import compute_digest + +if TYPE_CHECKING: + import pandas as pd + +LOGGER: logging.Logger = logging.getLogger(__name__) + +# Define the maximum size of data that can be cached within a notebook. +MAX_INLINE_CACHE_SIZE = 10000 + + +def write_cache( # noqa: PLR0913 + data: pd.DataFrame, + search_params: dict[str, Any], + query: str, + name: str, + cache_path: str | None = None, + *, + display: bool = False, +) -> None: + """Cache query result in a cell or a parquet file.""" + cache_digest: str = compute_digest(search_params) + cache: QueryResult = QueryResult( + name=name, + query=query, + raw_results=data, + arguments=search_params, + ) + if is_ipython() and display: + if len(data) < MAX_INLINE_CACHE_SIZE: + cell.write_cache( + cache, + name, + cache_digest, + ) + else: + LOGGER.warning( + "Too much data (%d rows) to write to cache inline", + len(data), + ) + if cache_path: + LOGGER.info("Writing cache to %s", cache_path) + cache_file.write_cache( + data=cache, + file_name=cache_digest, + export_folder=cache_path, + ) + + +def read_cache( + search_params: dict[str, Any], + cache_path: str | None, + name: str | None = None, +) -> QueryResult: + """Retrieve result from cache in a cell or a archive file.""" + if not cache_path: + error_msg: str = "Cache not provided." + raise ValueError(error_msg) + cache_digest: str = compute_digest(search_params) + if is_ipython(): + try: + return cell.read_cache( + name or cache_digest, + cache_digest, + cache_path, + ) + except ValueError: + pass + try: + cache: QueryResult = cache_file.read_cache( + cache_digest, + cache_path, + ) + except FileNotFoundError as exc: + error_msg = "Could not read from cache." + raise ValueError(error_msg) from exc + if is_ipython(): + # Writing cache to cell since it has not been found. + cell.write_cache( + cache, + name or cache_digest, + cache_digest, + ) + return cache diff --git a/msticpy/common/cache/cell.py b/msticpy/common/cache/cell.py new file mode 100644 index 000000000..0dca37756 --- /dev/null +++ b/msticpy/common/cache/cell.py @@ -0,0 +1,109 @@ +"""Handle caching in Notebook cell.""" +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +import nbformat +from IPython.display import display + +from ...datamodel.result import QueryResult +from .codec import decode_base64_as_pickle, encode_as_base64_pickle + +LOGGER: logging.Logger = logging.getLogger(__name__) +MAX_DISPLAY_SIZE: int = 5 + + +def write_cache( + data: QueryResult, + name: str, + digest: str, +) -> None: + """Cache content in cell.""" + cache: str = encode_as_base64_pickle(data) + metadata: dict[str, Any] = { + "data": cache, + "hash": digest, + } + if isinstance(data, QueryResult): + metadata.update( + { + "name": name, + "query": data.query, + "arguments": data.arguments, + "timestamp": data.timestamp, + }, + ) + LOGGER.debug("Data %s written to Notebook cache", name) + display( + data.raw_results.head(MAX_DISPLAY_SIZE), + metadata=metadata, + exclude=["text/plain"], + ) + + +def get_cache_item(path: Path, name: str, digest: str) -> dict[str, Any]: + """ + Get named object from cache. + + Parameters + ---------- + path : Path + Path to notebook + name : str + name of the cached object to search + digest : str + Hash of the cached object to search + + Returns + ------- + dict[str, Any] + Cached object. + """ + if not path.exists(): + error_msg: str = "Notebook not found" + raise FileNotFoundError(error_msg) + + notebook: nbformat.NotebookNode = nbformat.reads( + path.read_text(encoding="utf-8"), + as_version=nbformat.current_nbformat, + ) + + try: + cache: dict[str, Any] = next( + iter( + [ + (output.get("metadata", {}) or {}) + for cell in (notebook.cells or []) + for output in (cell.get("outputs", []) or []) + if output.get("metadata", {}).get("hash") == digest + and output.get("metadata", {}).get("name") == name + ], + ), + ) + except StopIteration: + LOGGER.debug("%s not found in %s cache...", digest, path) + cache = {} + + return cache + + +def read_cache(name: str, digest: str, nb_path: str) -> QueryResult: + """Read cache content from file.""" + if not nb_path: + error_msg: str = "Argument nb_path must be defined." + raise ValueError(error_msg) + + notebook_fp: Path = Path(nb_path).absolute() + + if not notebook_fp.exists(): + error_msg = "Notebook not found" + raise FileNotFoundError(error_msg) + + cache: dict[str, Any] = get_cache_item(path=notebook_fp, name=name, digest=digest) + if cache and (data := cache.get("data")): + LOGGER.debug("Digest %s found in cache...", digest) + return decode_base64_as_pickle(data) + error_msg = f"Cache {digest} not found" + raise ValueError(error_msg) diff --git a/msticpy/common/cache/file.py b/msticpy/common/cache/file.py new file mode 100644 index 000000000..b2c0f99e0 --- /dev/null +++ b/msticpy/common/cache/file.py @@ -0,0 +1,48 @@ +"""Handle caching in files.""" +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING + +from .codec import decode_base64_as_pickle, encode_as_base64_pickle + +if TYPE_CHECKING: + from ...datamodel.result import QueryResult + + +LOGGER: logging.Logger = logging.getLogger(__name__) +CACHE_FOLDER_NAME = "artifacts" + + +def write_cache( + data: QueryResult, + file_name: str, + export_folder: str = CACHE_FOLDER_NAME, +) -> None: + """Cache content in file.""" + export_path: Path = Path(export_folder) + if export_path.is_file(): + export_path = export_path.parent / CACHE_FOLDER_NAME + if not export_path.exists(): + export_path.mkdir(exist_ok=True, parents=True) + export_file: Path = export_path / file_name + encoded_text: str = encode_as_base64_pickle(data) + export_file.write_text(encoded_text) + LOGGER.debug("Data written to file %s", export_folder) + + +def read_cache( + file_name: str, + export_folder: str = CACHE_FOLDER_NAME, +) -> QueryResult: + """Read cache content from file.""" + export_path: Path = Path(export_folder) + if export_path.is_file(): + export_path = export_path.parent / CACHE_FOLDER_NAME + export_file: Path = export_path / file_name + if export_file.exists(): + LOGGER.debug("Found data in cache %s", export_file) + encoded_text: str = export_file.read_text() + return decode_base64_as_pickle(encoded_text) + raise FileNotFoundError From 48b4f0deb788160de28febfc96d94d67240a920c Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Wed, 28 Feb 2024 18:03:33 +0000 Subject: [PATCH 04/10] Add first version of data caching invocation --- msticpy/data/core/data_providers.py | 37 ++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/msticpy/data/core/data_providers.py b/msticpy/data/core/data_providers.py index f9ec3455e..7791dc967 100644 --- a/msticpy/data/core/data_providers.py +++ b/msticpy/data/core/data_providers.py @@ -11,6 +11,8 @@ import pandas as pd +from msticpy.datamodel.result import QueryResult + from ..._version import VERSION from ...common.pkg_config import get_config from ...common.utility import export, valid_pyname @@ -23,6 +25,7 @@ from .query_provider_connections_mixin import QueryProviderConnectionsMixin from .query_provider_utils_mixin import QueryProviderUtilsMixin from .query_store import QueryStore +from ...common.cache import read_cache, write_cache __version__ = VERSION __author__ = "Ian Hellen" @@ -267,6 +270,8 @@ def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]: ) query_name = kwargs.pop("query_name") family = kwargs.pop("query_path") + cache_path: Optional[str] = kwargs.pop("cache_path", None) + display: bool = kwargs.pop("display", True) query_source = self.query_store.get_query( query_path=family, query_name=query_name @@ -299,6 +304,7 @@ def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]: if split_result is not None: return split_result # if split queries could not be created, fall back to default + query_str = query_source.create_query( formatters=self._query_provider.formatters, **params ) @@ -311,7 +317,36 @@ def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]: logger.info( "Running query '%s...' with params: %s", query_str[:40], query_options ) - return self.exec_query(query_str, query_source=query_source, **query_options) + if cache_path: + try: + result: QueryResult = read_cache( + query_options, + cache_path, + query_source.name, + ) + except (ValueError, FileNotFoundError): + logger.info("Data not found in cache.") + return None + logger.info( + "Data found in cache, returning result from past execution %s.", + result.timestamp.isoformat(sep=" ", timespec="seconds"), + ) + if result.raw_results is not None: + return result.raw_results + + query_result: pd.DataFrame = self.exec_query( + query_str, query_source=query_source, **query_options + ) + + write_cache( + data=query_result, + query=query_str, + search_params=query_options, + cache_path=cache_path, + name=query_source.name, + display=display, + ) + return query_result def _check_for_time_params(self, params, missing) -> bool: """Fall back on builtin query time if no time parameters were supplied.""" From 6e256bd1402a9f19e58da91f04cb715af9721f39 Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Thu, 29 Feb 2024 10:13:33 +0000 Subject: [PATCH 05/10] Fix logic issue --- msticpy/data/core/data_providers.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/msticpy/data/core/data_providers.py b/msticpy/data/core/data_providers.py index 7791dc967..3ff7860cd 100644 --- a/msticpy/data/core/data_providers.py +++ b/msticpy/data/core/data_providers.py @@ -326,13 +326,13 @@ def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]: ) except (ValueError, FileNotFoundError): logger.info("Data not found in cache.") - return None - logger.info( - "Data found in cache, returning result from past execution %s.", - result.timestamp.isoformat(sep=" ", timespec="seconds"), - ) - if result.raw_results is not None: - return result.raw_results + else: + logger.info( + "Data found in cache, returning result from past execution %s.", + result.timestamp.isoformat(sep=" ", timespec="seconds"), + ) + if result.raw_results is not None: + return result.raw_results query_result: pd.DataFrame = self.exec_query( query_str, query_source=query_source, **query_options From 2e11acd4fa9ad198c301128d861c150431bb2ab9 Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Thu, 29 Feb 2024 12:30:17 +0000 Subject: [PATCH 06/10] Remove size checks --- msticpy/common/cache/__init__.py | 19 +++++-------------- msticpy/common/cache/cell.py | 3 +-- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/msticpy/common/cache/__init__.py b/msticpy/common/cache/__init__.py index 8b6806086..edc252cef 100644 --- a/msticpy/common/cache/__init__.py +++ b/msticpy/common/cache/__init__.py @@ -16,9 +16,6 @@ LOGGER: logging.Logger = logging.getLogger(__name__) -# Define the maximum size of data that can be cached within a notebook. -MAX_INLINE_CACHE_SIZE = 10000 - def write_cache( # noqa: PLR0913 data: pd.DataFrame, @@ -38,17 +35,11 @@ def write_cache( # noqa: PLR0913 arguments=search_params, ) if is_ipython() and display: - if len(data) < MAX_INLINE_CACHE_SIZE: - cell.write_cache( - cache, - name, - cache_digest, - ) - else: - LOGGER.warning( - "Too much data (%d rows) to write to cache inline", - len(data), - ) + cell.write_cache( + cache, + name, + cache_digest, + ) if cache_path: LOGGER.info("Writing cache to %s", cache_path) cache_file.write_cache( diff --git a/msticpy/common/cache/cell.py b/msticpy/common/cache/cell.py index 0dca37756..243f17b8d 100644 --- a/msticpy/common/cache/cell.py +++ b/msticpy/common/cache/cell.py @@ -12,7 +12,6 @@ from .codec import decode_base64_as_pickle, encode_as_base64_pickle LOGGER: logging.Logger = logging.getLogger(__name__) -MAX_DISPLAY_SIZE: int = 5 def write_cache( @@ -37,7 +36,7 @@ def write_cache( ) LOGGER.debug("Data %s written to Notebook cache", name) display( - data.raw_results.head(MAX_DISPLAY_SIZE), + data.raw_results, metadata=metadata, exclude=["text/plain"], ) From 6ee679d2c82b2f7f30847161ccff891bb5c66d61 Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Thu, 29 Feb 2024 14:08:11 +0000 Subject: [PATCH 07/10] Fix imports --- conda/conda-reqs-pip.txt | 2 ++ msticpy/data/core/data_providers.py | 8 +++----- requirements-all.txt | 3 ++- requirements.txt | 3 ++- tests/test_pkg_imports.py | 1 + 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/conda/conda-reqs-pip.txt b/conda/conda-reqs-pip.txt index 8058a6cb7..b02046b2f 100644 --- a/conda/conda-reqs-pip.txt +++ b/conda/conda-reqs-pip.txt @@ -1,7 +1,9 @@ azure-mgmt-monitor>=2.0.0 azure-mgmt-resourcegraph>=8.0.0 azure-mgmt-subscription>=3.0.0 +dataclasses-json >= 0.5.7 KqlmagicCustom[jupyter-basic,auth_code_clipboard]>=0.1.114.post22 mo-sql-parsing>=8, <9.0.0 +nbformat>=5.9.2 passivetotal>=2.5.3 splunk-sdk>=1.6.0 diff --git a/msticpy/data/core/data_providers.py b/msticpy/data/core/data_providers.py index 3ff7860cd..2b622302d 100644 --- a/msticpy/data/core/data_providers.py +++ b/msticpy/data/core/data_providers.py @@ -11,11 +11,11 @@ import pandas as pd -from msticpy.datamodel.result import QueryResult - from ..._version import VERSION +from ...common.cache import read_cache, write_cache from ...common.pkg_config import get_config from ...common.utility import export, valid_pyname +from ...datamodel.result import QueryResult from ...nbwidgets.query_time import QueryTime from .. import drivers from ..drivers.driver_base import DriverBase, DriverProps @@ -25,7 +25,6 @@ from .query_provider_connections_mixin import QueryProviderConnectionsMixin from .query_provider_utils_mixin import QueryProviderUtilsMixin from .query_store import QueryStore -from ...common.cache import read_cache, write_cache __version__ = VERSION __author__ = "Ian Hellen" @@ -271,7 +270,6 @@ def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]: query_name = kwargs.pop("query_name") family = kwargs.pop("query_path") cache_path: Optional[str] = kwargs.pop("cache_path", None) - display: bool = kwargs.pop("display", True) query_source = self.query_store.get_query( query_path=family, query_name=query_name @@ -344,7 +342,7 @@ def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]: search_params=query_options, cache_path=cache_path, name=query_source.name, - display=display, + display=kwargs.pop("display", True), ) return query_result diff --git a/requirements-all.txt b/requirements-all.txt index 8ae813cd9..0c8b3ed4e 100644 --- a/requirements-all.txt +++ b/requirements-all.txt @@ -18,7 +18,7 @@ beautifulsoup4>=4.0.0 bokeh>=1.4.0, <4.0.0 compress-pickle >= 2.1.0 cryptography>=3.1 -dataclasses-json >= 0.6.4 +dataclasses-json >= 0.5.7 deprecated>=1.2.4 dnspython>=2.0.0, <3.0.0 folium>=0.9.0 @@ -37,6 +37,7 @@ msal>=1.12.0 msal_extensions>=0.3.0 msrest>=0.6.0 msrestazure>=0.6.0 +nbformat>=5.9.2 nest_asyncio>=1.4.0 networkx>=2.2 numpy>=1.15.4 # pandas diff --git a/requirements.txt b/requirements.txt index 6161229fb..1c7583a7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ beautifulsoup4>=4.0.0 bokeh>=1.4.0, <4.0.0 compress-pickle >= 2.1.0 cryptography>=3.1 -dataclasses-json >= 0.6.4 +dataclasses-json >= 0.5.7 deprecated>=1.2.4 dnspython>=2.0.0, <3.0.0 folium>=0.9.0 @@ -27,6 +27,7 @@ msal>=1.12.0 msal_extensions>=0.3.0 msrest>=0.6.0 msrestazure>=0.6.0 +nbformat>=5.9.2 nest_asyncio>=1.4.0 networkx>=2.2 numpy>=1.15.4 # pandas diff --git a/tests/test_pkg_imports.py b/tests/test_pkg_imports.py index 168b34bfa..2e0a58988 100644 --- a/tests/test_pkg_imports.py +++ b/tests/test_pkg_imports.py @@ -35,6 +35,7 @@ "KqlmagicCustom[jupyter-extended]", "sumologic-sdk", "openpyxl", + "compress-pickle", } From ca2eae165ca33591e3b54aac70cfe7a16d8e5352 Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Thu, 7 Mar 2024 09:57:50 +0000 Subject: [PATCH 08/10] Adding tests --- tests/common/cache/__init__.py | 1 + tests/common/cache/test_cell.py | 228 ++++++++++++++++++++++++++++++++ tests/common/cache/test_file.py | 98 ++++++++++++++ tests/common/cache/test_init.py | 161 ++++++++++++++++++++++ tests/conftest.py | 7 + tests/datamodel/test_result.py | 59 +++++++++ tests/fixtures.py | 58 ++++++++ 7 files changed, 612 insertions(+) create mode 100644 tests/common/cache/__init__.py create mode 100644 tests/common/cache/test_cell.py create mode 100644 tests/common/cache/test_file.py create mode 100644 tests/common/cache/test_init.py create mode 100644 tests/conftest.py create mode 100644 tests/datamodel/test_result.py create mode 100644 tests/fixtures.py diff --git a/tests/common/cache/__init__.py b/tests/common/cache/__init__.py new file mode 100644 index 000000000..7cdd7c8cc --- /dev/null +++ b/tests/common/cache/__init__.py @@ -0,0 +1 @@ +"""Tests for cache functions.""" diff --git a/tests/common/cache/test_cell.py b/tests/common/cache/test_cell.py new file mode 100644 index 000000000..aa43d5d79 --- /dev/null +++ b/tests/common/cache/test_cell.py @@ -0,0 +1,228 @@ +"""Testing cell notebook cache.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any +from unittest.mock import patch + +import nbformat +import pandas as pd +import pytest +import pytest_check as check +from typing_extensions import Self + +from msticpy.common.cache import cell +from msticpy.datamodel.result import QueryResult + +if TYPE_CHECKING: + from pathlib import Path + + +def test_pickle_encode_decode(simple_dataframeresult: QueryResult) -> None: + """Test method encode_as_base64_pickle and decode_base64_as_pickle.""" + encoded: str = cell.encode_as_base64_pickle(simple_dataframeresult) + check.is_not_none(encoded) + check.is_instance(encoded, str) + check.greater(len(encoded), 100) + + decoded: QueryResult = cell.decode_base64_as_pickle(encoded) + check.is_not_none(decoded) + check.is_instance(decoded, QueryResult) + check.is_false(decoded.raw_results.empty) + check.is_true(simple_dataframeresult.raw_results.equals(decoded.raw_results)) + + +def test_write_cache(simple_dataframeresult: QueryResult) -> None: + """Test method write_cache.""" + digest: str = "digest" + name: str = "name" + + with patch.object(cell, "display") as patched_display: + cell.write_cache( + simple_dataframeresult, + name, + digest, + ) + check.is_true(patched_display.called) + check.equal(patched_display.call_count, 1) + + check.equal(len(patched_display.call_args.args), 1) + + data: pd.DataFrame = patched_display.call_args.args[0] + check.is_instance(data, pd.DataFrame) + + kwargs: dict[str, Any] = patched_display.call_args.kwargs + check.is_not_none(kwargs) + check.is_instance(kwargs, dict) + check.equal(len(kwargs), 2) + check.is_in("metadata", kwargs) + check.is_in("exclude", kwargs) + + metadata: dict[str, str] = kwargs["metadata"] + check.is_instance(metadata, dict) + check.equal(len(metadata), 6) + check.is_in("data", metadata) + check.is_in("hash", metadata) + check.is_in("name", metadata) + check.is_in("query", metadata) + check.is_in("arguments", metadata) + check.is_in("timestamp", metadata) + + meta_data: str = metadata["data"] + check.equal(meta_data, cell.encode_as_base64_pickle(simple_dataframeresult)) + meta_hash: str = metadata["hash"] + check.equal(meta_hash, digest) + meta_name: str = metadata["name"] + check.equal(meta_name, name) + meta_query: str = metadata["query"] + check.equal(meta_query, simple_dataframeresult.query) + meta_args: str = metadata["arguments"] + check.equal(meta_args, simple_dataframeresult.arguments) + meta_timestamp: str = metadata["timestamp"] + check.equal(meta_timestamp, simple_dataframeresult.timestamp) + + +class MyNotebook: # pylint:disable=too-few-public-methods + """Dummy notebook class.""" + + def __init__(self: Self, metadata: dict[str, Any] | None = None) -> None: + """Init dummy object.""" + self.cells: list[dict[str, list[dict[str, dict[str, Any]]]]] = [ + {"outputs": [{"metadata": metadata or {}}]}, + ] + + +def test_get_cache_item(tmp_path: Path) -> None: + """Test method get_cache_item.""" + digest: str = "digest" + name: str = "name" + + # Create file with digest content + (tmp_path / "random.ipynb").write_text(digest, encoding="utf-8") + + with patch.object( + nbformat, + "reads", + return_value=MyNotebook({"hash": digest, "name": name}), + ): + res: dict[str, Any] = cell.get_cache_item( + tmp_path / "random.ipynb", + name=name, + digest=digest, + ) + check.is_instance(res, dict) + check.is_in("hash", res) + check.is_in("name", name) + check.equal(res["hash"], digest) + check.equal(res["name"], name) + + +def test_get_cache_item_wrong_path(tmp_path: Path) -> None: + """Test method get_cache_item with invalid notebook path.""" + with pytest.raises(FileNotFoundError, match="Notebook not found"): + cell.get_cache_item(tmp_path / "random.ipynb", "name", "digest") + + +def test_get_cache_item_wrong_digest(tmp_path: Path) -> None: + """Test method get_cache_item with invalid digest.""" + # Create file with some content + (tmp_path / "random.ipynb").write_text("", encoding="utf-8") + name: str = "name" + digest: str = "digest" + + with patch.object( + nbformat, + "reads", + return_value=MyNotebook({"hash": digest, "name": name}), + ): + res: dict[str, Any] = cell.get_cache_item(tmp_path / "random.ipynb", name, name) + check.is_instance(res, dict) + check.equal(len(res), 0) + + +def test_get_cache_item_wrong_name(tmp_path: Path) -> None: + """Test method get_cache_item with invalid name.""" + # Create file with some content + (tmp_path / "random.ipynb").write_text("", encoding="utf-8") + name: str = "name" + digest: str = "digest" + + with patch.object( + nbformat, + "reads", + return_value=MyNotebook({"hash": digest, "name": name}), + ): + res: dict[str, Any] = cell.get_cache_item( + tmp_path / "random.ipynb", + digest, + digest, + ) + check.is_instance(res, dict) + check.equal(len(res), 0) + + +def test_read_cache(tmp_path: Path, simple_dataframeresult: QueryResult) -> None: + """Test method read_cache.""" + nb_path: Path = tmp_path / "test.ipynb" + digest: str = "digest" + encoded: str = cell.encode_as_base64_pickle(simple_dataframeresult) + name: str = "name" + # Create file with no content + nb_path.write_text(digest, encoding="utf-8") + + with patch.object( + nbformat, + "reads", + return_value=MyNotebook({"hash": digest, "name": name, "data": encoded}), + ): + data: QueryResult = cell.read_cache( + name=name, + digest=digest, + nb_path=str(nb_path), + ) + check.is_instance(data, QueryResult) + check.is_true(simple_dataframeresult.raw_results.equals(data.raw_results)) + + +def test_read_cache_wrong_digest(tmp_path: Path) -> None: + """Test method read_cache with an incorrect digest.""" + nb_path: Path = tmp_path / "test.ipynb" + digest: str = "random" + name: str = "name" + # Create file with no content + nb_path.write_text(digest, encoding="utf-8") + + with patch.object( + nbformat, + "reads", + return_value=MyNotebook({"hash": digest, "name": name}), + ), pytest.raises(ValueError, match=f"Cache {name} not found"): + cell.read_cache(name, name, str(nb_path)) + + +def test_read_cache_wrong_name(tmp_path: Path) -> None: + """Test method read_cache with an incorrect name.""" + nb_path: Path = tmp_path / "test.ipynb" + digest: str = "random" + name: str = "name" + # Create file with no content + nb_path.write_text(digest, encoding="utf-8") + + with patch.object( + nbformat, + "reads", + return_value=MyNotebook({"hash": "hash", "name": name}), + ), pytest.raises(ValueError, match=f"Cache {digest} not found"): + cell.read_cache(digest, digest, str(nb_path)) + + +def test_read_cache_wrong_nb(tmp_path: Path) -> None: + """Test method read_cache with incorrect noteboob path.""" + nb_path: Path = tmp_path / "test.ipynb" + with pytest.raises(FileNotFoundError, match="Notebook not found"): + cell.read_cache("name", "digest", str(nb_path)) + + +def test_read_cache_no_nb() -> None: + """Test method read_cache with incorrect noteboob path.""" + with pytest.raises(ValueError, match="Argument nb_path must be defined."): + cell.read_cache("name", "digest", "") diff --git a/tests/common/cache/test_file.py b/tests/common/cache/test_file.py new file mode 100644 index 000000000..ee098418f --- /dev/null +++ b/tests/common/cache/test_file.py @@ -0,0 +1,98 @@ +"""Testing file notebook cache.""" +from pathlib import Path +from typing import Any + +import pandas as pd +import pytest +import pytest_check as check + +from msticpy.common.cache import file +from msticpy.datamodel.result import QueryResult + + +def test_read_write_cache(tmp_path: Path, simple_dataframeresult: QueryResult) -> None: + """Test method to read and write cache.""" + file_name: str = "digest" + file_path: Path = tmp_path / file_name + + file.write_cache(simple_dataframeresult, file_name, str(tmp_path)) + + check.is_true(file_path.exists()) + check.is_true(file_path.stat().st_size > 0) + check.is_true(file_path.is_file()) + + res: QueryResult = file.read_cache(file_name, export_folder=str(tmp_path)) + check.is_instance(res.raw_results, pd.DataFrame) + check.is_false(res.raw_results.empty) + try: + check.is_true(res.raw_results.equals(simple_dataframeresult.raw_results)) + except ValueError as exc: + df: pd.DataFrame = res.raw_results + for i in range(df.shape[0]): + ref: Any = df.value.iloc[i] + if isinstance(ref, dict): + check.is_instance( + simple_dataframeresult.raw_results.value.iloc[i], + dict, + ) + else: + error_msg = "DataFrame comparison is only working for dict." + raise NotImplementedError(error_msg) from exc + + +def test_read_cache_from_missing_file() -> None: + """Test to read cache when file does not exist.""" + file_name: str = "does_not_exist" + + with pytest.raises(FileNotFoundError): + file.read_cache(file_name) + + +def test_write_cache_without_export_path( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + simple_dataframeresult: QueryResult, +) -> None: + """Test to write cache without providing export path.""" + monkeypatch.chdir(tmp_path) + file_name: str = "digest" + file_path: Path = Path(file.CACHE_FOLDER_NAME) / file_name + + file.write_cache(simple_dataframeresult, file_name) + + check.is_true(file_path.exists()) + check.is_true(file_path.stat().st_size > 0) + check.is_true(file_path.is_file()) + + +def test_read_write_cache_with_file( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + simple_dataframeresult: QueryResult, +) -> None: + """Test to write cache when providing a file as an export path.""" + monkeypatch.chdir(tmp_path) + + export_path_file: Path = Path("random_file") + export_path_file.touch() + check.is_true(export_path_file.exists()) + check.is_true(export_path_file.is_file()) + + file_name: str = "digest" + file_path: Path = export_path_file.parent / file.CACHE_FOLDER_NAME / file_name + + file.write_cache( + simple_dataframeresult, + file_name, + export_folder=str(export_path_file), + ) + + check.is_true(file_path.exists()) + check.is_true(file_path.stat().st_size > 0) + check.is_true(file_path.is_file()) + + res: QueryResult = file.read_cache( + file_name, + export_folder=str(export_path_file), + ) + check.equal(simple_dataframeresult, res) diff --git a/tests/common/cache/test_init.py b/tests/common/cache/test_init.py new file mode 100644 index 000000000..d4fc70f51 --- /dev/null +++ b/tests/common/cache/test_init.py @@ -0,0 +1,161 @@ +"""Testing generic cache methods.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any +from unittest.mock import patch + +import pytest +import pytest_check as check + +from msticpy.common import cache +from msticpy.common.cache import cell, file, read_cache, write_cache + +if TYPE_CHECKING: + from pathlib import Path + + import pandas as pd + + from msticpy.datamodel.result import QueryResult + + +def test_write_cache_cell(tmp_path: Path, simple_dataframe: pd.DataFrame) -> None: + """Test method to write cache from a cell.""" + params: dict[str, Any] = {"key": "digest"} + + with patch.object( + cell, + "write_cache", + return_value=None, + ) as mocked_cache, patch.object( + cache, + "is_ipython", + return_value=True, + ) as mocked_ipython: + # "Real" tests are managed in the cell.write_cache test" + write_cache( + simple_dataframe, + params, + str(tmp_path), + "name", + display=True, + ) + + check.is_true(mocked_ipython.called) + check.equal(mocked_ipython.call_count, 1) + + check.is_true(mocked_cache.called) + check.equal(mocked_cache.call_count, 1) + + +def test_write_cache_file(tmp_path: Path, simple_dataframe: pd.DataFrame) -> None: + """Test method to read cache from a file.""" + params: dict[str, Any] = {"key": "digest"} + + with patch.object(cell, "write_cache", return_value=None): + # "Real" tests are managed in the file.write_cache test" + write_cache( + data=simple_dataframe, + search_params=params, + query="query", + cache_path=str(tmp_path), + name="name", + ) + + +def test_read_cache_no_path() -> None: + """Test method to read cache without an export path.""" + params: dict[str, Any] = {"key": "digest"} + + with pytest.raises(ValueError, match="Cache not provided."): + read_cache(params, "", "name") + + +def test_read_cache_cell( + tmp_path: Path, + simple_dataframeresult: QueryResult, +) -> None: + """Test method to read cache from a call.""" + params: dict[str, Any] = {"key": "digest"} + + with patch.object( + cell, + "read_cache", + return_value=simple_dataframeresult, + ) as mocked_read_cache, patch.object( + cache, + "is_ipython", + return_value=True, + ) as mocked_ipython: + # "Real" tests are managed in the cell.read_cache test" + read_cache(search_params=params, cache_path=str(tmp_path), name="name") + + check.is_true(mocked_ipython.called) + check.equal(mocked_ipython.call_count, 1) + + check.is_true(mocked_read_cache.called) + check.equal(mocked_read_cache.call_count, 1) + + +def test_read_cache_cell_cache_not_found( + tmp_path: Path, + simple_dataframeresult: QueryResult, +) -> None: + """Test method to read cache from a cell.""" + params: dict[str, Any] = {"key": "digest"} + + with patch.object( + cell, + "read_cache", + side_effect=ValueError, + ) as mocked_read_cache_cell, patch.object( + file, + "read_cache", + return_value=simple_dataframeresult, + ) as mocked_read_cache_file, patch.object( + cache, + "is_ipython", + return_value=True, + ) as mocked_ipython, patch.object( + cell, + "write_cache", + ) as mocked_write_cache_cell: + # "Real" tests are managed in the cell.read_cache test" + read_cache(search_params=params, cache_path=str(tmp_path), name="name") + + check.is_true(mocked_ipython.called) + check.equal(mocked_ipython.call_count, 2) + + check.is_true(mocked_read_cache_cell.called) + check.equal(mocked_read_cache_cell.call_count, 1) + + # When reading from a cell, if the content is not found a failover to a file is attempted + check.is_true(mocked_read_cache_file.called) + check.equal(mocked_read_cache_file.call_count, 1) + + # Additionally, the cell cache must be re-written + check.is_true(mocked_write_cache_cell.called) + check.equal(mocked_write_cache_cell.call_count, 1) + + +def test_read_cache_file( + tmp_path: Path, + simple_dataframeresult: QueryResult, +) -> None: + """Test method to read cache from a file.""" + params: dict[str, Any] = {"key": "digest"} + + with patch.object(file, "read_cache", return_value=simple_dataframeresult): + # "Real" tests are managed in the file.read_cache test" + read_cache(search_params=params, cache_path=str(tmp_path), name="name") + + +def test_read_cache_file_not_exist(tmp_path: Path) -> None: + """Test method to read cache from a non existing file.""" + params: dict[str, Any] = {"key": "digest"} + + with patch.object(file, "read_cache", side_effect=FileNotFoundError), patch.object( + cache, + "is_ipython", + return_value=False, + ), pytest.raises(ValueError, match="Could not read from cache."): + read_cache(params, str(tmp_path), "name") diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..d7385f19f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,7 @@ +"""Pytest configuration for tests module.""" + +from .fixtures import ( # noqa: F401 # pylint: disable=W0611 + generate_sample_data, + generate_simple_dataframe, + generate_simple_dataframeresult, +) diff --git a/tests/datamodel/test_result.py b/tests/datamodel/test_result.py new file mode 100644 index 000000000..5abbf570a --- /dev/null +++ b/tests/datamodel/test_result.py @@ -0,0 +1,59 @@ +"""Tests for Query Result datamodel.""" + +import pandas as pd +import pytest_check as check + +from msticpy.datamodel.result import QueryResult + + +def test_normalizer(simple_dataframeresult: QueryResult) -> None: + """Test attribute normalizer from DataFrameResult.""" + check.is_instance(simple_dataframeresult.normalizer, str) + check.equal(simple_dataframeresult.normalizer, "QueryResult") + + +def test_total_results(simple_dataframeresult: QueryResult) -> None: + """Test attribute total_results from DataFrameResult.""" + check.is_instance(simple_dataframeresult.total_results, int) + check.greater_equal(simple_dataframeresult.total_results, 0) + + +def test_results(simple_dataframeresult: QueryResult) -> None: + """Test attribute results from DataFrameResult.""" + check.is_instance(simple_dataframeresult.results, list) + check.greater_equal(len(simple_dataframeresult.results), 0) + + +def test__repr_markdown_(simple_dataframeresult: QueryResult) -> None: + """Test attribute _repr_markdown_ from DataFrameResult.""" + res: str = ( + QueryResult._repr_markdown_( # noqa: SLF001 #pylint: disable=protected-access + simple_dataframeresult + ) + ) + check.is_instance(res, str) + check.greater_equal(len(res), 0) + + +def test__repr_html_(simple_dataframeresult: QueryResult) -> None: + """Test attribute _repr_html_ from DataFrameResult.""" + res: str = ( + QueryResult._repr_html_( # noqa: SLF001 #pylint: disable=protected-access + simple_dataframeresult + ) + ) + check.is_instance(res, str) + check.greater_equal(len(res), 0) + + +def test__eq_(simple_dataframeresult: QueryResult) -> None: + """Test attribute _repr_html_ from DataFrameResult.""" + check.equal(simple_dataframeresult, simple_dataframeresult) + other_sample: QueryResult = QueryResult( + name="my name", + query="my query", + raw_results=pd.DataFrame(), + arguments={}, + ) + check.not_equal(simple_dataframeresult, other_sample) + check.not_equal(simple_dataframeresult, 42) diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 000000000..f5788001b --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,58 @@ +"""Fixture for testing msticpy.""" + +import datetime as dt +from typing import Union + +import pandas as pd +import pytest + +from msticpy.datamodel.result import QueryResult + + +@pytest.fixture( + params=[ + "", + "TeSt", + "test", + 42, + 42.42, + ["A", "B", "c"], + [1, 2, 3], + { + "key_str": "Value", + "key_int": 42, + "key_list": ["A", "B", "c"], + "key_dict": {"A": 33, "B": "C"}, + }, + dt.datetime.now(tz=dt.timezone.utc), + ], + name="sample_data", +) +def generate_sample_data( + request: pytest.FixtureRequest, +) -> Union[str, int, float, list, dict, dt.datetime]: + """Return sample data for pattern matching.""" + return request.param + + +@pytest.fixture(name="simple_dataframe") +def generate_simple_dataframe( + sample_data: Union[str, float, list, dict, dt.datetime], +) -> pd.DataFrame: + """Sample dataframe to test get_raw_data.""" + return pd.DataFrame( + [ + {"key": "A", "value": sample_data}, + ], + ) + + +@pytest.fixture(name="simple_dataframeresult") +def generate_simple_dataframeresult(simple_dataframe: pd.DataFrame) -> QueryResult: + """Sample dataframeresult objects to test get_raw_data.""" + return QueryResult( + name="name", + query="no query", + raw_results=simple_dataframe, + arguments={}, + ) From 562d6ccf796abfcf069abb216f298e3b5b69ab98 Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Thu, 7 Mar 2024 16:02:24 +0000 Subject: [PATCH 09/10] Update test to enforce ipython test to return False --- tests/common/cache/test_init.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/common/cache/test_init.py b/tests/common/cache/test_init.py index d4fc70f51..ee47c0dc2 100644 --- a/tests/common/cache/test_init.py +++ b/tests/common/cache/test_init.py @@ -144,7 +144,13 @@ def test_read_cache_file( """Test method to read cache from a file.""" params: dict[str, Any] = {"key": "digest"} - with patch.object(file, "read_cache", return_value=simple_dataframeresult): + with patch.object( + file, "read_cache", return_value=simple_dataframeresult + ), patch.object( + cache, + "is_ipython", + return_value=False, + ): # "Real" tests are managed in the file.read_cache test" read_cache(search_params=params, cache_path=str(tmp_path), name="name") From 04058bb7b313b8de5291459fef286fee00becdbf Mon Sep 17 00:00:00 2001 From: Florian BRACQ Date: Mon, 25 Mar 2024 15:26:57 +0000 Subject: [PATCH 10/10] Fixing file-based cachingn names to prevent overwrite when the same parameters are used for multiple functions --- msticpy/common/cache/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/msticpy/common/cache/__init__.py b/msticpy/common/cache/__init__.py index edc252cef..57b21bd94 100644 --- a/msticpy/common/cache/__init__.py +++ b/msticpy/common/cache/__init__.py @@ -44,7 +44,7 @@ def write_cache( # noqa: PLR0913 LOGGER.info("Writing cache to %s", cache_path) cache_file.write_cache( data=cache, - file_name=cache_digest, + file_name=f"{name}_{cache_digest}", export_folder=cache_path, ) @@ -70,7 +70,7 @@ def read_cache( pass try: cache: QueryResult = cache_file.read_cache( - cache_digest, + f"{name}_{cache_digest}", cache_path, ) except FileNotFoundError as exc: