Skip to content

Commit

Permalink
Data library (#10)
Browse files Browse the repository at this point in the history
* Inital code for data query library supporting Log Analytics, Security graph and WDATP

* Updates to base64unpack and iocextract. Updating and expanding test cases.

* Adding unit tests and fixing some things

* Adding more unit tests and refactoring the driver/provider structure.

Also simplified the entityschema stuff a little to use internal __dict__ for properties.

* A few fixes + black formatting

* Updating gitignore to ignore vscode settings

* Fixing linting errors

* Flake8 line len and pylint error

* And updating the version to match Pete's PR
  • Loading branch information
ianhelle authored May 29, 2019
1 parent c247063 commit d2b4109
Show file tree
Hide file tree
Showing 35 changed files with 3,469 additions and 525 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,4 @@ venv.bak/
.mypy_cache/
/msticpy.code-workspace
/docs/source/_build/**
**/.vscode*
2 changes: 1 addition & 1 deletion msticpy/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Version file."""
VERSION = "0.1.7"
VERSION = "0.1.8"
188 changes: 188 additions & 0 deletions msticpy/data/data_providers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""Data provider loader."""
from functools import partial
from os import path
from typing import Union, Any

import pandas as pd

from .drivers import DriverBase, KqlDriver, SecurityGraphDriver
from .query_store import QueryStore
from .param_extractor import extract_query_params
from ..nbtools.query_defns import DataEnvironment
from ..nbtools.utility import export
from .._version import VERSION

__version__ = VERSION
__author__ = "Ian Hellen"

_PROVIDER_DIR = "providers"
_QUERY_DEF_DIR = "queries"

_ENVIRONMENT_DRIVERS = {
DataEnvironment.LogAnalytics: KqlDriver,
DataEnvironment.AzureSecurityCenter: KqlDriver,
DataEnvironment.SecurityGraph: SecurityGraphDriver,
}


class AttribHolder:
"""Empty class used to create hierarchical attributes."""

def __len__(self):
"""Retrun number of items in the attribute collection."""
return len(self.__dict__)

def __iter__(self):
"""Return iterator over the attributes."""
return iter(self.__dict__.items())


@export
class QueryProvider:
"""
Container for query store and query execution provider.
Instances of this class hold the query set and execution
methods for a specific data environment.
"""

def __init__(
self, data_environment: Union[str, DataEnvironment], driver: DriverBase = None
):
"""
Query provider interface to queries.
Parameters
----------
data_environment : Union[str, DataEnvironment]
Name or Enum of environment for the QueryProvider
driver : DriverBase, optional
Override the builtin driver (query execution class)
and use your own driver (must inherit from
`DriverBase`)
See Also
--------
DataProviderBase : base class for data query providers.
"""
if isinstance(data_environment, str):
data_environment = DataEnvironment.parse(data_environment)

self._environment = data_environment.name

if driver is None:
driver_class = _ENVIRONMENT_DRIVERS[data_environment]
if issubclass(driver_class, DriverBase):
driver = driver_class()
else:
raise LookupError(
"Could not find suitable data provider for",
f" {data_environment.name}",
)

self._query_provider = driver

# Find the path of this module and build sub-path
query_path = path.join(path.dirname(__file__), _QUERY_DEF_DIR)

# Load data query definitions for environment
data_environments = QueryStore.import_files(
source_path=query_path, recursive=True
)
self._query_store = data_environments[data_environment.name]

self.all_queries = AttribHolder()
self._add_query_functions()

def connect(self, connection_str: str, **kwargs):
"""
Connect to data source.
Parameters
----------
connection_string : str
Connection string for the data source
"""
return self._query_provider.connect(connection_str=connection_str, **kwargs)

def import_query_file(self, query_file: str):
"""
Import a yaml data source definition.
Parameters
----------
query_file : str
Path to the file to import
"""
self._query_store.import_file(query_file)

def list_queries(self):
"""
Return list of family.query in the store.
Returns
-------
Iterable[str]
List of queries
"""
return self._query_store.query_names

def query_help(self, query_name):
"""Print help for query."""
self._query_store[query_name].help()

def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]:
if not self._query_provider.loaded:
raise ValueError("Provider is not loaded.")
if not self._query_provider.connected:
raise ValueError(
"No connection to a data source.",
"Please call connect(connection_str) and retry.",
)
query_name = kwargs.pop("query_name")
family = kwargs.pop("data_family")

query_source = self._query_store.get_query(
data_family=family, query_name=query_name
)
if "help" in args or "?" in args:
query_source.help()
return None

params, missing = extract_query_params(query_source, *args, **kwargs)
if missing:
query_source.help()
raise ValueError(f"No values found for these parameters: {missing}")

query_str = query_source.create_query(**params)
return self._query_provider.query(query_str)

def _add_query_functions(self):
"""Add queries to the module as callable methods."""
for qual_query_name in self.list_queries():

family, query_name = qual_query_name.split(".")
if not hasattr(self, family):
setattr(self, family, AttribHolder())
query_family = getattr(self, family)

# Create the partial function
query_func = partial(
self._execute_query, data_family=family, query_name=query_name
)
query_func.__doc__ = self._query_store.get_query(
family, query_name
).create_doc_string()

setattr(query_family, query_name, query_func)
setattr(self.all_queries, query_name, query_func)
134 changes: 134 additions & 0 deletions msticpy/data/data_query_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""Data query definition reader."""
from typing import Tuple, Dict, Iterable, Any
from pathlib import Path
import yaml

from ..nbtools.query_defns import DataFamily, DataEnvironment
from .._version import VERSION

__version__ = VERSION
__author__ = "Ian Hellen"


def find_yaml_files(source_path: str, recursive: bool = False) -> Iterable[Path]:
"""Return iterable of yaml files found in `source_path`.
Parameters
----------
source_path : str
The source path to search in.
recursive : bool, optional
Whether to recurse through subfolders.
By default False
Returns
-------
Iterable[str]
File paths of yanl files found.
"""
recurse_pfx = "**/" if recursive else ""
file_glob = Path(source_path).glob(f"{recurse_pfx}*.yaml")
for file_path in file_glob:
if not file_path.is_file():
continue
yield file_path


def read_query_def_file(query_file: str) -> Tuple[Dict, Dict, Dict]:
"""
Read a yaml data query definition file.
Parameters
----------
query_file : str
Path to yaml query defintion file
Returns
-------
Tuple[Dict, Dict, Dict]
Tuple of dictionaries.
sources - dictionary of query definitions
defaults - the default parameters from the file
metadata - the global metadata from the file
"""
data_map = None
with open(query_file) as f_handle:
# use safe_load instead load
data_map = yaml.safe_load(f_handle)

validate_query_defs(query_def_dict=data_map)

defaults = data_map.get("defaults", {})
sources = data_map.get("sources", {})
metadata = data_map.get("metadata", {})

return sources, defaults, metadata


def validate_query_defs(query_def_dict: Dict[str, Any]) -> bool:
"""Validate content of query definition.
Parameters
----------
query_def_dict : dict
Dictionary of query definition yaml file contents.
Returns
-------
bool
True if validation succeeds.
Raises
------
ValueError
The validation failure reason is returned in the
exception message (arg[0])
"""
# verify that sources and metadata are in the data dict
if "sources" not in query_def_dict or not query_def_dict["sources"]:
raise ValueError("Imported file has no sources defined")
if "metadata" not in query_def_dict or not query_def_dict["metadata"]:
raise ValueError("Imported file has no metadata defined")

# data_environments and data_families must be defined at with at least
# one value
_validate_data_categories(query_def_dict)

return True


def _validate_data_categories(query_def_dict: Dict):
if (
"data_environments" not in query_def_dict["metadata"]
or not query_def_dict["metadata"]["data_environments"]
):
raise ValueError("Imported file has no data_environments defined")

for env in query_def_dict["metadata"]["data_environments"]:
if not DataEnvironment.parse(env):
raise ValueError(
f"Unknown data evironment {env} in metadata. ",
"Valid values are\n",
", ".join([e.name for e in DataEnvironment]),
)
if (
"data_families" not in query_def_dict["metadata"]
or not query_def_dict["metadata"]["data_families"]
):
raise ValueError("Imported file has no data families defined")

for fam in query_def_dict["metadata"]["data_families"]:
if not DataFamily.parse(fam):
raise ValueError(
f"Unknown data family {fam} in metadata. ",
"Valid values are\n",
", ".join([f.name for f in DataFamily]),
)
10 changes: 10 additions & 0 deletions msticpy/data/drivers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""Data provider sub-package."""
# flake8: noqa: F403
from . driver_base import DriverBase
from . kql_driver import KqlDriver
from . security_graph_driver import SecurityGraphDriver
Loading

0 comments on commit d2b4109

Please sign in to comment.