From e320232d74ff0489c953f33d582ed7e40a8d08a2 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Wed, 1 Feb 2023 11:58:38 -0800 Subject: [PATCH] Automatic cleanup jobs (#1052) --- .../jobs/cleanup/aws_import_emr_cleanup.json | 10 - cartography/graph/cleanupbuilder.py | 180 +++++++++++++ cartography/graph/job.py | 75 +++++- cartography/graph/model.py | 6 +- cartography/graph/querybuilder.py | 23 +- cartography/intel/aws/emr.py | 5 +- .../sample_data/helloworld_relationships.py | 27 +- .../asset_with_non_kwargs_tgm.py | 38 +++ .../cartography/graph/test_cleanupbuilder.py | 253 ++++++++++++++++++ .../cartography/intel/aws/test_emr.py | 73 ++++- tests/unit/cartography/graph/helpers.py | 12 + .../cartography/graph/test_cleanupbuilder.py | 160 +++++++++++ .../test_querybuilder_filter_selected_rels.py | 4 +- 13 files changed, 832 insertions(+), 34 deletions(-) delete mode 100644 cartography/data/jobs/cleanup/aws_import_emr_cleanup.json create mode 100644 cartography/graph/cleanupbuilder.py create mode 100644 tests/data/graph/querybuilder/sample_models/asset_with_non_kwargs_tgm.py create mode 100644 tests/integration/cartography/graph/test_cleanupbuilder.py create mode 100644 tests/unit/cartography/graph/test_cleanupbuilder.py diff --git a/cartography/data/jobs/cleanup/aws_import_emr_cleanup.json b/cartography/data/jobs/cleanup/aws_import_emr_cleanup.json deleted file mode 100644 index 2fd0f23f2..000000000 --- a/cartography/data/jobs/cleanup/aws_import_emr_cleanup.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "statements": [ - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(n:EMRCluster) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", - "iterative": true, - "iterationsize": 100 - } - ], - "name": "cleanup EMRCluster" -} diff --git a/cartography/graph/cleanupbuilder.py b/cartography/graph/cleanupbuilder.py new file mode 100644 index 000000000..45a39ed4a --- /dev/null +++ b/cartography/graph/cleanupbuilder.py @@ -0,0 +1,180 @@ +from dataclasses import asdict +from string import Template +from typing import List +from typing import Optional +from typing import Set + +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import LinkDirection +from cartography.graph.model import TargetNodeMatcher +from cartography.graph.querybuilder import _build_match_clause +from cartography.graph.querybuilder import filter_selected_relationships +from cartography.graph.querybuilder import rel_present_on_node_schema + + +def build_cleanup_queries( + node_schema: CartographyNodeSchema, + selected_rels: Optional[Set[CartographyRelSchema]] = None, +) -> List[str]: + """ + Generates queries to clean up stale nodes and relationships from the given CartographyNodeSchema. + :param node_schema: The given CartographyNodeSchema to generate cleanup queries for. + :param selected_rels: Optional. If specified, only generate cleanup queries where the `node_schema` is bound to this + given set of selected relationships. Raises an exception if any of the rels in `selected_rels` aren't actually + defined on the `node_schema`. + If `selected_rels` is not specified (default), we generate cleanup queries against all relationships defined on the + `node_schema`. + :return: A list of Neo4j queries to clean up nodes and relationships. Order matters: we always clean up the sub + resource relationship last because we only clean up stale nodes and rels that are associated with a given sub + resource, so if we delete the sub resource first then we will not be able to reach the stale nodes and rels, thus + leaving orphaned objects behind. + Note also that we return the empty list if the node_schema has no relationships. Doing cleanups of nodes without + relationships can be resource expensive for a large graph, and you might risk deleting unintended objects. Please + write a manual cleanup job if you wish to do this. + """ + other_rels = node_schema.other_relationships + sub_resource_rel = node_schema.sub_resource_relationship + + if selected_rels: + # Ensure that the selected rels actually exist on the node_schema + sub_resource_rel, other_rels = filter_selected_relationships(node_schema, selected_rels) + + if not sub_resource_rel: + raise ValueError( + "Auto-creating a cleanup job for a node_schema without a sub resource relationship is not supported. " + f'Please check the class definition of "{node_schema.__class__.__name__}". If the optional `selected_rels` ' + 'param was specified to build_cleanup_queries(), then ensure that the sub resource relationship is ' + 'present.', + ) + + result = [] + if other_rels: + for rel in other_rels.rels: + result.extend(_build_cleanup_node_and_rel_queries(node_schema, rel)) + + # Make sure that the sub resource cleanup job is last in the list; order matters. + result.extend(_build_cleanup_node_and_rel_queries(node_schema, sub_resource_rel)) + # Note that auto-cleanups for a node with no relationships does not happen at all - we don't support it. + return result + + +def _build_cleanup_node_and_rel_queries( + node_schema: CartographyNodeSchema, + selected_relationship: CartographyRelSchema, +) -> List[str]: + """ + Private function that performs the main string template logic for generating cleanup node and relationship queries. + :param node_schema: The given CartographyNodeSchema to generate cleanup queries for. + :param selected_relationship: Determines what relationship on the node_schema to build cleanup queries for. + selected_relationship must be in the set {node_schema.sub_resource_relationship} + node_schema.other_relationships. + :return: A list of 2 cleanup queries. The first one cleans up stale nodes attached to the given + selected_relationships, and the second one cleans up stale selected_relationships. For example outputs, see + tests.unit.cartography.graph.test_cleanupbuilder. + """ + if not node_schema.sub_resource_relationship: + raise ValueError( + f"_build_cleanup_node_query() failed: '{node_schema.label}' does not have a sub_resource_relationship " + "defined, so we cannot generate a query to clean it up. Please verify that the class definition is what " + "you expect.", + ) + if not rel_present_on_node_schema(node_schema, selected_relationship): + raise ValueError( + f"_build_cleanup_node_query(): Attempted to build cleanup query for node '{node_schema.label}' and " + f"relationship {selected_relationship.rel_label} but that relationship is not present on the node. Please " + "verify the node class definition for the relationships that it has.", + ) + + # Draw sub resource rel with correct direction + if node_schema.sub_resource_relationship.direction == LinkDirection.INWARD: + sub_resource_link_template = Template("<-[s:$SubResourceRelLabel]-") + else: + sub_resource_link_template = Template("-[s:$SubResourceRelLabel]->") + sub_resource_link = sub_resource_link_template.safe_substitute( + SubResourceRelLabel=node_schema.sub_resource_relationship.rel_label, + ) + + # The cleanup node query must always be before the cleanup rel query + delete_action_clauses = [ + """ + WHERE n.lastupdated <> $UPDATE_TAG + WITH n LIMIT $LIMIT_SIZE + DETACH DELETE n; + """, + ] + # Now clean up the relationships + if selected_relationship == node_schema.sub_resource_relationship: + _validate_target_node_matcher_for_cleanup_job(node_schema.sub_resource_relationship.target_node_matcher) + delete_action_clauses.append( + """ + WHERE s.lastupdated <> $UPDATE_TAG + WITH s LIMIT $LIMIT_SIZE + DELETE s; + """, + ) + else: + delete_action_clauses.append( + """ + WHERE r.lastupdated <> $UPDATE_TAG + WITH r LIMIT $LIMIT_SIZE + DELETE r; + """, + ) + + # Ensure the node is attached to the sub resource and delete the node + query_template = Template( + """ + MATCH (n:$node_label)$sub_resource_link(:$sub_resource_label{$match_sub_res_clause}) + $selected_rel_clause + $delete_action_clause + """, + ) + return [ + query_template.safe_substitute( + node_label=node_schema.label, + sub_resource_link=sub_resource_link, + sub_resource_label=node_schema.sub_resource_relationship.target_node_label, + match_sub_res_clause=_build_match_clause(node_schema.sub_resource_relationship.target_node_matcher), + selected_rel_clause=( + "" if selected_relationship == node_schema.sub_resource_relationship + else _build_selected_rel_clause(selected_relationship) + ), + delete_action_clause=delete_action_clause, + ) for delete_action_clause in delete_action_clauses + ] + + +def _build_selected_rel_clause(selected_relationship: CartographyRelSchema) -> str: + """ + Draw selected relationship with correct direction. Returns a string that looks like either + MATCH (n)<-[r:$SelectedRelLabel]-(:$other_node_label) or + MATCH (n)-[r:$SelectedRelLabel]->(:$other_node_label) + """ + if selected_relationship.direction == LinkDirection.INWARD: + selected_rel_template = Template("<-[r:$SelectedRelLabel]-") + else: + selected_rel_template = Template("-[r:$SelectedRelLabel]->") + selected_rel = selected_rel_template.safe_substitute(SelectedRelLabel=selected_relationship.rel_label) + selected_rel_clause_template = Template("""MATCH (n)$selected_rel(:$other_node_label)""") + selected_rel_clause = selected_rel_clause_template.safe_substitute( + selected_rel=selected_rel, + other_node_label=selected_relationship.target_node_label, + ) + return selected_rel_clause + + +def _validate_target_node_matcher_for_cleanup_job(tgm: TargetNodeMatcher): + """ + Raises ValueError if a single PropertyRef in the given TargetNodeMatcher does not have set_in_kwargs=True. + Auto cleanups require the sub resource target node matcher to have set_in_kwargs=True because the GraphJob + class injects the sub resource id via a query kwarg parameter. See GraphJob and GraphStatement classes. + This is a private function meant only to be called when we clean up the sub resource relationship. + """ + tgm_asdict = asdict(tgm) + + for key, prop_ref in tgm_asdict.items(): + if not prop_ref.set_in_kwargs: + raise ValueError( + f"TargetNodeMatcher PropertyRefs in the sub_resource_relationship must have set_in_kwargs=True. " + f"{key} has set_in_kwargs=False, please check.", + ) diff --git a/cartography/graph/job.py b/cartography/graph/job.py index 6fec80685..41a91f02a 100644 --- a/cartography/graph/job.py +++ b/cartography/graph/job.py @@ -1,21 +1,59 @@ import json import logging +import string from pathlib import Path +from string import Template from typing import Any from typing import Dict from typing import List from typing import Optional +from typing import Set from typing import Union import neo4j +from cartography.graph.cleanupbuilder import build_cleanup_queries +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelSchema from cartography.graph.statement import get_job_shortname from cartography.graph.statement import GraphStatement - logger = logging.getLogger(__name__) +def _get_identifiers(template: string.Template) -> List[str]: + """ + :param template: A string Template + :return: the variable names that start with a '$' like $this in the given Template. + Stolen from https://github.com/python/cpython/issues/90465#issuecomment-1093941790. + TODO we can get rid of this and use template.get_identifiers() once we are on python 3.11 + """ + return list( + set( + filter( + lambda v: v is not None, + ( + mo.group('named') or mo.group('braced') + for mo in template.pattern.finditer(template.template) + ), + ), + ), + ) + + +def get_parameters(queries: List[str]) -> Set[str]: + """ + :param queries: A list of Neo4j queries with parameters indicated by leading '$' like $this. + :return: The set of all parameters across all given Neo4j queries. + """ + parameter_set = set() + for query in queries: + as_template = Template(query) + params = _get_identifiers(as_template) + parameter_set.update(params) + return parameter_set + + class GraphJobJSONEncoder(json.JSONEncoder): """ Support JSON serialization for GraphJob instances. @@ -86,6 +124,41 @@ def from_json(cls, blob: str, short_name: Optional[str] = None) -> 'GraphJob': name = data["name"] return cls(name, statements, short_name) + @classmethod + def from_node_schema( + cls, + node_schema: CartographyNodeSchema, + parameters: Dict[str, Any], + selected_rels: Optional[Set[CartographyRelSchema]] = None, + ) -> 'GraphJob': + """ + Create a cleanup job from a CartographyNodeSchema object. + For a given node, the fields used in the node_schema.sub_resource_relationship.target_node_node_matcher.keys() + must be provided as keys and values in the params dict. + """ + queries: List[str] = build_cleanup_queries(node_schema, selected_rels) + + # Validate params + expected_param_keys: Set[str] = get_parameters(queries) + actual_param_keys: Set[str] = set(parameters.keys()) + # Hacky, but LIMIT_SIZE is specified by default in cartography.graph.statement, so we exclude it from validation + actual_param_keys.add('LIMIT_SIZE') + if actual_param_keys != expected_param_keys: + raise ValueError( + f'Expected query params "{expected_param_keys}" but got "{actual_param_keys}". Please check the value ' + f'passed to `parameters`.', + ) + + statements: List[GraphStatement] = [ + GraphStatement(query, parameters=parameters, iterative=True, iterationsize=100) for query in queries + ] + + return cls( + f"Cleanup {node_schema.label}", + statements, + node_schema.label, + ) + @classmethod def from_json_file(cls, file_path: Union[str, Path]) -> 'GraphJob': """ diff --git a/cartography/graph/model.py b/cartography/graph/model.py index 43d7b8386..1d6caf6f1 100644 --- a/cartography/graph/model.py +++ b/cartography/graph/model.py @@ -44,7 +44,7 @@ class PropertyRef: (PropertyRef.set_in_kwargs=True). """ - def __init__(self, name: str, set_in_kwargs=False): + def __init__(self, name: str, set_in_kwargs: bool = False): """ :param name: The name of the property :param set_in_kwargs: Optional. If True, the property is not defined on the data dict, and we expect to find the @@ -52,8 +52,8 @@ def __init__(self, name: str, set_in_kwargs=False): If False, looks for the property in the data dict. Defaults to False. """ - self.name = name - self.set_in_kwargs = set_in_kwargs + self.name: str = name + self.set_in_kwargs: bool = set_in_kwargs def _parameterize_name(self) -> str: return f"${self.name}" diff --git a/cartography/graph/querybuilder.py b/cartography/graph/querybuilder.py index 0e8032832..0a21af771 100644 --- a/cartography/graph/querybuilder.py +++ b/cartography/graph/querybuilder.py @@ -267,7 +267,20 @@ def _build_attach_relationships_statement( return query_template.safe_substitute(attach_relationships_statement=attach_relationships_statement) -def _filter_selected_relationships( +def rel_present_on_node_schema( + node_schema: CartographyNodeSchema, + rel_schema: CartographyRelSchema, +) -> bool: + """ + Answers the question: is the given rel_schema is present on the given node_schema? + """ + sub_res_rel, other_rels = filter_selected_relationships(node_schema, {rel_schema}) + if sub_res_rel or other_rels: + return True + return False + + +def filter_selected_relationships( node_schema: CartographyNodeSchema, selected_relationships: Set[CartographyRelSchema], ) -> Tuple[Optional[CartographyRelSchema], Optional[OtherRelationships]]: @@ -277,7 +290,7 @@ def _filter_selected_relationships( :param node_schema: The node schema object to filter relationships against :param selected_relationships: The set of relationships to check if they exist in the node schema. If empty set, this means that no relationships have been selected. None is not an accepted value here. - :return: a tuple of the (sub resource rel [if present in selected_relationships], an OtherRelationships object + :return: a tuple of the shape (sub resource rel [if present in selected_relationships], an OtherRelationships object containing all values of node_schema.other_relationships that are present in selected_relationships) """ # The empty set means no relationships are selected @@ -294,8 +307,8 @@ def _filter_selected_relationships( for selected_rel in selected_relationships: if selected_rel not in all_rels_on_node: raise ValueError( - f"build_ingestion_query() failed: CartographyRelSchema {selected_rel.__class__.__name__} is not " - f"defined on CartographyNodeSchema type {node_schema.__class__.__name__}. Please verify the " + f"filter_selected_relationships() failed: CartographyRelSchema {selected_rel.__class__.__name__} is " + f"not defined on CartographyNodeSchema type {node_schema.__class__.__name__}. Please verify the " f"value of `selected_relationships` passed to `build_ingestion_query()`.", ) @@ -350,7 +363,7 @@ def build_ingestion_query( sub_resource_rel: Optional[CartographyRelSchema] = node_schema.sub_resource_relationship other_rels: Optional[OtherRelationships] = node_schema.other_relationships if selected_relationships or selected_relationships == set(): - sub_resource_rel, other_rels = _filter_selected_relationships(node_schema, selected_relationships) + sub_resource_rel, other_rels = filter_selected_relationships(node_schema, selected_relationships) ingest_query = query_template.safe_substitute( node_label=node_schema.label, diff --git a/cartography/intel/aws/emr.py b/cartography/intel/aws/emr.py index ef16a323b..86287065a 100644 --- a/cartography/intel/aws/emr.py +++ b/cartography/intel/aws/emr.py @@ -9,6 +9,7 @@ import neo4j from cartography.client.core.tx import load_graph_data +from cartography.graph.job import GraphJob from cartography.graph.model import CartographyNodeProperties from cartography.graph.model import CartographyNodeSchema from cartography.graph.model import CartographyRelProperties @@ -20,7 +21,6 @@ from cartography.graph.querybuilder import build_ingestion_query from cartography.intel.aws.ec2.util import get_botocore_config from cartography.util import aws_handle_regions -from cartography.util import run_cleanup_job from cartography.util import timeit logger = logging.getLogger(__name__) @@ -134,7 +134,8 @@ def load_emr_clusters( @timeit def cleanup(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: logger.debug("Running EMR cleanup job.") - run_cleanup_job('aws_import_emr_cleanup.json', neo4j_session, common_job_parameters) + cleanup_job = GraphJob.from_node_schema(EMRClusterSchema(), common_job_parameters) + cleanup_job.run(neo4j_session) @timeit diff --git a/tests/data/graph/querybuilder/sample_data/helloworld_relationships.py b/tests/data/graph/querybuilder/sample_data/helloworld_relationships.py index 4eaf902e8..e4f67a97c 100644 --- a/tests/data/graph/querybuilder/sample_data/helloworld_relationships.py +++ b/tests/data/graph/querybuilder/sample_data/helloworld_relationships.py @@ -28,6 +28,19 @@ }, ] +# This dataset shows an InterestingNode attached to a SubResource and a HelloAsset, but no WorldAsset. +INTERESTING_NODE_NO_WORLD_ASSET = [ + { + 'Id': 'interesting-node-id', + 'property1': 'b', + 'property2': 'c', + 'AnotherField': 'd', + 'YetAnotherRelField': 'e', + 'hello_asset_id': 'the-helloasset-id-1', + 'sub_resource_id': 'sub-resource-id', + }, +] + # This dataset shows an InterestingNode attached to a HelloAsset and a WorldAsset. INTERESTING_NODE_WITH_ALL_RELS = [ { @@ -37,6 +50,18 @@ 'AnotherField': 'd', 'YetAnotherRelField': 'e', 'world_asset_id': 'the-worldasset-id-1', - 'hello_asset_id': 'the-helloasset_id-1', + 'hello_asset_id': 'the-helloasset-id-1', + 'sub_resource_id': 'sub-resource-id', + }, +] + +INTERESTING_NODE_SUB_RES_ONLY = [ + { + 'Id': 'interesting-node-id', + 'property1': 'b', + 'property2': 'c', + 'AnotherField': 'd', + 'YetAnotherRelField': 'e', + 'sub_resource_id': 'sub-resource-id', }, ] diff --git a/tests/data/graph/querybuilder/sample_models/asset_with_non_kwargs_tgm.py b/tests/data/graph/querybuilder/sample_models/asset_with_non_kwargs_tgm.py new file mode 100644 index 000000000..17b04abda --- /dev/null +++ b/tests/data/graph/querybuilder/sample_models/asset_with_non_kwargs_tgm.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass + +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelProperties +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import LinkDirection +from cartography.graph.model import make_target_node_matcher +from cartography.graph.model import PropertyRef +from cartography.graph.model import TargetNodeMatcher +from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeProperties + + +@dataclass(frozen=True) +class FakeEC2InstanceToAWSAccountRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class FakeEC2InstanceToAWSAccount(CartographyRelSchema): + """ + The PropertyRef is intentionally set to False: we expect the unit test to raise an exception. + Auto cleanups require the sub resource target node matcher to have set_in_kwargs=True because of how the GraphJob + class does query parameterization. + """ + target_node_label: str = 'AWSAccount' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('AWS_ID', set_in_kwargs=False)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: FakeEC2InstanceToAWSAccountRelProps = FakeEC2InstanceToAWSAccountRelProps() + + +@dataclass(frozen=True) +class FakeEC2InstanceSchema(CartographyNodeSchema): + label: str = 'FakeEC2Instance' + properties: SimpleNodeProperties = SimpleNodeProperties() + sub_resource_relationship: FakeEC2InstanceToAWSAccount = FakeEC2InstanceToAWSAccount() diff --git a/tests/integration/cartography/graph/test_cleanupbuilder.py b/tests/integration/cartography/graph/test_cleanupbuilder.py new file mode 100644 index 000000000..0ba7eb842 --- /dev/null +++ b/tests/integration/cartography/graph/test_cleanupbuilder.py @@ -0,0 +1,253 @@ +from cartography.client.core.tx import load_graph_data +from cartography.graph.job import GraphJob +from cartography.graph.querybuilder import build_ingestion_query +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import INTERESTING_NODE_NO_WORLD_ASSET +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import INTERESTING_NODE_SUB_RES_ONLY +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import INTERESTING_NODE_WITH_ALL_RELS +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_HELLO_ASSET_QUERY +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_SUB_RESOURCE_QUERY +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_WORLD_ASSET_QUERY +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema +from tests.integration.util import check_nodes +from tests.integration.util import check_rels + + +# TODO: Once we have more real modules using cartography data objects, add a test using them instead of these fake objs. +def test_cleanup_interesting_asset_end_to_end_only_sub_res_remains(neo4j_session): + """ + Arrange + Create paths + (i:InterestingAsset{id:'interesting-node-id'})<-[:RELATIONSHIP_LABEL]-(:SubResource{id:sub-resource-id}), + (i)-[:ASSOCIATED_WITH]->(:HelloAsset{id: the-helloasset-id-1}), + (i)<-[:CONNECTED]-(:WorldAsset{id: world-asset-id}) + at timestamp lastupdated 1. + + Act + Suppose only InterestingAsset and its SubResource connection exist now at timestamp lastupdated=2. + Run cleanup. + + Assert + That only the rels to the InterestingAsset and SubResource exist and all other rels have been cleaned up. + """ + # Arrange: add these nodes and rels to the graph, all with lastupdated=1: + neo4j_session.run(MERGE_SUB_RESOURCE_QUERY) + neo4j_session.run(MERGE_HELLO_ASSET_QUERY) + neo4j_session.run(MERGE_WORLD_ASSET_QUERY) + + query = build_ingestion_query(InterestingAssetSchema()) + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_ALL_RELS, + lastupdated=1, + sub_resource_id='sub-resource-id', + ) + # Sanity checks to verify that the relationships exist. + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'WorldAsset', + 'id', + 'CONNECTED', + rel_direction_right=False, + ) == {('interesting-node-id', 'the-worldasset-id-1')} + + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'HelloAsset', + 'id', + 'ASSOCIATED_WITH', + rel_direction_right=True, + ) == {('interesting-node-id', 'the-helloasset-id-1')} + + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'SubResource', + 'id', + 'RELATIONSHIP_LABEL', + rel_direction_right=False, + ) == {('interesting-node-id', 'sub-resource-id')} + + # Arrange: Suppose only InterestingAsset and its SubResource connection exist now at timestamp lastupdated=2. + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_SUB_RES_ONLY, + lastupdated=2, + sub_resource_id='sub-resource-id', + ) + + # Act: Now actually generate and run the cleanup job. + cleanup_job = GraphJob.from_node_schema( + InterestingAssetSchema(), + {'UPDATE_TAG': 2, 'sub_resource_id': 'sub-resource-id'}, + ) + cleanup_job.run(neo4j_session) + + # Assert + # The rel between InterestingAsset and WorldAsset was stale (lastupdated != 2) and was cleaned up, + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'WorldAsset', + 'id', + 'CONNECTED', + rel_direction_right=True, + ) == set() + # but we don't delete the WorldAsset itself, as that cleanup should be handled by the WorldAsset's own intel module. + assert check_nodes(neo4j_session, 'WorldAsset', ['id']) == {('the-worldasset-id-1',)} + + # Same thing here: the rel between InterestingAsset and HelloAsset was stale (lastupdated != 2) and was cleaned up, + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'HelloAsset', + 'id', + 'ASSOCIATED_WITH', + rel_direction_right=False, + ) == set() + # but we don't delete the HelloAsset itself, as that cleanup should be handled by the HelloAsset's own intel module. + assert check_nodes(neo4j_session, 'HelloAsset', ['id']) == {('the-helloasset-id-1',)} + + # This is the only rel we expect to remain intact as it was last updated in our call to `load_graph_data()` above. + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'SubResource', + 'id', + 'RELATIONSHIP_LABEL', + rel_direction_right=False, + ) == {('interesting-node-id', 'sub-resource-id')} + # And we expect to be able to retrieve fields from the InterestingAsset, + assert check_nodes(neo4j_session, 'InterestingAsset', ['id', 'property1', 'property2', 'lastupdated']) == { + ('interesting-node-id', 'b', 'c', 2), + } + # and from the SubResource: + assert check_nodes(neo4j_session, 'SubResource', ['id', 'lastupdated']) == {('sub-resource-id', 1)} + + +def test_cleanup_interesting_asset_end_to_end_no_world_asset(neo4j_session): + """ + Arrange + Create paths + (i:InterestingAsset{id:'interesting-node-id'})<-[:RELATIONSHIP_LABEL]-(:SubResource{id:sub-resource-id}), + (i)-[:ASSOCIATED_WITH]->(:HelloAsset{id: the-helloasset-id-1}), + (i)<-[:CONNECTED]-(:WorldAsset{id: world-asset-id}) + at timestamp lastupdated 1. + + Act + Suppose all nodes except for WorldAsset exist at timestamp lastupdated=2. + Run cleanup. + + Assert + That only the relationship to the WorldAsset has been cleaned up. + """ + # Arrange: add these nodes and rels to the graph, all with lastupdated=1: + neo4j_session.run(MERGE_SUB_RESOURCE_QUERY) + neo4j_session.run(MERGE_HELLO_ASSET_QUERY) + neo4j_session.run(MERGE_WORLD_ASSET_QUERY) + + query = build_ingestion_query(InterestingAssetSchema()) + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_ALL_RELS, + lastupdated=1, + sub_resource_id='sub-resource-id', + ) + # Sanity checks to verify that the relationships exist. + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'WorldAsset', + 'id', + 'CONNECTED', + rel_direction_right=False, + ) == {('interesting-node-id', 'the-worldasset-id-1')} + + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'HelloAsset', + 'id', + 'ASSOCIATED_WITH', + rel_direction_right=True, + ) == {('interesting-node-id', 'the-helloasset-id-1')} + + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'SubResource', + 'id', + 'RELATIONSHIP_LABEL', + rel_direction_right=False, + ) == {('interesting-node-id', 'sub-resource-id')} + + # Arrange: Suppose all assets except the WorldAsset exist now at timestamp lastupdated=2. + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_NO_WORLD_ASSET, + lastupdated=2, + sub_resource_id='sub-resource-id', + ) + + # Act: Now actually generate and run the cleanup job. + cleanup_job = GraphJob.from_node_schema( + InterestingAssetSchema(), + {'UPDATE_TAG': 2, 'sub_resource_id': 'sub-resource-id'}, + ) + cleanup_job.run(neo4j_session) + + # Assert + # The rel between InterestingAsset and WorldAsset was stale (lastupdated != 2) and was cleaned up, + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'WorldAsset', + 'id', + 'CONNECTED', + rel_direction_right=True, + ) == set() + # but we don't delete the WorldAsset itself, as that cleanup should be handled by the WorldAsset's own intel module. + assert check_nodes(neo4j_session, 'WorldAsset', ['id']) == {('the-worldasset-id-1',)} + + # The rel between InterestingAsset and HelloAsset should still exist. + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'HelloAsset', + 'id', + 'ASSOCIATED_WITH', + rel_direction_right=True, + ) == {('interesting-node-id', 'the-helloasset-id-1')} + + # We also expect the sub resource relationship to remain intact. + assert check_rels( + neo4j_session, + 'InterestingAsset', + 'id', + 'SubResource', + 'id', + 'RELATIONSHIP_LABEL', + rel_direction_right=False, + ) == {('interesting-node-id', 'sub-resource-id')} + # And we expect to be able to retrieve fields from the InterestingAsset, + assert check_nodes(neo4j_session, 'InterestingAsset', ['id', 'property1', 'property2', 'lastupdated']) == { + ('interesting-node-id', 'b', 'c', 2), + } + # and from the SubResource: + assert check_nodes(neo4j_session, 'SubResource', ['id', 'lastupdated']) == {('sub-resource-id', 1)} diff --git a/tests/integration/cartography/intel/aws/test_emr.py b/tests/integration/cartography/intel/aws/test_emr.py index 87050934d..a9a5438eb 100644 --- a/tests/integration/cartography/intel/aws/test_emr.py +++ b/tests/integration/cartography/intel/aws/test_emr.py @@ -1,5 +1,6 @@ import cartography.intel.aws.emr import tests.data.aws.emr +from cartography.intel.aws.emr import cleanup from tests.integration.util import check_nodes from tests.integration.util import check_rels @@ -8,6 +9,19 @@ TEST_UPDATE_TAG = 123456789 +def _create_test_accounts(neo4j_session): + # Create Test AWSAccount + neo4j_session.run( + """ + MERGE (aws:AWSAccount{id: $aws_account_id}) + ON CREATE SET aws.firstseen = timestamp() + SET aws.lastupdated = $aws_update_tag + """, + aws_account_id=TEST_ACCOUNT_ID, + aws_update_tag=TEST_UPDATE_TAG, + ) + + def test_load_emr_clusters_nodes(neo4j_session): # Act data = tests.data.aws.emr.DESCRIBE_CLUSTERS @@ -28,16 +42,7 @@ def test_load_emr_clusters_nodes(neo4j_session): def test_load_emr_clusters_relationships(neo4j_session): - # Arrange: Create Test AWSAccount - neo4j_session.run( - """ - MERGE (aws:AWSAccount{id: $aws_account_id}) - ON CREATE SET aws.firstseen = timestamp() - SET aws.lastupdated = $aws_update_tag - """, - aws_account_id=TEST_ACCOUNT_ID, - aws_update_tag=TEST_UPDATE_TAG, - ) + _create_test_accounts(neo4j_session) # Act: Load Test EMR Clusters data = tests.data.aws.emr.DESCRIBE_CLUSTERS @@ -62,3 +67,51 @@ def test_load_emr_clusters_relationships(neo4j_session): 'arn', 'RESOURCE', ) == expected + + +def test_cleanup_emr(neo4j_session): + # Arrange: load EMR cluster data + data = tests.data.aws.emr.DESCRIBE_CLUSTERS + _create_test_accounts(neo4j_session) + cartography.intel.aws.emr.load_emr_clusters( + neo4j_session, + data, + TEST_REGION, + TEST_ACCOUNT_ID, + TEST_UPDATE_TAG, + ) + # Arrange: load in an unrelated EC2 instance. This should not be affected by the EMR module's cleanup job. + neo4j_session.run( + ''' + MERGE (i:EC2Instance{id:1234, lastupdated: $lastupdated})<-[r:RESOURCE]-(:AWSAccount{id: $aws_account_id}) + SET r.lastupdated = $lastupdated + ''', + aws_account_id=TEST_ACCOUNT_ID, + lastupdated=TEST_UPDATE_TAG, + ) + + # [Pre-test] Assert that the EMR clusters exist + assert check_nodes(neo4j_session, 'EMRCluster', ['arn']) == { + ("arn:aws:elasticmapreduce:us-east-1:190000000000:cluster/j-awesome",), + ("arn:aws:elasticmapreduce:us-east-1:190000000000:cluster/j-meh",), + } + # [Pre-test] Assert that the unrelated EC2 instance exists + assert check_rels(neo4j_session, 'AWSAccount', 'id', 'EC2Instance', 'id', 'RESOURCE') == { + (TEST_ACCOUNT_ID, 1234), + } + + # Act: run the cleanup job + cleanup( + neo4j_session, + { + 'UPDATE_TAG': TEST_UPDATE_TAG + 1, # Simulate a new sync run finished so the old update tag is obsolete now + 'AccountId': TEST_ACCOUNT_ID, + }, + ) + + # Assert: Expect no EMR clusters in the graph now + assert check_nodes(neo4j_session, 'EMRCluster', ['arn']) == set() + # Assert: Expect that the unrelated EC2 instance was not touched by the cleanup job + assert check_rels(neo4j_session, 'AWSAccount', 'id', 'EC2Instance', 'id', 'RESOURCE') == { + (TEST_ACCOUNT_ID, 1234), + } diff --git a/tests/unit/cartography/graph/helpers.py b/tests/unit/cartography/graph/helpers.py index 3dbb4c11f..ba5571427 100644 --- a/tests/unit/cartography/graph/helpers.py +++ b/tests/unit/cartography/graph/helpers.py @@ -1,3 +1,6 @@ +from typing import List + + def remove_leading_whitespace_and_empty_lines(text: str) -> str: """ Helper function for tests. @@ -8,3 +11,12 @@ def remove_leading_whitespace_and_empty_lines(text: str) -> str: # We call lstrip() twice on the same line. This is inefficient but ok for small unit tests. # Please change it if you want to. return '\n'.join([line.lstrip() for line in text.split('\n') if line.lstrip() != '']) + + +def clean_query_list(queries: List[str]) -> List[str]: + """ + Helper function to remove leading whitespace and blank lines for all strings in the input list. + :param queries: The list of strings to clean + :return: A list of text strings with no leading whitespace and no blank lines. + """ + return [remove_leading_whitespace_and_empty_lines(query) for query in queries] diff --git a/tests/unit/cartography/graph/test_cleanupbuilder.py b/tests/unit/cartography/graph/test_cleanupbuilder.py new file mode 100644 index 000000000..17688a303 --- /dev/null +++ b/tests/unit/cartography/graph/test_cleanupbuilder.py @@ -0,0 +1,160 @@ +from typing import List + +import pytest + +from cartography.graph.cleanupbuilder import _build_cleanup_node_and_rel_queries +from cartography.graph.cleanupbuilder import build_cleanup_queries +from cartography.graph.job import get_parameters +from cartography.intel.aws.emr import EMRClusterToAWSAccount +from tests.data.graph.querybuilder.sample_models.asset_with_non_kwargs_tgm import FakeEC2InstanceSchema +from tests.data.graph.querybuilder.sample_models.asset_with_non_kwargs_tgm import FakeEC2InstanceToAWSAccount +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToHelloAssetRel +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToSubResourceRel +from tests.unit.cartography.graph.helpers import clean_query_list + + +def test_cleanup_sub_rel(): + """ + Test that we correctly generate cleanup queries when a selected rel is not specified. + """ + actual_queries: List[str] = _build_cleanup_node_and_rel_queries( + InterestingAssetSchema(), + InterestingAssetToSubResourceRel(), + ) + expected_queries = [ + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + WHERE n.lastupdated <> $UPDATE_TAG + WITH n LIMIT $LIMIT_SIZE + DETACH DELETE n; + """, + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + WHERE s.lastupdated <> $UPDATE_TAG + WITH s LIMIT $LIMIT_SIZE + DELETE s; + """, + ] + assert clean_query_list(actual_queries) == clean_query_list(expected_queries) + + +def test_cleanup_with_selected_rel(): + """ + Test that we correctly generate cleanup queries when a selected rel is specified. + """ + actual_queries: List[str] = _build_cleanup_node_and_rel_queries( + InterestingAssetSchema(), + InterestingAssetToHelloAssetRel(), + ) + expected_queries = [ + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + MATCH (n)-[r:ASSOCIATED_WITH]->(:HelloAsset) + WHERE n.lastupdated <> $UPDATE_TAG + WITH n LIMIT $LIMIT_SIZE + DETACH DELETE n; + """, + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + MATCH (n)-[r:ASSOCIATED_WITH]->(:HelloAsset) + WHERE r.lastupdated <> $UPDATE_TAG + WITH r LIMIT $LIMIT_SIZE + DELETE r; + """, + ] + assert clean_query_list(actual_queries) == clean_query_list(expected_queries) + + +def test_cleanup_with_invalid_selected_rel_raises_exc(): + """ + Test that we raise a ValueError if we try to cleanup a node and provide a specified rel but the rel doesn't exist on + the node schema. + """ + exc_msg = "EMRClusterToAWSAccount is not defined on CartographyNodeSchema type InterestingAssetSchema" + with pytest.raises(ValueError, match=exc_msg): + _build_cleanup_node_and_rel_queries(InterestingAssetSchema(), EMRClusterToAWSAccount()) + + +def test_build_cleanup_queries(): + """ + Test that the full set of cleanup queries generated for a node schema is what we expect. Order matters! + """ + actual_queries: list[str] = build_cleanup_queries(InterestingAssetSchema()) + expected_queries = [ + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + MATCH (n)-[r:ASSOCIATED_WITH]->(:HelloAsset) + WHERE n.lastupdated <> $UPDATE_TAG + WITH n LIMIT $LIMIT_SIZE + DETACH DELETE n; + """, + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + MATCH (n)-[r:ASSOCIATED_WITH]->(:HelloAsset) + WHERE r.lastupdated <> $UPDATE_TAG + WITH r LIMIT $LIMIT_SIZE + DELETE r; + """, + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + MATCH (n)<-[r:CONNECTED]-(:WorldAsset) + WHERE n.lastupdated <> $UPDATE_TAG + WITH n LIMIT $LIMIT_SIZE + DETACH DELETE n; + """, + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + MATCH (n)<-[r:CONNECTED]-(:WorldAsset) + WHERE r.lastupdated <> $UPDATE_TAG + WITH r LIMIT $LIMIT_SIZE + DELETE r; + """, + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + WHERE n.lastupdated <> $UPDATE_TAG + WITH n LIMIT $LIMIT_SIZE + DETACH DELETE n; + """, + """ + MATCH (n:InterestingAsset)<-[s:RELATIONSHIP_LABEL]-(:SubResource{id: $sub_resource_id}) + WHERE s.lastupdated <> $UPDATE_TAG + WITH s LIMIT $LIMIT_SIZE + DELETE s;""", + ] + assert clean_query_list(actual_queries) == clean_query_list(expected_queries) + + +def test_get_params_from_queries(): + """ + Test that we are able to correctly retrieve parameter names from the generated cleanup queries. + """ + queries: list[str] = build_cleanup_queries(InterestingAssetSchema()) + assert set(get_parameters(queries)) == {'UPDATE_TAG', 'sub_resource_id', 'LIMIT_SIZE'} + + +def test_build_cleanup_queries_selected_rels(): + """ + Test that we are able to correctly make cleanup jobs for a subset of relationships. + """ + queries: list[str] = build_cleanup_queries( + InterestingAssetSchema(), + {InterestingAssetToSubResourceRel(), InterestingAssetToHelloAssetRel()}, + ) + assert len(queries) == 4 # == 2 to delete nodes and rels bound to the sub resource + 2 for the HelloAsset + + +def test_build_cleanup_queries_selected_rels_no_sub_res_raises_exc(): + """ + Test that not specifying the sub resource rel as a selected_relationship in build_cleanup_queries raises exception + """ + with pytest.raises(ValueError, match='node_schema without a sub resource relationship is not supported'): + build_cleanup_queries( + InterestingAssetSchema(), + {InterestingAssetToHelloAssetRel()}, + ) + + +def test_build_cleanup_node_and_rel_queries_sub_res_tgm_not_validated_raises_exc(): + with pytest.raises(ValueError, match='must have set_in_kwargs=True'): + _build_cleanup_node_and_rel_queries(FakeEC2InstanceSchema(), FakeEC2InstanceToAWSAccount()) diff --git a/tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py b/tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py index a76d497af..d02807f06 100644 --- a/tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py +++ b/tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py @@ -1,6 +1,6 @@ from pytest import raises -from cartography.graph.querybuilder import _filter_selected_relationships +from cartography.graph.querybuilder import filter_selected_relationships from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToSubResourceRel from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema @@ -11,7 +11,7 @@ def test_filter_selected_rels_raises_value_err(): """ # Act and assert with raises(ValueError): - _, _ = _filter_selected_relationships( + _, _ = filter_selected_relationships( SimpleNodeSchema(), selected_relationships={InterestingAssetToSubResourceRel()}, )