From cc41bc342de712a6f08a205dd5cf6a77c4b7dd3e Mon Sep 17 00:00:00 2001 From: Colin Chartier Date: Mon, 30 Dec 2024 16:13:59 -0500 Subject: [PATCH] feat: implement mapping for HashBucketFunctionTransformer --- .../entities/eap_spans.yaml | 6 ++--- .../entities/eap_spans_rpc.yaml | 11 +++++----- .../logical/hash_bucket_functions.py | 22 ++++++++++--------- .../test_hash_bucket_functions_processor.py | 6 ++++- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans.yaml b/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans.yaml index 3a3e3e59752..5d01842a91a 100644 --- a/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans.yaml +++ b/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans.yaml @@ -90,9 +90,9 @@ query_processors: - quantileTDigestWeighted - processor: HashBucketFunctionTransformer args: - hash_bucket_names: - - attr_str - - attr_num + hash_bucket_name_mapping: + attr_str: attr_str + attr_num: attr_num validate_data_model: do_nothing # in order to reference aliased columns, we shouldn't validate columns purely based on the entity schema validators: diff --git a/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans_rpc.yaml b/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans_rpc.yaml index 6e1cc3fd603..32a00c77db7 100644 --- a/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans_rpc.yaml +++ b/snuba/datasets/configuration/events_analytics_platform/entities/eap_spans_rpc.yaml @@ -93,6 +93,12 @@ query_processors: time_parse_columns: - start_timestamp - end_timestamp + - processor: HashBucketFunctionTransformer + args: + hash_bucket_name_mapping: + attr_str: attr_str + attr_f64: attr_num + attr_i64: attr_num - processor: OptionalAttributeAggregationTransformer args: attribute_column_names: @@ -108,11 +114,6 @@ query_processors: curried_aggregation_names: - quantile - quantileTDigestWeighted - - processor: HashBucketFunctionTransformer - args: - hash_bucket_names: - - attr_str - - attr_num validate_data_model: do_nothing # in order to reference aliased columns, we shouldn't validate columns purely based on the entity schema validators: diff --git a/snuba/query/processors/logical/hash_bucket_functions.py b/snuba/query/processors/logical/hash_bucket_functions.py index 2527b66fe5e..5e7ff1717df 100644 --- a/snuba/query/processors/logical/hash_bucket_functions.py +++ b/snuba/query/processors/logical/hash_bucket_functions.py @@ -1,4 +1,4 @@ -from typing import Sequence +from typing import Mapping from snuba.query.expressions import Column, Expression, FunctionCall, Literal from snuba.query.logical import Query @@ -22,11 +22,9 @@ class HashBucketFunctionTransformer(LogicalQueryProcessor): It converts mapExists(attr_str, 'blah') to mapExists(attr_str_{hash('blah')%20}, 'blah') """ - def __init__( - self, - hash_bucket_names: Sequence[str], - ): - self.hash_bucket_names = hash_bucket_names + def __init__(self, hash_bucket_name_mapping: Mapping[str, str]): + super().__init__() + self.hash_bucket_name_mapping = hash_bucket_name_mapping def process_query(self, query: Query, query_settings: QuerySettings) -> None: def transform_map_keys_and_values_expression(exp: Expression) -> Expression: @@ -40,7 +38,7 @@ def transform_map_keys_and_values_expression(exp: Expression) -> Expression: if not isinstance(param, Column): return exp - if param.column_name not in self.hash_bucket_names: + if param.column_name not in self.hash_bucket_name_mapping: return exp if exp.function_name not in ("mapKeys", "mapValues"): @@ -56,7 +54,7 @@ def transform_map_keys_and_values_expression(exp: Expression) -> Expression: parameters=( Column( None, - column_name=f"{param.column_name}_{i}", + column_name=f"{self.hash_bucket_name_mapping[param.column_name]}_{i}", table_name=param.table_name, ), ), @@ -76,7 +74,7 @@ def transform_map_contains_expression(exp: Expression) -> Expression: if not isinstance(column, Column): return exp - if column.column_name not in self.hash_bucket_names: + if column.column_name not in self.hash_bucket_name_mapping: return exp if exp.function_name != "mapContains": @@ -91,7 +89,11 @@ def transform_map_contains_expression(exp: Expression) -> Expression: alias=exp.alias, function_name=exp.function_name, parameters=( - Column(None, None, f"{column.column_name}_{bucket_idx}"), + Column( + None, + None, + f"{self.hash_bucket_name_mapping[column.column_name]}_{bucket_idx}", + ), key, ), ) diff --git a/tests/query/processors/test_hash_bucket_functions_processor.py b/tests/query/processors/test_hash_bucket_functions_processor.py index a259ca1066d..ac3faf68bbd 100644 --- a/tests/query/processors/test_hash_bucket_functions_processor.py +++ b/tests/query/processors/test_hash_bucket_functions_processor.py @@ -196,6 +196,7 @@ condition=binary_condition( "or", f.mapContains(column("attr_str"), literal("blah"), alias="x"), + f.mapContains(column("attr_i64"), literal("blah"), alias="y"), f.mapContains(column("attr_strz"), literal("blah"), alias="z"), ), ), @@ -210,6 +211,7 @@ condition=binary_condition( "or", f.mapContains(column("attr_str_2"), literal("blah"), alias="x"), + f.mapContains(column("attr_num_2"), literal("blah"), alias="y"), f.mapContains(column("attr_strz"), literal("blah"), alias="z"), ), ), @@ -220,7 +222,9 @@ @pytest.mark.parametrize("pre_format, expected_query", test_data) def test_format_expressions(pre_format: Query, expected_query: Query) -> None: copy = deepcopy(pre_format) - HashBucketFunctionTransformer("attr_str").process_query(copy, HTTPQuerySettings()) + HashBucketFunctionTransformer( + {"attr_str": "attr_str", "attr_i64": "attr_num"} + ).process_query(copy, HTTPQuerySettings()) assert copy.get_selected_columns() == expected_query.get_selected_columns() assert copy.get_groupby() == expected_query.get_groupby() assert copy.get_condition() == expected_query.get_condition()