diff --git a/.github/workflows/pr_testing.yml b/.github/workflows/pr_testing.yml index 88b6ec4ba..3641ed72d 100644 --- a/.github/workflows/pr_testing.yml +++ b/.github/workflows/pr_testing.yml @@ -231,6 +231,7 @@ jobs: SF_NONE_USERNAME: ${{ secrets.SF_NONE_USERNAME }} SF_NONE_PASSWORD: ${{ secrets.SF_NONE_PASSWORD }} SF_MASKED_ACCOUNT: ${{ secrets.SF_MASKED_ACCOUNT }} + PYDOUGH_MASK_SERVER_PATH: ${{ secrets.PYDOUGH_MASK_SERVER_PATH }} with: python-versions: ${{ github.event_name == 'workflow_dispatch' && needs.get-py-ver-matrix.outputs.matrix diff --git a/.github/workflows/sf_masked_testing.yml b/.github/workflows/sf_masked_testing.yml index 56b5986c1..aee42d70d 100644 --- a/.github/workflows/sf_masked_testing.yml +++ b/.github/workflows/sf_masked_testing.yml @@ -22,6 +22,8 @@ on: required: true SF_MASKED_ACCOUNT: required: true + PYDOUGH_MASK_SERVER_PATH: + required: true jobs: sf-tests: diff --git a/documentation/metadata.md b/documentation/metadata.md index 7a730da46..e6230bf5b 100644 --- a/documentation/metadata.md +++ b/documentation/metadata.md @@ -157,6 +157,8 @@ Properties of this type use the type string "masked table column" and include al - `protect protocol` (required): a Python format string, in the same format as `unprotect protocol`, used to describe how the data was originally masked. This can be used to generate masked values consistent with the encryption scheme, allowing operations such as comparisons between masked data. - `protected data type` (optional): same as `data type`, except referring to the type of the data when it is protected, whereas `data type` refers to the raw unprotected column. If omitted, it is assumed that the data type is the same between the unprotected vs protected data. - `server masked` (optional): a boolean flag indicating whether the column was masked on a server that is attached to PyDough. If `true`, PyDough can use it to optimize queries by rewriting predicates and expressions to avoid unmasking the data. +- `server dataset id` (optional): a string that must be provided `server masked` is `true`, indicating the `dataset id` value to be used when looking up this column in a remote server to optimize it by rewriting predicates. + Example of the structure of the metadata for a masked table column property where the string data is masked by moving the first character to the end, and unmasked by moving it back to the beginning: diff --git a/pydough/configs/session.py b/pydough/configs/session.py index 425f1f26d..7485752ef 100644 --- a/pydough/configs/session.py +++ b/pydough/configs/session.py @@ -19,6 +19,8 @@ existing state. """ +from typing import TYPE_CHECKING, Union + from pydough.database_connectors import ( DatabaseContext, DatabaseDialect, @@ -30,6 +32,9 @@ from .pydough_configs import PyDoughConfigs +if TYPE_CHECKING: + from pydough.mask_server import MaskServerInfo + class PyDoughSession: """ @@ -50,6 +55,7 @@ def __init__(self) -> None: connection=empty_connection, dialect=DatabaseDialect.ANSI ) self._error_builder: PyDoughErrorBuilder = PyDoughErrorBuilder() + self._mask_server: MaskServerInfo | None = None @property def metadata(self) -> GraphMetadata | None: @@ -131,6 +137,26 @@ def error_builder(self, builder: PyDoughErrorBuilder) -> None: """ self._error_builder = builder + @property + def mask_server(self) -> Union["MaskServerInfo", None]: + """ + Get the active mask server information. + + Returns: + The active mask server information. + """ + return self._mask_server + + @mask_server.setter + def mask_server(self, server_info: Union["MaskServerInfo", None]) -> None: + """ + Set the active mask server information. + + Args: + The mask server information to set. + """ + self._mask_server = server_info + def connect_database(self, database_name: str, **kwargs) -> DatabaseContext: """ Create a new DatabaseContext and register it in the session. This returns diff --git a/pydough/conversion/masking_shuttles.py b/pydough/conversion/masking_shuttles.py index 53ce2097e..9cfcbb559 100644 --- a/pydough/conversion/masking_shuttles.py +++ b/pydough/conversion/masking_shuttles.py @@ -63,7 +63,7 @@ def rewrite_masked_literal_comparison( # literal in a call to MASK by toggling is_unmask to False. masked_literal = CallExpression( pydop.MaskedExpressionFunctionOperator( - call_arg.op.masking_metadata, False + call_arg.op.masking_metadata, call_arg.op.table_path, False ), call_arg.data_type, [literal_arg], @@ -83,7 +83,7 @@ def rewrite_masked_literal_comparison( [ CallExpression( pydop.MaskedExpressionFunctionOperator( - call_arg.op.masking_metadata, False + call_arg.op.masking_metadata, call_arg.op.table_path, False ), call_arg.data_type, [LiteralExpression(v, inner_type)], diff --git a/pydough/conversion/relational_converter.py b/pydough/conversion/relational_converter.py index 35c98e75e..c67d4117d 100644 --- a/pydough/conversion/relational_converter.py +++ b/pydough/conversion/relational_converter.py @@ -12,6 +12,8 @@ import pydough.pydough_operators as pydop from pydough.configs import PyDoughSession +from pydough.mask_server.mask_server_candidate_visitor import MaskServerCandidateVisitor +from pydough.mask_server.mask_server_rewrite_shuttle import MaskServerRewriteShuttle from pydough.metadata import ( CartesianProductMetadata, GeneralJoinMetadata, @@ -45,7 +47,10 @@ LiteralExpression, Project, RelationalExpression, + RelationalExpressionDispatcher, RelationalExpressionShuttle, + RelationalExpressionShuttleDispatcher, + RelationalExpressionVisitor, RelationalNode, RelationalRoot, Scan, @@ -861,7 +866,9 @@ def build_simple_table_scan( ) unmask_columns[name] = CallExpression( pydop.MaskedExpressionFunctionOperator( - hybrid_expr.column.column_property, True + hybrid_expr.column.column_property, + node.collection.collection.table_path, + True, ), hybrid_expr.column.column_property.unprotected_data_type, [ColumnReference(name, hybrid_expr.typ)], @@ -1561,7 +1568,9 @@ def confirm_root(node: RelationalNode) -> RelationalRoot: def optimize_relational_tree( root: RelationalRoot, session: PyDoughSession, - additional_shuttles: list[RelationalExpressionShuttle], + additional_shuttles: list[ + RelationalExpressionShuttle | RelationalExpressionVisitor + ], ) -> RelationalRoot: """ Runs optimize on the relational tree, including pushing down filters and @@ -1570,8 +1579,8 @@ def optimize_relational_tree( Args: `root`: the relational root to optimize. `configs`: PyDough session used during optimization. - `additional_shuttles`: additional relational expression shuttles to use - for expression simplification. + `additional_shuttles`: additional relational expression shuttles or + visitors to use for expression simplification. Returns: The optimized relational root. @@ -1633,7 +1642,7 @@ def optimize_relational_tree( # Run the following pipeline twice: # A: projection pullup - # B: expression simplification + # B: expression simplification (followed by additional shuttles) # C: filter pushdown # D: join-aggregate transpose # E: projection pullup again @@ -1647,7 +1656,13 @@ def optimize_relational_tree( # pullup and pushdown and so on. for _ in range(2): root = confirm_root(pullup_projections(root)) - simplify_expressions(root, session, additional_shuttles) + simplify_expressions(root, session) + # Run all of the other shuttles/visitors over the entire tree. + for shuttle_or_visitor in additional_shuttles: + if isinstance(shuttle_or_visitor, RelationalExpressionShuttle): + root.accept(RelationalExpressionShuttleDispatcher(shuttle_or_visitor)) + else: + root.accept(RelationalExpressionDispatcher(shuttle_or_visitor, True)) root = confirm_root(push_filters(root, session)) root = confirm_root(pull_aggregates_above_joins(root)) root = confirm_root(pullup_projections(root)) @@ -1716,10 +1731,19 @@ def convert_ast_to_relational( raw_result: RelationalRoot = postprocess_root(node, columns, hybrid, output) # Invoke the optimization procedures on the result to clean up the tree. - additional_shuttles: list[RelationalExpressionShuttle] = [] + additional_shuttles: list[ + RelationalExpressionShuttle | RelationalExpressionVisitor + ] = [] # Add the mask literal comparison shuttle if the environment variable - # PYDOUGH_ENABLE_MASK_REWRITES is set to 1. + # PYDOUGH_ENABLE_MASK_REWRITES is set to 1. If a masking rewrite server has + # been attached to the session, include the shuttles for that as well. if os.getenv("PYDOUGH_ENABLE_MASK_REWRITES") == "1": + if session.mask_server is not None: + candidate_shuttle: MaskServerCandidateVisitor = MaskServerCandidateVisitor() + additional_shuttles.append(candidate_shuttle) + additional_shuttles.append( + MaskServerRewriteShuttle(session.mask_server, candidate_shuttle) + ) additional_shuttles.append(MaskLiteralComparisonShuttle()) optimized_result: RelationalRoot = optimize_relational_tree( raw_result, session, additional_shuttles diff --git a/pydough/conversion/relational_simplification.py b/pydough/conversion/relational_simplification.py index e118c3bdd..4f35b76a1 100644 --- a/pydough/conversion/relational_simplification.py +++ b/pydough/conversion/relational_simplification.py @@ -1479,22 +1479,13 @@ class SimplificationVisitor(RelationalVisitor): the current node are placed on the stack. """ - def __init__( - self, - session: PyDoughSession, - additional_shuttles: list[RelationalExpressionShuttle], - ): + def __init__(self, session: PyDoughSession): self.stack: list[dict[RelationalExpression, PredicateSet]] = [] self.shuttle: SimplificationShuttle = SimplificationShuttle(session) - self.additional_shuttles: list[RelationalExpressionShuttle] = ( - additional_shuttles - ) def reset(self): self.stack.clear() self.shuttle.reset() - for shuttle in self.additional_shuttles: - shuttle.reset() def get_input_predicates( self, node: RelationalNode @@ -1559,8 +1550,6 @@ def generic_visit( ref_expr = ColumnReference(name, expr.data_type) expr = expr.accept_shuttle(self.shuttle) output_predicates[ref_expr] = self.shuttle.stack.pop() - for shuttle in self.additional_shuttles: - expr = expr.accept_shuttle(shuttle) node.columns[name] = expr return output_predicates @@ -1645,8 +1634,6 @@ def visit_filter(self, node: Filter) -> None: # Transform the filter condition in-place. node._condition = node.condition.accept_shuttle(self.shuttle) self.shuttle.stack.pop() - for shuttle in self.additional_shuttles: - node._condition = node.condition.accept_shuttle(shuttle) self.infer_null_predicates_from_condition( output_predicates, node.condition, @@ -1661,8 +1648,6 @@ def visit_join(self, node: Join) -> None: # Transform the join condition in-place. node._condition = node.condition.accept_shuttle(self.shuttle) self.shuttle.stack.pop() - for shuttle in self.additional_shuttles: - node._condition = node.condition.accept_shuttle(shuttle) # If the join is not an inner join, remove any not-null predicates # from the RHS of the join. if node.join_type != JoinType.INNER: @@ -1689,8 +1674,6 @@ def visit_limit(self, node: Limit) -> None: for ordering_expr in node.orderings: ordering_expr.expr = ordering_expr.expr.accept_shuttle(self.shuttle) self.shuttle.stack.pop() - for shuttle in self.additional_shuttles: - ordering_expr.expr = ordering_expr.expr.accept_shuttle(shuttle) self.stack.append(output_predicates) def visit_root(self, node: RelationalRoot) -> None: @@ -1704,8 +1687,6 @@ def visit_root(self, node: RelationalRoot) -> None: for ordering_expr in node.orderings: ordering_expr.expr = ordering_expr.expr.accept_shuttle(self.shuttle) self.shuttle.stack.pop() - for shuttle in self.additional_shuttles: - ordering_expr.expr = ordering_expr.expr.accept_shuttle(shuttle) self.stack.append(output_predicates) def visit_aggregate(self, node: Aggregate) -> None: @@ -1725,7 +1706,6 @@ def visit_aggregate(self, node: Aggregate) -> None: def simplify_expressions( node: RelationalNode, session: PyDoughSession, - additional_shuttles: list[RelationalExpressionShuttle], ) -> None: """ Transforms the current node and all of its descendants in-place to simplify @@ -1734,12 +1714,6 @@ def simplify_expressions( Args: `node`: The relational node to perform simplification on. `session`: The PyDough session used during the simplification. - `additional_shuttles`: A list of additional shuttles to apply to the - expressions of the node and its descendants. These shuttles are applied - after the simplification shuttle, and can be used to perform additional - transformations on the expressions. """ - simplifier: SimplificationVisitor = SimplificationVisitor( - session, additional_shuttles - ) + simplifier: SimplificationVisitor = SimplificationVisitor(session) node.accept(simplifier) diff --git a/pydough/errors/error_utils.py b/pydough/errors/error_utils.py index 6afd449f7..fd9f276a1 100644 --- a/pydough/errors/error_utils.py +++ b/pydough/errors/error_utils.py @@ -203,7 +203,8 @@ def __init__(self): "sql_keyword": "must have a SQL name that is not a reserved word", } - def _split_identifier(self, name: str) -> list[str]: + @staticmethod + def _split_identifier(name: str) -> list[str]: """ Split a potentially qualified SQL identifier into parts. diff --git a/pydough/evaluation/evaluate_unqualified.py b/pydough/evaluation/evaluate_unqualified.py index 79408bb8d..f3d7009e5 100644 --- a/pydough/evaluation/evaluate_unqualified.py +++ b/pydough/evaluation/evaluate_unqualified.py @@ -15,6 +15,7 @@ from pydough.errors import ( PyDoughSessionException, ) +from pydough.mask_server import MaskServerInfo from pydough.metadata import GraphMetadata from pydough.qdag import PyDoughCollectionQDAG, PyDoughQDAG from pydough.relational import RelationalRoot @@ -32,8 +33,8 @@ def _load_session_info(**kwargs) -> PyDoughSession: Load the session information from the active session unless it is found in the keyword arguments. The following variants are accepted: - If `session` is found, it is used directly. - - If `metadata`, `config` and/or `database` are found, they are used to - construct a new session. + - If `metadata`, `config`, `mask_server`, and/or `database` are found, they + are used to construct a new session. - If none of these are found, the active session is used. Args: @@ -88,6 +89,11 @@ def _load_session_info(**kwargs) -> PyDoughSession: database = kwargs.pop("database") else: database = pydough.active_session.database + mask_server: MaskServerInfo | None + if "mask_server" in kwargs: + mask_server = kwargs.pop("mask_server") + else: + mask_server = pydough.active_session.mask_server assert not kwargs, f"Unexpected keyword arguments: {kwargs}" # Construct the new session @@ -95,6 +101,7 @@ def _load_session_info(**kwargs) -> PyDoughSession: new_session._metadata = metadata new_session._config = config new_session._database = database + new_session._mask_server = mask_server return new_session diff --git a/pydough/mask_server/__init__.py b/pydough/mask_server/__init__.py index 4ec78b406..5cbdccda7 100644 --- a/pydough/mask_server/__init__.py +++ b/pydough/mask_server/__init__.py @@ -3,10 +3,12 @@ """ __all__ = [ + "MaskServerCandidateVisitor", "MaskServerInfo", "MaskServerInput", "MaskServerOutput", "MaskServerResponse", + "MaskServerRewriteShuttle", "RequestMethod", "ServerConnection", "ServerRequest", @@ -18,6 +20,8 @@ MaskServerOutput, MaskServerResponse, ) +from .mask_server_candidate_visitor import MaskServerCandidateVisitor +from .mask_server_rewrite_shuttle import MaskServerRewriteShuttle from .server_connection import ( RequestMethod, ServerConnection, diff --git a/pydough/mask_server/mask_server.py b/pydough/mask_server/mask_server.py index 38f50bd50..3c06415e6 100644 --- a/pydough/mask_server/mask_server.py +++ b/pydough/mask_server/mask_server.py @@ -10,10 +10,14 @@ "MaskServerResponse", ] +import base64 +import os from dataclasses import dataclass from enum import Enum from typing import Any +from pydough.errors.error_utils import ValidSQLName +from pydough.logger import get_logger from pydough.mask_server.server_connection import ( RequestMethod, ServerConnection, @@ -49,6 +53,11 @@ class MaskServerInput: Input data structure for the MaskServer. """ + dataset_id: str + """ + The dataset ID to use when querying the mask server. + """ + table_path: str """ The fully qualified SQL table path, given from the metadata. @@ -64,6 +73,16 @@ class MaskServerInput: The linear serialization of the predicate expression. """ + @property + def fully_qualified_name(self) -> str: + """ + Returns the fully qualified name of the column in the format + 'table_path/column_name', with `/` as the separator used to modify the + `table_path` appropriately. + """ + table_path_chunks: list[str] = ValidSQLName._split_identifier(self.table_path) + return f"{'/'.join(table_path_chunks)}/{self.column_name}" + @dataclass class MaskServerOutput: @@ -88,11 +107,16 @@ class MaskServerOutput: class MaskServerInfo: """ - The MaskServeraInfo class is responsible for evaluating predicates against a + The MaskServerInfo class is responsible for evaluating predicates against a given table and column. It interacts with an external mask server to perform the evaluation. """ + batch_evaluate_api_path: str = "v1/predicates/batch-evaluate" + """ + The API path for batch evaluating predicates on the mask server. + """ + def __init__(self, base_url: str, token: str | None = None): """ Initialize the MaskServerInfo with the given server URL. @@ -105,25 +129,29 @@ def __init__(self, base_url: str, token: str | None = None): base_url=base_url, token=token ) - def get_server_response_case(self, server_case: str) -> MaskServerResponse: + def get_server_response_case(self, response_metadata: dict) -> MaskServerResponse: """ Mapping from server response strings to MaskServerResponse enum values. Args: - `server_case`: The response string from the server. + `response_metadata`: The metadata field from the server response. Returns: The corresponding MaskServerResponse enum value. """ - match server_case: - case "IN": - return MaskServerResponse.IN_ARRAY - case "NOT_IN": - return MaskServerResponse.NOT_IN_ARRAY - case _: - return MaskServerResponse.UNSUPPORTED + if response_metadata.get("dynamic_operator", None) == "IN": + match response_metadata.get("representation", None): + case "IN" | None: + return MaskServerResponse.IN_ARRAY + case "NOT_IN": + return MaskServerResponse.NOT_IN_ARRAY + case _: + return MaskServerResponse.UNSUPPORTED + return MaskServerResponse.UNSUPPORTED def simplify_simple_expression_batch( - self, batch: list[MaskServerInput] + self, + batch: list[MaskServerInput], + dry_run: bool, ) -> list[MaskServerOutput]: """ Sends a batch of predicate expressions to the mask server for evaluation. @@ -135,105 +163,173 @@ def simplify_simple_expression_batch( Args: `batch`: The list of inputs to be sent to the server. + `dry_run`: Whether to perform a dry run or not. Returns: An output list containing the response case and payload. """ - assert batch != [], "Batch cannot be empty." - - path: str = "v1/predicates/batch-evaluate" - method: RequestMethod = RequestMethod.POST - request: ServerRequest = self.generate_request(batch, path, method) + # Obtain the `hard_limit` (the maximum number of items that can be + # returned for each predicate) from the environment variable. Set the + # default to 1000 if the variable is not set or invalid. + hard_limit: int + try: + hard_limit = int(os.environ.get("PYDOUGH_MASK_SERVER_HARD_LIMIT", "1000")) + except Exception: + hard_limit = 1000 + + # Log the batch request + pyd_logger = get_logger(__name__) + if dry_run: + pyd_logger.info( + f"Batch request (dry run) to Mask Server ({len(batch)} items):" + ) + else: + pyd_logger.info(f"Batch request to Mask Server ({len(batch)} items):") + for idx, item in enumerate(batch): + pyd_logger.info( + f"({idx + 1}) {item.fully_qualified_name}: {item.expression}" + ) + assert batch != [], "Batch cannot be empty." + request: ServerRequest = self.generate_request(batch, dry_run, hard_limit) response_json = self.connection.send_server_request(request) result: list[MaskServerOutput] = self.generate_result(response_json) + print() + print(request.payload) + print() + print(response_json) return result def generate_request( - self, batch: list[MaskServerInput], path: str, method: RequestMethod + self, + batch: list[MaskServerInput], + dry_run: bool, + hard_limit: int, ) -> ServerRequest: """ - Generate a server request from the given batch of server inputs and path. + Generate a server request from the given batch of server inputs. Args: `batch`: A list of MaskServerInput objects. - `path`: The API endpoint path. + `dry_run`: Whether the request is a dry run or not. + `hard_limit`: The maximum number of items that can be returned for + each predicate. Returns: A server request including payload to be sent. Example payload: + ``` { "items": [ { - "column_reference": "srv.db.tbl.col", + "dataset_id": "snowflake.bodo.blah_blah_blah", + "column_ref": {"kind": "fqn", "value": "db/schema/table/name"}, "predicate": ["EQUAL", 2, "__col__", 1], "mode": "dynamic", - "dry_run": false + "predicate_format": "linear_with_arity", + "output_mode": "cell_encrypted", + "dry_run": true, + "limits": {"dedup": True}, }, ... ], - "expression_format": {"name": "linear", "version": "0.2.0"} + "expression_format": {"name": "linear", "version": "0.2.0"}, + "hard_limit": 1000, } + ``` """ + # Create the payload for the overall batch request, then populate the + # items list with each individual request. payload: dict = { "items": [], "expression_format": {"name": "linear", "version": "0.2.0"}, + "hard_limit": hard_limit, } + # Populate each individual request in the batch in the specified format. for item in batch: evaluate_request: dict = { - "column_reference": f"{item.table_path}.{item.column_name}", + "dataset_id": item.dataset_id, + "column_ref": { + "kind": "fqn", + "value": item.fully_qualified_name, + }, "predicate": item.expression, + "output_mode": "cell_encrypted", "mode": "dynamic", - "dry_run": False, + "predicate_format": "linear_with_arity", + "dry_run": dry_run, + "limits": {"dedup": True}, } payload["items"].append(evaluate_request) - return ServerRequest(path=path, payload=payload, method=method) + return ServerRequest( + path=self.batch_evaluate_api_path, + payload=payload, + method=RequestMethod.POST, + ) - def generate_result(self, response: dict) -> list[MaskServerOutput]: + def generate_result(self, response_dict: dict) -> list[MaskServerOutput]: """ - Generate a list of server outputs from the server response. + Generate a list of server outputs from the server response of a batch + request, either for a dry run or a normal run. On dry run requests, the + `records` field will be absent. Args: - `response`: The response from the mask server. - - Returns: - A list of server outputs objects. + `response_dict`: The response from the mask server. Example response: + ``` { "result": "SUCCESS", "items": [ { "index": 0, "result": "SUCCESS", - "decision": {"strategy": "values", "reason": "mock"}, - "predicate_hash": "hash0", - "encryption_mode": "clear", - "materialization": { - "type": "literal", - "operator": "IN", - "values": [0], - "count": 1 + "response": { + "strategy": ..., + + "records": [ + { + "mode": "cell_encrypted", + "cell_encrypted": "abcE1dsa", + } + ], + + "count": ..., + + "stats": ..., + + "column_stats": ..., + + "next_cursor": ..., + + "metadata": { + "dynamic_operator": "IN", + ... + } } }, ... ] } + ``` + + Returns: + A list of server outputs objects. """ result: list[MaskServerOutput] = [] - for item in response.get("items", []): - """ - Case on whether operator is ERROR or not - If ERROR, then response_case is unsupported and payload is None - Otherwise, call self.get_server_response(operator) to get the enum, store in a variable, then case on this variable to obtain the payload (use item.get("materialization", {}).get("values", []) if it is IN_ARRAY or NOT_IN_ARRAY, otherwise None) - """ + for item in response_dict.get("items", []): + # Case on whether operator is ERROR or not. + # If ERROR, then response_case is unsupported and payload is None. + # Otherwise, call self.get_server_response(operator) to get the + # enum, store in a variable, then case on this variable to obtain + # the payload. if item.get("result") == "ERROR": result.append( MaskServerOutput( @@ -242,24 +338,49 @@ def generate_result(self, response: dict) -> list[MaskServerOutput]: ) ) else: - materialization: dict = item.get("materialization", {}) - response_case: MaskServerResponse = self.get_server_response_case( - materialization.get("operator", "ERROR") - ) - - payload: Any = None - - if response_case in ( - MaskServerResponse.IN_ARRAY, - MaskServerResponse.NOT_IN_ARRAY, - ): - payload = materialization.get("values", []) + response: dict = item["response"] + if response.get("records", None) is None: + # In this case, it was a dry-run, and use a dummy value to + # indicate that it was successful. + result.append( + MaskServerOutput( + response_case=MaskServerResponse.IN_ARRAY, + payload=None, + ) + ) + else: + # In this case, parse the response normally. + response_case: MaskServerResponse = self.get_server_response_case( + response["metadata"] + ) - result.append( - MaskServerOutput( - response_case=response_case, - payload=payload, + payload: Any = None + + if response_case in ( + MaskServerResponse.IN_ARRAY, + MaskServerResponse.NOT_IN_ARRAY, + ): + # If the response is an IN_ARRAY or NOT_IN_ARRAY, + # extract all the records to get the cell encrypted + # values, and decode them from base64. + payload = [] + for record in response.get("records", []): + record_raw = record["cell_encrypted"] + if isinstance(record_raw, str): + padded = ( + record_raw + "=" * (4 - len(record_raw) % 4) + if len(record_raw) % 4 + else record_raw + ) + payload.append(base64.b64decode(padded).decode("utf-8")) + else: + payload.append(record_raw) + + result.append( + MaskServerOutput( + response_case=response_case, + payload=payload, + ) ) - ) return result diff --git a/pydough/mask_server/mask_server_candidate_visitor.py b/pydough/mask_server/mask_server_candidate_visitor.py new file mode 100644 index 000000000..98be67a72 --- /dev/null +++ b/pydough/mask_server/mask_server_candidate_visitor.py @@ -0,0 +1,784 @@ +""" +Logic for the visitor that is run across all expressions to identify candidates +for Mask Server rewrite conversion. +""" + +__all__ = ["MaskServerCandidateVisitor"] + +import datetime +import re + +import pydough.pydough_operators as pydop +from pydough.relational import ( + CallExpression, + ColumnReference, + CorrelatedReference, + LiteralExpression, + RelationalExpression, + RelationalExpressionVisitor, + WindowCallExpression, +) +from pydough.sqlglot.transform_bindings.sqlglot_transform_utils import ( + DateTimeUnit, + current_ts_pattern, + offset_pattern, + trunc_pattern, +) +from pydough.types import UnknownType + + +class MaskServerCandidateVisitor(RelationalExpressionVisitor): + """ + A relational expression visitor that identifies candidate expressions for + Mask Server rewrite conversion, and stores them in a candidate pool for + later processing by a `MaskServerRewriteShuttle`. The candidate pool + contains expressions with the following criteria, including both + atomic instances of the patterns, and larger expressions that contain + these patterns as sub-expressions: + 1. An expression that contains exactly one unique unmasking operator (i.e. a + `MaskedExpressionFunctionOperator` with `is_unmask=True`). The contents + of the unmasking operator can be any valid expression. + 2. Literals are allowed anywhere in the expression. + 3. No other expressions are allowed (outside the contents of the unmasking + operator) except for function calls used to combine other valid + expressions, where the function calls must be one of the operators + supported by the Mask Server (see `OPERATORS_TO_SERVER_NAMES`, as well as + the `ISIN` operator). + """ + + OPERATORS_TO_SERVER_NAMES: dict[pydop.PyDoughExpressionOperator, str] = { + pydop.BAN: "AND", + pydop.BOR: "OR", + pydop.NOT: "NOT", + pydop.EQU: "EQUAL", + pydop.NEQ: "NOT_EQUAL", + pydop.GRT: "GT", + pydop.GEQ: "GTE", + pydop.LET: "LT", + pydop.LEQ: "LTE", + pydop.STARTSWITH: "STARTSWITH", + pydop.ENDSWITH: "ENDSWITH", + pydop.CONTAINS: "CONTAINS", + pydop.LIKE: "LIKE", + pydop.LOWER: "LOWER", + pydop.UPPER: "UPPER", + pydop.YEAR: "YEAR", + pydop.QUARTER: "QUARTER", + pydop.MONTH: "MONTH", + pydop.DAY: "DAY", + pydop.HOUR: "HOUR", + pydop.MINUTE: "MINUTE", + pydop.SECOND: "SECOND", + pydop.ADD: "ADD", + pydop.SUB: "SUB", + pydop.MUL: "MUL", + pydop.DIV: "DIV", + pydop.ABS: "ABS", + pydop.SMALLEST: "LEAST", + pydop.LARGEST: "GREATEST", + pydop.DEFAULT_TO: "COALESCE", + pydop.IFF: "IFF", + } + """ + A mapping of all PyDough operators that can be handled by the Mask Server, + mapping each such operator to the string name used in the linear string + serialization format recognized by the Mask Server. + + Note: the following operators are handled separately: + - `ISIN` + - `SLICE` + - `JOIN_STRINGS` + - `DATETIME` + - `DATEDIFF` + - `MONOTONIC` + """ + + PREDICATE_OPERATORS: set[str] = { + "EQUAL", + "NOT_EQUAL", + "GT", + "GTE", + "LT", + "LTE", + "STARTSWITH", + "ENDSWITH", + "CONTAINS", + "LIKE", + "IN", + "AND", + "OR", + "NOT", + } + """ + The set of strings from `OPERATORS_TO_SERVER_NAMES` that correspond to + predicate operators. Only expressions whose outermost layer is a predicate + operator will be added to the candidate pool. This also includes other + operators from the mask server not used by `OPERATORS_TO_SERVER_NAMES` but + that are used by special handling cases, like how the `ISIN` operator + in PyDough becomes the `IN` operator in the mask server. + """ + + SERVER_OPERATOR_NAMES: set[str] = { + *OPERATORS_TO_SERVER_NAMES.values(), + "NOT_IN", + "SLICE", + "CONCAT", + "DATETIME", + "DATEDIFF", + "DATETRUNC", + "REGEXP", + } + """ + The set of all operator names recognized by the Mask Server in its linear + serialization format, needed because when a string literal is used that + matches one of these reserved names, it must be wrapped in the QUOTE + function to avoid confusion. + """ + + def __init__(self) -> None: + self.candidate_pool: dict[ + RelationalExpression, + tuple[ + pydop.MaskedExpressionFunctionOperator, + RelationalExpression, + list[str | int | float | None | bool], + ], + ] = {} + """ + The internal datastructure used to keep track of all candidate + expressions identified during a traversal of a relational tree. Each + candidate expression maps to a tuple of: + 1. The single unmasking operator contained within the expression. + 2. The input expression that is being unmasked. + 3. The linear serialization of the entire expression as a list, where + invocations of UNMASK(input_expr) are replaced with the token + "__col__". + """ + + self.processed_candidates: set[RelationalExpression] = set() + """ + The set of all relational expressions that have already been added to + the candidate pool at least once. This is used to avoid adding the same + candidate multiple times if it is encountered multiple times during a + traversal of the relational tree, since the candidate pool will be + cleared once all of the candidates in the pool are processed in a batch + request to the mask server. + """ + + self.stack: list[ + tuple[ + tuple[pydop.MaskedExpressionFunctionOperator, RelationalExpression] + | None, + list[str | int | float | None | bool] | None, + ] + ] = [] + """ + The stack is used to keep track of information relating to + sub-expressions of the current expression. When visiting an expression, + the stack will contain one entry for each input to the expression, + where each entry is a tuple of: + 1. Either None, or the single unmasking operator and input expression + contained within the input expression, if any. + 2. Either None, or the linear serialization of the input expression as + a list, where invocations of UNMASK(input_expr) are replaced with + the token "__col__". + """ + + self.heritage_tree: dict[ + RelationalExpression, set[RelationalExpression | None] + ] = {} + """ + A mapping of each expression to its set of parent expressions in the + relational tree. `None` is also included in the set if the expression + ever appears standalone (i.e., as the root of a relational expression in + the tree). This is used later as a core part of the algorithm for + `choose_minimal_covering_set`. Each expression can map to multiple + parents since the same expression instance can appear in multiple places + within the relational tree. + """ + + self.ancestry_stack: list[RelationalExpression | None] = [None] + """ + A stack used to keep track of the ancestry of the current expression + being visited. The top of the stack is always the parent of the current + expression. This is used to build the `heritage_tree` mapping. + """ + + def reset(self): + self.stack.clear() + self.heritage_tree.clear() + self.ancestry_stack = [None] + + def visit_call_expression(self, expr: CallExpression) -> None: + # First, recursively visit all of the inputs to the function call, then + # extract the data from the stack to determine whether this expression + # is a candidate for Mask Server rewrite conversion. Reverse the order + # of the stack entries since they were pushed in order of visitation, + # but need to be processed in the original input order. + self.ancestry_stack.append(expr) + for arg in expr.inputs: + arg.accept_shuttle(self) + self.ancestry_stack.pop() + mask_ops: set[ + tuple[pydop.MaskedExpressionFunctionOperator, RelationalExpression] + ] = set() + arg_exprs: list[list[str | int | float | None | bool] | None] = [] + for _ in range(len(expr.inputs)): + stack_term, expression_list = self.stack.pop() + if stack_term is not None: + mask_ops.add(stack_term) + arg_exprs.append(expression_list) + arg_exprs.reverse() + + self.heritage_tree[expr] = self.heritage_tree.get(expr, set()) + self.heritage_tree[expr].add(self.ancestry_stack[-1]) + + input_op: pydop.MaskedExpressionFunctionOperator + input_expr: RelationalExpression + combined_exprs: list[str | int | float | None | bool] | None + + # A call in the form `UNMASK(input_expr)` is the atomic `__col__` + # expression that forms the base case for all candidate expressions, if + # the column is server-masked. + if ( + isinstance(expr.op, pydop.MaskedExpressionFunctionOperator) + and expr.op.is_unmask + and expr.op.masking_metadata.server_masked + and expr.op.masking_metadata.server_dataset_id is not None + ): + self.stack.append(((expr.op, expr.inputs[0]), ["__col__"])) + + # If there are zero unmasking operators in the inputs, or more than + # one, this expression is not a candidate. + elif len(mask_ops) != 1: + self.stack.append((None, None)) + + # Otherwise, verify that the function call operator is one that can be + # handled by the Mask Server, and if so, build the linear serialization + # for the entire expression. If it cannot be handled, return None. + else: + input_op, input_expr = mask_ops.pop() + combined_exprs = self.convert_call_to_server_expression(expr, arg_exprs) + if combined_exprs is not None and expr not in self.processed_candidates: + # Insert the expression and its corresponding data (the unmask + # operator, the input expression, and the linear serialization) + # into the candidate pool, but only if the expression's + # outermost layer is a predicate call. + if ( + len(combined_exprs) > 0 + and combined_exprs[0] in self.PREDICATE_OPERATORS + ): + self.candidate_pool[expr] = (input_op, input_expr, combined_exprs) + self.processed_candidates.add(expr) + self.stack.append(((input_op, input_expr), combined_exprs)) + + def visit_column_reference(self, column_reference: ColumnReference) -> None: + self.stack.append((None, None)) + + def visit_literal_expression(self, literal: LiteralExpression) -> None: + # Literals do not contain the UNMASK operator, but can have a linear + # serialization that can be sent to the Mask Server, so we convert the + # literal to the appropriate list format and push that onto the stack. + self.stack.append((None, self.convert_literal_to_server_expression(literal))) + + def visit_window_expression(self, window_expression: WindowCallExpression) -> None: + # Window functions cannot be sent to the mask server, but their inputs + # potentially can be. + for arg in window_expression.inputs: + arg.accept_shuttle(self) + self.stack.pop() + for arg in window_expression.partition_inputs: + arg.accept_shuttle(self) + self.stack.pop() + for order in window_expression.order_inputs: + order.expr.accept_shuttle(self) + self.stack.pop() + self.stack.append((None, None)) + + def visit_correlated_reference(self, correlated_reference: CorrelatedReference): + # Correlated references cannot be sent to the mask server. + self.stack.append((None, None)) + + def convert_call_to_server_expression( + self, + call: CallExpression, + input_exprs: list[list[str | int | float | None | bool] | None], + ) -> list[str | int | float | None | bool] | None: + """ + Converts a function call to the linear serialization format recognized + by the Mask Server, using the provided list of linear serializations for + each input to the function call. If the function call cannot be + converted, returns None. + + Args: + `call`: The function call to convert. + `input_exprs`: A list of linear serializations for each input to + the function call, where each input serialization is either a + list of strings/ints/floats/bools/None, or None if the input + could not be converted. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the function call, or None if the function call + could not be converted. + """ + + # If the function call is an ISIN, handle it separately since it has a + # different format than the other operators, and we don't need the + # second input to be converted since it must be a literal list. + if call.op == pydop.ISIN and len(call.inputs) == 2: + return self.convert_isin_call_to_server_expression(call.inputs, input_exprs) + + # If any of the inputs were not able to be converted, return None since + # then the call cannot be converted. + if None in input_exprs: + return None + + # Dispatch to the specified conversion method for each operator that + # has dedicated logic, besides ISIN which was already handled. + match call.op: + case pydop.MONOTONIC: + return self.convert_monotonic_call_to_server_expression(input_exprs) + case pydop.SLICE: + return self.convert_slice_call_to_server_expression(input_exprs) + case pydop.JOIN_STRINGS: + return self.convert_join_strings_call_to_server_expression(input_exprs) + case pydop.DATETIME: + return self.convert_datetime_call_to_server_expression(input_exprs) + case pydop.DATEDIFF: + return self.convert_datediff_call_to_server_expression(input_exprs) + case op if op in self.OPERATORS_TO_SERVER_NAMES: + # Default handling for all the remaining operators that are + # just translated 1:1 with from `OPERATORS_TO_SERVER_NAMES`. + # First, build up the list with the first two entries: the name + # of the function call operator, and the number of inputs to the + # function call. + result: list[str | int | float | None | bool] = [] + operator_name: str = self.OPERATORS_TO_SERVER_NAMES[call.op] + result.append(operator_name) + result.append(len(call.inputs)) + # For each input to the function call, append its linear + # serialization to the result list. We know they are not None + # from the earlier check. + for inp in input_exprs: + assert inp is not None + result.extend(inp) + return result + case _: + # Any other operator is unsupported. + return None + + def convert_isin_call_to_server_expression( + self, + inputs: list[RelationalExpression], + input_exprs: list[list[str | int | float | None | bool] | None], + ) -> list[str | int | float | None | bool] | None: + """ + Converts a relational expression for an ISIN call into the linear + serialization list format recognized by the Mask Server, using the + provided list of linear serializations for the first input, versus a + manual unfolding of the second input which must be a literal list. + + Args: + `inputs`: The two inputs to the ISIN call. + `input_exprs`: A list of linear serializations for each input to + the ISIN call, where each input serialization is either a + list of strings/ints/floats/bools/None, or None if the input + could not be converted. + """ + if len(inputs) != 2: + raise ValueError("ISIN operator requires exactly two inputs.") + + # Start the output list with the operator name. If the first input + # could not be converted, return None. + if input_exprs[0] is None: + return None + assert isinstance(inputs[1], LiteralExpression) and isinstance( + inputs[1].value, (list, tuple) + ), "ISIN right-hand side must be a list or tuple literal." + + # Unfold the second input, which must be a literal list, into the + # output list. If any element of the list cannot be converted, return + # None. + in_list: list[str | int | float | None | bool] = [] + for v in inputs[1].value: + literal_list: list[str | int | float | None | bool] | None = ( + self.convert_literal_to_server_expression( + LiteralExpression(v, UnknownType()) + ) + ) + if literal_list is None: + return None + in_list.extend(literal_list) + + # The result list is: + # 1. The operator name "IN" + # 2. The total number of arguments, including the element to check + # versus the number of elements in the list. + # 3. The linear serialization of the first input expression. + # 4. The unfolded elements of the literal list from the second input. + result: list[str | int | float | None | bool] = ["IN"] + result.append(len(inputs[1].value) + 1) + result.extend(input_exprs[0]) + result.extend(in_list) + return result + + def convert_monotonic_call_to_server_expression( + self, input_exprs: list[list[str | int | float | None | bool] | None] + ) -> list[str | int | float | None | bool] | None: + """ + Converts a PyDough MONOTONIC operation to the linear serialization + format recognized by the Mask Server. MONOTONIC(a, b, c) is converted to + be equivalent to `(a <= b) AND (b <= c)`. + + Args: + `input_exprs`: A list of linear serializations for each input to + the MONOTONIC call, where each input serialization is either a + list of strings/ints/floats/bools/None, or None if the input + could not be converted. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the MONOTONIC operation, or None if the MONOTONIC + operation could not be converted. + """ + assert len(input_exprs) == 3, ( + "MONOTONIC operator requires exactly three inputs." + ) + if input_exprs[0] is None or input_exprs[1] is None or input_exprs[2] is None: + return None + arg0: list[str | int | float | None | bool] = input_exprs[0] + arg1: list[str | int | float | None | bool] = input_exprs[1] + arg2: list[str | int | float | None | bool] = input_exprs[2] + return ["AND", 2, "LTE", 2, *arg0, *arg1, "LTE", 2, *arg1, *arg2] + + def convert_slice_call_to_server_expression( + self, input_exprs: list[list[str | int | float | None | bool] | None] + ) -> list[str | int | float | None | bool] | None: + """ + Attempts to convert a PyDough SLICE operation to the linear + serialization format recognized by the Mask Server. This requires + converting the slice from Python form `input_expr[start:stop:step]` to + the more SQL-like form `SUBSTRING(input_expr, start, length)`, but + still using 0-based indexing for start (just like Python). + + Args: + `input_exprs`: A list of linear serializations for each input to + the SLICE call, where each input serialization is either a + list of strings/ints/floats/bools/None, or None if the input + could not be converted. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the SLICE operation, or None if the SLICE + operation could not be converted. + """ + assert len(input_exprs) == 4, "SLICE operator requires exactly four inputs." + # Start by building the output list with the operator name, the number + # of arguments (3), and the linear serialization of the input + # expression. If the input expression could not be converted, return + # None. + result: list[str | int | float | None | bool] = ["SLICE", 3] + if input_exprs[0] is None: + return None + result.extend(input_exprs[0]) + + # Attempt to extract the start, stop, and step values from the remaining + # arguments to the slice operation and convert them to start vs length. + # For now, only supports the form where step is 1, and start/stop are + # both positive integer literals, with stop > start. Alternatively, + # allows taking a prefix since that case is similarly well defined. + start_int: int + length_int: int + start_literal = input_exprs[1] + stop_literal = input_exprs[2] + step_literal = input_exprs[3] + if ( + start_literal is None + or stop_literal is None + or len(start_literal) != 1 + or len(stop_literal) != 1 + or step_literal not in ([1], ["NULL"]) + ): + return None + match (start_literal[0], stop_literal[0]): + case (int(start), int(stop)) if start >= 0 and stop > start: + start_int = start + length_int = stop - start + case ("NULL", int(stop)) if stop > 0: + start_int = 0 + length_int = stop + case _: + return None + + result.append(start_int) + result.append(length_int) + return result + + def convert_join_strings_call_to_server_expression( + self, input_exprs: list[list[str | int | float | None | bool] | None] + ) -> list[str | int | float | None | bool] | None: + """ + Converts the JOIN_STRINGS PyDough operator to an equivalent variadic + CONCAT operation in the linear serialization format recognized by + the Mask Server: + + `JOIN_STRINGS('', a, b, c)` becomes `CONCAT(3, a, b, c)` + `JOIN_STRINGS(s, a, b, c)` becomes `CONCAT(5, a, s, b, s, c)` + + Args: + `input_exprs`: A list of linear serializations for each input to + the JOIN_STRINGS call, where each input serialization is either a + list of strings/ints/floats/bools/None, or None if the input + could not be converted. The first input is the delimiter + expression, and each subsequent input is a string expression to + be joined. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the JOIN_STRINGS operation, or None if the + JOIN_STRINGS operation could not be converted. + """ + assert len(input_exprs) >= 3, ( + "JOIN_STRINGS operator requires at least three inputs." + ) + # If the delimiter expression could not be converted, return None. + delimiter_expr: list[str | int | float | None | bool] | None = input_exprs[0] + if delimiter_expr is None: + return None + + # Start building the result list with the operator name. + result: list[str | int | float | None | bool] = ["CONCAT"] + + # If the delimiter is the empty string, then the number of arguments + # is simply the number of input expressions minus one (the delimiter), + # and all of the remaining arguments should just be appended directly. + remaining_args: list[list[str | int | float | None | bool] | None] = ( + input_exprs[1:] + ) + if delimiter_expr == [""]: + result.append(len(remaining_args)) + for expr in remaining_args: + if expr is None: + return None + result.extend(expr) + return result + + # Otherwise, the remaining arguments are interleaved with the delimiter. + result.append(2 * len(remaining_args) - 1) + for i, expr in enumerate(remaining_args): + if expr is None: + return None + result.extend(expr) + if i < len(remaining_args) - 1: + result.extend(delimiter_expr) + return result + + def convert_datetime_call_to_server_expression( + self, input_exprs: list[list[str | int | float | None | bool] | None] + ) -> list[str | int | float | None | bool] | None: + """ + Attempts to convert a PyDough DATETIME operation to the linear + serialization format recognized by the Mask Server. The DATETIME + operation is treated as a series of transformations on an initial + input expression, where each transformation is either a truncation + (DATETRUNC) or an addition (DATEADD). + + Args: + `input_exprs`: A list of linear serializations for each input to + the DATETIME call, where each input serialization is either a + list of strings/ints/floats/bools/None, or None if the input + could not be converted. The first input is the seed expression, + and each subsequent input is a string representing either a + truncation or addition operation. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the DATETIME operation, or None if the DATETIME + operation could not be converted. + """ + # Skip cases where DATETIME is called on an argument just to cast it. + if len(input_exprs) < 2: + return None + + # Start with the input argument, then iteratively apply each phase of + # the transformation with DATETIME as either a truncation or addition. + # Reject if the seed is a literal indicating the current timestamp. + result: list[str | int | float | None | bool] + if input_exprs[0] is None or ( + len(input_exprs[0]) == 1 + and isinstance(input_exprs[0][0], str) + and current_ts_pattern.fullmatch(input_exprs[0][0]) + ): + return None + else: + result = input_exprs[0] + for arg in input_exprs[1:]: + if arg is None or len(arg) != 1 or not isinstance(arg[0], str): + return None + # Use regex to determine if this is a truncation or addition, + # and dispatch to the appropriate conversion method. If it is + # neither, or the conversion method failed, return None. + # Otherwise, the result becomes the new input to the next phase. + trunc_match: re.Match | None = trunc_pattern.fullmatch(arg[0]) + offset_match: re.Match | None = offset_pattern.fullmatch(arg[0]) + new_result: list[str | int | float | None | bool] | None = None + if trunc_match is not None: + new_result = self.convert_datetrunc_call_to_server_expression( + result, str(trunc_match.group(1)) + ) + elif offset_match is not None: + new_result = self.convert_dateadd_call_to_server_expression( + result, + str(offset_match.group(1)), + int(offset_match.group(2)), + str(offset_match.group(3)), + ) + if new_result is None: + return None + result = new_result + + return result + + def convert_datetrunc_call_to_server_expression( + self, input_expr: list[str | int | float | None | bool], unit_str: str + ) -> list[str | int | float | None | bool] | None: + """ + Attempt to convert a DATETRUNC call to the linear serialization format + recognized by the Mask Server. + + Args: + `input_expr`: A linear serialization for the input to the + DATETRUNC call, as a list of strings/ints/floats/bools/None. + `unit_str`: The string representing the unit to truncate to. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the DATETRUNC operation, or None if the DATETRUNC + operation could not be converted. + """ + unit: DateTimeUnit | None = DateTimeUnit.from_string(unit_str) + # Reject if the unit is not recognized, or is a WEEK (for now). + if unit is None or unit == DateTimeUnit.WEEK: + return None + result: list[str | int | float | None | bool] = ["DATETRUNC", 2] + result.append(unit.value) + result.extend(input_expr) + return result + + def convert_dateadd_call_to_server_expression( + self, + input_expr: list[str | int | float | None | bool], + sign_str: str, + amount: int, + unit_str: str, + ) -> list[str | int | float | None | bool] | None: + """ + Attempt to convert a DATEADD call to the linear serialization format + recognized by the Mask Server. + + Args: + `input_expr`: A linear serialization for the input to the + DATEADD call, as a list of strings/ints/floats/bools/None. + `sign_str`: The string representing the sign of the amount to add ( + either "+", "-", or "", with empty being the same as "+"). + `amount`: The integer amount to add (can be negative). + `unit_str`: The string representing the unit to add. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the DATEADD operation, or None if the DATEADD + operation could not be converted. + """ + unit: DateTimeUnit | None = DateTimeUnit.from_string(unit_str) + if unit is None or unit == DateTimeUnit.WEEK: + return None + result: list[str | int | float | None | bool] = ["DATEADD", 3] + if sign_str == "-": + amount = -amount + result.append(amount) + result.append(unit.value + "s") + result.extend(input_expr) + return result + + def convert_datediff_call_to_server_expression( + self, input_exprs: list[list[str | int | float | None | bool] | None] + ) -> list[str | int | float | None | bool] | None: + """ + Attempt to convert a DATEDIFF call to the linear serialization format + recognized by the Mask Server. The datediff is transformed by having + its first argument, the units, normalized into one of the following: + - "years" + - "quarters" + - "months" + - "days" + - "hours" + - "minutes" + - "seconds" + + Weeks are ignored for now. + + Args: + `input_exprs`: A list of linear serializations for each input to + the DATEDIFF call, where each input serialization is either a + list of strings/ints/floats/bools/None, or None if the input + could not be converted. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the DATEDIFF operation, or None if the DATEDIFF + operation could not be converted. + """ + result: list[str | int | float | None | bool] = ["DATEDIFF", 3] + assert len(input_exprs) == 3, "DATEDIFF operator requires exactly three inputs." + + # Extract and normalize the unit argument, rejecting weeks for now. + unit_expr = input_exprs[0] + if ( + unit_expr is None + or len(unit_expr) != 1 + or not isinstance(unit_expr[0], str) + ): + return None + unit: DateTimeUnit | None = DateTimeUnit.from_string(unit_expr[0]) + if unit is None or unit == DateTimeUnit.WEEK: + return None + result.append(unit.value + "s") + + # Append the linear serializations for the start and end expressions. + start_expr = input_exprs[1] + end_expr = input_exprs[2] + if start_expr is None or end_expr is None: + return None + result.extend(start_expr) + result.extend(end_expr) + return result + + def convert_literal_to_server_expression( + self, literal: LiteralExpression + ) -> list[str | int | float | None | bool] | None: + """ + Converts a literal expression to the linear serialization format + recognized by the Mask Server. If the literal cannot be converted, + returns None. + + Args: + `literal`: The literal expression to convert. + + Returns: + A list of strings/ints/floats/bools/None representing the linear + serialization of the literal, or None if the literal could not be + converted. + """ + if literal.value is None: + return ["NULL"] + elif isinstance(literal.value, bool): + return ["TRUE" if literal.value else "FALSE"] + elif isinstance(literal.value, (int, float, str)): + return [literal.value] + elif isinstance(literal.value, datetime.datetime): + return [literal.value.strftime("%Y-%m-%d %H:%M:%S")] + elif isinstance(literal.value, datetime.date): + return [literal.value.isoformat()] + else: + return None diff --git a/pydough/mask_server/mask_server_rewrite_shuttle.py b/pydough/mask_server/mask_server_rewrite_shuttle.py new file mode 100644 index 000000000..1744608ec --- /dev/null +++ b/pydough/mask_server/mask_server_rewrite_shuttle.py @@ -0,0 +1,323 @@ +""" +Logic for the shuttle that performs Mask Server rewrite conversion on candidates +identified by the candidate visitor. +""" + +__all__ = ["MaskServerRewriteShuttle"] + +import pydough.pydough_operators as pydop +from pydough.relational import ( + CallExpression, + LiteralExpression, + RelationalExpression, + RelationalExpressionShuttle, +) +from pydough.types import ArrayType, BooleanType, UnknownType + +from .mask_server import ( + MaskServerInfo, + MaskServerInput, + MaskServerOutput, + MaskServerResponse, +) +from .mask_server_candidate_visitor import MaskServerCandidateVisitor +from .min_cover_set import choose_minimal_covering_set + + +class MaskServerRewriteShuttle(RelationalExpressionShuttle): + """ + A shuttle that rewrites candidate expressions for Mask Server conversion + identified by a `MaskServerCandidateVisitor`, by batching requests to the + Mask Server and replacing the candidate expressions with the appropriate + responses from the server. + """ + + def __init__( + self, server_info: MaskServerInfo, candidate_visitor: MaskServerCandidateVisitor + ) -> None: + self.server_info: MaskServerInfo = server_info + self.candidate_visitor: MaskServerCandidateVisitor = candidate_visitor + self.responses: dict[RelationalExpression, RelationalExpression | None] = {} + """ + A mapping of relational expressions from the candidate visitor that have + been processed by the Mask Server. Each expression maps to either None + (if the server could not handle it) or the rewritten expression based on + the outcome of the server request. + """ + + def visit_call_expression(self, expr: CallExpression) -> RelationalExpression: + # If this expression is in the candidate pool, process all of the + # candidates in the pool in a batch sent to the Mask Server. The + # candidate pool will then be cleared, preventing duplicate processing + # of the same expression. The responses will be stored in self.responses + # for later lookup. + if expr in self.candidate_visitor.candidate_pool: + self.process_batch() + + # If a Mask Server response has been stored for this expression, + # utilize it to convert the expression to its simplified form. + response: RelationalExpression | None = self.responses.get(expr, None) + if response is not None: + return response + + # Otherwise, use the regular process to recursively transform the inputs + # to the function call. + return super().visit_call_expression(expr) + + def process_batch(self) -> None: + """ + Invokes the logic to dump the contents of the candidate pool to the + Mask Server in a single batch, and process the responses to store them + in self.responses for later lookup. + """ + batch: list[MaskServerInput] = [] + ancillary_info: list[tuple[RelationalExpression, RelationalExpression]] = [] + + # Loop over every candidate in the pool, building up the batch request + # by adding the MaskServerInput for each candidate, and storing the + # tuple of the original expression and the underlying input that is + # being unmasked for later use when processing the response. The two + # lists, the batch and ancillary info, remain in sync by index so they + # can be zipped together later. + for expr, ( + mask_op, + input_expr, + expression_list, + ) in self.candidate_visitor.candidate_pool.items(): + ancillary_info.append((expr, input_expr)) + assert mask_op.masking_metadata.server_masked + assert mask_op.masking_metadata.server_dataset_id is not None + batch.append( + MaskServerInput( + dataset_id=mask_op.masking_metadata.server_dataset_id, + table_path=mask_op.table_path, + column_name=mask_op.masking_metadata.column_name, + expression=expression_list, + ) + ) + self.candidate_visitor.processed_candidates.add(expr) + + # Wipe the candidate pool to prevent duplicate processing, since every + # candidate already in the pool has now been handled. + self.candidate_visitor.candidate_pool.clear() + + # First, send the dry response batch to the Mask Server to identify + # which predicates can be re-written. + dry_run_results: list[MaskServerOutput] = ( + self.server_info.simplify_simple_expression_batch(batch, True) + ) + + batch, ancillary_info = self.identify_predicates_to_send( + dry_run_results, + batch, + ancillary_info, + heritage_tree=self.candidate_visitor.heritage_tree, + ) + self.candidate_visitor.heritage_tree.clear() + + # Abort if the batch is now empty after filtering. + if len(batch) == 0: + return + + # Send the batch to the Mask Server, and process each response + # alongside the ancillary info. Afterwards, self.responses should + # contain an entry for every candidate that was in the pool, mapping it + # to None in the case of failure, or the rewritten expression in the + # case of success. + responses: list[MaskServerOutput] = ( + self.server_info.simplify_simple_expression_batch(batch, False) + ) + assert len(responses) == len(ancillary_info) + for (expr, input_expr), response in zip(ancillary_info, responses): + if response.response_case != MaskServerResponse.UNSUPPORTED: + self.responses[expr] = self.convert_response_to_relational( + input_expr, response + ) + else: + self.responses[expr] = None + + def identify_predicates_to_send( + self, + dry_run_results: list[MaskServerOutput], + batch: list[MaskServerInput], + ancillary_info: list[tuple[RelationalExpression, RelationalExpression]], + heritage_tree: dict[RelationalExpression, set[RelationalExpression | None]], + ) -> tuple[ + list[MaskServerInput], list[tuple[RelationalExpression, RelationalExpression]] + ]: + """ + Takes in the results of a dry run to the Mask Server, and identifies + which predicates should actually be sent to the server for processing in + order to minimize the total number of requests while still ensuring + that all necessary predicates are covered. + + Args: + `dry_run_results`: The results from the dry run to the Mask Server. + `batch`: The original batch of Mask Server inputs sent in the dry + run. + `ancillary_info`: The original ancillary info sent in the dry run. + `heritage_tree`: A mapping of each expression to its set of parent + expressions in the relational tree. `None` is also included in the + set if the expression ever appears standalone without a parent. + + Returns: + A tuple containing the new batch of Mask Server inputs to send, and + the new ancillary info corresponding to that batch. + """ + # Extract the underlying expressions from the ancillary info, and + # identify the indices of the expressions that were successful in the + # dry run by checking the response cases. + expressions: list[RelationalExpression] = [expr for expr, _ in ancillary_info] + successes: list[int] = [ + idx + for idx, result in enumerate(dry_run_results) + if result.response_case != MaskServerResponse.UNSUPPORTED + ] + + # Run the algorithm to identify the indices of which successful dry run + # responses from the list should be kept. + keep_idxs: set[int] = choose_minimal_covering_set( + expressions, successes, heritage_tree + ) + + # Build the new batch and ancillary info lists by filtering to only + # those indices. + new_batch: list[MaskServerInput] = [ + elem for idx, elem in enumerate(batch) if idx in keep_idxs + ] + new_ancillary_info: list[tuple[RelationalExpression, RelationalExpression]] = [ + anc_elem for idx, anc_elem in enumerate(ancillary_info) if idx in keep_idxs + ] + return new_batch, new_ancillary_info + + def convert_response_to_relational( + self, input_expr: RelationalExpression, response: MaskServerOutput + ) -> RelationalExpression | None: + """ + Takes in the original input expression that is being unmasked within + a larger candidate expression for Mask Server rewrite, as well as the + response from the Mask Server, and converts it to a relational + expression that can be used to replace the original candidate + expression. + + Args: + `input_expr`: The original input expression that is being unmasked. + `response`: The response from the Mask Server for the candidate. + + Returns: + A relational expression that can be used to replace the original + candidate expression. Alternatively, returns None if the response + could not be converted (e.g. it is a pattern PyDough does not yet + support). + """ + result: RelationalExpression + match response.response_case: + case MaskServerResponse.IN_ARRAY | MaskServerResponse.NOT_IN_ARRAY: + result = self.build_in_array_expression(input_expr, response) + case _: + return None + return result + + def build_in_array_expression( + self, input_expr: RelationalExpression, response: MaskServerOutput + ) -> RelationalExpression: + """ + Implements the logic of `convert_response_to_relational` specifically + for the case where the Mask Server response indicates that the original + expression, containing the input expression, can be replaced with an + IN or NOT IN expression with a list of literals. + + Args: + `input_expr`: The original input expression that is being unmasked. + `response`: The response from the Mask Server for the candidate. + This response is assumed to be of type IN_ARRAY or NOT_IN_ARRAY. + + Returns: + A relational expression that can be used to replace the original + candidate expression. + """ + assert response.response_case in ( + MaskServerResponse.IN_ARRAY, + MaskServerResponse.NOT_IN_ARRAY, + ) + assert isinstance(response.payload, list) + # Extract the list of literals from the response payload. If the list + # contains a NULL, remove it since SQL IN lists cannot contain NULLs, + # then mark it as such so we can add the null check later. + in_list: list = response.payload + contains_null: bool = None in in_list + while None in in_list: + in_list.remove(None) + result: RelationalExpression + if len(in_list) == 0: + # If the payload is empty, we can return a literal true/false + # depending on whether it is IN or NOT IN. If there was a null, then + # instead we just check if the expression is/isn't null. + if contains_null: + result = CallExpression( + pydop.ABSENT + if response.response_case == MaskServerResponse.IN_ARRAY + else pydop.PRESENT, + BooleanType(), + [input_expr], + ) + else: + result = LiteralExpression( + response.response_case == MaskServerResponse.NOT_IN_ARRAY, + BooleanType(), + ) + elif len(in_list) == 1: + # If the payload has one element, we can return a simple equality + # or inequality, depending on whether it is IN or NOT IN. + result = CallExpression( + pydop.EQU + if response.response_case == MaskServerResponse.IN_ARRAY + else pydop.NEQ, + BooleanType(), + [ + input_expr, + LiteralExpression(in_list[0], UnknownType()), + ], + ) + else: + # Otherwise, we need to return an ISIN expression with an array + # literal, and if doing NOT IN then negate the whole thing. + array_literal: LiteralExpression = LiteralExpression( + in_list, ArrayType(UnknownType()) + ) + result = CallExpression( + pydop.ISIN, BooleanType(), [input_expr, array_literal] + ) + if response.response_case == MaskServerResponse.NOT_IN_ARRAY: + result = CallExpression(pydop.NOT, BooleanType(), [result]) + + # If the original payload contained a NULL, we need to add an extra + # check to the result to account for that, since SQL IN lists cannot + # contain NULLs. + # - If the list is empty after removing nulls, then the present/absent + # check has already been added. + # - Otherwise, if doing IN -> `ABSENT(x) OR ISIN(x, ...)`. + # - Otherwise, if doing NOT_IN -> `PRESENT(x) AND NOT(ISIN(x, ...))`. + if contains_null and len(in_list) > 0: + null_op: pydop.PyDoughExpressionOperator = ( + pydop.ABSENT + if response.response_case == MaskServerResponse.IN_ARRAY + else pydop.PRESENT + ) + bool_op: pydop.PyDoughExpressionOperator = ( + pydop.BOR + if response.response_case == MaskServerResponse.IN_ARRAY + else pydop.BAN + ) + is_null_check: CallExpression = CallExpression( + null_op, + BooleanType(), + [input_expr], + ) + result = CallExpression( + bool_op, + BooleanType(), + [is_null_check, result], + ) + + return result diff --git a/pydough/mask_server/min_cover_set.py b/pydough/mask_server/min_cover_set.py new file mode 100644 index 000000000..bc544b4e9 --- /dev/null +++ b/pydough/mask_server/min_cover_set.py @@ -0,0 +1,89 @@ +""" +Logic for choosing the minimal set of expressions out of a list such that only +expressions marked as "successful" are included, and every expression from the +list is either included or has an ancestor that is included. +""" + +__all__ = ["choose_minimal_covering_set"] + +from pydough.relational import RelationalExpression + + +def choose_minimal_covering_set( + expressions: list[RelationalExpression], + successful_idxs: list[int], + heritage_tree: dict[RelationalExpression, set[RelationalExpression | None]], +) -> set[int]: + """ + Identifies the minimal set of indices from `successful_idxs` such that every + expression in `expressions` is either included in the set or has an ancestor + that is included. + + Args: + `expressions`: The list of expressions to cover. + `successful_idxs`: The list of indices into `expressions` that are + marked as successful. + `heritage_tree`: A mapping of each expression to its set of parent + expressions in the relational tree. `None` is also included in the set + if the expression ever appears standalone (i.e., as the root of a + relational expression in the tree). Each expression maps to a set since + an expression can appear in multiple places within the relational tree. + + Returns: + The set of indices from `successful_idxs` that form the minimal covering + set. + """ + + # Build the following datastructures: + # 1. Set of expressions that are marked as successful. + # 2. Set of expressions that are not needed (i.e., every ancestor is either + # included in the answer, or is also not needed). + # 3. Set of expressions to include in the final answer. + # 4. Set of expressions already visited during traversal (to ensure dynamic + # programming principles are upheld to avoid redundant work). + supported: set[RelationalExpression] = {expressions[idx] for idx in successful_idxs} + not_needed: set[RelationalExpression] = set() + include: set[RelationalExpression] = set() + visited: set[RelationalExpression] = set() + + # Run a DFS traversal for each expression, walking through the full forest + # from `expressions`. + def traverse(expr: RelationalExpression): + # Abort if already visited, then mark the node as visited. + if expr in visited: + return + visited.add(expr) + + # Extract all parents of the expression from the heritage tree. A + # `None` parent indicates that the current expression appears + # standalone. For each non-None parent, traverse it recursively. The + # expression starts out as unecessary, but loses that distinction if + # any of the parents indicate otherwise. + parents: set[RelationalExpression | None] = heritage_tree.get(expr, {None}) + unnecessary: bool = True + for parent in parents: + if parent is not None: + traverse(parent) + + # The expression loses its unecessary distinction if it appears + # standalone, or if any of its parents are simultaneously + # unsupported and necessary. + if parent is None or (parent not in supported and parent not in not_needed): + unnecessary = False + # If the current expression loses the unnecessary distinction, + # add it in the inclusion set, but only if it is supported. + if expr in supported: + include.add(expr) + + # If the expression was marked as unnecessary, add it to the + # `not_needed` set. + if unnecessary: + not_needed.add(expr) + + for expr in expressions: + traverse(expr) + + # Return the set of indices from `successful_idxs` that correspond to + # expressions that were placed in `include` during the DFS forest run. + result: set[int] = {idx for idx in successful_idxs if expressions[idx] in include} + return result diff --git a/pydough/metadata/properties/masked_table_column_metadata.py b/pydough/metadata/properties/masked_table_column_metadata.py index 500d9857c..9c7d4e577 100644 --- a/pydough/metadata/properties/masked_table_column_metadata.py +++ b/pydough/metadata/properties/masked_table_column_metadata.py @@ -36,6 +36,7 @@ class MaskedTableColumnMetadata(TableColumnMetadata): "protect protocol", "unprotect protocol", "server masked", + "server dataset id", } def __init__( @@ -48,6 +49,7 @@ def __init__( unprotect_protocol: str, protect_protocol: str, server_masked: bool, + server_dataset_id: str | None, sample_values: list | None, description: str | None, synonyms: list[str] | None, @@ -67,6 +69,7 @@ def __init__( self._unprotect_protocol: str = unprotect_protocol self._protect_protocol: str = protect_protocol self._server_masked: bool = server_masked + self._server_dataset_id: str | None = server_dataset_id @property def unprotected_data_type(self) -> PyDoughType: @@ -101,6 +104,14 @@ def server_masked(self) -> bool: """ return self._server_masked + @property + def server_dataset_id(self) -> str | None: + """ + Returns the dataset ID to use when querying the mask server for this + column, if any. + """ + return self._server_dataset_id + @staticmethod def create_error_name(name: str, collection_error_name: str) -> str: return f"masked table column property {name!r} of {collection_error_name}" @@ -112,6 +123,7 @@ def components(self) -> list: comp.append(self.unprotect_protocol) comp.append(self.protect_protocol) comp.append(self.server_masked) + comp.append(self.server_dataset_id) return comp @staticmethod @@ -164,6 +176,12 @@ def parse_from_json( if "server masked" in property_json: server_masked = extract_bool(property_json, "server masked", error_name) + server_dataset_id: str | None = None + if "server dataset id" in property_json: + server_dataset_id = extract_string( + property_json, "server dataset id", error_name + ) + NoExtraKeys(MaskedTableColumnMetadata.allowed_fields).verify( property_json, error_name ) @@ -178,6 +196,7 @@ def parse_from_json( unprotect_protocol, protect_protocol, server_masked, + server_dataset_id, None, None, None, diff --git a/pydough/pydough_operators/expression_operators/masked_expression_function_operator.py b/pydough/pydough_operators/expression_operators/masked_expression_function_operator.py index 905361348..211a55c90 100644 --- a/pydough/pydough_operators/expression_operators/masked_expression_function_operator.py +++ b/pydough/pydough_operators/expression_operators/masked_expression_function_operator.py @@ -29,6 +29,7 @@ class MaskedExpressionFunctionOperator(ExpressionFunctionOperator): def __init__( self, masking_metadata: MaskedTableColumnMetadata, + table_path: str, is_unmask: bool, ): # Create a dummy verifier that requires exactly one argument, since all @@ -49,6 +50,7 @@ def __init__( "UNMASK" if is_unmask else "MASK", False, verifier, deducer, False ) self._masking_metadata: MaskedTableColumnMetadata = masking_metadata + self._table_path: str = table_path self._is_unmask: bool = is_unmask @property @@ -58,6 +60,13 @@ def masking_metadata(self) -> MaskedTableColumnMetadata: """ return self._masking_metadata + @property + def table_path(self) -> str: + """ + The fully qualified SQL table path for the masked column. + """ + return self._table_path + @property def is_unmask(self) -> bool: """ diff --git a/pydough/relational/__init__.py b/pydough/relational/__init__.py index 98772e97a..161a43c85 100644 --- a/pydough/relational/__init__.py +++ b/pydough/relational/__init__.py @@ -20,6 +20,7 @@ "RelationalExpression", "RelationalExpressionDispatcher", "RelationalExpressionShuttle", + "RelationalExpressionShuttleDispatcher", "RelationalExpressionVisitor", "RelationalNode", "RelationalRoot", @@ -55,6 +56,7 @@ Limit, Project, RelationalExpressionDispatcher, + RelationalExpressionShuttleDispatcher, RelationalNode, RelationalRoot, RelationalShuttle, diff --git a/pydough/relational/relational_nodes/__init__.py b/pydough/relational/relational_nodes/__init__.py index b16c6f9f1..a4e673190 100644 --- a/pydough/relational/relational_nodes/__init__.py +++ b/pydough/relational/relational_nodes/__init__.py @@ -16,6 +16,7 @@ "Limit", "Project", "RelationalExpressionDispatcher", + "RelationalExpressionShuttleDispatcher", "RelationalNode", "RelationalRoot", "RelationalShuttle", @@ -33,6 +34,9 @@ from .limit import Limit from .project import Project from .relational_expression_dispatcher import RelationalExpressionDispatcher +from .relational_expression_shuttle_dispatcher import ( + RelationalExpressionShuttleDispatcher, +) from .relational_root import RelationalRoot from .relational_shuttle import RelationalShuttle from .relational_visitor import RelationalVisitor diff --git a/pydough/relational/relational_nodes/relational_expression_shuttle_dispatcher.py b/pydough/relational/relational_nodes/relational_expression_shuttle_dispatcher.py new file mode 100644 index 000000000..1602c81cc --- /dev/null +++ b/pydough/relational/relational_nodes/relational_expression_shuttle_dispatcher.py @@ -0,0 +1,86 @@ +""" +Implementation of a visitor that works by applying a shuttle to every expression +for each node. +""" + +from pydough.relational.relational_expressions import ( + CallExpression, + RelationalExpressionShuttle, +) + +from .abstract_node import RelationalNode +from .aggregate import Aggregate +from .empty_singleton import EmptySingleton +from .filter import Filter +from .generated_table import GeneratedTable +from .join import Join +from .limit import Limit +from .project import Project +from .relational_root import RelationalRoot +from .relational_visitor import RelationalVisitor +from .scan import Scan + +__all__ = ["RelationalExpressionShuttleDispatcher"] + + +class RelationalExpressionShuttleDispatcher(RelationalVisitor): + """ + Applies some expression shuttle to each expression in the relational tree. + """ + + def __init__(self, shuttle: RelationalExpressionShuttle) -> None: + self.shuttle: RelationalExpressionShuttle = shuttle + + def reset(self) -> None: + self.shuttle.reset() + + def visit_common(self, node: RelationalNode) -> None: + """ + Applies the basic logic to transform all the expressions in a node's + column list, as well as transforming the inputs to the node. + """ + self.visit_inputs(node) + for name, expr in node.columns.items(): + node.columns[name] = expr.accept_shuttle(self.shuttle) + + def visit_scan(self, scan: Scan) -> None: + self.visit_common(scan) + + def visit_join(self, join: Join) -> None: + self.visit_common(join) + join._condition = join.condition.accept_shuttle(self.shuttle) + + def visit_project(self, project: Project) -> None: + self.visit_common(project) + + def visit_filter(self, filter: Filter) -> None: + self.visit_common(filter) + filter._condition = filter.condition.accept_shuttle(self.shuttle) + + def visit_aggregate(self, aggregate: Aggregate) -> None: + self.visit_common(aggregate) + for key in aggregate.keys: + aggregate.keys[key] = aggregate.columns[key] + for agg in aggregate.aggregations: + aggregation = aggregate.aggregations[agg] + assert isinstance(aggregation, CallExpression) + aggregate.aggregations[agg] = aggregation + + def visit_limit(self, limit: Limit) -> None: + self.visit_common(limit) + limit._limit = limit.limit.accept_shuttle(self.shuttle) + for order in limit.orderings: + order.expr = order.expr.accept_shuttle(self.shuttle) + + def visit_empty_singleton(self, singleton: EmptySingleton) -> None: + pass + + def visit_generated_table(self, generated_table: GeneratedTable) -> None: + pass + + def visit_root(self, root: RelationalRoot) -> None: + self.visit_common(root) + if root.limit is not None: + root._limit = root.limit.accept_shuttle(self.shuttle) + for order in root.orderings: + order.expr = order.expr.accept_shuttle(self.shuttle) diff --git a/pydough/sqlglot/override_pushdown_predicates.py b/pydough/sqlglot/override_pushdown_predicates.py index d4fdbc810..b15f67bfe 100644 --- a/pydough/sqlglot/override_pushdown_predicates.py +++ b/pydough/sqlglot/override_pushdown_predicates.py @@ -3,9 +3,33 @@ """ from sqlglot import exp -from sqlglot.optimizer.pushdown_predicates import pushdown +from sqlglot.optimizer.normalize import normalized +from sqlglot.optimizer.pushdown_predicates import nodes_for_predicate, replace_aliases +from sqlglot.optimizer.simplify import simplify +from sqlglot.optimizer.scope import find_all_in_scope from sqlglot.optimizer.scope import build_scope +# ruff: noqa +# mypy: ignore-errors +# ruff & mypy should not try to typecheck or verify any of this + + +def contains_real_aggregate(expression) -> bool: + """ + Check if the expression contains a real aggregate function (e.g. SUM, AVG), + as opposed to MAX(a, b) which is a form of the LEAST/GREATEST function. This + is created by PyDough to account for such an edge case when pushing down + predicates. + """ + for agg_expr in find_all_in_scope(expression, exp.AggFunc, bfs=True): + if ( + isinstance(agg_expr, (exp.Max, exp.Min)) + and len(agg_expr.args["expressions"]) > 0 + ): + continue + return True + return False + def pushdown_predicates(expression, dialect=None): """ @@ -71,3 +95,105 @@ def pushdown_predicates(expression, dialect=None): ) return expression + + +def pushdown(condition, sources, scope_ref_count, dialect, join_index=None): + if not condition: + return + + condition = condition.replace(simplify(condition, dialect=dialect)) + cnf_like = normalized(condition) or not normalized(condition, dnf=True) + + predicates = list( + condition.flatten() + if isinstance(condition, exp.And if cnf_like else exp.Or) + else [condition] + ) + + if cnf_like: + pushdown_cnf(predicates, sources, scope_ref_count, join_index=join_index) + else: + pushdown_dnf(predicates, sources, scope_ref_count) + + +def pushdown_cnf(predicates, sources, scope_ref_count, join_index=None): + """ + If the predicates are in CNF like form, we can simply replace each block in the parent. + """ + join_index = join_index or {} + for predicate in predicates: + for node in nodes_for_predicate(predicate, sources, scope_ref_count).values(): + if isinstance(node, exp.Join): + name = node.alias_or_name + predicate_tables = exp.column_table_names(predicate, name) + + # Don't push the predicate if it references tables that appear in later joins + this_index = join_index[name] + if all( + join_index.get(table, -1) < this_index for table in predicate_tables + ): + predicate.replace(exp.true()) + node.on(predicate, copy=False) + break + if isinstance(node, exp.Select): + predicate.replace(exp.true()) + inner_predicate = replace_aliases(node, predicate) + # PyDough Change: stop using `find_in_scope(inner_predicate, exp.AggFunc)` + # since this will fail if the predicate is MIN/MAX with 2+ args. + if contains_real_aggregate(inner_predicate): + node.having(inner_predicate, copy=False) + else: + node.where(inner_predicate, copy=False) + + +def pushdown_dnf(predicates, sources, scope_ref_count): + """ + If the predicates are in DNF form, we can only push down conditions that are in all blocks. + Additionally, we can't remove predicates from their original form. + """ + # find all the tables that can be pushdown too + # these are tables that are referenced in all blocks of a DNF + # (a.x AND b.x) OR (a.y AND c.y) + # only table a can be push down + pushdown_tables = set() + + for a in predicates: + a_tables = exp.column_table_names(a) + + for b in predicates: + a_tables &= exp.column_table_names(b) + + pushdown_tables.update(a_tables) + + conditions = {} + + # pushdown all predicates to their respective nodes + for table in sorted(pushdown_tables): + for predicate in predicates: + nodes = nodes_for_predicate(predicate, sources, scope_ref_count) + + if table not in nodes: + continue + + conditions[table] = ( + exp.or_(conditions[table], predicate) + if table in conditions + else predicate + ) + + for name, node in nodes.items(): + if name not in conditions: + continue + + predicate = conditions[name] + + if isinstance(node, exp.Join): + node.on(predicate, copy=False) + elif isinstance(node, exp.Select): + inner_predicate = replace_aliases(node, predicate) + # PyDough Change: stop using `find_in_scope(inner_predicate, exp.AggFunc)` + # since this will fail if the predicate is MIN/MAX with 2+ args. + if contains_real_aggregate(inner_predicate): + node.having(inner_predicate, copy=False) + else: + node.where(inner_predicate, copy=False) diff --git a/tests/conftest.py b/tests/conftest.py index dcc74b605..84f8d5c67 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,6 +27,7 @@ load_database_context, ) from pydough.errors import PyDoughTestingException +from pydough.mask_server import MaskServerInfo from pydough.metadata.graphs import GraphMetadata from pydough.qdag import AstNodeBuilder from tests.test_pydough_functions.tpch_outputs import ( @@ -2007,3 +2008,22 @@ def mock_server_setup(): # Cleanup after tests proc.terminate() proc.wait() + + +@pytest.fixture(scope="session") +def mock_server_info(mock_server_setup: str) -> MaskServerInfo: + """ + Returns the MaskServerInfo for the mock server. + """ + return MaskServerInfo(base_url=mock_server_setup, token=None) + + +@pytest.fixture(scope="session") +def true_mask_server_info() -> MaskServerInfo: + """ + Returns the MaskServerInfo for the true Mask server. + """ + if not os.getenv("PYDOUGH_MASK_SERVER_PATH"): + raise RuntimeError("PYDOUGH_MASK_SERVER_PATH environment variable is not set") + + return MaskServerInfo(base_url=os.environ["PYDOUGH_MASK_SERVER_PATH"], token=None) diff --git a/tests/mock_server/api_mock_server.py b/tests/mock_server/api_mock_server.py index 2a1b125d9..accef8458 100644 --- a/tests/mock_server/api_mock_server.py +++ b/tests/mock_server/api_mock_server.py @@ -8,6 +8,8 @@ Intended for use in unit and integration tests. """ +import base64 + from fastapi import Depends, FastAPI, HTTPException, Request from pydantic import BaseModel @@ -17,10 +19,12 @@ class EvaluateRequest(BaseModel): - column_reference: str + dataset_id: str + column_ref: dict[str, str] predicate: list[str | int | float | None | bool] - mode: str = "dynamic" - dry_run: bool = False + output_mode: str + mode: str + dry_run: bool class RequestPayload(BaseModel): @@ -47,19 +51,70 @@ def batch_evaluate( request: Request, payload: RequestPayload, authorized: bool = Depends(verify_token) ): responses: list[dict] = [] + successful_responses: int = 0 + # Process each item in the batch for item in payload.items: - key = (item.column_reference, tuple(item.predicate)) - materialization: dict = LOOKUP_TABLE.get(key, {}) - - response: dict = { + assert set(item.column_ref.keys()) == { + "kind", + "value", + }, f"Invalid column_reference format in mock: {item.column_ref!r}." + assert item.column_ref["kind"] == "fqn", "Only FQN kind is supported in mock." + key = (item.dataset_id, item.column_ref["value"], tuple(item.predicate)) + table_result: tuple[str, list] | None = LOOKUP_TABLE.get(key, None) + out_item: dict = { "index": payload.items.index(item) + 1, - "result": "SUCCESS" if materialization != {} else "UNSUPPORTED", - "decision": {"strategy": "values", "reason": "mock"}, - "predicate_hash": "hash1", - "encryption_mode": "clear", - "materialization": materialization, } - # Adding the index - responses.append(response) - - return {"result": "SUCCESS", "items": responses} + if table_result is None: + # If the key is not found in the lookup table, return an error for + # this item of the batch. + out_item["result"] = "ERROR" + else: + # Otherwise, generate a successful response based on the lookup + # table. + output_case, output_list = table_result + out_item["SUCCESS"] = "SUCCESS" + out_item["response"] = { + "strategy": "early_stop", + "records": [ + { + "mode": "cell_encrypted", + "cell_encrypted": base64.b64encode(str(elem).encode("utf-8")) + if isinstance(elem, str) + else elem, + } + for elem in output_list + ], + "count": len(output_list), + "stats": {"execution_time_ms": 42}, + "column_stats": None, + "next_cursor": None, + "metadata": { + "requested_output_mode": "cell_encrypted", + "actual_output_mode": "cell_encrypted", + "available_output_modes": ["cell_encrypted"], + "encryption_mode": None, + "dynamic_operator": "IN", + }, + } + if output_case == "NOT_IN": + out_item["response"]["metadata"]["representation"] = "NOT_IN" + # Don't include response in dry run case + if item.dry_run: + out_item["response"].pop("records") + successful_responses += 1 + + # Adding the new item to the batch output + responses.append(out_item) + + # Determine overall result: + # - SUCCESS if all items succeeded + # - ERROR if all items failed + # - PARTIAL_FAILURE otherwise + result: str + if successful_responses == len(payload.items): + result = "SUCCESS" + elif successful_responses == 0: + result = "ERROR" + else: + result = "PARTIAL_FAILURE" + return {"result": result, "items": responses} diff --git a/tests/mock_server/lookup_table.py b/tests/mock_server/lookup_table.py index f132e4c74..02e5a635d 100644 --- a/tests/mock_server/lookup_table.py +++ b/tests/mock_server/lookup_table.py @@ -3,73 +3,74 @@ request column reference and predicate. """ -LOOKUP_TABLE: dict = { - # key: (column_reference, tuple(predicate)) - ("srv.db.tbl.col", ("EQUAL", 2, "__col__", 0)): { - "type": "literal", - "operator": "NOT_IN", - "values": [ +LOOKUP_TABLE: dict[tuple[str, str, tuple], tuple[str, list]] = { + # key: (dataset_id, fully_qualified_column_name, tuple(predicate)) + # value: (response_case, payload) + ("dummy_server", "db/tbl/col", ("EQUAL", 2, "__col__", 0)): ( + "NOT_IN", + [ "value1", "value2", "value3", ], - "count": 3, - }, + ), ( - "srv.db.orders.order_date", - ("BETWEEN", 3, "__col__", "2025-01-01", "2025-02-01"), - ): { - "type": "literal", - "operator": "IN", - "values": [ + "dummy_server", + "db/orders/order_date", + ( + "AND", + 2, + "LTE", + 2, + "2025-01-01", + "__col__", + "LTE", + 2, + "__col__", + "2025-02-01", + ), + ): ( + "IN", + [ "2025-01-01", "2025-01-02", "2025-01-03", "2025-01-04", "2025-01-05", ], - "count": 5, - }, - ("srv.db.tbl.col", ("NOT_EQUAL", 2, "__col__", "LOWER", 1, "Smith")): { - "type": "literal", - "operator": "NOT_IN", - "values": ["smith"], - "count": 1, - }, + ), + ("dummy_server", "db/tbl/col", ("NOT_EQUAL", 2, "__col__", "LOWER", 1, "Smith")): ( + "NOT_IN", + ["smith"], + ), # booleans, - ("srv.db.tbl.col", ("NOT_EQUAL", 2, "__col__", True)): { - "type": "literal", - "operator": "IN", - "values": [False], - "count": 1, - }, + ("dummy_server", "db/tbl/col", ("NOT_EQUAL", 2, "__col__", True)): ( + "IN", + [False], + ), # decimals (string format) - ("srv.db.tbl.col", ("LT", 2, "__col__", "123.654445")): { - "type": "literal", - "operator": "IN", - "values": ["123.121123", "123.654444", "123.654445"], - "count": 3, - }, + ("dummy_server", "db/tbl/col", ("LT", 2, "__col__", "123.654445")): ( + "IN", + ["123.121123", "123.654444", "123.654445"], + ), # json embedded - ("srv.db.tbl.col", ("NOT_EQUAL", 2, "__col__", '{"key": "value"}')): { - "type": "literal", - "operator": "NOT_IN", - "values": ['{"key": "value"}'], - "count": 1, - }, + ("dummy_server", "db/tbl/col", ("NOT_EQUAL", 2, "__col__", '("key": "value")')): ( + "NOT_IN", + ['("key": "value")'], + ), # NULLs and Money ( - "srv.db.tbl.col", + "dummy_server", + "db/tbl/col", ("AND", 2, "NOT_EQUAL", 2, "__col__", None, "GT", 2, "__col__", "$45.00"), - ): { - "type": "literal", - "operator": "NOT_IN", - "values": [None, "$44.50", "$43.20", "$44.99"], - "count": 4, - }, + ): ( + "NOT_IN", + [None, "$44.50", "$43.20", "$44.99"], + ), # Result with Regex, Bytea, Backslash in really nested expression. ( - "srv.db.tbl.col", + "dummy_server", + "db/tbl/col", ( "OR", 2, @@ -88,10 +89,1173 @@ "__col__", '"Hello World"', ), - ): { - "type": "literal", - "operator": "IN", - "values": ['"Hello"', "HelloWorld", "SGVsbG9Xb3JsZA=="], - "count": 3, - }, + ): ( + "IN", + ['"Hello"', "HelloWorld", "SGVsbG9Xb3JsZA=="], + ), + # CRYPTBANK hardcoded responses + ("dummy_server", "CRBNK/CUSTOMERS/c_lname", ("EQUAL", 2, "__col__", "lee")): ( + "IN", + ["LEE"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ( + "AND", + 2, + "LTE", + 2, + 1980, + "YEAR", + 1, + "__col__", + "LTE", + 2, + "YEAR", + 1, + "__col__", + 1985, + ), + ): ( + "IN", + [ + "1980-01-18", + "1981-07-21", + "1981-11-15", + "1982-11-07", + "1983-12-27", + ], + ), + ("dummy_server", "CRBNK/TRANSACTIONS/t_amount", ("GT", 2, "__col__", 9000.0)): ( + "IN", + [ + -8934.44, + -8881.98, + -8736.83, + -8717.7, + -8648.33, + -8639.5, + -8620.48, + -8593.09, + -8553.43, + -8527.34, + -8484.61, + -8480.79, + -8472.7, + -8457.49, + -8366.52, + -8361.27, + -8352.72, + -8308.42, + -8254.69, + -8077.89, + -8067.8, + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ( + "AND", + 2, + "EQUAL", + 2, + "MONTH", + 1, + "__col__", + 6, + "EQUAL", + 2, + "YEAR", + 1, + "__col__", + 2022, + ), + ): ( + "IN", + [ + "2022-06-03 05:08:58", + "2022-06-12 00:24:06", + "2022-06-13 05:50:39", + "2022-06-14 19:08:57", + "2022-06-16 03:15:13", + "2022-06-18 03:37:49", + "2022-06-27 06:08:04", + "2022-06-28 15:35:47", + "2022-06-29 05:40:38", + "2022-06-29 19:53:42", + ], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_type", + ( + "OR", + 2, + "EQUAL", + 2, + "__col__", + "retirement", + "EQUAL", + 2, + "__col__", + "savings", + ), + ): ( + "IN", + ["avingss", "etirementr"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_phone", ("ENDSWITH", 2, "__col__", "5")): ( + "IN", + ["555-091-2345", "555-901-2345"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("OR", 2, "ENDSWITH", 2, "__col__", "a", "ENDSWITH", 2, "__col__", "e"), + ): ( + "IN", + ["ALICE", "GRACE", "LUKE", "MARIA", "OLIVIA", "QUEENIE", "SOPHIA"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_fname", ("ENDSWITH", 2, "__col__", "s")): ( + "IN", + ["JAMES", "NICHOLAS", "THOMAS"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_lname", ("NOT_EQUAL", 2, "__col__", "lopez")): ( + "NOT_IN", + ["LOPEZ"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_lname", ("NOT_EQUAL", 2, "__col__", "lee")): ( + "NOT_IN", + ["LEE"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_lname", + ("IN", 4, "__col__", "lee", "smith", "rodriguez"), + ): ( + "IN", + ["LEE", "SMITH", "RODRIGUEZ"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_lname", + ("NOT", 1, "IN", 4, "__col__", "lee", "smith", "rodriguez"), + ): ( + "NOT_IN", + ["LEE", "SMITH", "RODRIGUEZ"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_phone", + ("STARTSWITH", 2, "__col__", "555-8"), + ): ( + "IN", + ["555-809-1234", "555-870-9123"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_email", + ("ENDSWITH", 2, "__col__", "gmail.com"), + ): ( + "IN", + [ + "livia.a22@gmail.como", + "ob.smith77@gmail.comb", + "ob_moore78@gmail.comr", + "opez.luke99@gmail.coml", + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("EQUAL", 2, "YEAR", 1, "__col__", 1978), + ): ( + "IN", + ["1976-10-27", "1976-12-02"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("EQUAL", 2, "__col__", "1985-04-12"), + ): ( + "IN", + ["1983-12-27"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_fname", ("ENDSWITH", 2, "__col__", "e")): ( + "IN", + ["ALICE", "GRACE", "LUKE", "QUEENIE"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_lname", ("ENDSWITH", 2, "__col__", "e")): ( + "IN", + ["LEE", "MOORE"], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_type", + ( + "AND", + 2, + "NOT_EQUAL", + 2, + "__col__", + "checking", + "NOT_EQUAL", + 2, + "__col__", + "savings", + ), + ): ( + "NOT_IN", + ["avingss", "heckingc"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("NOT_EQUAL", 2, "__col__", "1991-11-15"), + ): ( + "NOT_IN", + ["1990-07-31"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("LTE", 2, "__col__", "1991-11-15"), + ): ( + "NOT_IN", + ["1991-03-13", "1992-05-06", "1993-01-01", "1994-06-15"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("GT", 2, "__col__", "1991-11-15"), + ): ( + "IN", + ["1991-03-13", "1992-05-06", "1993-01-01", "1994-06-15"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("LT", 2, "__col__", "1991-11-15"), + ): ( + "NOT_IN", + [ + "1990-07-31", + "1991-03-13", + "1992-05-06", + "1993-01-01", + "1994-06-15", + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("GTE", 2, "__col__", "1991-11-15"), + ): ( + "IN", + [ + "1990-07-31", + "1991-03-13", + "1992-05-06", + "1993-01-01", + "1994-06-15", + ], + ), + ("dummy_server", "CRBNK/TRANSACTIONS/t_amount", ("LT", 2, "__col__", 0)): ( + "IN", + [], + ), + ("dummy_server", "CRBNK/TRANSACTIONS/t_amount", ("GT", 2, "__col__", 0)): ( + "NOT_IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("LTE", 2, "__col__", "1925-01-01"), + ): ( + "IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_phone", + ("EQUAL", 2, "__col__", "555-123-456"), + ): ( + "IN", + [], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_open_ts", + ("EQUAL", 2, "YEAR", 1, "__col__", 2021), + ): ( + "IN", + [ + "2017-02-11 10:59:51", + "2017-06-15 12:41:51", + "2017-07-07 14:26:51", + "2017-07-09 12:21:51", + "2017-09-15 11:26:51", + "2018-01-02 12:26:51", + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ( + "AND", + 2, + "IN", + 7, + "ADD", + 2, + "MONTH", + 1, + "__col__", + 1, + 2, + 4, + 6, + 8, + 10, + 12, + "IN", + 11, + "SUB", + 2, + "YEAR", + 1, + "__col__", + 2, + 1975, + 1977, + 1979, + 1981, + 1983, + 1985, + 1987, + 1989, + 1991, + 1993, + ), + ): ( + "IN", + ["1980-01-18", "1981-11-15", "1990-07-31", "1994-06-15"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("IN", 5, "__col__", "1991-11-15", "1978-02-11", "2005-03-14", "1985-04-12"), + ): ( + "IN", + ["1990-07-31", "1976-10-27", "1983-12-27"], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_open_ts", + ( + "AND", + 2, + "LTE", + 2, + "2020-03-28 09:20:00", + "__col__", + "LTE", + 2, + "__col__", + "2020-09-20 08:30:00", + ), + ): ( + "IN", + [ + "2016-04-29 11:46:51", + "2016-06-10 12:56:51", + "2016-07-20 15:46:51", + "2016-08-22 10:41:51", + "2016-09-03 12:01:51", + ], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_email", ("CONTAINS", 2, "__col__", "mail")): ( + "NOT_IN", + [ + "homasl@outlook.comt", + "ueenie.t@outlook.netq", + ".hernandez@icloud.comk", + "martinez94@outlook.orgj", + "sa.rodriguez@zoho.comi", + ".brown88@yahoo.comd", + ".lee@outlook.comc", + "lice_j@example.orga", + ], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_email", ("LIKE", 2, "__col__", "%.%@%mail%")): ( + "IN", + [ + "ophia.jackson@mail.orgs", + "livia.a22@gmail.como", + ".gonzalez@ymail.comm", + "opez.luke99@gmail.coml", + "enry.g@fastmail.comh", + "rank.k@protonmail.comf", + "mily.jones@mail.come", + "ob.smith77@gmail.comb", + ], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_open_ts", + ("IN", 4, "MONTH", 1, "__col__", 1, 2, 3), + ): ( + "IN", + [ + "2013-04-22 11:37:51", + "2017-02-11 10:59:51", + "2011-04-30 15:16:51", + "2016-03-23 12:41:51", + "2013-02-15 12:46:51", + "2018-03-15 10:36:51", + "2014-04-07 14:21:51", + "2015-02-08 17:26:51", + "2016-04-29 11:46:51", + "2012-03-22 12:16:51", + "2015-04-06 13:46:51", + ], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_open_ts", + ("EQUAL", 2, "QUARTER", 1, "__col__", "DAY", 1, "__col__"), + ): ( + "IN", + ["2015-05-04 18:01:51"], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_open_ts", + ( + "AND", + 2, + "LT", + 2, + "HOUR", + 1, + "__col__", + 10, + "LT", + 2, + "MINUTE", + 1, + "__col__", + 20, + ), + ): ( + "IN", + [ + "2013-04-22 11:37:51", + "2017-09-15 11:26:51", + "2018-03-15 10:36:51", + "2014-05-23 11:31:51", + "2016-08-22 10:41:51", + "2014-08-15 11:31:51", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "SECOND", 1, "__col__", 23), + ): ( + "IN", + [ + "2020-11-11 09:03:02", + "2023-09-15 09:00:02", + "2024-07-21 23:24:02", + ], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_balance", + ( + "AND", + 2, + "LTE", + 2, + 200, + "ABS", + 1, + "SUB", + 2, + "__col__", + 7250, + "LTE", + 2, + "ABS", + 1, + "SUB", + 2, + "__col__", + 7250, + 600, + ), + ): ( + "IN", + [ + 46240000.0, + 57760000.0, + ], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_open_ts", + ( + "EQUAL", + 2, + "GREATEST", + 3, + "HOUR", + 1, + "__col__", + "MINUTE", + 1, + "__col__", + "SECOND", + 1, + "__col__", + 10, + ), + ): ( + "IN", + [ + "2018-03-15 10:36:51", + "2018-01-02 12:26:51", + ], + ), + ( + "dummy_server", + "CRBNK/ACCOUNTS/a_open_ts", + ("EQUAL", 2, "LEAST", 2, "HOUR", 1, "__col__", "MINUTE", 1, "__col__", 15), + ): ( + "IN", + [ + "2015-08-10 18:11:51", + "2015-05-04 18:01:51", + "2015-10-19 18:11:51", + "2014-10-03 17:41:51", + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_phone", + ("CONTAINS", 2, "CONCAT", 2, "1-", "__col__", "1-5"), + ): ( + "NOT_IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_phone", + ("CONTAINS", 2, "CONCAT", 3, "1", "-", "__col__", "1-5"), + ): ( + "NOT_IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_phone", + ("CONTAINS", 2, "CONCAT", 5, "1", "-", "__col__", "-", "1", "5-1"), + ): ( + "IN", + [ + "555-112-3456", + "555-901-2345", + "555-091-2345", + "555-123-4567", + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("IN", 3, "COALESCE", 2, "YEAR", 1, "__col__", 1990, 1990, 1991), + ): ( + "IN", + [ + "1990-07-31", + "1989-04-07", + None, + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("IN", 3, "COALESCE", 2, "YEAR", 1, "__col__", 1990, 1990, 2005), + ): ( + "IN", + [ + "1989-04-07", + None, + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("IN", 3, "COALESCE", 2, "YEAR", 1, "__col__", 2005, 2005, 2006), + ): ( + "IN", + [ + None, + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("NOT", 1, "IN", 3, "COALESCE", 2, "YEAR", 1, "__col__", 1990, 1990, 1991), + ): ( + "NOT_IN", + [ + "1990-07-31", + "1989-04-07", + None, + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("NOT", 1, "IN", 3, "COALESCE", 2, "YEAR", 1, "__col__", 1990, 1990, 2005), + ): ( + "NOT_IN", + [ + "1989-04-07", + None, + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_birthday", + ("NOT", 1, "IN", 3, "COALESCE", 2, "YEAR", 1, "__col__", 2005, 2005, 2006), + ): ( + "NOT_IN", + [ + None, + ], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("IN", 4, "SLICE", 3, "__col__", 0, 1, "q", "r", "s"), + ): ("IN", ["QUEENIE", "ROBERT", "SOPHIA"]), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_lname", + ( + "CONTAINS", + 2, + "__col__", + "CONCAT", + 2, + "e", + "IFF", + 3, + "IN", + 4, + "SLICE", + 3, + "__col__", + 0, + 1, + "q", + "r", + "s", + "z", + "e", + ), + ): ( + "IN", + ["LEE", "RODRIGUEZ"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("EQUAL", 2, "SLICE", 3, "__col__", 0, 1, "i"), + ): ( + "IN", + ["ISABEL"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("IN", 6, "SLICE", 3, "__col__", 1, 2, "ar", "li", "ra", "to", "am"), + ): ( + "IN", + [ + "ALICE", + "CAROL", + "FRANK", + "GRACE", + "JAMES", + "KAREN", + "MARIA", + "OLIVIA", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATETRUNC", 2, "year", "__col__", "2023-01-01"), + ): ( + "IN", + [ + "2022-12-31 17:42:54", + "2023-01-04 12:05:15", + "2023-01-07 22:11:27", + "2023-01-20 04:38:03", + "2023-01-20 16:40:54", + "2023-01-27 15:13:18", + "2023-01-30 19:58:26", + "2023-02-02 19:12:58", + "2023-02-11 11:13:53", + "2023-02-11 12:32:55", + "2023-02-15 21:54:29", + "2023-02-16 14:18:36", + "2023-02-28 07:11:29", + "2023-03-07 01:26:10", + "2023-03-08 18:58:18", + "2023-03-14 14:23:33", + "2023-03-16 06:17:44", + "2023-03-17 08:48:16", + "2023-03-24 03:33:40", + "2023-03-26 06:52:52", + "2023-04-18 00:35:40", + "2023-04-25 18:54:26", + "2023-04-29 04:58:30", + "2023-05-04 23:30:10", + "2023-05-12 04:42:28", + "2023-05-17 18:54:12", + "2023-05-19 10:10:44", + "2023-05-21 13:52:14", + "2023-05-24 03:51:10", + "2023-06-01 13:50:10", + "2023-06-01 13:50:14", + "2023-06-04 10:35:26", + "2023-06-11 21:53:04", + "2023-06-25 15:06:06", + "2023-06-25 21:58:37", + "2023-06-27 03:21:19", + "2023-06-27 10:34:20", + "2023-06-30 15:27:03", + "2023-07-07 15:17:47", + "2023-07-17 03:23:15", + "2023-07-18 14:41:26", + "2023-08-03 20:24:35", + "2023-08-11 20:25:39", + "2023-08-29 03:07:18", + "2023-09-01 16:50:48", + "2023-09-08 09:30:23", + "2023-09-13 06:42:39", + "2023-09-15 09:00:02", + "2023-09-30 08:57:30", + "2023-10-15 02:47:04", + "2023-10-19 09:40:06", + "2023-10-30 00:20:45", + "2023-11-08 12:52:24", + "2023-11-10 17:20:29", + "2023-11-16 11:30:24", + "2023-11-21 15:17:10", + "2023-11-28 06:34:03", + "2023-12-07 14:11:33", + "2023-12-15 05:57:23", + "2023-12-16 00:51:23", + "2023-12-23 07:54:22", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATETRUNC", 2, "quarter", "__col__", "2023-04-01"), + ): ( + "IN", + [ + "2023-04-18 00:35:40", + "2023-04-25 18:54:26", + "2023-04-29 04:58:30", + "2023-05-04 23:30:10", + "2023-05-12 04:42:28", + "2023-05-17 18:54:12", + "2023-05-19 10:10:44", + "2023-05-21 13:52:14", + "2023-05-24 03:51:10", + "2023-06-01 13:50:10", + "2023-06-01 13:50:14", + "2023-06-04 10:35:26", + "2023-06-11 21:53:04", + "2023-06-25 15:06:06", + "2023-06-25 21:58:37", + "2023-06-27 03:21:19", + "2023-06-27 10:34:20", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATETRUNC", 2, "month", "__col__", "2023-06-01"), + ): ( + "IN", + [ + "2023-06-01 13:50:10", + "2023-06-01 13:50:14", + "2023-06-04 10:35:26", + "2023-06-11 21:53:04", + "2023-06-25 15:06:06", + "2023-06-25 21:58:37", + "2023-06-27 03:21:19", + "2023-06-27 10:34:20", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATETRUNC", 2, "day", "__col__", "2023-06-02"), + ): ( + "IN", + [ + "2023-06-01 13:50:10", + "2023-06-01 13:50:14", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATETRUNC", 2, "hour", "__col__", "2023-06-02 04:00:00"), + ): ( + "IN", + [ + "2023-06-01 13:50:10", + "2023-06-01 13:50:14", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATETRUNC", 2, "minute", "__col__", "2023-06-02 04:55:00"), + ): ( + "IN", + [ + "2023-06-01 13:50:10", + "2023-06-01 13:50:14", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATETRUNC", 2, "second", "__col__", "2023-06-02 04:55:31"), + ): ( + "IN", + [ + "2023-06-01 13:50:10", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATEADD", 3, 1, "years", "__col__", "2020-11-11 18:00:52"), + ): ( + "IN", + [ + "2019-11-11 02:55:31", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATEADD", 3, 2, "quarters", "__col__", "2020-05-11 18:00:52"), + ): ( + "IN", + [ + "2019-11-11 02:55:31", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATEADD", 3, -5, "months", "__col__", "2019-06-11 18:00:52"), + ): ( + "IN", + [ + "2019-11-11 02:55:31", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATEADD", 3, 10, "days", "__col__", "2019-11-21 18:00:52"), + ): ( + "IN", + [ + "2019-11-11 02:55:31", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATEADD", 3, 1000, "hours", "__col__", "2019-12-23 10:00:52"), + ): ( + "IN", + [ + "2019-11-11 02:55:31", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ("EQUAL", 2, "DATEADD", 3, 10000, "minutes", "__col__", "2019-11-18 16:40:52"), + ): ( + "IN", + [ + "2019-11-11 02:55:31", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ( + "EQUAL", + 2, + "DATEADD", + 3, + -1000000, + "seconds", + "__col__", + "2019-10-31 04:14:12", + ), + ): ( + "IN", + [ + "2019-11-11 02:55:31", + ], + ), + ( + "dummy_server", + "CRBNK/TRANSACTIONS/t_ts", + ( + "EQUAL", + 2, + "DATEADD", + 3, + -1, + "days", + "DATETRUNC", + 2, + "month", + "__col__", + "2019-10-31", + ), + ): ( + "IN", + [ + "2019-11-02 11:58:37", + "2019-11-02 12:54:09", + "2019-11-11 02:55:31", + "2019-11-11 15:44:22", + ], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_fname", ("CONTAINS", 2, "__col__", "a")): ( + "NOT_IN", + ["BOB", "EMILY", "HENRY", "LUKE", "PETER", "QUEENIE", "ROBERT"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_fname", ("CONTAINS", 2, "__col__", "e")): ( + "NOT_IN", + [ + "BOB", + "CAROL", + "DAVID", + "FRANK", + "MARIA", + "NICHOLAS", + "OLIVIA", + "SOPHIA", + "THOMAS", + ], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_fname", ("CONTAINS", 2, "__col__", "i")): ( + "IN", + [ + "ALICE", + "DAVID", + "EMILY", + "ISABEL", + "MARIA", + "NICHOLAS", + "OLIVIA", + "QUEENIE", + "SOPHIA", + ], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_fname", ("CONTAINS", 2, "__col__", "o")): ( + "IN", + ["BOB", "CAROL", "NICHOLAS", "OLIVIA", "ROBERT", "SOPHIA", "THOMAS"], + ), + ("dummy_server", "CRBNK/CUSTOMERS/c_fname", ("CONTAINS", 2, "__col__", "u")): ( + "IN", + ["LUKE", "QUEENIE"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("AND", 2, "CONTAINS", 2, "__col__", "a", "CONTAINS", 2, "__col__", "e"), + ): ( + "IN", + ["ALICE", "GRACE", "ISABEL", "JAMES", "KAREN"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("AND", 2, "CONTAINS", 2, "__col__", "e", "CONTAINS", 2, "__col__", "i"), + ): ( + "IN", + ["ALICE", "EMILY", "ISABEL", "QUEENIE"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("AND", 2, "CONTAINS", 2, "__col__", "i", "CONTAINS", 2, "__col__", "o"), + ): ( + "IN", + ["NICHOLAS", "OLIVIA", "SOPHIA"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("AND", 2, "CONTAINS", 2, "__col__", "o", "CONTAINS", 2, "__col__", "u"), + ): ( + "IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("AND", 2, "CONTAINS", 2, "__col__", "u", "CONTAINS", 2, "__col__", "a"), + ): ( + "IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ( + "AND", + 3, + "CONTAINS", + 2, + "__col__", + "a", + "CONTAINS", + 2, + "__col__", + "e", + "CONTAINS", + 2, + "__col__", + "i", + ), + ): ( + "IN", + ["ALICE", "ISABEL"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ( + "AND", + 3, + "CONTAINS", + 2, + "__col__", + "e", + "CONTAINS", + 2, + "__col__", + "i", + "CONTAINS", + 2, + "__col__", + "o", + ), + ): ( + "IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ( + "NOT", + 1, + "AND", + 2, + "CONTAINS", + 2, + "__col__", + "i", + "CONTAINS", + 2, + "__col__", + "o", + ), + ): ( + "NOT_IN", + ["NICHOLAS", "OLIVIA", "SOPHIA"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ( + "AND", + 2, + "CONTAINS", + 2, + "__col__", + "i", + "NOT", + 1, + "AND", + 2, + "CONTAINS", + 2, + "__col__", + "a", + "CONTAINS", + 2, + "__col__", + "e", + ), + ): ( + "IN", + [], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ( + "NOT", + 1, + "AND", + 2, + "CONTAINS", + 2, + "__col__", + "e", + "CONTAINS", + 2, + "__col__", + "i", + ), + ): ( + "NOT_IN", + ["ALICE", "EMILY", "ISABEL", "QUEENIE"], + ), + ( + "dummy_server", + "CRBNK/CUSTOMERS/c_fname", + ("CONTAINS", 2, "SLICE", "UPPER", 1, "SLICE", 3, "__col__", 0, 1), + ): ("IN", ["CAROL", "EMILY", "ISABEL", "LUKE", "SOPHIA"]), } diff --git a/tests/test_masked_sf.py b/tests/test_masked_sf.py index d94b1007f..ec54cfe7c 100644 --- a/tests/test_masked_sf.py +++ b/tests/test_masked_sf.py @@ -1,11 +1,22 @@ import datetime +import io from collections.abc import Callable +from contextlib import redirect_stdout import pandas as pd import pytest +from pydough import to_sql from pydough.database_connectors import DatabaseContext, DatabaseDialect -from tests.testing_utilities import graph_fetcher +from pydough.mask_server import MaskServerInfo +from pydough.metadata import GraphMetadata +from pydough.unqualified import UnqualifiedNode +from tests.testing_utilities import ( + extract_batch_requests_from_logs, + graph_fetcher, + temp_env_override, + transform_and_exec_pydough, +) from .testing_sf_masked_utilities import ( PyDoughSnowflakeMaskedTest, @@ -318,6 +329,26 @@ ), id="retail_transactions_filter", ), + pytest.param( + PyDoughSnowflakeMaskedTest( + "t1 = transactions.WHERE((DAY(transaction_date) == 1) & (HOUR(transaction_date) == 7))\n" + "t2 = transactions.WHERE((DAY(transaction_date) == 2) & (HOUR(transaction_date) == 7))\n" + "t3 = transactions.WHERE((DAY(transaction_date) == 1) & (HOUR(transaction_date) == 8))\n" + "t4 = transactions.WHERE((DAY(transaction_date) == 2) & (HOUR(transaction_date) == 8))\n" + "t5 = transactions.WHERE(((DAY(transaction_date) < 4) & (HOUR(transaction_date) < 3)) | ((MINUTE(transaction_date) == SECOND(transaction_date)) & (HOUR(transaction_date) < 3)))\n" + "result = RETAIL.CALCULATE(n1=COUNT(t1), n2=COUNT(t2), n3=COUNT(t3), n4=COUNT(t4), n5=COUNT(t5))", + "RETAIL", + "retail_transactions_ts", + answers={ + "NONE": None, + "PARTIAL": None, + "FULL": pd.DataFrame( + {"n1": [2], "n2": [6], "n3": [3], "n4": [6], "n5": [52]} + ), + }, + ), + id="retail_transactions_ts", + ), pytest.param( PyDoughSnowflakeMaskedTest( "acc_typs = accounts.PARTITION(name='account_types', by=account_type)\n" @@ -813,6 +844,66 @@ ), id="fsi_accounts_customers_compound_c", ), + pytest.param( + PyDoughSnowflakeMaskedTest( + "selected_members = loyalty_members.WHERE(CONTAINS('GT', UPPER(first_name[:1])))\n" + "result = RETAIL.CALCULATE(n=COUNT(selected_members))", + "RETAIL", + "retail_names_analysis_a", + order_sensitive=True, + answers={ + "NONE": None, + "PARTIAL": None, + "FULL": pd.DataFrame({"n": [25]}), + }, + ), + id="retail_names_analysis_a", + ), + pytest.param( + PyDoughSnowflakeMaskedTest( + "selected_members = loyalty_members.WHERE(CONTAINS('day', LOWER(first_name[:2])))\n" + "result = RETAIL.CALCULATE(n=COUNT(selected_members))", + "RETAIL", + "retail_names_analysis_b", + order_sensitive=True, + answers={ + "NONE": None, + "PARTIAL": None, + "FULL": pd.DataFrame({"n": [11]}), + }, + ), + id="retail_names_analysis_b", + ), + pytest.param( + PyDoughSnowflakeMaskedTest( + "selected_members = loyalty_members.WHERE(YEAR(date_of_birth) < 2026)\n" + "result = RETAIL.CALCULATE(n=COUNT(selected_members))", + "RETAIL", + "retail_all", + order_sensitive=True, + answers={ + "NONE": None, + "PARTIAL": None, + "FULL": pd.DataFrame({"n": [500]}), + }, + ), + id="retail_all", + ), + pytest.param( + PyDoughSnowflakeMaskedTest( + "selected_members = loyalty_members.WHERE(YEAR(date_of_birth) >= 2026)\n" + "result = RETAIL.CALCULATE(n=COUNT(selected_members))", + "RETAIL", + "retail_none", + order_sensitive=True, + answers={ + "NONE": None, + "PARTIAL": None, + "FULL": pd.DataFrame({"n": [0]}), + }, + ), + id="retail_none", + ), ], ) def sf_masked_test_data( @@ -825,6 +916,7 @@ def sf_masked_test_data( return request.param +@temp_env_override({"PYDOUGH_MASK_SERVER_HARD_LIMIT": "50"}) @pytest.mark.sf_masked def test_pipeline_until_relational_masked_sf( sf_masked_test_data: PyDoughSnowflakeMaskedTest, @@ -832,6 +924,7 @@ def test_pipeline_until_relational_masked_sf( get_plan_test_filename: Callable[[str], str], update_tests: bool, enable_mask_rewrites: str, + true_mask_server_info: MaskServerInfo, ) -> None: """ Tests the conversion of the PyDough queries on the masked dataset @@ -840,11 +933,16 @@ def test_pipeline_until_relational_masked_sf( file_path: str = get_plan_test_filename( f"{sf_masked_test_data.test_name}_{enable_mask_rewrites}" ) - sf_masked_test_data.run_relational_test( - get_sf_masked_graphs, file_path, update_tests - ) + with redirect_stdout(io.StringIO()): + sf_masked_test_data.run_relational_test( + get_sf_masked_graphs, + file_path, + update_tests, + mask_server=true_mask_server_info, + ) +@temp_env_override({"PYDOUGH_MASK_SERVER_HARD_LIMIT": "50"}) @pytest.mark.sf_masked def test_pipeline_until_sql_masked_sf( sf_masked_test_data: PyDoughSnowflakeMaskedTest, @@ -853,6 +951,7 @@ def test_pipeline_until_sql_masked_sf( get_sql_test_filename: Callable[[str, DatabaseDialect], str], update_tests: bool, enable_mask_rewrites: str, + true_mask_server_info: MaskServerInfo, ): """ Tests the conversion of the PyDough queries on the custom masked dataset @@ -862,14 +961,17 @@ def test_pipeline_until_sql_masked_sf( file_path: str = get_sql_test_filename( f"{sf_masked_test_data.test_name}_{enable_mask_rewrites}", sf_data.dialect ) - sf_masked_test_data.run_sql_test( - get_sf_masked_graphs, - file_path, - update_tests, - sf_data, - ) + with redirect_stdout(io.StringIO()): + sf_masked_test_data.run_sql_test( + get_sf_masked_graphs, + file_path, + update_tests, + sf_data, + mask_server=true_mask_server_info, + ) +@temp_env_override({"PYDOUGH_MASK_SERVER_HARD_LIMIT": "50"}) @pytest.mark.execute @pytest.mark.sf_masked @pytest.mark.parametrize("account_type", ["NONE", "PARTIAL", "FULL"]) @@ -879,6 +981,7 @@ def test_pipeline_e2e_masked_sf( get_sf_masked_graphs: graph_fetcher, # noqa: F811 sf_masked_context: Callable[[str, str, str], DatabaseContext], # noqa: F811 enable_mask_rewrites: str, # noqa: F811 + true_mask_server_info: MaskServerInfo, ) -> None: """ End-to-end test for Snowflake with masked columns. @@ -886,8 +989,99 @@ def test_pipeline_e2e_masked_sf( sf_masked_test_data.account_type = account_type if sf_masked_test_data.answers.get(account_type) is None: pytest.skip(f"No reference solution for account_type={account_type}") + # with redirect_stdout(io.StringIO()): sf_masked_test_data.run_e2e_test( get_sf_masked_graphs, sf_masked_context("BODO", sf_masked_test_data.graph_name, account_type), coerce_types=True, + mask_server=true_mask_server_info, + ) + + +@pytest.mark.sf_masked +@temp_env_override( + {"PYDOUGH_MASK_SERVER_HARD_LIMIT": "50", "PYDOUGH_ENABLE_MASK_REWRITES": "1"} +) +@pytest.mark.parametrize( + ["graph_name", "pydough_code", "batch_requests"], + [ + pytest.param( + "FSI", + "selected_customers = customers.WHERE(last_name == 'Adams')\n" + "result = FSI.CALCULATE(n=COUNT(selected_customers))", + [ + { + "DRY_RUN", + "bodo/fsi/protected_customers/lastname: ['EQUAL', 2, '__col__', 'Adams']", + }, + { + "bodo/fsi/protected_customers/lastname: ['EQUAL', 2, '__col__', 'Adams']", + }, + ], + id="fsi_customers_a", + ), + pytest.param( + "FSI", + "c1 = customers.WHERE((MONTH(date_of_birth) == 6) & (DAY(date_of_birth) == 15))\n" + "c2 = customers.WHERE((YEAR(date_of_birth) == 1970) & (MONTH(date_of_birth) == 6))\n" + "c3 = customers.WHERE((YEAR(date_of_birth) == 1970) & (DAY(date_of_birth) == 15))\n" + "c4 = customers.WHERE((YEAR(date_of_birth) == 1970) & (MONTH(date_of_birth) == 6) & (DAY(date_of_birth) == 15))\n" + "result = FSI.CALCULATE(n1=COUNT(c1), n2=COUNT(c2), n3=COUNT(c3), n4=COUNT(c4))", + [ + { + "DRY_RUN", + "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6]", + "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['AND', 3, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'DAY', 1, '__col__', 15]", + "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'MONTH', 1, '__col__', 6]", + "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + }, + { + "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6]", + "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['AND', 3, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + }, + ], + id="fsi_customers_b", + ), + ], +) +def test_masked_sf_mask_server_logging( + graph_name: str, + pydough_code: str, + batch_requests: list[set[str]], + get_sf_masked_graphs: graph_fetcher, # noqa: F811 + true_mask_server_info: MaskServerInfo, + caplog, +): + """ + Tests whether, during the conversion of the PyDough queries on the masked + Snowflake dataset into SQL text, the correct logging calls are made + regarding batches sent to the mask server. This is to ensure that the calls + are being batched as expected, the right calls are being sent to the server, + and expressions that are non-predicates are not being sent, even if they are + a valid sub-expression of a predicate that can be sent. + """ + # Obtain the graph and the unqualified node + graph: GraphMetadata = get_sf_masked_graphs(graph_name) + root: UnqualifiedNode = transform_and_exec_pydough( + pydough_code, + graph, + {"datetime": datetime, "pd": pd}, + ) + + # Convert the PyDough code to SQL text, while capturing + # stdout to avoid polluting the console with logging calls + with redirect_stdout(io.StringIO()): + to_sql(root, metadata=graph, mask_server=true_mask_server_info) + + # Retrieve the output from the captured logger output + batch_requests_made: list[set[str]] = extract_batch_requests_from_logs(caplog.text) + + # Compare the expected batch requests to those made. + assert batch_requests_made == batch_requests, ( + "The batch requests made do not match the expected batch requests." ) diff --git a/tests/test_masked_sqlite.py b/tests/test_masked_sqlite.py index cf5cd5dce..731aa14c9 100644 --- a/tests/test_masked_sqlite.py +++ b/tests/test_masked_sqlite.py @@ -3,13 +3,25 @@ CRYPTBANK sqlite database. """ +import datetime +import io from collections.abc import Callable +from contextlib import redirect_stdout import pandas as pd import pytest +from pydough import to_sql from pydough.database_connectors import DatabaseContext, DatabaseDialect -from tests.testing_utilities import PyDoughPandasTest, graph_fetcher +from pydough.mask_server import MaskServerInfo +from pydough.metadata import GraphMetadata +from pydough.unqualified import UnqualifiedNode +from tests.testing_utilities import ( + PyDoughPandasTest, + extract_batch_requests_from_logs, + graph_fetcher, + transform_and_exec_pydough, +) @pytest.fixture( @@ -394,6 +406,321 @@ ), id="cryptbank_filter_count_28", ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(birthday <= '1925-01-01')\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [0]}), + "cryptbank_filter_count_29", + ), + id="cryptbank_filter_count_29", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(" + " ISIN(YEAR(birthday) - 2, (1975, 1977, 1979, 1981, 1983, 1985, 1987, 1989, 1991, 1993))" + " & ISIN(MONTH(birthday) + 1, (2, 4, 6, 8, 10, 12))" + ")\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [4]}), + "cryptbank_filter_count_30", + ), + id="cryptbank_filter_count_30", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(ISIN(birthday, [datetime.date(1991, 11, 15), datetime.date(1978, 2, 11), datetime.date(2005, 3, 14), datetime.date(1985, 4, 12)]))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [3]}), + "cryptbank_filter_count_31", + kwargs={"datetime": datetime, "pd": pd}, + ), + id="cryptbank_filter_count_31", + ), + pytest.param( + PyDoughPandasTest( + "selected_accounts = accounts.WHERE(MONOTONIC(pd.Timestamp('2020-03-28 09:20:00'), creation_timestamp, datetime.datetime(2020, 9, 20, 8, 30, 0)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [5]}), + "cryptbank_filter_count_32", + kwargs={"datetime": datetime, "pd": pd}, + ), + id="cryptbank_filter_count_32", + ), + pytest.param( + PyDoughPandasTest( + "selected_accounts = accounts.WHERE(QUARTER(creation_timestamp) == 1)\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [11]}), + "cryptbank_filter_count_33", + ), + id="cryptbank_filter_count_33", + ), + pytest.param( + PyDoughPandasTest( + "selected_accounts = accounts.WHERE(QUARTER(creation_timestamp) == DAY(creation_timestamp))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [1]}), + "cryptbank_filter_count_34", + ), + id="cryptbank_filter_count_34", + ), + pytest.param( + PyDoughPandasTest( + "selected_accounts = accounts.WHERE((HOUR(creation_timestamp) < 10) & (MINUTE(creation_timestamp) < 20))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [6]}), + "cryptbank_filter_count_35", + ), + id="cryptbank_filter_count_35", + ), + pytest.param( + PyDoughPandasTest( + "selected_transactions = transactions.WHERE(SECOND(time_stamp) == 23)\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_transactions))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [3]}), + "cryptbank_filter_count_36", + ), + id="cryptbank_filter_count_36", + ), + pytest.param( + PyDoughPandasTest( + "selected_accounts = accounts.WHERE(MONOTONIC(200, ABS(balance - 7250), 600))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [2]}), + "cryptbank_filter_count_37", + ), + id="cryptbank_filter_count_37", + ), + pytest.param( + PyDoughPandasTest( + "selected_accounts = accounts.WHERE(LARGEST(HOUR(creation_timestamp), MINUTE(creation_timestamp), SECOND(creation_timestamp)) == 10)\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [2]}), + "cryptbank_filter_count_38", + ), + id="cryptbank_filter_count_38", + ), + pytest.param( + PyDoughPandasTest( + "selected_accounts = accounts.WHERE(SMALLEST(HOUR(creation_timestamp), MINUTE(creation_timestamp)) == 15)\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [4]}), + "cryptbank_filter_count_39", + ), + id="cryptbank_filter_count_39", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS(JOIN_STRINGS('', '1-', phone_number), '1-5'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [20]}), + "cryptbank_filter_count_40", + ), + id="cryptbank_filter_count_40", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS(JOIN_STRINGS('-', '1', phone_number), '1-5'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [20]}), + "cryptbank_filter_count_41", + ), + id="cryptbank_filter_count_41", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS(JOIN_STRINGS('-', '1', phone_number, '1'), '5-1'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [4]}), + "cryptbank_filter_count_42", + ), + id="cryptbank_filter_count_42", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(JOIN_STRINGS(' ', first_name, last_name) == 'olivia anderson')\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [1]}), + "cryptbank_filter_count_43", + ), + id="cryptbank_filter_count_43", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(ISIN(DEFAULT_TO(YEAR(birthday), 1990), (1990, 1991)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [4]}), + "cryptbank_filter_count_44", + ), + id="cryptbank_filter_count_44", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(ISIN(DEFAULT_TO(YEAR(birthday), 1990), (1990, 2005)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [3]}), + "cryptbank_filter_count_45", + ), + id="cryptbank_filter_count_45", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(ISIN(DEFAULT_TO(YEAR(birthday), 2005), (2005, 2006)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [2]}), + "cryptbank_filter_count_46", + ), + id="cryptbank_filter_count_46", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(~ISIN(DEFAULT_TO(YEAR(birthday), 1990), (1990, 1991)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [16]}), + "cryptbank_filter_count_47", + ), + id="cryptbank_filter_count_47", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(~ISIN(DEFAULT_TO(YEAR(birthday), 1990), (1990, 2005)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [17]}), + "cryptbank_filter_count_48", + ), + id="cryptbank_filter_count_48", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(~ISIN(DEFAULT_TO(YEAR(birthday), 2005), (2005, 2006)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [18]}), + "cryptbank_filter_count_49", + ), + id="cryptbank_filter_count_49", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(~ISIN(DEFAULT_TO(YEAR(birthday), 2005), (2005, 2006)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [18]}), + "cryptbank_filter_count_50", + ), + id="cryptbank_filter_count_50", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS(IFF(ISIN(first_name[:1], ('q', 'r', 's')), first_name, last_name), 'ee'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [4]}), + "cryptbank_filter_count_51", + ), + id="cryptbank_filter_count_51", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS(last_name, JOIN_STRINGS('', 'e', IFF(ISIN(last_name[:1], ('q', 'r', 's')), 'z', 'e'))))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [5]}), + "cryptbank_filter_count_52", + ), + id="cryptbank_filter_count_52", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(first_name[0:1] == 'i')\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [1]}), + "cryptbank_filter_count_53", + ), + id="cryptbank_filter_count_53", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(ISIN(first_name[-1:], list('aeiou')))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [7]}), + "cryptbank_filter_count_54", + ), + id="cryptbank_filter_count_54", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(ISIN(first_name[1:3], ['ar', 'li', 'ra', 'to', 'am']))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [8]}), + "cryptbank_filter_count_55", + ), + id="cryptbank_filter_count_55", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(ISIN(first_name[-2:-1], ['a', 'c', 'l']))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [5]}), + "cryptbank_filter_count_56", + ), + id="cryptbank_filter_count_56", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS(first_name[:-1], 'e'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [8]}), + "cryptbank_filter_count_57", + ), + id="cryptbank_filter_count_57", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS(first_name[1:-1], 'e'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [7]}), + "cryptbank_filter_count_58", + ), + id="cryptbank_filter_count_58", + ), + pytest.param( + PyDoughPandasTest( + "selected_customers = customers.WHERE(CONTAINS('SLICE', UPPER(first_name[:1])))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + "CRYPTBANK", + lambda: pd.DataFrame({"n": [5]}), + "cryptbank_filter_count_59", + ), + id="cryptbank_filter_count_59", + ), pytest.param( PyDoughPandasTest( "selected_transactions = transactions.WHERE((YEAR(time_stamp) == 2022) & (MONTH(time_stamp) == 6))\n" @@ -496,6 +823,120 @@ ), id="cryptbank_agg_05", ), + pytest.param( + PyDoughPandasTest( + "result = CRYPTBANK.CALCULATE(n_neg=SUM(transactions.amount < 0), n_positive=SUM(transactions.amount > 0))", + "CRYPTBANK", + lambda: pd.DataFrame({"n_neg": [0], "n_positive": [300]}), + "cryptbank_agg_06", + ), + id="cryptbank_agg_06", + ), + pytest.param( + PyDoughPandasTest( + "result = CRYPTBANK.CALCULATE(" + " n_yr=SUM(DATETIME(transactions.time_stamp, 'start of year') == '2023-01-01')," + " n_qu=SUM(DATETIME(transactions.time_stamp, 'start of quarter') == '2023-04-01')," + " n_mo=SUM(DATETIME(transactions.time_stamp, 'start of month') == '2023-06-01')," + " n_we=SUM(DATETIME(transactions.time_stamp, 'start of week') == '2023-05-28')," + " n_da=SUM(DATETIME(transactions.time_stamp, 'start of day') == '2023-06-02')," + " n_ho=SUM(DATETIME(transactions.time_stamp, 'start of hour') == '2023-06-02 04:00:00')," + " n_mi=SUM(DATETIME(transactions.time_stamp, 'start of minute') == '2023-06-02 04:55:00')," + " n_se=SUM(DATETIME(transactions.time_stamp, 'start of second') == '2023-06-02 04:55:31')," + " n_cts=SUM(transactions.time_stamp == DATETIME('now', 'start of day'))," + " n_dts=SUM(transactions.time_stamp == DATETIME(JOIN_STRINGS('-', '2025', '12', '31')))," + " n_nst=SUM(DATETIME(transactions.time_stamp, 'start of week', '+3 days') == '2023-05-31')," + " n_ayr=SUM(DATETIME(transactions.time_stamp, '+1 Y') == '2020-11-11 18:00:52')," + " n_aqu=SUM(DATETIME(transactions.time_stamp, '+2 q') == '2020-05-11 18:00:52')," + " n_amo=SUM(DATETIME(transactions.time_stamp, '-5 Mm') == '2019-06-11 18:00:52')," + " n_awe=SUM(DATETIME(transactions.time_stamp, 'start of day', '+1 week') == '2023-06-09')," + " n_ada=SUM(DATETIME(transactions.time_stamp, '+10 DAYS') == '2019-11-21 18:00:52')," + " n_aho=SUM(DATETIME(transactions.time_stamp, '+1000 hour') == '2019-12-23 10:00:52')," + " n_ami=SUM(DATETIME(transactions.time_stamp, '+10000 minute') == '2019-11-18 16:40:52')," + " n_ase=SUM(DATETIME(transactions.time_stamp, '-1000000 s') == '2019-10-31 04:14:12')," + " n_ldm=SUM(DATETIME(transactions.time_stamp, 'start of month', '-1 day') == '2019-10-31')," + ")", + "CRYPTBANK", + lambda: pd.DataFrame( + { + "n_yr": [61], + "n_qu": [17], + "n_mo": [8], + "n_we": [2], + "n_da": [2], + "n_ho": [2], + "n_mi": [2], + "n_se": [1], + "n_cts": [0], + "n_dts": [0], + "n_nst": [2], + "n_ayr": [1], + "n_aqu": [1], + "n_amo": [1], + "n_awe": [2], + "n_ada": [1], + "n_aho": [1], + "n_ami": [1], + "n_ase": [1], + "n_ldm": [4], + } + ), + "cryptbank_agg_07", + ), + id="cryptbank_agg_07", + ), + pytest.param( + PyDoughPandasTest( + "result = (" + " accounts" + " .CALCULATE(partkey=(account_type == 'retirement') | (account_type == 'savings'))" + " .PARTITION(name='actyp', by=partkey)" + " .accounts" + " .BEST(per='actyp', by=balance.DESC())" + " .CALCULATE(account_type, key, balance)" + " .ORDER_BY(account_type.ASC())" + ")", + "CRYPTBANK", + lambda: pd.DataFrame( + { + "account_type": ["mma", "retirement"], + "key": [8, 28], + "balance": [5500.0, 25000.0], + } + ), + "cryptbank_window_01", + ), + id="cryptbank_window_01", + ), + pytest.param( + PyDoughPandasTest( + "result = (" + " branches" + " .WHERE(CONTAINS(address, ';CA;'))" + " .CALCULATE(branch_name=name)" + " .accounts_managed" + " .BEST(per='branches', by=((YEAR(creation_timestamp) == 2021).ASC(), key.ASC()))" + " .CALCULATE(branch_name, key, creation_timestamp)" + " .ORDER_BY(branch_name.ASC())" + ")", + "CRYPTBANK", + lambda: pd.DataFrame( + { + "branch_name": [ + "Downtown Los Angeles Branch", + "San Francisco Financial Branch", + ], + "key": [14, 8], + "creation_timestamp": [ + "2016-05-12 14:00:00", + "2018-07-19 14:10:00", + ], + } + ), + "cryptbank_window_02", + ), + id="cryptbank_window_02", + ), pytest.param( PyDoughPandasTest( "first_sent = accounts_held.transactions_sent.WHERE(receiver_account.branch.address[-5:] == '94105').BEST(per='accounts_held', by=time_stamp.ASC())\n" @@ -625,6 +1066,7 @@ def test_pipeline_until_relational_cryptbank( get_plan_test_filename: Callable[[str], str], update_tests: bool, enable_mask_rewrites: str, + mock_server_info: MaskServerInfo, ) -> None: """ Tests the conversion of the PyDough queries on the custom cryptbank dataset @@ -633,9 +1075,11 @@ def test_pipeline_until_relational_cryptbank( file_path: str = get_plan_test_filename( f"{cryptbank_pipeline_test_data.test_name}_{enable_mask_rewrites}" ) - cryptbank_pipeline_test_data.run_relational_test( - masked_graphs, file_path, update_tests - ) + # Capture stdout to avoid polluting the console with logging calls + with redirect_stdout(io.StringIO()): + cryptbank_pipeline_test_data.run_relational_test( + masked_graphs, file_path, update_tests, mask_server=mock_server_info + ) def test_pipeline_until_sql_cryptbank( @@ -645,6 +1089,7 @@ def test_pipeline_until_sql_cryptbank( get_sql_test_filename: Callable[[str, DatabaseDialect], str], update_tests: bool, enable_mask_rewrites: str, + mock_server_info: MaskServerInfo, ): """ Tests the conversion of the PyDough queries on the custom cryptbank dataset @@ -654,12 +1099,15 @@ def test_pipeline_until_sql_cryptbank( f"{cryptbank_pipeline_test_data.test_name}_{enable_mask_rewrites}", sqlite_tpch_db_context.dialect, ) - cryptbank_pipeline_test_data.run_sql_test( - masked_graphs, - file_path, - update_tests, - sqlite_tpch_db_context, - ) + # Capture stdout to avoid polluting the console with logging calls + with redirect_stdout(io.StringIO()): + cryptbank_pipeline_test_data.run_sql_test( + masked_graphs, + file_path, + update_tests, + sqlite_tpch_db_context, + mask_server=mock_server_info, + ) @pytest.mark.execute @@ -668,12 +1116,610 @@ def test_pipeline_e2e_cryptbank( masked_graphs: graph_fetcher, sqlite_cryptbank_connection: DatabaseContext, enable_mask_rewrites: str, + mock_server_info: MaskServerInfo, ): """ Test executing the the custom queries with the custom cryptbank dataset against the refsol DataFrame. """ - cryptbank_pipeline_test_data.run_e2e_test( - masked_graphs, - sqlite_cryptbank_connection, + # Capture stdout to avoid polluting the console with logging calls + with redirect_stdout(io.StringIO()): + cryptbank_pipeline_test_data.run_e2e_test( + masked_graphs, + sqlite_cryptbank_connection, + mask_server=mock_server_info, + ) + + +@pytest.mark.parametrize( + ["pydough_code", "batch_requests"], + [ + pytest.param( + "selected_customers = customers.WHERE(last_name == 'lee')\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + {"CRBNK/CUSTOMERS/c_lname: ['EQUAL', 2, '__col__', 'lee']", "DRY_RUN"}, + {"CRBNK/CUSTOMERS/c_lname: ['EQUAL', 2, '__col__', 'lee']"}, + ], + id="cryptbank_filter_count_01", + ), + pytest.param( + "selected_customers = customers.WHERE(ISIN(last_name, ('lee', 'smith', 'rodriguez')))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_lname: ['IN', 4, '__col__', 'lee', 'smith', 'rodriguez']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_lname: ['IN', 4, '__col__', 'lee', 'smith', 'rodriguez']" + }, + ], + id="cryptbank_filter_count_03", + ), + pytest.param( + "selected_customers = customers.WHERE(~ISIN(last_name, ('lee', 'smith', 'rodriguez')))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_lname: ['IN', 4, '__col__', 'lee', 'smith', 'rodriguez']", + "CRBNK/CUSTOMERS/c_lname: ['NOT', 1, 'IN', 4, '__col__', 'lee', 'smith', 'rodriguez']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_lname: ['NOT', 1, 'IN', 4, '__col__', 'lee', 'smith', 'rodriguez']", + }, + ], + id="cryptbank_filter_count_04", + ), + pytest.param( + "selected_customers = customers.WHERE(" + " (" + " PRESENT(address) &" + " PRESENT(birthday) &" + " (last_name != 'lopez') &" + " (ENDSWITH(first_name, 'a') | ENDSWITH(first_name, 'e') | ENDSWITH(first_name, 's'))" + ") | (ABSENT(birthday) & ENDSWITH(phone_number, '5'))" + ")\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['ENDSWITH', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['ENDSWITH', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['ENDSWITH', 2, '__col__', 's']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 2, 'ENDSWITH', 2, '__col__', 'a', 'ENDSWITH', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 2, 'OR', 2, 'ENDSWITH', 2, '__col__', 'a', 'ENDSWITH', 2, '__col__', 'e', 'ENDSWITH', 2, '__col__', 's']", + "CRBNK/CUSTOMERS/c_lname: ['NOT_EQUAL', 2, '__col__', 'lopez']", + "CRBNK/CUSTOMERS/c_phone: ['ENDSWITH', 2, '__col__', '5']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['ENDSWITH', 2, '__col__', 's']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 2, 'ENDSWITH', 2, '__col__', 'a', 'ENDSWITH', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_lname: ['NOT_EQUAL', 2, '__col__', 'lopez']", + "CRBNK/CUSTOMERS/c_phone: ['ENDSWITH', 2, '__col__', '5']", + }, + ], + id="cryptbank_filter_count_27", + ), + pytest.param( + "selected_accounts = accounts.WHERE(" + + " & ".join( + [ + "((account_type == 'retirement') | (account_type == 'savings'))", + "(balance >= 5000)", + "(CONTAINS(account_holder.email, 'outlook') | CONTAINS(account_holder.email, 'gmail'))", + "(YEAR(creation_timestamp) < 2020)", + ] + ) + + ")\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + [ + { + "CRBNK/ACCOUNTS/a_balance: ['GTE', 2, '__col__', 5000]", + "CRBNK/ACCOUNTS/a_open_ts: ['LT', 2, 'YEAR', 1, '__col__', 2020]", + "CRBNK/ACCOUNTS/a_type: ['EQUAL', 2, '__col__', 'retirement']", + "CRBNK/ACCOUNTS/a_type: ['EQUAL', 2, '__col__', 'savings']", + "CRBNK/CUSTOMERS/c_email: ['CONTAINS', 2, '__col__', 'gmail']", + "CRBNK/CUSTOMERS/c_email: ['CONTAINS', 2, '__col__', 'outlook']", + "CRBNK/ACCOUNTS/a_type: ['OR', 2, 'EQUAL', 2, '__col__', 'retirement', 'EQUAL', 2, '__col__', 'savings']", + "CRBNK/CUSTOMERS/c_email: ['OR', 2, 'CONTAINS', 2, '__col__', 'outlook', 'CONTAINS', 2, '__col__', 'gmail']", + "DRY_RUN", + }, + { + "CRBNK/ACCOUNTS/a_type: ['OR', 2, 'EQUAL', 2, '__col__', 'retirement', 'EQUAL', 2, '__col__', 'savings']", + }, + ], + id="cryptbank_filter_count_28", + ), + pytest.param( + "selected_customers = customers.WHERE(birthday <= '1925-01-01')\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_birthday: ['LTE', 2, '__col__', '1925-01-01']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_birthday: ['LTE', 2, '__col__', '1925-01-01']", + }, + ], + id="cryptbank_filter_count_29", + ), + pytest.param( + "selected_customers = customers.WHERE(" + " ISIN(YEAR(birthday) - 2, (1975, 1977, 1979, 1981, 1983, 1985, 1987, 1989, 1991, 1993))" + " & ISIN(MONTH(birthday) + 1, (2, 4, 6, 8, 10, 12))" + ")\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_birthday: ['AND', 2, 'IN', 7, 'ADD', 2, 'MONTH', 1, '__col__', 1, 2, 4, 6, 8, 10, 12, 'IN', 11, 'SUB', 2, 'YEAR', 1, '__col__', 2, 1975, 1977, 1979, 1981, 1983, 1985, 1987, 1989, 1991, 1993]", + "CRBNK/CUSTOMERS/c_birthday: ['IN', 11, 'SUB', 2, 'YEAR', 1, '__col__', 2, 1975, 1977, 1979, 1981, 1983, 1985, 1987, 1989, 1991, 1993]", + "CRBNK/CUSTOMERS/c_birthday: ['IN', 7, 'ADD', 2, 'MONTH', 1, '__col__', 1, 2, 4, 6, 8, 10, 12]", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_birthday: ['AND', 2, 'IN', 7, 'ADD', 2, 'MONTH', 1, '__col__', 1, 2, 4, 6, 8, 10, 12, 'IN', 11, 'SUB', 2, 'YEAR', 1, '__col__', 2, 1975, 1977, 1979, 1981, 1983, 1985, 1987, 1989, 1991, 1993]", + }, + ], + id="cryptbank_filter_count_30", + ), + pytest.param( + "selected_customers = customers.WHERE(ISIN(birthday, [datetime.date(1991, 11, 15), datetime.date(1978, 2, 11), datetime.date(2005, 3, 14), datetime.date(1985, 4, 12)]))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_birthday: ['IN', 5, '__col__', '1991-11-15', '1978-02-11', '2005-03-14', '1985-04-12']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_birthday: ['IN', 5, '__col__', '1991-11-15', '1978-02-11', '2005-03-14', '1985-04-12']", + }, + ], + id="cryptbank_filter_count_31", + ), + pytest.param( + "selected_accounts = accounts.WHERE(MONOTONIC(pd.Timestamp('2020-03-28 09:20:00'), creation_timestamp, datetime.datetime(2020, 9, 20, 8, 30, 0)))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + [ + { + "CRBNK/ACCOUNTS/a_open_ts: ['AND', 2, 'LTE', 2, '2020-03-28 09:20:00', '__col__', 'LTE', 2, '__col__', '2020-09-20 08:30:00']", + "DRY_RUN", + }, + { + "CRBNK/ACCOUNTS/a_open_ts: ['AND', 2, 'LTE', 2, '2020-03-28 09:20:00', '__col__', 'LTE', 2, '__col__', '2020-09-20 08:30:00']", + }, + ], + id="cryptbank_filter_count_32", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(n_neg=SUM(transactions.amount < 0), n_positive=SUM(transactions.amount > 0))", + [ + { + "CRBNK/TRANSACTIONS/t_amount: ['LT', 2, '__col__', 0]", + "CRBNK/TRANSACTIONS/t_amount: ['GT', 2, '__col__', 0]", + "DRY_RUN", + }, + { + "CRBNK/TRANSACTIONS/t_amount: ['LT', 2, '__col__', 0]", + "CRBNK/TRANSACTIONS/t_amount: ['GT', 2, '__col__', 0]", + }, + ], + id="cryptbank_agg_06", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(" + " n_yr=SUM(DATETIME(transactions.time_stamp, 'start of year') == '2023-01-01')," + " n_qu=SUM(DATETIME(transactions.time_stamp, 'start of quarter') == '2023-04-01')," + " n_mo=SUM(DATETIME(transactions.time_stamp, 'start of month') == '2023-06-01')," + " n_we=SUM(DATETIME(transactions.time_stamp, 'start of week') == '2023-05-28')," + " n_da=SUM(DATETIME(transactions.time_stamp, 'start of day') == '2023-06-02')," + " n_ho=SUM(DATETIME(transactions.time_stamp, 'start of hour') == '2023-06-02 04:00:00')," + " n_mi=SUM(DATETIME(transactions.time_stamp, 'start of minute') == '2023-06-02 04:55:00')," + " n_se=SUM(DATETIME(transactions.time_stamp, 'start of second') == '2023-06-02 04:55:31')," + " n_cts=SUM(transactions.time_stamp == DATETIME('now', 'start of day'))," + " n_dts=SUM(transactions.time_stamp == DATETIME(JOIN_STRINGS('-', '2025', '12', '31')))," + " n_nst=SUM(DATETIME(transactions.time_stamp, 'start of week', '+3 days') == '2023-05-31')," + " n_ayr=SUM(DATETIME(transactions.time_stamp, '+1 Y') == '2020-11-11 18:00:52')," + " n_aqu=SUM(DATETIME(transactions.time_stamp, '+2 q') == '2020-05-11 18:00:52')," + " n_amo=SUM(DATETIME(transactions.time_stamp, '-5 Mm') == '2019-06-11 18:00:52')," + " n_awe=SUM(DATETIME(transactions.time_stamp, 'start of day', '+1 week') == '2023-06-09')," + " n_ada=SUM(DATETIME(transactions.time_stamp, '+10 DAYS') == '2019-11-21 18:00:52')," + " n_aho=SUM(DATETIME(transactions.time_stamp, '+1000 hour') == '2019-12-23 10:00:52')," + " n_ami=SUM(DATETIME(transactions.time_stamp, '+10000 minute') == '2019-11-18 16:40:52')," + " n_ase=SUM(DATETIME(transactions.time_stamp, '-1000000 s') == '2019-10-31 04:14:12')," + " n_ldm=SUM(DATETIME(transactions.time_stamp, 'start of month', '-1 day') == '2019-10-31')," + ")", + [ + { + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'day', '__col__', '2023-06-02']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'hour', '__col__', '2023-06-02 04:00:00']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'minute', '__col__', '2023-06-02 04:55:00']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'month', '__col__', '2023-06-01']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'quarter', '__col__', '2023-04-01']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'second', '__col__', '2023-06-02 04:55:31']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'year', '__col__', '2023-01-01']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 1, 'years', '__col__', '2020-11-11 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 2, 'quarters', '__col__', '2020-05-11 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, -5, 'months', '__col__', '2019-06-11 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 10, 'days', '__col__', '2019-11-21 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 1000, 'hours', '__col__', '2019-12-23 10:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 10000, 'minutes', '__col__', '2019-11-18 16:40:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, -1000000, 'seconds', '__col__', '2019-10-31 04:14:12']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, -1, 'days', 'DATETRUNC', 2, 'month', '__col__', '2019-10-31']", + "DRY_RUN", + }, + { + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'day', '__col__', '2023-06-02']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'hour', '__col__', '2023-06-02 04:00:00']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'minute', '__col__', '2023-06-02 04:55:00']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'month', '__col__', '2023-06-01']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'quarter', '__col__', '2023-04-01']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'second', '__col__', '2023-06-02 04:55:31']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATETRUNC', 2, 'year', '__col__', '2023-01-01']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 1, 'years', '__col__', '2020-11-11 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 2, 'quarters', '__col__', '2020-05-11 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, -5, 'months', '__col__', '2019-06-11 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 10, 'days', '__col__', '2019-11-21 18:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 1000, 'hours', '__col__', '2019-12-23 10:00:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, 10000, 'minutes', '__col__', '2019-11-18 16:40:52']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, -1000000, 'seconds', '__col__', '2019-10-31 04:14:12']", + "CRBNK/TRANSACTIONS/t_ts: ['EQUAL', 2, 'DATEADD', 3, -1, 'days', 'DATETRUNC', 2, 'month', '__col__', '2019-10-31']", + }, + ], + id="cryptbank_agg_07", + ), + pytest.param( + "selected_accounts = accounts.WHERE(QUARTER(creation_timestamp) == DAY(creation_timestamp))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_accounts))", + [ + { + "CRBNK/ACCOUNTS/a_open_ts: ['EQUAL', 2, 'QUARTER', 1, '__col__', 'DAY', 1, '__col__']", + "DRY_RUN", + }, + { + "CRBNK/ACCOUNTS/a_open_ts: ['EQUAL', 2, 'QUARTER', 1, '__col__', 'DAY', 1, '__col__']" + }, + ], + id="cryptbank_filter_count_34", + ), + pytest.param( + "selected_customers = customers.WHERE(CONTAINS(JOIN_STRINGS('', '1-', phone_number), '1-5'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_phone: ['CONTAINS', 2, 'CONCAT', 2, '1-', '__col__', '1-5']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_phone: ['CONTAINS', 2, 'CONCAT', 2, '1-', '__col__', '1-5']" + }, + ], + id="cryptbank_filter_count_40", + ), + pytest.param( + "selected_customers = customers.WHERE(CONTAINS(JOIN_STRINGS('-', '1', phone_number), '1-5'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_phone: ['CONTAINS', 2, 'CONCAT', 3, '1', '-', '__col__', '1-5']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_phone: ['CONTAINS', 2, 'CONCAT', 3, '1', '-', '__col__', '1-5']" + }, + ], + id="cryptbank_filter_count_41", + ), + pytest.param( + "selected_customers = customers.WHERE(CONTAINS(JOIN_STRINGS('-', '1', phone_number, '1'), '5-1'))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_phone: ['CONTAINS', 2, 'CONCAT', 5, '1', '-', '__col__', '-', '1', '5-1']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_phone: ['CONTAINS', 2, 'CONCAT', 5, '1', '-', '__col__', '-', '1', '5-1']" + }, + ], + id="cryptbank_filter_count_42", + ), + pytest.param( + "selected_customers = customers.WHERE(JOIN_STRINGS(' ', first_name, last_name) == 'olivia anderson')\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [], + id="cryptbank_filter_count_43", + ), + pytest.param( + "selected_customers = customers.WHERE(CONTAINS('SLICE', UPPER(first_name[:1])))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, 'SLICE', 'UPPER', 1, 'SLICE', 3, '__col__', 0, 1]", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, 'SLICE', 'UPPER', 1, 'SLICE', 3, '__col__', 0, 1]" + }, + ], + id="cryptbank_filter_count_59", + ), + pytest.param( + "selected_customers = customers.WHERE(ISIN(first_name, ['Datediff', 'YEAR', 'IN', 'NOT IN', 'NEQ', 'NOT_EQUAL', 'lower']))\n" + "result = CRYPTBANK.CALCULATE(n=COUNT(selected_customers))", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['IN', 8, '__col__', 'Datediff', 'YEAR', 'IN', 'NOT IN', 'NEQ', 'NOT_EQUAL', 'lower']", + "DRY_RUN", + }, + ], + id="cryptbank_quote_list", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(" + + ", ".join( + f"n{idx}=COUNT(customers.WHERE({cond}))" + for idx, cond in enumerate( + [ + "CONTAINS(first_name, 'a')", + "CONTAINS(first_name, 'e')", + "CONTAINS(first_name, 'i')", + "CONTAINS(first_name, 'o')", + "CONTAINS(first_name, 'u')", + ] + ) + ) + + ")", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + }, + ], + id="cryptbank_multi_fcount_01", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(" + + ", ".join( + f"n{idx}=COUNT(customers.WHERE({cond}))" + for idx, cond in enumerate( + [ + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'e')", + "CONTAINS(first_name, 'e') & CONTAINS(first_name, 'i')", + "CONTAINS(first_name, 'i') & CONTAINS(first_name, 'o')", + "CONTAINS(first_name, 'o') & CONTAINS(first_name, 'u')", + "CONTAINS(first_name, 'u') & CONTAINS(first_name, 'a')", + ] + ) + ) + + ")", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', " + "'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', " + "'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'e', " + "'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'i', " + "'CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'o', " + "'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + }, + ], + id="cryptbank_multi_fcount_02", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(" + + ", ".join( + f"n{idx}=COUNT(customers.WHERE({cond}))" + for idx, cond in enumerate( + [ + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'e')", + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'i')", + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'o')", + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'u')", + "CONTAINS(first_name, 'a')", + ] + ) + ) + + ")", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + }, + ], + id="cryptbank_multi_fcount_03", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(" + + ", ".join( + f"n{idx}=COUNT(customers.WHERE({cond}))" + for idx, cond in enumerate( + [ + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'e')", + "CONTAINS(first_name, 'e') & CONTAINS(first_name, 'i')", + "CONTAINS(first_name, 'i') & CONTAINS(first_name, 'u')", + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'i')", + ] + ) + ) + + ")", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + }, + ], + id="cryptbank_multi_fcount_04", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(" + + ", ".join( + f"n{idx}=COUNT(customers.WHERE({cond}))" + for idx, cond in enumerate( + [ + "CONTAINS(first_name, 'a') & CONTAINS(first_name, 'e') & CONTAINS(first_name, 'i')", + "CONTAINS(first_name, 'e') & CONTAINS(first_name, 'i') & CONTAINS(first_name, 'o')", + "CONTAINS(first_name, 'i') & CONTAINS(first_name, 'o') & CONTAINS(first_name, 'u')", + ] + ) + ) + + ")", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + }, + ], + id="cryptbank_multi_fcount_05", + ), + pytest.param( + "result = CRYPTBANK.CALCULATE(" + + ", ".join( + f"n{idx}=COUNT(customers.WHERE({cond}))" + for idx, cond in enumerate( + [ + "~(CONTAINS(first_name, 'a') & CONTAINS(first_name, 'e')) & CONTAINS(first_name, 'i')", + "~(CONTAINS(first_name, 'e') & CONTAINS(first_name, 'i')) & CONTAINS(first_name, 'o')", + ] + ) + ) + + ")", + [ + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'i', 'NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'o', 'NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "DRY_RUN", + }, + { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'i', 'NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + }, + ], + id="cryptbank_multi_fcount_06", + ), + ], +) +def test_cryptbank_mask_server_logging( + pydough_code: str, + batch_requests: list[set[str]], + masked_graphs: graph_fetcher, + enable_mask_rewrites: str, + mock_server_info: MaskServerInfo, + caplog, +): + """ + Tests whether, during the conversion of the PyDough queries on the custom + cryptbank dataset into SQL text, the correct logging calls are made + regarding batches sent to the mask server. This is to ensure that the calls + are being batched as expected, the right calls are being sent to the server, + and expressions that are non-predicates are not being sent, even if they are + a valid sub-expression of a predicate that can be sent. + """ + # Obtain the graph and the unqualified node + graph: GraphMetadata = masked_graphs("CRYPTBANK") + root: UnqualifiedNode = transform_and_exec_pydough( + pydough_code, + masked_graphs("CRYPTBANK"), + {"datetime": datetime, "pd": pd}, ) + + # Convert the PyDough code to SQL text, while capturing + # stdout to avoid polluting the console with logging calls + with redirect_stdout(io.StringIO()): + to_sql(root, metadata=graph, mask_server=mock_server_info) + + # Retrieve the output from the captured logger output + batch_requests_made: list[set[str]] = extract_batch_requests_from_logs(caplog.text) + + # If in raw mode, make sure no requests were made. Otherwise, compare the + # expected batch requests to those made. + if enable_mask_rewrites == "raw": + assert batch_requests_made == [], ( + "Expected no batch requests to be made in 'raw' mode." + ) + else: + assert batch_requests_made == batch_requests, ( + "The batch requests made do not match the expected batch requests." + ) diff --git a/tests/test_metadata/masked_graphs.json b/tests/test_metadata/masked_graphs.json index ef9680335..7ff27c758 100644 --- a/tests/test_metadata/masked_graphs.json +++ b/tests/test_metadata/masked_graphs.json @@ -17,6 +17,7 @@ "unprotect protocol": "(42 - ({}))", "protect protocol": "(42 - ({}))", "server masked": true, + "server dataset id": "dummy_server", "description": "The unique key for the customer", "sample values": [1, 2, 10, 18], "synonyms": ["customer id"] @@ -29,6 +30,7 @@ "unprotect protocol": "LOWER({})", "protect protocol": "UPPER({})", "server masked": true, + "server dataset id": "dummy_server", "description": "The first name of the customer", "sample values": ["alice", "olivia", "queenie", "james"], "synonyms": ["customer name"] @@ -41,6 +43,7 @@ "unprotect protocol": "LOWER({})", "protect protocol": "UPPER({})", "server masked": true, + "server dataset id": "dummy_server", "description": "The last name of the customer", "sample values": ["smith", "johnson", "lee"], "synonyms": ["surname"] @@ -53,6 +56,7 @@ "unprotect protocol": "REPLACE(REPLACE(REPLACE({}, '9', '*'), '0', '9'), '*', '0')", "protect protocol": "REPLACE(REPLACE(REPLACE({}, '0', '*'), '9', '0'), '*', '9')", "server masked": true, + "server dataset id": "dummy_server", "description": "The phone number of the customer", "sample values": ["555-123-4567", "555-768-9012"], "synonyms": ["cell number", "contact phone number"] @@ -65,6 +69,7 @@ "unprotect protocol": "SUBSTRING({0}, -1) || SUBSTRING({0}, 1, LENGTH({0}) - 1)", "protect protocol": "SUBSTRING({0}, 2) || SUBSTRING({0}, 1, 1)", "server masked": true, + "server dataset id": "dummy_server", "description": "The email address of the customer", "sample values": ["alice_j@example.org", "m.gonzalez@ymail.com"], "synonyms": ["email address", "contact email"] @@ -77,6 +82,7 @@ "unprotect protocol": "SUBSTRING({0}, -1) || SUBSTRING({0}, 1, LENGTH({0}) - 1)", "protect protocol": "SUBSTRING({0}, 2) || SUBSTRING({0}, 1, 1)", "server masked": true, + "server dataset id": "dummy_server", "description": "The address of the customer, in the format 'street address;city;state;zip'", "sample values": ["123 Maple St;Portland;OR;97205", "654 Cedar Blvd;San Diego;CA;92101"], "synonyms": ["home address", "residential address", "residence", "location", "home"] @@ -89,6 +95,7 @@ "unprotect protocol": "DATE({}, '+472 days')", "protect protocol": "DATE({}, '-472 days')", "server masked": true, + "server dataset id": "dummy_server", "description": "The date the customer was born on.", "synonyms": ["birth date", "date of birth", "DOB"] } @@ -147,6 +154,7 @@ "unprotect protocol": "CASE WHEN {0} = 0 THEN 0 ELSE (CASE WHEN {0} > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING({0}, 1 + INSTR({0}, '-'), LENGTH({0}) / 2) AS INTEGER) END", "protect protocol": "CAST(STRING({0}) || STRING(ABS({0})) AS INTEGER)", "server masked": true, + "server dataset id": "dummy_server", "description": "The unique key for the account", "sample values": [5, 17, 35, 58], "synonyms": ["account id", "account number"] @@ -177,6 +185,7 @@ "unprotect protocol": "SQRT({})", "protect protocol": "({0} * {0})", "server masked": true, + "server dataset id": "dummy_server", "description": "The current balance of the account", "sample values": [2450.75, 520.10, 22500.00], "synonyms": ["money in account", "account balance", "funds available"] @@ -189,6 +198,7 @@ "unprotect protocol": "SUBSTRING({0}, -1) || SUBSTRING({0}, 1, LENGTH({0}) - 1)", "protect protocol": "SUBSTRING({0}, 2) || SUBSTRING({0}, 1, 1)", "server masked": true, + "server dataset id": "dummy_server", "description": "The type of account, which is one of: checking, savings, retirement, business, or mma", "sample values": ["checking", "savings", "retirement", "business", "mma"], "synonyms": ["category"] @@ -201,6 +211,7 @@ "unprotect protocol": "DATETIME({}, '+123456789 seconds')", "protect protocol": "DATETIME({}, '-123456789 seconds')", "server masked": true, + "server dataset id": "dummy_server", "description": "The timestamp when the account was opened at the branch", "synonyms": ["datetime of opening", "account creation date", "account open timestamp"] } @@ -249,6 +260,7 @@ "unprotect protocol": "(1025.67 - ({}))", "protect protocol": "(1025.67 - ({}))", "server masked": true, + "server dataset id": "dummy_server", "description": "The amount of money transferred in the transaction", "sample values": [2753.92, 322.67, 5278.45], "synonyms": ["amount wired", "transaction amount", "money transferred"] @@ -261,6 +273,7 @@ "unprotect protocol": "DATETIME({}, '+54321 seconds')", "protect protocol": "DATETIME({}, '-54321 seconds')", "server masked": true, + "server dataset id": "dummy_server", "description": "The timestamp when the transaction occurred", "synonyms": ["transaction datetime", "transaction time", "datetime of transfer"] } diff --git a/tests/test_metadata/sf_masked_examples.json b/tests/test_metadata/sf_masked_examples.json index 4232f56bd..fdba1e454 100644 --- a/tests/test_metadata/sf_masked_examples.json +++ b/tests/test_metadata/sf_masked_examples.json @@ -24,6 +24,7 @@ "column name": "customerid", "data type": "numeric", "server masked": true, + "server dataset id": "BODO.FSI.ACCOUNTS", "unprotect protocol": "PTY_UNPROTECT_ACCOUNT({})", "protect protocol": "PTY_PROTECT_ACCOUNT({})", "description": "ID of the customer who owns the account", @@ -36,6 +37,7 @@ "column name": "accounttype", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.ACCOUNTS", "unprotect protocol": "PTY_UNPROTECT({}, 'deAccount')", "protect protocol": "PTY_PROTECT({}, 'deAccount')", "description": "Type of the account (either 'Checking' or 'Savings')", @@ -57,6 +59,7 @@ "column name": "currency", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.ACCOUNTS", "unprotect protocol": "PTY_UNPROTECT_ACCOUNT({})", "protect protocol": "PTY_PROTECT({}, 'deAccount')", "description": "Currency of the account (either 'USD', 'EUR', or 'GBP')", @@ -69,6 +72,7 @@ "column name": "createddate", "data type": "datetime", "server masked": true, + "server dataset id": "BODO.FSI.ACCOUNTS", "unprotect protocol": "PTY_UNPROTECT_DOB({})", "protect protocol": "PTY_PROTECT_DOB({})", "description": "The date the account was created", @@ -81,6 +85,7 @@ "column name": "status", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.ACCOUNTS", "unprotect protocol": "PTY_UNPROTECT_ACCOUNT({})", "protect protocol": "PTY_PROTECT({}, 'deAccount')", "description": "The current status of the account (either 'Active' or 'Inactive')", @@ -205,6 +210,7 @@ "column name": "customerid", "data type": "numeric", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deAccount')", "protect protocol": "PTY_PROTECT_ACCOUNT({})", "description": "The unique id for the customer", @@ -217,6 +223,7 @@ "column name": "firstname", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deName')", "protect protocol": "PTY_PROTECT({}, 'deName')", "description": "The first name of the customer", @@ -229,6 +236,7 @@ "column name": "lastname", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deName')", "protect protocol": "PTY_PROTECT_NAME({})", "description": "The last name of the customer", @@ -241,6 +249,7 @@ "column name": "address", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT_ADDRESS({})", "protect protocol": "PTY_PROTECT({}, 'deAddress')", "description": "The address of the customer", @@ -255,6 +264,7 @@ "column name": "city", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT_ADDRESS({})", "protect protocol": "PTY_PROTECT_ADDRESS({})", "description": "The city where the customer resides", @@ -267,6 +277,7 @@ "column name": "state", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deAddress')", "protect protocol": "PTY_PROTECT({}, 'deAddress')", "description": "The state where the customer resides", @@ -288,6 +299,7 @@ "column name": "email", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deEmail')", "protect protocol": "PTY_PROTECT_EMAIL({})", "description": "The email address of the customer", @@ -301,6 +313,7 @@ "column name": "phonenumber", "data type": "string", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT_PHONE({})", "protect protocol": "PTY_PROTECT({}, 'dePhone')", "description": "The phone number of the customer", @@ -313,6 +326,7 @@ "column name": "dob", "data type": "datetime", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deDOB')", "protect protocol": "PTY_PROTECT({}, 'deDOB')", "description": "The date of birth of the customer", @@ -361,6 +375,7 @@ "column name": "creditcardnumber", "data type": "numeric", "server masked": true, + "server dataset id": "BODO.FSI.PROTECTED_CUSTOMERS", "unprotect protocol": "PTY_UNPROTECT_CCN({})", "protect protocol": "PTY_PROTECT_CCN({})", "description": "The credit card number of the customer", @@ -571,6 +586,7 @@ "column name": "patient_id", "data type": "numeric", "server masked": true, + "server dataset id": "BODO.HEALTH.CLAIMS", "unprotect protocol": "PTY_UNPROTECT({}, 'deAccount')", "protect protocol": "PTY_PROTECT_ACCOUNT({})", "description": "The id of the patient who filed the claim", @@ -583,6 +599,7 @@ "column name": "claim_date", "data type": "datetime", "server masked": true, + "server dataset id": "BODO.HEALTH.CLAIMS", "unprotect protocol": "PTY_UNPROTECT_DOB({})", "protect protocol": "PTY_PROTECT({}, 'deDOB')", "description": "The date when the claim was filed", @@ -595,6 +612,7 @@ "column name": "provider_name", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.CLAIMS", "unprotect protocol": "PTY_UNPROTECT({}, 'deName')", "protect protocol": "PTY_PROTECT({}, 'deName')", "description": "The name of the healthcare provider", @@ -643,6 +661,7 @@ "column name": "claim_status", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.CLAIMS", "unprotect protocol": "PTY_UNPROTECT({}, 'deAccount')", "protect protocol": "PTY_PROTECT_ACCOUNT({})", "description": "The current status of the claim (either 'Pending', 'Approved', or 'Denied')", @@ -735,6 +754,7 @@ "column name": "patient_id", "data type": "numeric", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT_ACCOUNT({})", "protect protocol": "PTY_PROTECT_ACCOUNT({})", "description": "The unique id for the patient", @@ -747,6 +767,7 @@ "column name": "first_name", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT_NAME({})", "protect protocol": "PTY_PROTECT({}, 'deName')", "description": "The first name of the patient", @@ -759,6 +780,7 @@ "column name": "last_name", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT({}, 'deName')", "protect protocol": "PTY_PROTECT({}, 'deName')", "description": "The last name of the patient", @@ -771,6 +793,7 @@ "column name": "date_of_birth", "data type": "datetime", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT({}, 'deDOB')", "protect protocol": "PTY_PROTECT_DOB({})", "description": "The date of birth of the patient", @@ -792,6 +815,7 @@ "column name": "ssn", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT_SSN({})", "protect protocol": "PTY_PROTECT_SSN({})", "description": "The social security number of the patient", @@ -804,6 +828,7 @@ "column name": "address", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT_ADDRESS({})", "protect protocol": "PTY_PROTECT({}, 'deAddress')", "description": "The address of the patient", @@ -818,6 +843,7 @@ "column name": "phone_number", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT({}, 'dePhone')", "protect protocol": "PTY_PROTECT({}, 'dePhone')", "description": "The phone number of the patient", @@ -830,6 +856,7 @@ "column name": "email", "data type": "string", "server masked": true, + "server dataset id": "BODO.HEALTH.PROTECTED_PATIENTS", "unprotect protocol": "PTY_UNPROTECT({}, 'deEmail')", "protect protocol": "PTY_PROTECT({}, 'deEmail')", "description": "The email address of the patient", @@ -933,6 +960,7 @@ "column name": "first_name", "data type": "string", "server masked": true, + "server dataset id": "BODO.RETAIL.PROTECTED_LOYALTY_MEMBERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deName')", "protect protocol": "PTY_PROTECT_NAME({})", "description": "The first name of the loyalty member", @@ -945,6 +973,7 @@ "column name": "last_name", "data type": "string", "server masked": true, + "server dataset id": "BODO.RETAIL.PROTECTED_LOYALTY_MEMBERS", "unprotect protocol": "PTY_UNPROTECT_NAME({})", "protect protocol": "PTY_PROTECT({}, 'deName')", "description": "The last name of the loyalty member", @@ -957,6 +986,7 @@ "column name": "email", "data type": "string", "server masked": true, + "server dataset id": "BODO.RETAIL.PROTECTED_LOYALTY_MEMBERS", "unprotect protocol": "PTY_UNPROTECT_EMAIL({})", "protect protocol": "PTY_PROTECT_EMAIL({})", "description": "The email address of the loyalty member", @@ -978,6 +1008,7 @@ "column name": "address", "data type": "string", "server masked": true, + "server dataset id": "BODO.RETAIL.PROTECTED_LOYALTY_MEMBERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deAddress')", "protect protocol": "PTY_PROTECT({}, 'deAddress')", "description": "The address of the loyalty member", @@ -992,6 +1023,7 @@ "column name": "date_of_birth", "data type": "datetime", "server masked": true, + "server dataset id": "BODO.RETAIL.PROTECTED_LOYALTY_MEMBERS", "unprotect protocol": "PTY_UNPROTECT({}, 'deDOB')", "protect protocol": "PTY_PROTECT({}, 'deDOB')", "description": "Birthdate of the loyalty member", @@ -1062,6 +1094,7 @@ "column name": "transaction_date", "data type": "datetime", "server masked": true, + "server dataset id": "BODO.RETAIL.TRANSACTIONS", "unprotect protocol": "PTY_UNPROTECT_TS({})", "protect protocol": "PTY_PROTECT_TS({})", "description": "The date when the transaction occurred", @@ -1074,6 +1107,7 @@ "column name": "store_location", "data type": "string", "server masked": true, + "server dataset id": "BODO.RETAIL.TRANSACTIONS", "unprotect protocol": "PTY_UNPROTECT_ADDRESS({})", "protect protocol": "PTY_PROTECT({}, 'deAddress')", "description": "The location of the store where the transaction took place", @@ -1095,6 +1129,7 @@ "column name": "payment_method", "data type": "string", "server masked": true, + "server dataset id": "BODO.RETAIL.TRANSACTIONS", "unprotect protocol": "PTY_UNPROTECT_ACCOUNT({})", "protect protocol": "PTY_PROTECT_ACCOUNT({})", "description": "The method used for payment (either 'Cash', 'Credit Card', 'Gift Card', or 'Mobile Payment')", diff --git a/tests/test_mock_mask_server.py b/tests/test_mock_mask_server.py index 4d0e44479..8f2925f4b 100644 --- a/tests/test_mock_mask_server.py +++ b/tests/test_mock_mask_server.py @@ -2,7 +2,9 @@ Unit tests for the PyDough mask server module. """ +import io import re +from contextlib import redirect_stdout import pytest @@ -22,27 +24,43 @@ None, [ MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["EQUAL", 2, "__col__", 0], ), MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["OR", 2, "__col__", 5], ), MaskServerInput( - table_path="srv.db.orders", + dataset_id="dummy_server", + table_path="db/orders", column_name="order_date", - expression=["BETWEEN", 3, "__col__", "2025-01-01", "2025-02-01"], + expression=[ + "AND", + 2, + "LTE", + 2, + "2025-01-01", + "__col__", + "LTE", + 2, + "__col__", + "2025-02-01", + ], ), MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db/tbl", column_name="col", expression=["GT", 2, "__col__", 45.67], ), MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["NOT_EQUAL", 2, "__col__", "LOWER", 1, "Smith"], ), @@ -84,7 +102,8 @@ None, [ MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["EQUAL", 2, "__col__", 0], ), @@ -105,7 +124,8 @@ None, [ MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["OR", 2, "__col__", 5], ), @@ -122,9 +142,21 @@ "test-token-123", [ MaskServerInput( - table_path="srv.db.orders", + dataset_id="dummy_server", + table_path="db.orders", column_name="order_date", - expression=["BETWEEN", 3, "__col__", "2025-01-01", "2025-02-01"], + expression=[ + "AND", + 2, + "LTE", + 2, + "2025-01-01", + "__col__", + "LTE", + 2, + "__col__", + "2025-02-01", + ], ), ], [ @@ -145,7 +177,8 @@ "test-token-123", [ MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["NOT_EQUAL", 2, "__col__", True], ), @@ -162,12 +195,14 @@ None, [ MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["LT", 2, "__col__", "123.654445"], ), MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=[ "AND", @@ -199,7 +234,8 @@ None, [ MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=[ "OR", @@ -246,10 +282,13 @@ def test_mock_mask_server( base_url="http://localhost:8000", token=token ) - # Doing the request - response: list[MaskServerOutput] = mask_server.simplify_simple_expression_batch( - batch=batch, - ) + # Capture stdout to avoid polluting the console with logging calls + with redirect_stdout(io.StringIO()): + # Doing the request + response: list[MaskServerOutput] = mask_server.simplify_simple_expression_batch( + batch=batch, + dry_run=False, + ) assert response == answer, ( f"Mismatch between the response {response!r} and the answer {answer!r}" @@ -272,7 +311,8 @@ def test_mock_mask_server( "bad_token_123", [ MaskServerInput( - table_path="srv.db.tbl", + dataset_id="dummy_server", + table_path="db.tbl", column_name="col", expression=["OR", 2, "__col__", 5], ) @@ -299,4 +339,5 @@ def test_mock_mask_server_errors( # Doing the request mask_server.simplify_simple_expression_batch( batch=batch, + dry_run=False, ) diff --git a/tests/test_plan_refsols/cryptbank_agg_01_rewrite.txt b/tests/test_plan_refsols/cryptbank_agg_01_rewrite.txt index 1dad4af9a..f91b82c0b 100644 --- a/tests/test_plan_refsols/cryptbank_agg_01_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_agg_01_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', ROUND(avg_unmask_t_amount, 2:numeric))], orderings=[]) AGGREGATE(keys={}, aggregations={'avg_unmask_t_amount': AVG(UNMASK::((1025.67 - ([t_amount]))))}) - FILTER(condition=MONTH(UNMASK::(DATETIME([t_ts], '+54321 seconds'))) == 6:numeric & YEAR(UNMASK::(DATETIME([t_ts], '+54321 seconds'))) == 2022:numeric, columns={'t_amount': t_amount}) + FILTER(condition=ISIN(t_ts, ['2022-06-03 05:08:58', '2022-06-12 00:24:06', '2022-06-13 05:50:39', '2022-06-14 19:08:57', '2022-06-16 03:15:13', '2022-06-18 03:37:49', '2022-06-27 06:08:04', '2022-06-28 15:35:47', '2022-06-29 05:40:38', '2022-06-29 19:53:42']:array[unknown]), columns={'t_amount': t_amount}) SCAN(table=CRBNK.TRANSACTIONS, columns={'t_amount': t_amount, 't_ts': t_ts}) diff --git a/tests/test_plan_refsols/cryptbank_agg_06_raw.txt b/tests/test_plan_refsols/cryptbank_agg_06_raw.txt new file mode 100644 index 000000000..2815f14ad --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_agg_06_raw.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n_neg', DEFAULT_TO(sum_expr, 0:numeric)), ('n_positive', DEFAULT_TO(sum_expr_3, 0:numeric))], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(UNMASK::((1025.67 - ([t_amount]))) < 0:numeric), 'sum_expr_3': SUM(UNMASK::((1025.67 - ([t_amount]))) > 0:numeric)}) + SCAN(table=CRBNK.TRANSACTIONS, columns={'t_amount': t_amount}) diff --git a/tests/test_plan_refsols/cryptbank_agg_06_rewrite.txt b/tests/test_plan_refsols/cryptbank_agg_06_rewrite.txt new file mode 100644 index 000000000..c7c7e9e9a --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_agg_06_rewrite.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n_neg', sum_expr), ('n_positive', sum_expr_3)], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(False:bool), 'sum_expr_3': SUM(True:bool)}) + SCAN(table=CRBNK.TRANSACTIONS, columns={}) diff --git a/tests/test_plan_refsols/cryptbank_agg_07_raw.txt b/tests/test_plan_refsols/cryptbank_agg_07_raw.txt new file mode 100644 index 000000000..e5ad88ef3 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_agg_07_raw.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n_yr', DEFAULT_TO(sum_expr_31, 0:numeric)), ('n_qu', DEFAULT_TO(sum_expr_28, 0:numeric)), ('n_mo', DEFAULT_TO(sum_expr_26, 0:numeric)), ('n_we', DEFAULT_TO(sum_expr_30, 0:numeric)), ('n_da', DEFAULT_TO(sum_expr_39, 0:numeric)), ('n_ho', DEFAULT_TO(sum_expr_23, 0:numeric)), ('n_mi', DEFAULT_TO(sum_expr_25, 0:numeric)), ('n_se', DEFAULT_TO(sum_expr_29, 0:numeric)), ('n_cts', DEFAULT_TO(sum_expr_38, 0:numeric)), ('n_dts', DEFAULT_TO(sum_expr_22, 0:numeric)), ('n_nst', DEFAULT_TO(sum_expr_27, 0:numeric)), ('n_ayr', DEFAULT_TO(sum_expr_37, 0:numeric)), ('n_aqu', DEFAULT_TO(sum_expr_34, 0:numeric)), ('n_amo', DEFAULT_TO(sum_expr_33, 0:numeric)), ('n_awe', DEFAULT_TO(sum_expr_36, 0:numeric)), ('n_ada', DEFAULT_TO(sum_expr, 0:numeric)), ('n_aho', DEFAULT_TO(sum_expr_21, 0:numeric)), ('n_ami', DEFAULT_TO(sum_expr_32, 0:numeric)), ('n_ase', DEFAULT_TO(sum_expr_35, 0:numeric)), ('n_ldm', DEFAULT_TO(sum_expr_24, 0:numeric))], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), '+10 DAYS':string) == '2019-11-21 18:00:52':string), 'sum_expr_21': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), '+1000 hour':string) == '2019-12-23 10:00:52':string), 'sum_expr_22': SUM(UNMASK::(DATETIME([t_ts], '+54321 seconds')) == DATETIME(JOIN_STRINGS('-':string, '2025':string, '12':string, '31':string))), 'sum_expr_23': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of hour':string) == '2023-06-02 04:00:00':string), 'sum_expr_24': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of month':string, '-1 day':string) == '2019-10-31':string), 'sum_expr_25': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of minute':string) == '2023-06-02 04:55:00':string), 'sum_expr_26': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of month':string) == '2023-06-01':string), 'sum_expr_27': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of week':string, '+3 days':string) == '2023-05-31':string), 'sum_expr_28': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of quarter':string) == '2023-04-01':string), 'sum_expr_29': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of second':string) == '2023-06-02 04:55:31':string), 'sum_expr_30': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of week':string) == '2023-05-28':string), 'sum_expr_31': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of year':string) == '2023-01-01':string), 'sum_expr_32': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), '+10000 minute':string) == '2019-11-18 16:40:52':string), 'sum_expr_33': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), '-5 Mm':string) == '2019-06-11 18:00:52':string), 'sum_expr_34': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), '+2 q':string) == '2020-05-11 18:00:52':string), 'sum_expr_35': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), '-1000000 s':string) == '2019-10-31 04:14:12':string), 'sum_expr_36': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of day':string, '+1 week':string) == '2023-06-09':string), 'sum_expr_37': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), '+1 Y':string) == '2020-11-11 18:00:52':string), 'sum_expr_38': SUM(UNMASK::(DATETIME([t_ts], '+54321 seconds')) == DATETIME('now':string, 'start of day':string)), 'sum_expr_39': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of day':string) == '2023-06-02':string)}) + SCAN(table=CRBNK.TRANSACTIONS, columns={'t_ts': t_ts}) diff --git a/tests/test_plan_refsols/cryptbank_agg_07_rewrite.txt b/tests/test_plan_refsols/cryptbank_agg_07_rewrite.txt new file mode 100644 index 000000000..ed2497185 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_agg_07_rewrite.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n_yr', DEFAULT_TO(sum_expr_31, 0:numeric)), ('n_qu', DEFAULT_TO(sum_expr_28, 0:numeric)), ('n_mo', DEFAULT_TO(sum_expr_26, 0:numeric)), ('n_we', DEFAULT_TO(sum_expr_30, 0:numeric)), ('n_da', DEFAULT_TO(sum_expr_39, 0:numeric)), ('n_ho', DEFAULT_TO(sum_expr_39, 0:numeric)), ('n_mi', DEFAULT_TO(sum_expr_39, 0:numeric)), ('n_se', DEFAULT_TO(sum_expr_29, 0:numeric)), ('n_cts', DEFAULT_TO(sum_expr_38, 0:numeric)), ('n_dts', DEFAULT_TO(sum_expr_22, 0:numeric)), ('n_nst', DEFAULT_TO(sum_expr_27, 0:numeric)), ('n_ayr', DEFAULT_TO(sum_expr, 0:numeric)), ('n_aqu', DEFAULT_TO(sum_expr, 0:numeric)), ('n_amo', DEFAULT_TO(sum_expr, 0:numeric)), ('n_awe', DEFAULT_TO(sum_expr_36, 0:numeric)), ('n_ada', DEFAULT_TO(sum_expr, 0:numeric)), ('n_aho', DEFAULT_TO(sum_expr, 0:numeric)), ('n_ami', DEFAULT_TO(sum_expr, 0:numeric)), ('n_ase', DEFAULT_TO(sum_expr, 0:numeric)), ('n_ldm', DEFAULT_TO(sum_expr_24, 0:numeric))], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(t_ts == '2019-11-11 02:55:31':unknown), 'sum_expr_22': SUM(UNMASK::(DATETIME([t_ts], '+54321 seconds')) == DATETIME(JOIN_STRINGS('-':string, '2025':string, '12':string, '31':string))), 'sum_expr_24': SUM(ISIN(t_ts, ['2019-11-02 11:58:37', '2019-11-02 12:54:09', '2019-11-11 02:55:31', '2019-11-11 15:44:22']:array[unknown])), 'sum_expr_26': SUM(ISIN(t_ts, ['2023-06-01 13:50:10', '2023-06-01 13:50:14', '2023-06-04 10:35:26', '2023-06-11 21:53:04', '2023-06-25 15:06:06', '2023-06-25 21:58:37', '2023-06-27 03:21:19', '2023-06-27 10:34:20']:array[unknown])), 'sum_expr_27': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of week':string, '+3 days':string) == '2023-05-31':string), 'sum_expr_28': SUM(ISIN(t_ts, ['2023-04-18 00:35:40', '2023-04-25 18:54:26', '2023-04-29 04:58:30', '2023-05-04 23:30:10', '2023-05-12 04:42:28', '2023-05-17 18:54:12', '2023-05-19 10:10:44', '2023-05-21 13:52:14', '2023-05-24 03:51:10', '2023-06-01 13:50:10', '2023-06-01 13:50:14', '2023-06-04 10:35:26', '2023-06-11 21:53:04', '2023-06-25 15:06:06', '2023-06-25 21:58:37', '2023-06-27 03:21:19', '2023-06-27 10:34:20']:array[unknown])), 'sum_expr_29': SUM(t_ts == '2023-06-01 13:50:10':unknown), 'sum_expr_30': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of week':string) == '2023-05-28':string), 'sum_expr_31': SUM(ISIN(t_ts, ['2022-12-31 17:42:54', '2023-01-04 12:05:15', '2023-01-07 22:11:27', '2023-01-20 04:38:03', '2023-01-20 16:40:54', '2023-01-27 15:13:18', '2023-01-30 19:58:26', '2023-02-02 19:12:58', '2023-02-11 11:13:53', '2023-02-11 12:32:55', '2023-02-15 21:54:29', '2023-02-16 14:18:36', '2023-02-28 07:11:29', '2023-03-07 01:26:10', '2023-03-08 18:58:18', '2023-03-14 14:23:33', '2023-03-16 06:17:44', '2023-03-17 08:48:16', '2023-03-24 03:33:40', '2023-03-26 06:52:52', '2023-04-18 00:35:40', '2023-04-25 18:54:26', '2023-04-29 04:58:30', '2023-05-04 23:30:10', '2023-05-12 04:42:28', '2023-05-17 18:54:12', '2023-05-19 10:10:44', '2023-05-21 13:52:14', '2023-05-24 03:51:10', '2023-06-01 13:50:10', '2023-06-01 13:50:14', '2023-06-04 10:35:26', '2023-06-11 21:53:04', '2023-06-25 15:06:06', '2023-06-25 21:58:37', '2023-06-27 03:21:19', '2023-06-27 10:34:20', '2023-06-30 15:27:03', '2023-07-07 15:17:47', '2023-07-17 03:23:15', '2023-07-18 14:41:26', '2023-08-03 20:24:35', '2023-08-11 20:25:39', '2023-08-29 03:07:18', '2023-09-01 16:50:48', '2023-09-08 09:30:23', '2023-09-13 06:42:39', '2023-09-15 09:00:02', '2023-09-30 08:57:30', '2023-10-15 02:47:04', '2023-10-19 09:40:06', '2023-10-30 00:20:45', '2023-11-08 12:52:24', '2023-11-10 17:20:29', '2023-11-16 11:30:24', '2023-11-21 15:17:10', '2023-11-28 06:34:03', '2023-12-07 14:11:33', '2023-12-15 05:57:23', '2023-12-16 00:51:23', '2023-12-23 07:54:22']:array[unknown])), 'sum_expr_36': SUM(DATETIME(UNMASK::(DATETIME([t_ts], '+54321 seconds')), 'start of day':string, '+1 week':string) == '2023-06-09':string), 'sum_expr_38': SUM(UNMASK::(DATETIME([t_ts], '+54321 seconds')) == DATETIME('now':string, 'start of day':string)), 'sum_expr_39': SUM(ISIN(t_ts, ['2023-06-01 13:50:10', '2023-06-01 13:50:14']:array[unknown]))}) + SCAN(table=CRBNK.TRANSACTIONS, columns={'t_ts': t_ts}) diff --git a/tests/test_plan_refsols/cryptbank_analysis_04_rewrite.txt b/tests/test_plan_refsols/cryptbank_analysis_04_rewrite.txt index 9f348a995..2d3442760 100644 --- a/tests/test_plan_refsols/cryptbank_analysis_04_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_analysis_04_rewrite.txt @@ -3,7 +3,7 @@ ROOT(columns=[('key', UNMASK::(CASE WHEN [anything_a_key] = 0 THEN 0 ELSE (CASE JOIN(condition=UNMASK::(CASE WHEN [t0.a_key] = 0 THEN 0 ELSE (CASE WHEN [t0.a_key] > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING([t0.a_key], 1 + INSTR([t0.a_key], '-'), LENGTH([t0.a_key]) / 2) AS INTEGER) END) == t1.t_sourceaccount, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'a_key': t0.a_key, 'c_fname': t0.c_fname, 'c_lname': t0.c_lname, 't_sourceaccount': t1.t_sourceaccount}) JOIN(condition=t0.a_custkey == UNMASK::((42 - ([t1.c_key]))), type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={'a_key': t0.a_key, 'c_fname': t1.c_fname, 'c_lname': t1.c_lname}) SCAN(table=CRBNK.ACCOUNTS, columns={'a_custkey': a_custkey, 'a_key': a_key}) - FILTER(condition=MONOTONIC(1980:numeric, YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 1985:numeric), columns={'c_fname': c_fname, 'c_key': c_key, 'c_lname': c_lname}) + FILTER(condition=ISIN(c_birthday, ['1980-01-18', '1981-07-21', '1981-11-15', '1982-11-07', '1983-12-27']:array[unknown]), columns={'c_fname': c_fname, 'c_key': c_key, 'c_lname': c_lname}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday, 'c_fname': c_fname, 'c_key': c_key, 'c_lname': c_lname}) - FILTER(condition=UNMASK::((1025.67 - ([t_amount]))) > 9000.0:numeric, columns={'t_sourceaccount': t_sourceaccount}) + FILTER(condition=ISIN(t_amount, [-8934.44, -8881.98, -8736.83, -8717.7, -8648.33, -8639.5, -8620.48, -8593.09, -8553.43, -8527.34, -8484.61, -8480.79, -8472.7, -8457.49, -8366.52, -8361.27, -8352.72, -8308.42, -8254.69, -8077.89, -8067.8]:array[unknown]), columns={'t_sourceaccount': t_sourceaccount}) SCAN(table=CRBNK.TRANSACTIONS, columns={'t_amount': t_amount, 't_sourceaccount': t_sourceaccount}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_01_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_01_rewrite.txt index f0cb980e1..bfeb90c10 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_01_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_01_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=c_lname == MASK::(UPPER(['lee':string])), columns={}) + FILTER(condition=c_lname == 'LEE':unknown, columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_02_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_02_rewrite.txt index 7558820f5..e771808ac 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_02_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_02_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=c_lname != MASK::(UPPER(['lee':string])), columns={}) + FILTER(condition=c_lname != 'LEE':unknown, columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_03_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_03_rewrite.txt index 96ddb590e..81051fdfa 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_03_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_03_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(c_lname, [Call(op=MASK, inputs=[Literal(value='lee', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='smith', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='rodriguez', type=UnknownType())], return_type=StringType())]:bool), columns={}) + FILTER(condition=ISIN(c_lname, ['LEE', 'SMITH', 'RODRIGUEZ']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_04_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_04_rewrite.txt index a8ebb4a29..733f70691 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_04_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_04_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=NOT(ISIN(c_lname, [Call(op=MASK, inputs=[Literal(value='lee', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='smith', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='rodriguez', type=UnknownType())], return_type=StringType())]:bool)), columns={}) + FILTER(condition=NOT(ISIN(c_lname, ['LEE', 'SMITH', 'RODRIGUEZ']:array[unknown])), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_05_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_05_rewrite.txt index 80d0ffef1..a8625f8fe 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_05_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_05_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=STARTSWITH(UNMASK::(REPLACE(REPLACE(REPLACE([c_phone], '9', '*'), '0', '9'), '*', '0')), '555-8':string), columns={}) + FILTER(condition=ISIN(c_phone, ['555-809-1234', '555-870-9123']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_06_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_06_rewrite.txt index 80671abda..889a1221e 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_06_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_06_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ENDSWITH(UNMASK::(SUBSTRING([c_email], -1) || SUBSTRING([c_email], 1, LENGTH([c_email]) - 1)), 'gmail.com':string), columns={}) + FILTER(condition=ISIN(c_email, ['livia.a22@gmail.como', 'ob.smith77@gmail.comb', 'ob_moore78@gmail.comr', 'opez.luke99@gmail.coml']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_email': c_email}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_07_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_07_rewrite.txt index 1311df68a..e688c6561 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_07_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_07_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=YEAR(UNMASK::(DATE([c_birthday], '+472 days'))) == 1978:numeric, columns={}) + FILTER(condition=ISIN(c_birthday, ['1976-10-27', '1976-12-02']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_08_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_08_rewrite.txt index 4d56d54dc..d9a743560 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_08_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_08_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=c_birthday == MASK::(DATE(['1985-04-12':string], '-472 days')), columns={}) + FILTER(condition=c_birthday == '1983-12-27':unknown, columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_14_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_14_rewrite.txt index b623ae10e..5e7f02291 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_14_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_14_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ENDSWITH(UNMASK::(LOWER([c_fname])), 'e':string) | ENDSWITH(UNMASK::(LOWER([c_lname])), 'e':string), columns={}) + FILTER(condition=ISIN(c_fname, ['ALICE', 'GRACE', 'LUKE', 'QUEENIE']:array[unknown]) | ISIN(c_lname, ['LEE', 'MOORE']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname, 'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_16_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_16_rewrite.txt index 1b2fcbc78..47ef8d92a 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_16_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_16_rewrite.txt @@ -2,5 +2,5 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) JOIN(condition=UNMASK::((42 - ([t0.c_key]))) == t1.a_custkey, type=SEMI, columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_key': c_key}) - FILTER(condition=a_type != MASK::(SUBSTRING(['checking':string], 2) || SUBSTRING(['checking':string], 1, 1)) & a_type != MASK::(SUBSTRING(['savings':string], 2) || SUBSTRING(['savings':string], 1, 1)), columns={'a_custkey': a_custkey}) + FILTER(condition=NOT(ISIN(a_type, ['avingss', 'heckingc']:array[unknown])), columns={'a_custkey': a_custkey}) SCAN(table=CRBNK.ACCOUNTS, columns={'a_custkey': a_custkey, 'a_type': a_type}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_18_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_18_rewrite.txt index 935e582ed..0ff9c1100 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_18_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_18_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=LIKE(UNMASK::(SUBSTRING([c_email], -1) || SUBSTRING([c_email], 1, LENGTH([c_email]) - 1)), '%.%@%mail%':string), columns={}) + FILTER(condition=ISIN(c_email, ['ophia.jackson@mail.orgs', 'livia.a22@gmail.como', '.gonzalez@ymail.comm', 'opez.luke99@gmail.coml', 'enry.g@fastmail.comh', 'rank.k@protonmail.comf', 'mily.jones@mail.come', 'ob.smith77@gmail.comb']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_email': c_email}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_19_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_19_rewrite.txt index 3da1c4ddd..f32b625c3 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_19_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_19_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=CONTAINS(UNMASK::(SUBSTRING([c_email], -1) || SUBSTRING([c_email], 1, LENGTH([c_email]) - 1)), 'mail':string), columns={}) + FILTER(condition=NOT(ISIN(c_email, ['homasl@outlook.comt', 'ueenie.t@outlook.netq', '.hernandez@icloud.comk', 'martinez94@outlook.orgj', 'sa.rodriguez@zoho.comi', '.brown88@yahoo.comd', '.lee@outlook.comc', 'lice_j@example.orga']:array[unknown])), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_email': c_email}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_20_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_20_rewrite.txt index 462814855..34cb8907f 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_20_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_20_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(DATE([c_birthday], '+472 days')) > '1991-11-15':string, columns={}) + FILTER(condition=ISIN(c_birthday, ['1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_21_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_21_rewrite.txt index 9742c2261..1e32a7730 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_21_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_21_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(DATE([c_birthday], '+472 days')) >= '1991-11-15':string, columns={}) + FILTER(condition=ISIN(c_birthday, ['1990-07-31', '1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_22_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_22_rewrite.txt index 13dfd466f..df32b5a3a 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_22_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_22_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(DATE([c_birthday], '+472 days')) < '1991-11-15':string, columns={}) + FILTER(condition=NOT(ISIN(c_birthday, ['1990-07-31', '1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15']:array[unknown])), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_23_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_23_rewrite.txt index 56faca00f..3b1ff1ebe 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_23_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_23_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(DATE([c_birthday], '+472 days')) <= '1991-11-15':string, columns={}) + FILTER(condition=NOT(ISIN(c_birthday, ['1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15']:array[unknown])), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_25_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_25_rewrite.txt index 5face6b66..5890afa41 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_25_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_25_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ABSENT(UNMASK::(DATE([c_birthday], '+472 days'))) | c_birthday != MASK::(DATE(['1991-11-15':string], '-472 days')), columns={}) + FILTER(condition=ABSENT(UNMASK::(DATE([c_birthday], '+472 days'))) | c_birthday != '1990-07-31':unknown, columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_26_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_26_rewrite.txt index 34e03d1e0..9a06dd87d 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_26_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_26_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=c_phone == MASK::(REPLACE(REPLACE(REPLACE(['555-123-456':string], '0', '*'), '9', '0'), '*', '9')), columns={}) - SCAN(table=CRBNK.CUSTOMERS, columns={'c_phone': c_phone}) + FILTER(condition=False:bool, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_27_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_27_rewrite.txt index 6aac164ff..6de5d9627 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_27_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_27_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=PRESENT(UNMASK::(SUBSTRING([c_addr], -1) || SUBSTRING([c_addr], 1, LENGTH([c_addr]) - 1))) & PRESENT(UNMASK::(DATE([c_birthday], '+472 days'))) & c_lname != MASK::(UPPER(['lopez':string])) & ENDSWITH(UNMASK::(LOWER([c_fname])), 'a':string) | ENDSWITH(UNMASK::(LOWER([c_fname])), 'e':string) | ENDSWITH(UNMASK::(LOWER([c_fname])), 's':string) | ABSENT(UNMASK::(DATE([c_birthday], '+472 days'))) & ENDSWITH(UNMASK::(REPLACE(REPLACE(REPLACE([c_phone], '9', '*'), '0', '9'), '*', '0')), '5':string), columns={}) + FILTER(condition=PRESENT(UNMASK::(SUBSTRING([c_addr], -1) || SUBSTRING([c_addr], 1, LENGTH([c_addr]) - 1))) & PRESENT(UNMASK::(DATE([c_birthday], '+472 days'))) & c_lname != 'LOPEZ':unknown & ISIN(c_fname, ['ALICE', 'GRACE', 'LUKE', 'MARIA', 'OLIVIA', 'QUEENIE', 'SOPHIA']:array[unknown]) | ISIN(c_fname, ['JAMES', 'NICHOLAS', 'THOMAS']:array[unknown]) | ABSENT(UNMASK::(DATE([c_birthday], '+472 days'))) & ISIN(c_phone, ['555-091-2345', '555-901-2345']:array[unknown]), columns={}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_addr': c_addr, 'c_birthday': c_birthday, 'c_fname': c_fname, 'c_lname': c_lname, 'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_28_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_28_rewrite.txt index 9526425f5..319cf85ff 100644 --- a/tests/test_plan_refsols/cryptbank_filter_count_28_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_filter_count_28_rewrite.txt @@ -1,7 +1,7 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) JOIN(condition=t0.a_custkey == UNMASK::((42 - ([t1.c_key]))), type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={}) - FILTER(condition=YEAR(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))) < 2020:numeric & UNMASK::(SQRT([a_balance])) >= 5000:numeric & a_type == MASK::(SUBSTRING(['retirement':string], 2) || SUBSTRING(['retirement':string], 1, 1)) | a_type == MASK::(SUBSTRING(['savings':string], 2) || SUBSTRING(['savings':string], 1, 1)), columns={'a_custkey': a_custkey}) + FILTER(condition=YEAR(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))) < 2020:numeric & UNMASK::(SQRT([a_balance])) >= 5000:numeric & ISIN(a_type, ['avingss', 'etirementr']:array[unknown]), columns={'a_custkey': a_custkey}) SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_custkey': a_custkey, 'a_open_ts': a_open_ts, 'a_type': a_type}) FILTER(condition=CONTAINS(UNMASK::(SUBSTRING([c_email], -1) || SUBSTRING([c_email], 1, LENGTH([c_email]) - 1)), 'outlook':string) | CONTAINS(UNMASK::(SUBSTRING([c_email], -1) || SUBSTRING([c_email], 1, LENGTH([c_email]) - 1)), 'gmail':string), columns={'c_key': c_key}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_email': c_email, 'c_key': c_key}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_29_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_29_raw.txt new file mode 100644 index 000000000..a59cfd88f --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_29_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=UNMASK::(DATE([c_birthday], '+472 days')) <= '1925-01-01':string, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_29_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_29_rewrite.txt new file mode 100644 index 000000000..9a06dd87d --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_29_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=False:bool, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_30_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_30_raw.txt new file mode 100644 index 000000000..68486227f --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_30_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(MONTH(UNMASK::(DATE([c_birthday], '+472 days'))) + 1:numeric, [2, 4, 6, 8, 10, 12]:array[unknown]) & ISIN(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))) - 2:numeric, [1975, 1977, 1979, 1981, 1983, 1985, 1987, 1989, 1991, 1993]:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_30_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_30_rewrite.txt new file mode 100644 index 000000000..cea80fa2c --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_30_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(c_birthday, ['1980-01-18', '1981-11-15', '1990-07-31', '1994-06-15']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_31_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_31_raw.txt new file mode 100644 index 000000000..bbfe21d7c --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_31_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(UNMASK::(DATE([c_birthday], '+472 days')), [datetime.date(1991, 11, 15), datetime.date(1978, 2, 11), datetime.date(2005, 3, 14), datetime.date(1985, 4, 12)]:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_31_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_31_rewrite.txt new file mode 100644 index 000000000..924660224 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_31_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(c_birthday, ['1990-07-31', '1976-10-27', '1983-12-27']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_32_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_32_raw.txt new file mode 100644 index 000000000..3d0ca83f6 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_32_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=MONOTONIC(Timestamp('2020-03-28 09:20:00'):datetime, UNMASK::(DATETIME([a_open_ts], '+123456789 seconds')), datetime.datetime(2020, 9, 20, 8, 30):datetime), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_32_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_32_rewrite.txt new file mode 100644 index 000000000..506f379b7 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_32_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(a_open_ts, ['2016-04-29 11:46:51', '2016-06-10 12:56:51', '2016-07-20 15:46:51', '2016-08-22 10:41:51', '2016-09-03 12:01:51']:array[unknown]), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_33_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_33_raw.txt new file mode 100644 index 000000000..35df42fd5 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_33_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(MONTH(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))), [1, 2, 3]:array[numeric]), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_33_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_33_rewrite.txt new file mode 100644 index 000000000..1e78cde11 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_33_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(a_open_ts, ['2013-04-22 11:37:51', '2017-02-11 10:59:51', '2011-04-30 15:16:51', '2016-03-23 12:41:51', '2013-02-15 12:46:51', '2018-03-15 10:36:51', '2014-04-07 14:21:51', '2015-02-08 17:26:51', '2016-04-29 11:46:51', '2012-03-22 12:16:51', '2015-04-06 13:46:51']:array[unknown]), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_34_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_34_raw.txt new file mode 100644 index 000000000..3ee213c72 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_34_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=QUARTER(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))) == DAY(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_34_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_34_rewrite.txt new file mode 100644 index 000000000..7aeed4c0f --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_34_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=a_open_ts == '2015-05-04 18:01:51':unknown, columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_35_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_35_raw.txt new file mode 100644 index 000000000..a07aea681 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_35_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=HOUR(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))) < 10:numeric & MINUTE(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))) < 20:numeric, columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_35_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_35_rewrite.txt new file mode 100644 index 000000000..4dd3363ae --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_35_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(a_open_ts, ['2013-04-22 11:37:51', '2017-09-15 11:26:51', '2018-03-15 10:36:51', '2014-05-23 11:31:51', '2016-08-22 10:41:51', '2014-08-15 11:31:51']:array[unknown]), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_36_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_36_raw.txt new file mode 100644 index 000000000..59a065082 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_36_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=SECOND(UNMASK::(DATETIME([t_ts], '+54321 seconds'))) == 23:numeric, columns={}) + SCAN(table=CRBNK.TRANSACTIONS, columns={'t_ts': t_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_36_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_36_rewrite.txt new file mode 100644 index 000000000..60678bf05 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_36_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(t_ts, ['2020-11-11 09:03:02', '2023-09-15 09:00:02', '2024-07-21 23:24:02']:array[unknown]), columns={}) + SCAN(table=CRBNK.TRANSACTIONS, columns={'t_ts': t_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_37_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_37_raw.txt new file mode 100644 index 000000000..3b9aa3400 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_37_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=MONOTONIC(200:numeric, ABS(UNMASK::(SQRT([a_balance])) - 7250:numeric), 600:numeric), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_37_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_37_rewrite.txt new file mode 100644 index 000000000..38eceea46 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_37_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(a_balance, [46240000.0, 57760000.0]:array[unknown]), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_38_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_38_raw.txt new file mode 100644 index 000000000..7b0df0c11 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_38_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=LARGEST(HOUR(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))), MINUTE(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))), SECOND(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds')))) == 10:numeric, columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_38_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_38_rewrite.txt new file mode 100644 index 000000000..10b036d4c --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_38_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(a_open_ts, ['2018-03-15 10:36:51', '2018-01-02 12:26:51']:array[unknown]), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_39_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_39_raw.txt new file mode 100644 index 000000000..4fdbe5082 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_39_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=SMALLEST(HOUR(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))), MINUTE(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds')))) == 15:numeric, columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_39_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_39_rewrite.txt new file mode 100644 index 000000000..223ee5fac --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_39_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(a_open_ts, ['2015-08-10 18:11:51', '2015-05-04 18:01:51', '2015-10-19 18:11:51', '2014-10-03 17:41:51']:array[unknown]), columns={}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_40_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_40_raw.txt new file mode 100644 index 000000000..9bf88a215 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_40_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(JOIN_STRINGS('':string, '1-':string, UNMASK::(REPLACE(REPLACE(REPLACE([c_phone], '9', '*'), '0', '9'), '*', '0'))), '1-5':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_40_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_40_rewrite.txt new file mode 100644 index 000000000..174a826e7 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_40_rewrite.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + SCAN(table=CRBNK.CUSTOMERS, columns={}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_41_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_41_raw.txt new file mode 100644 index 000000000..e12d9e85b --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_41_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(JOIN_STRINGS('-':string, '1':string, UNMASK::(REPLACE(REPLACE(REPLACE([c_phone], '9', '*'), '0', '9'), '*', '0'))), '1-5':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_41_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_41_rewrite.txt new file mode 100644 index 000000000..174a826e7 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_41_rewrite.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + SCAN(table=CRBNK.CUSTOMERS, columns={}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_42_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_42_raw.txt new file mode 100644 index 000000000..44f03c593 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_42_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(JOIN_STRINGS('-':string, '1':string, UNMASK::(REPLACE(REPLACE(REPLACE([c_phone], '9', '*'), '0', '9'), '*', '0')), '1':string), '5-1':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_42_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_42_rewrite.txt new file mode 100644 index 000000000..d88bab6e1 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_42_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(c_phone, ['555-112-3456', '555-901-2345', '555-091-2345', '555-123-4567']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_43_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_43_raw.txt new file mode 100644 index 000000000..5ee34a718 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_43_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=JOIN_STRINGS(' ':string, UNMASK::(LOWER([c_fname])), UNMASK::(LOWER([c_lname]))) == 'olivia anderson':string, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname, 'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_43_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_43_rewrite.txt new file mode 100644 index 000000000..5ee34a718 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_43_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=JOIN_STRINGS(' ':string, UNMASK::(LOWER([c_fname])), UNMASK::(LOWER([c_lname]))) == 'olivia anderson':string, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname, 'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_44_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_44_raw.txt new file mode 100644 index 000000000..e2012c8b7 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_44_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(DEFAULT_TO(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 1990:numeric), [1990, 1991]:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_44_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_44_rewrite.txt new file mode 100644 index 000000000..e09882c8f --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_44_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ABSENT(c_birthday) | ISIN(c_birthday, ['1990-07-31', '1989-04-07']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_45_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_45_raw.txt new file mode 100644 index 000000000..84908b7f1 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_45_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(DEFAULT_TO(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 1990:numeric), [1990, 2005]:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_45_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_45_rewrite.txt new file mode 100644 index 000000000..0112adac3 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_45_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ABSENT(c_birthday) | c_birthday == '1989-04-07':unknown, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_46_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_46_raw.txt new file mode 100644 index 000000000..3f3b65f41 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_46_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(DEFAULT_TO(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 2005:numeric), [2005, 2006]:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_46_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_46_rewrite.txt new file mode 100644 index 000000000..b9d6157ba --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_46_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ABSENT(c_birthday), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_47_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_47_raw.txt new file mode 100644 index 000000000..ccfac3d44 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_47_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=NOT(ISIN(DEFAULT_TO(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 1990:numeric), [1990, 1991]:array[unknown])), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_47_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_47_rewrite.txt new file mode 100644 index 000000000..b1c4218ca --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_47_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=NOT(ISIN(c_birthday, ['1990-07-31', '1989-04-07']:array[unknown])) & PRESENT(c_birthday), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_48_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_48_raw.txt new file mode 100644 index 000000000..8937c18e3 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_48_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=NOT(ISIN(DEFAULT_TO(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 1990:numeric), [1990, 2005]:array[unknown])), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_48_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_48_rewrite.txt new file mode 100644 index 000000000..8fdc5ff76 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_48_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=c_birthday != '1989-04-07':unknown & PRESENT(c_birthday), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_49_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_49_raw.txt new file mode 100644 index 000000000..c7d2ebb89 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_49_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=NOT(ISIN(DEFAULT_TO(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 2005:numeric), [2005, 2006]:array[unknown])), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_49_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_49_rewrite.txt new file mode 100644 index 000000000..429f74d7b --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_49_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=PRESENT(c_birthday), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_50_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_50_raw.txt new file mode 100644 index 000000000..c7d2ebb89 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_50_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=NOT(ISIN(DEFAULT_TO(YEAR(UNMASK::(DATE([c_birthday], '+472 days'))), 2005:numeric), [2005, 2006]:array[unknown])), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_50_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_50_rewrite.txt new file mode 100644 index 000000000..429f74d7b --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_50_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=PRESENT(c_birthday), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_birthday': c_birthday}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_51_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_51_raw.txt new file mode 100644 index 000000000..3aecc6a52 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_51_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(IFF(ISIN(SLICE(UNMASK::(LOWER([c_fname])), None:unknown, 1:numeric, None:unknown), ['q', 'r', 's']:array[unknown]), UNMASK::(LOWER([c_fname])), UNMASK::(LOWER([c_lname]))), 'ee':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname, 'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_51_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_51_rewrite.txt new file mode 100644 index 000000000..9abbfccd9 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_51_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(IFF(ISIN(c_fname, ['QUEENIE', 'ROBERT', 'SOPHIA']:array[unknown]), UNMASK::(LOWER([c_fname])), UNMASK::(LOWER([c_lname]))), 'ee':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname, 'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_52_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_52_raw.txt new file mode 100644 index 000000000..be00df52e --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_52_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(UNMASK::(LOWER([c_lname])), JOIN_STRINGS('':string, 'e':string, IFF(ISIN(SLICE(UNMASK::(LOWER([c_lname])), None:unknown, 1:numeric, None:unknown), ['q', 'r', 's']:array[unknown]), 'z':string, 'e':string))), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_52_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_52_rewrite.txt new file mode 100644 index 000000000..2945a7768 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_52_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(c_lname, ['LEE', 'RODRIGUEZ']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_lname': c_lname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_53_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_53_raw.txt new file mode 100644 index 000000000..425d5b7d0 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_53_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=SLICE(UNMASK::(LOWER([c_fname])), 0:numeric, 1:numeric, None:unknown) == 'i':string, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_53_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_53_rewrite.txt new file mode 100644 index 000000000..6110686f1 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_53_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=c_fname == 'ISABEL':unknown, columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_54_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_54_raw.txt new file mode 100644 index 000000000..ac00003be --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_54_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(SLICE(UNMASK::(LOWER([c_fname])), -1:numeric, None:unknown, None:unknown), ['a', 'e', 'i', 'o', 'u']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_54_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_54_rewrite.txt new file mode 100644 index 000000000..ac00003be --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_54_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(SLICE(UNMASK::(LOWER([c_fname])), -1:numeric, None:unknown, None:unknown), ['a', 'e', 'i', 'o', 'u']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_55_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_55_raw.txt new file mode 100644 index 000000000..2565c41de --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_55_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(SLICE(UNMASK::(LOWER([c_fname])), 1:numeric, 3:numeric, None:unknown), ['ar', 'li', 'ra', 'to', 'am']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_55_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_55_rewrite.txt new file mode 100644 index 000000000..66493452f --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_55_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(c_fname, ['ALICE', 'CAROL', 'FRANK', 'GRACE', 'JAMES', 'KAREN', 'MARIA', 'OLIVIA']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_56_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_56_raw.txt new file mode 100644 index 000000000..93d44675d --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_56_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(SLICE(UNMASK::(LOWER([c_fname])), -2:numeric, -1:numeric, None:unknown), ['a', 'c', 'l']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_56_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_56_rewrite.txt new file mode 100644 index 000000000..93d44675d --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_56_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(SLICE(UNMASK::(LOWER([c_fname])), -2:numeric, -1:numeric, None:unknown), ['a', 'c', 'l']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_57_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_57_raw.txt new file mode 100644 index 000000000..7a469fc93 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_57_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(SLICE(UNMASK::(LOWER([c_fname])), None:unknown, -1:numeric, None:unknown), 'e':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_57_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_57_rewrite.txt new file mode 100644 index 000000000..7a469fc93 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_57_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(SLICE(UNMASK::(LOWER([c_fname])), None:unknown, -1:numeric, None:unknown), 'e':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_58_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_58_raw.txt new file mode 100644 index 000000000..497690ba2 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_58_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(SLICE(UNMASK::(LOWER([c_fname])), 1:numeric, -1:numeric, None:unknown), 'e':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_58_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_58_rewrite.txt new file mode 100644 index 000000000..497690ba2 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_58_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS(SLICE(UNMASK::(LOWER([c_fname])), 1:numeric, -1:numeric, None:unknown), 'e':string), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_59_raw.txt b/tests/test_plan_refsols/cryptbank_filter_count_59_raw.txt new file mode 100644 index 000000000..9fecff9db --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_59_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS('SLICE':string, UPPER(SLICE(UNMASK::(LOWER([c_fname])), None:unknown, 1:numeric, None:unknown))), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_filter_count_59_rewrite.txt b/tests/test_plan_refsols/cryptbank_filter_count_59_rewrite.txt new file mode 100644 index 000000000..1c5d4dd33 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_filter_count_59_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(c_fname, ['CAROL', 'EMILY', 'ISABEL', 'LUKE', 'SOPHIA']:array[unknown]), columns={}) + SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname}) diff --git a/tests/test_plan_refsols/cryptbank_window_01_raw.txt b/tests/test_plan_refsols/cryptbank_window_01_raw.txt new file mode 100644 index 000000000..32267be3d --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_window_01_raw.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('account_type', UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))), ('key', UNMASK::(CASE WHEN [a_key] = 0 THEN 0 ELSE (CASE WHEN [a_key] > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING([a_key], 1 + INSTR([a_key], '-'), LENGTH([a_key]) / 2) AS INTEGER) END)), ('balance', UNMASK::(SQRT([a_balance])))], orderings=[(UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))):asc_first]) + FILTER(condition=RANKING(args=[], partition=[UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1)) == 'retirement':string | UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1)) == 'savings':string], order=[(UNMASK::(SQRT([a_balance]))):desc_first], allow_ties=False) == 1:numeric, columns={'a_balance': a_balance, 'a_key': a_key, 'a_type': a_type}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_key': a_key, 'a_type': a_type}) diff --git a/tests/test_plan_refsols/cryptbank_window_01_rewrite.txt b/tests/test_plan_refsols/cryptbank_window_01_rewrite.txt new file mode 100644 index 000000000..48b7f6c42 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_window_01_rewrite.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('account_type', UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))), ('key', UNMASK::(CASE WHEN [a_key] = 0 THEN 0 ELSE (CASE WHEN [a_key] > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING([a_key], 1 + INSTR([a_key], '-'), LENGTH([a_key]) / 2) AS INTEGER) END)), ('balance', UNMASK::(SQRT([a_balance])))], orderings=[(UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))):asc_first]) + FILTER(condition=RANKING(args=[], partition=[ISIN(a_type, ['avingss', 'etirementr']:array[unknown])], order=[(UNMASK::(SQRT([a_balance]))):desc_first], allow_ties=False) == 1:numeric, columns={'a_balance': a_balance, 'a_key': a_key, 'a_type': a_type}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_key': a_key, 'a_type': a_type}) diff --git a/tests/test_plan_refsols/cryptbank_window_02_raw.txt b/tests/test_plan_refsols/cryptbank_window_02_raw.txt new file mode 100644 index 000000000..dfeb1cc2a --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_window_02_raw.txt @@ -0,0 +1,6 @@ +ROOT(columns=[('branch_name', b_name), ('key', UNMASK::(CASE WHEN [a_key] = 0 THEN 0 ELSE (CASE WHEN [a_key] > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING([a_key], 1 + INSTR([a_key], '-'), LENGTH([a_key]) / 2) AS INTEGER) END)), ('creation_timestamp', UNMASK::(DATETIME([a_open_ts], '+123456789 seconds')))], orderings=[(b_name):asc_first]) + FILTER(condition=RANKING(args=[], partition=[a_branchkey], order=[(YEAR(UNMASK::(DATETIME([a_open_ts], '+123456789 seconds'))) == 2021:numeric):asc_last, (UNMASK::(CASE WHEN [a_key] = 0 THEN 0 ELSE (CASE WHEN [a_key] > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING([a_key], 1 + INSTR([a_key], '-'), LENGTH([a_key]) / 2) AS INTEGER) END)):asc_last], allow_ties=False) == 1:numeric, columns={'a_key': a_key, 'a_open_ts': a_open_ts, 'b_name': b_name}) + JOIN(condition=t0.b_key == t1.a_branchkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_FILTER, columns={'a_branchkey': t1.a_branchkey, 'a_key': t1.a_key, 'a_open_ts': t1.a_open_ts, 'b_name': t0.b_name}) + FILTER(condition=CONTAINS(b_addr, ';CA;':string), columns={'b_key': b_key, 'b_name': b_name}) + SCAN(table=CRBNK.BRANCHES, columns={'b_addr': b_addr, 'b_key': b_key, 'b_name': b_name}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_branchkey': a_branchkey, 'a_key': a_key, 'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/cryptbank_window_02_rewrite.txt b/tests/test_plan_refsols/cryptbank_window_02_rewrite.txt new file mode 100644 index 000000000..eeb528693 --- /dev/null +++ b/tests/test_plan_refsols/cryptbank_window_02_rewrite.txt @@ -0,0 +1,6 @@ +ROOT(columns=[('branch_name', b_name), ('key', UNMASK::(CASE WHEN [a_key] = 0 THEN 0 ELSE (CASE WHEN [a_key] > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING([a_key], 1 + INSTR([a_key], '-'), LENGTH([a_key]) / 2) AS INTEGER) END)), ('creation_timestamp', UNMASK::(DATETIME([a_open_ts], '+123456789 seconds')))], orderings=[(b_name):asc_first]) + FILTER(condition=RANKING(args=[], partition=[a_branchkey], order=[(ISIN(a_open_ts, ['2017-02-11 10:59:51', '2017-06-15 12:41:51', '2017-07-07 14:26:51', '2017-07-09 12:21:51', '2017-09-15 11:26:51', '2018-01-02 12:26:51']:array[unknown])):asc_last, (UNMASK::(CASE WHEN [a_key] = 0 THEN 0 ELSE (CASE WHEN [a_key] > 0 THEN 1 ELSE -1 END) * CAST(SUBSTRING([a_key], 1 + INSTR([a_key], '-'), LENGTH([a_key]) / 2) AS INTEGER) END)):asc_last], allow_ties=False) == 1:numeric, columns={'a_key': a_key, 'a_open_ts': a_open_ts, 'b_name': b_name}) + JOIN(condition=t0.b_key == t1.a_branchkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_FILTER, columns={'a_branchkey': t1.a_branchkey, 'a_key': t1.a_key, 'a_open_ts': t1.a_open_ts, 'b_name': t0.b_name}) + FILTER(condition=CONTAINS(b_addr, ';CA;':string), columns={'b_key': b_key, 'b_name': b_name}) + SCAN(table=CRBNK.BRANCHES, columns={'b_addr': b_addr, 'b_key': b_key, 'b_name': b_name}) + SCAN(table=CRBNK.ACCOUNTS, columns={'a_branchkey': a_branchkey, 'a_key': a_key, 'a_open_ts': a_open_ts}) diff --git a/tests/test_plan_refsols/fsi_accounts_customers_compound_a_rewrite.txt b/tests/test_plan_refsols/fsi_accounts_customers_compound_a_rewrite.txt index f3451d2a7..b884e6bca 100644 --- a/tests/test_plan_refsols/fsi_accounts_customers_compound_a_rewrite.txt +++ b/tests/test_plan_refsols/fsi_accounts_customers_compound_a_rewrite.txt @@ -1,7 +1,7 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) JOIN(condition=UNMASK::(PTY_UNPROTECT_ACCOUNT([t0.customerid])) == UNMASK::(PTY_UNPROTECT([t1.customerid], 'deAccount')), type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={}) - FILTER(condition=currency != MASK::(PTY_PROTECT(['GBP':string], 'deAccount')) & balance < 20000:numeric, columns={'customerid': customerid}) + FILTER(condition=balance < 20000:numeric & ISIN(currency, ['jpb', 'gFr']:array[unknown]), columns={'customerid': customerid}) SCAN(table=bodo.fsi.accounts, columns={'balance': balance, 'currency': currency, 'customerid': customerid}) - FILTER(condition=state == MASK::(PTY_PROTECT(['California':string], 'deAddress')), columns={'customerid': customerid}) + FILTER(condition=state == 'V6kSQBaqGv':unknown, columns={'customerid': customerid}) SCAN(table=bodo.fsi.protected_customers, columns={'customerid': customerid, 'state': state}) diff --git a/tests/test_plan_refsols/fsi_accounts_customers_compound_b_rewrite.txt b/tests/test_plan_refsols/fsi_accounts_customers_compound_b_rewrite.txt index db5c10d1f..603d39a83 100644 --- a/tests/test_plan_refsols/fsi_accounts_customers_compound_b_rewrite.txt +++ b/tests/test_plan_refsols/fsi_accounts_customers_compound_b_rewrite.txt @@ -1,7 +1,7 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) JOIN(condition=UNMASK::(PTY_UNPROTECT_ACCOUNT([t0.customerid])) == UNMASK::(PTY_UNPROTECT([t1.customerid], 'deAccount')), type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={}) - FILTER(condition=YEAR(UNMASK::(PTY_UNPROTECT_DOB([createddate]))) <= 2022:numeric & ISIN(currency, [Call(op=MASK, inputs=[Literal(value='USD', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='GPB', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='EUR', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='JPY', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='AUD', type=UnknownType())], return_type=StringType())]:bool), columns={'customerid': customerid}) + FILTER(condition=YEAR(UNMASK::(PTY_UNPROTECT_DOB([createddate]))) <= 2022:numeric & ISIN(currency, ['jpb', 'gFr']:array[unknown]), columns={'customerid': customerid}) SCAN(table=bodo.fsi.accounts, columns={'createddate': createddate, 'currency': currency, 'customerid': customerid}) - FILTER(condition=ISIN(state, [Call(op=MASK, inputs=[Literal(value='Georgia', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Alabama', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Mississippi', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Arkansas', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Louisiana', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Florida', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='South Carolina', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='North Carolina', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Texas', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Tennessee', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Missouri', type=UnknownType())], return_type=StringType())]:bool) & NOT(ISIN(firstname, [Call(op=MASK, inputs=[Literal(value='Jennifer', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Julio', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Johnson', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Jameson', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Michael', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Robert', type=UnknownType())], return_type=StringType())]:bool)), columns={'customerid': customerid}) + FILTER(condition=ISIN(state, ['EdJ6cty', 'raXuWJGK', '4o0uuG1', 'FvlL1x8', 'TY84qyAxy', 'AqjyPuvoU8d', 'q6OaWD9X', 'MZBK0 U3nQzZbb', 'lN1sA AANifXzd', 'JXtZBpRhT', 'YYE75']:array[unknown]) & NOT(ISIN(firstname, ['tzuhpuCF', 'cPBnsOl', 'NVGimP']:array[unknown])), columns={'customerid': customerid}) SCAN(table=bodo.fsi.protected_customers, columns={'customerid': customerid, 'firstname': firstname, 'state': state}) diff --git a/tests/test_plan_refsols/fsi_accounts_customers_compound_c_rewrite.txt b/tests/test_plan_refsols/fsi_accounts_customers_compound_c_rewrite.txt index 6bc643316..a0c36bdbe 100644 --- a/tests/test_plan_refsols/fsi_accounts_customers_compound_c_rewrite.txt +++ b/tests/test_plan_refsols/fsi_accounts_customers_compound_c_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=MONOTONIC('2020-01-31':string, UNMASK::(PTY_UNPROTECT_DOB([createddate])), '2020-03-13':string) | MONOTONIC('2022-12-25':string, UNMASK::(PTY_UNPROTECT_DOB([createddate])), '2023-01-15':string) | MONOTONIC('2024-08-04':string, UNMASK::(PTY_UNPROTECT_DOB([createddate])), '2024-11-08':string) | MONOTONIC('2025-06-07':string, UNMASK::(PTY_UNPROTECT_DOB([createddate])), '2026-03-07':string), columns={}) + FILTER(condition=ISIN(createddate, ['3149-05-04', '1478-09-27', '2396-11-12', '0714-10-12', '2461-03-25', '2326-07-19', '2883-05-12', '1368-06-18', '2386-05-20', '2241-06-04', '2413-07-10', '1464-06-25', '2308-05-18', '2690-01-11', '0937-05-21', '0794-10-27', '2856-02-06', '1335-02-11', '1605-10-12', '2456-12-12', '1610-12-28', '1267-04-15', '2133-09-29', '3337-02-07', '1403-12-19', '1484-05-22']:array[unknown]), columns={}) SCAN(table=bodo.fsi.accounts, columns={'createddate': createddate}) diff --git a/tests/test_plan_refsols/fsi_customers_accounts_join_rewrite.txt b/tests/test_plan_refsols/fsi_customers_accounts_join_rewrite.txt index 0e96429f6..1cef79db3 100644 --- a/tests/test_plan_refsols/fsi_customers_accounts_join_rewrite.txt +++ b/tests/test_plan_refsols/fsi_customers_accounts_join_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('num_customers_checking_accounts', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=accounttype != MASK::(PTY_PROTECT(['checking':string], 'deAccount')), columns={}) + FILTER(condition=ISIN(accounttype, ['HPlnssRN', 'XADfRcm']:array[unknown]), columns={}) SCAN(table=bodo.fsi.accounts, columns={'accounttype': accounttype}) diff --git a/tests/test_plan_refsols/fsi_customers_filter_isin_rewrite.txt b/tests/test_plan_refsols/fsi_customers_filter_isin_rewrite.txt index 2021c990a..561498fea 100644 --- a/tests/test_plan_refsols/fsi_customers_filter_isin_rewrite.txt +++ b/tests/test_plan_refsols/fsi_customers_filter_isin_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(lastname, [Call(op=MASK, inputs=[Literal(value='Barnes', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Hernandez', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Moore', type=UnknownType())], return_type=StringType())]:bool), columns={}) + FILTER(condition=ISIN(lastname, ['CyypMP', 'TlwQYRsjl', 'SmfgY']:array[unknown]), columns={}) SCAN(table=bodo.fsi.protected_customers, columns={'lastname': lastname}) diff --git a/tests/test_plan_refsols/fsi_customers_filter_not_isin_rewrite.txt b/tests/test_plan_refsols/fsi_customers_filter_not_isin_rewrite.txt index fdd20a45a..59eddfa22 100644 --- a/tests/test_plan_refsols/fsi_customers_filter_not_isin_rewrite.txt +++ b/tests/test_plan_refsols/fsi_customers_filter_not_isin_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=NOT(ISIN(lastname, [Call(op=MASK, inputs=[Literal(value='Barnes', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Hernandez', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Moore', type=UnknownType())], return_type=StringType())]:bool)), columns={}) + FILTER(condition=NOT(ISIN(lastname, ['CyypMP', 'TlwQYRsjl', 'SmfgY']:array[unknown])), columns={}) SCAN(table=bodo.fsi.protected_customers, columns={'lastname': lastname}) diff --git a/tests/test_plan_refsols/health_claims_filter_day_rewrite.txt b/tests/test_plan_refsols/health_claims_filter_day_rewrite.txt index 5541f71b5..faec07875 100644 --- a/tests/test_plan_refsols/health_claims_filter_day_rewrite.txt +++ b/tests/test_plan_refsols/health_claims_filter_day_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT_DOB([claim_date]))) == 31:numeric, columns={}) + FILTER(condition=ISIN(claim_date, ['2666-05-02', '2627-10-27', '2896-11-08', '0775-03-22', '1471-09-22', '3175-06-30', '1909-08-08', '2063-10-13', '3095-04-16', '1842-06-18', '1292-11-24', '1324-05-13', '2757-05-10', '1415-01-25']:array[unknown]), columns={}) SCAN(table=bodo.health.claims, columns={'claim_date': claim_date}) diff --git a/tests/test_plan_refsols/retail_all_raw.txt b/tests/test_plan_refsols/retail_all_raw.txt new file mode 100644 index 000000000..1d6902166 --- /dev/null +++ b/tests/test_plan_refsols/retail_all_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=YEAR(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))) < 2026:numeric, columns={}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth}) diff --git a/tests/test_plan_refsols/retail_all_rewrite.txt b/tests/test_plan_refsols/retail_all_rewrite.txt new file mode 100644 index 000000000..b252b0c5e --- /dev/null +++ b/tests/test_plan_refsols/retail_all_rewrite.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={}) diff --git a/tests/test_plan_refsols/retail_members_compound_a_rewrite.txt b/tests/test_plan_refsols/retail_members_compound_a_rewrite.txt index 1422ebb17..053492e14 100644 --- a/tests/test_plan_refsols/retail_members_compound_a_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_compound_a_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB')) >= datetime.date(2002, 1, 1):datetime & ISIN(last_name, [Call(op=MASK, inputs=[Literal(value='Johnson', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Robinson', type=UnknownType())], return_type=StringType())]:bool), columns={}) + FILTER(condition=ISIN(date_of_birth, ['1922-08-06', '0913-06-11', '2142-09-01', '2006-03-03', '0915-05-17', '1823-12-26', '1722-02-13', '2208-12-06', '2350-04-16', '2973-02-23', '1484-10-19', '1924-10-25', '2544-09-01', '2363-10-31', '2685-03-23', '1040-04-02', '3136-09-15', '1569-07-03', '1804-07-19', '1543-07-16', '2478-02-14', '0983-02-13', '2243-03-06', '2628-10-02', '2064-12-22', '1463-05-18', '1078-01-28', '1125-11-24', '1405-11-12', '3290-02-08', '1278-11-09', '3093-06-09', '1464-06-16', '2613-07-13', '1964-08-20', '1061-01-22', '2797-05-10', '1905-02-26', '1938-07-08', '1535-05-03', '1289-11-13', '1818-01-12', '1073-07-09', '2605-10-18', '1711-07-03', '3018-03-01', '2830-08-29']:array[unknown]) & ISIN(last_name, ['xnUVZyS', 'UcoQBfzB']:array[unknown]), columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth, 'last_name': last_name}) diff --git a/tests/test_plan_refsols/retail_members_compound_b_rewrite.txt b/tests/test_plan_refsols/retail_members_compound_b_rewrite.txt index 530b3f93d..203ac6a84 100644 --- a/tests/test_plan_refsols/retail_members_compound_b_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_compound_b_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=last_name != MASK::(PTY_PROTECT(['Smith':string], 'deName')) & date_of_birth == MASK::(PTY_PROTECT([datetime.date(1979, 3, 7):datetime], 'deDOB')), columns={}) + FILTER(condition=last_name != MASK::(PTY_PROTECT(['Smith':string], 'deName')) & date_of_birth == '1622-10-03':unknown, columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth, 'last_name': last_name}) diff --git a/tests/test_plan_refsols/retail_members_compound_e_rewrite.txt b/tests/test_plan_refsols/retail_members_compound_e_rewrite.txt index e14a2e9b0..a4c5878d8 100644 --- a/tests/test_plan_refsols/retail_members_compound_e_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_compound_e_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB')) < datetime.date(1983, 1, 30):datetime & UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB')) >= datetime.date(1983, 1, 10):datetime, columns={}) + FILTER(condition=ISIN(date_of_birth, ['2637-10-01', '1403-11-22']:array[unknown]), columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth}) diff --git a/tests/test_plan_refsols/retail_members_compound_f_rewrite.txt b/tests/test_plan_refsols/retail_members_compound_f_rewrite.txt index 65af4ef09..dc0445dc1 100644 --- a/tests/test_plan_refsols/retail_members_compound_f_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_compound_f_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB')) <= datetime.date(1976, 7, 28):datetime & UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB')) > datetime.date(1976, 7, 1):datetime, columns={}) + FILTER(condition=ISIN(date_of_birth, ['1357-07-11', '0988-09-15']:array[unknown]), columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth}) diff --git a/tests/test_plan_refsols/retail_members_compound_g_rewrite.txt b/tests/test_plan_refsols/retail_members_compound_g_rewrite.txt index ab64fa711..8e495c01b 100644 --- a/tests/test_plan_refsols/retail_members_compound_g_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_compound_g_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))) <= 13:numeric & DAY(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))) > 3:numeric & ISIN(MONTH(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))), [1, 2, 5, 10, 12]:array[unknown]) & ISIN(YEAR(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))), [1960, 1970, 1980, 1990, 2000]:array[unknown]), columns={}) + FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))) <= 13:numeric & ISIN(MONTH(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))), [1, 2, 5, 10, 12]:array[unknown]) & ISIN(YEAR(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))), [1960, 1970, 1980, 1990, 2000]:array[unknown]) & NOT(ISIN(date_of_birth, ['0897-01-11', '0681-01-31', '2337-11-25', '0765-06-07', '3270-09-07', '3114-01-23', '0946-07-28', '0671-06-23', '1030-02-26', '2892-07-01', '1787-09-06', '2191-11-24', '0912-05-28', '1828-09-20', '1318-12-03', '0660-08-20', '1546-05-12', '2064-12-18', '1664-12-03', '0627-05-21', '1348-11-22', '3202-05-20', '0959-04-01', '1397-05-24', '3184-08-05', '2207-02-22', '2388-11-19', '2563-07-20', '3159-09-21', '2692-10-23', '1365-12-07', '1712-02-18', '0846-08-04', '3332-01-06', '2501-07-04', '3297-10-03', '2235-01-19', '2006-03-03', '2544-09-01', '1543-07-16']:array[unknown])), columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth}) diff --git a/tests/test_plan_refsols/retail_members_compound_h_rewrite.txt b/tests/test_plan_refsols/retail_members_compound_h_rewrite.txt index 88edd4783..e8d0806cb 100644 --- a/tests/test_plan_refsols/retail_members_compound_h_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_compound_h_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB')) < datetime.date(2007, 1, 1):datetime & UNMASK::(PTY_UNPROTECT_NAME([last_name])) >= 'Cross':string, columns={}) + FILTER(condition=date_of_birth != '2605-10-18':unknown & UNMASK::(PTY_UNPROTECT_NAME([last_name])) >= 'Cross':string, columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth, 'last_name': last_name}) diff --git a/tests/test_plan_refsols/retail_members_compound_j_rewrite.txt b/tests/test_plan_refsols/retail_members_compound_j_rewrite.txt index b9366f919..120646627 100644 --- a/tests/test_plan_refsols/retail_members_compound_j_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_compound_j_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=CONTAINS(LOWER(UNMASK::(PTY_UNPROTECT_NAME([last_name]))), 'hu':string), columns={}) + FILTER(condition=ISIN(last_name, ['jNPacL', 'NIAZ', 'eIVERzXY', 'tREJmG', 'cxyIdcy']:array[unknown]), columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'last_name': last_name}) diff --git a/tests/test_plan_refsols/retail_members_filter_name_endswith_rewrite.txt b/tests/test_plan_refsols/retail_members_filter_name_endswith_rewrite.txt index 6c2b46896..98e3e2918 100644 --- a/tests/test_plan_refsols/retail_members_filter_name_endswith_rewrite.txt +++ b/tests/test_plan_refsols/retail_members_filter_name_endswith_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ENDSWITH(UNMASK::(PTY_UNPROTECT([first_name], 'deName')), 'e':string) | ENDSWITH(UNMASK::(PTY_UNPROTECT_NAME([last_name])), 'e':string), columns={}) + FILTER(condition=ISIN(first_name, ['CdCPvr', 'EKjOcM', 'euaiZD', 'DsOlJExPB', 'yyrzPqwYJ', 'veSbKfjZ', 'ltpzJF', 'QlYGrf', 'wrJPcBLnb', 'aPZukW', 'zQhHu', 'rBysMhdxNH', 'xSofz', 'CvHV', 'UhnVJm', 'zixlYsG', 'OXzucS', 'nRhMWQ', 'oKd', 'rASYq', 'mFtb', 'XdaEj', 'StqmwCvYW', 'zPgDshgP', 'OQMeTN', 'fcxiAcj', 'otHnLXhd', 'ZpEzCmV', 'pFTdpMJ', 'eMPChjxY', 'IzjmJq', 'wzDFEL', 'vhZhdhNRf', 'FoEhR', 'RxvEbkd', 'KrdDrun', 'sFBVM']:array[unknown]) | ISIN(last_name, ['XuCRC', 'vcVqo', 'xpFpz', 'rpnEFk', 'brcc', 'teibn', 'KLvNYE', 'OgARIx', 'aPZukW', 'RZnrOO', 'LFtAm', 'VTFJ', 'NaTJ', 'gYR', 'SUvctz', 'SmfgY', 'FgeTdq', 'EYAd', 'iPPF', 'LEcnd', 'YYsb', 'wlBDLGE', 'xAzGl']:array[unknown]), columns={}) SCAN(table=bodo.retail.protected_loyalty_members, columns={'first_name': first_name, 'last_name': last_name}) diff --git a/tests/test_plan_refsols/retail_names_analysis_a_raw.txt b/tests/test_plan_refsols/retail_names_analysis_a_raw.txt new file mode 100644 index 000000000..65b95754b --- /dev/null +++ b/tests/test_plan_refsols/retail_names_analysis_a_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS('GT':string, UPPER(SLICE(UNMASK::(PTY_UNPROTECT([first_name], 'deName')), None:unknown, 1:numeric, None:unknown))), columns={}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={'first_name': first_name}) diff --git a/tests/test_plan_refsols/retail_names_analysis_a_rewrite.txt b/tests/test_plan_refsols/retail_names_analysis_a_rewrite.txt new file mode 100644 index 000000000..c24d5cbc7 --- /dev/null +++ b/tests/test_plan_refsols/retail_names_analysis_a_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(first_name, ['wrJPcBLnb', 'tfdP', 'aPZukW', 'FllTrn', 'dWFj', 'zQhHu', 'cLxbra', 'iShNn', 'nrvyDT', 'Eikudy', 'dDxAuD', 'RwQZcxw', 'RnYgnn', 'UMwsjSm', 'VjzF', 'lDKVA', 'DAzoEa', 'POnnEr', 'EGBa']:array[unknown]), columns={}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={'first_name': first_name}) diff --git a/tests/test_plan_refsols/retail_names_analysis_b_raw.txt b/tests/test_plan_refsols/retail_names_analysis_b_raw.txt new file mode 100644 index 000000000..2e8548f25 --- /dev/null +++ b/tests/test_plan_refsols/retail_names_analysis_b_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=CONTAINS('day':string, LOWER(SLICE(UNMASK::(PTY_UNPROTECT([first_name], 'deName')), None:unknown, 2:numeric, None:unknown))), columns={}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={'first_name': first_name}) diff --git a/tests/test_plan_refsols/retail_names_analysis_b_rewrite.txt b/tests/test_plan_refsols/retail_names_analysis_b_rewrite.txt new file mode 100644 index 000000000..7f144b658 --- /dev/null +++ b/tests/test_plan_refsols/retail_names_analysis_b_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(first_name, ['ZcH', 'MgEZTa', 'veSbKfjZ', 'HBRvO', 'jvUyLK', 'tdfnU']:array[unknown]), columns={}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={'first_name': first_name}) diff --git a/tests/test_plan_refsols/retail_none_raw.txt b/tests/test_plan_refsols/retail_none_raw.txt new file mode 100644 index 000000000..93fbf94a8 --- /dev/null +++ b/tests/test_plan_refsols/retail_none_raw.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=YEAR(UNMASK::(PTY_UNPROTECT([date_of_birth], 'deDOB'))) >= 2026:numeric, columns={}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={'date_of_birth': date_of_birth}) diff --git a/tests/test_plan_refsols/retail_none_rewrite.txt b/tests/test_plan_refsols/retail_none_rewrite.txt new file mode 100644 index 000000000..aef1db0af --- /dev/null +++ b/tests/test_plan_refsols/retail_none_rewrite.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n', n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=False:bool, columns={}) + SCAN(table=bodo.retail.protected_loyalty_members, columns={}) diff --git a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_a_rewrite.txt b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_a_rewrite.txt index ff57eb91f..65f17072e 100644 --- a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_a_rewrite.txt +++ b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_a_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=payment_method == MASK::(PTY_PROTECT_ACCOUNT(['Cash':string])), columns={}) + FILTER(condition=payment_method == 'CsNw':unknown, columns={}) SCAN(table=bodo.retail.transactions, columns={'payment_method': payment_method}) diff --git a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_b_rewrite.txt b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_b_rewrite.txt index 89904da39..787e165d3 100644 --- a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_b_rewrite.txt +++ b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_b_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=payment_method != MASK::(PTY_PROTECT_ACCOUNT(['Credit Card':string])), columns={}) + FILTER(condition=ISIN(payment_method, ['CsNw', 'DwXR YwQL', 'BaGWrt IqJfFoq']:array[unknown]), columns={}) SCAN(table=bodo.retail.transactions, columns={'payment_method': payment_method}) diff --git a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_c_rewrite.txt b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_c_rewrite.txt index 36ff8674c..62d7b5e54 100644 --- a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_c_rewrite.txt +++ b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_c_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(payment_method, [Call(op=MASK, inputs=[Literal(value='Cash', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Gift Card', type=UnknownType())], return_type=StringType())]:bool), columns={}) + FILTER(condition=ISIN(payment_method, ['CsNw', 'DwXR YwQL']:array[unknown]), columns={}) SCAN(table=bodo.retail.transactions, columns={'payment_method': payment_method}) diff --git a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_d_rewrite.txt b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_d_rewrite.txt index 4853a4398..5ad240062 100644 --- a/tests/test_plan_refsols/retail_transactions_payment_method_cmp_d_rewrite.txt +++ b/tests/test_plan_refsols/retail_transactions_payment_method_cmp_d_rewrite.txt @@ -1,4 +1,4 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=NOT(ISIN(payment_method, [Call(op=MASK, inputs=[Literal(value='Mobile Payment', type=UnknownType())], return_type=StringType()), Call(op=MASK, inputs=[Literal(value='Gift Card', type=UnknownType())], return_type=StringType())]:bool)), columns={}) + FILTER(condition=ISIN(payment_method, ['CsNw', 'JrVjGo Mdvt']:array[unknown]), columns={}) SCAN(table=bodo.retail.transactions, columns={'payment_method': payment_method}) diff --git a/tests/test_plan_refsols/retail_transactions_ts_raw.txt b/tests/test_plan_refsols/retail_transactions_ts_raw.txt new file mode 100644 index 000000000..46d76c896 --- /dev/null +++ b/tests/test_plan_refsols/retail_transactions_ts_raw.txt @@ -0,0 +1,20 @@ +ROOT(columns=[('n1', n_rows), ('n2', agg_1), ('n3', agg_2), ('n4', agg_3), ('n5', agg_4)], orderings=[]) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t0.agg_3, 'agg_4': t1.n_rows, 'n_rows': t0.n_rows}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t1.n_rows, 'n_rows': t0.n_rows}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t1.n_rows, 'n_rows': t0.n_rows}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t1.n_rows, 'n_rows': t0.n_rows}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric & HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric, columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric & HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric, columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric & HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric, columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric & HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric, columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 4:numeric & HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric | MINUTE(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == SECOND(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) & HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric, columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) diff --git a/tests/test_plan_refsols/retail_transactions_ts_rewrite.txt b/tests/test_plan_refsols/retail_transactions_ts_rewrite.txt new file mode 100644 index 000000000..a03d36ac8 --- /dev/null +++ b/tests/test_plan_refsols/retail_transactions_ts_rewrite.txt @@ -0,0 +1,20 @@ +ROOT(columns=[('n1', n_rows), ('n2', agg_1), ('n3', agg_2), ('n4', agg_3), ('n5', agg_4)], orderings=[]) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t0.agg_3, 'agg_4': t1.n_rows, 'n_rows': t0.n_rows}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t1.n_rows, 'n_rows': t0.n_rows}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t1.n_rows, 'n_rows': t0.n_rows}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t1.n_rows, 'n_rows': t0.n_rows}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(transaction_date, ['2178-03-20 07:19:29', '2825-09-23 07:37:08']:array[unknown]), columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(transaction_date, ['1010-12-08 07:23:35', '2328-01-19 07:33:25', '1577-03-20 07:41:29', '1345-03-06 07:41:47', '0937-05-21 07:27:48', '2176-01-07 07:07:03']:array[unknown]), columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(transaction_date, ['3120-07-22 08:30:44', '1890-02-18 08:21:13', '1890-02-18 08:46:51']:array[unknown]), columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(transaction_date, ['2052-11-18 08:24:33', '2052-11-18 08:32:00', '1577-03-20 08:03:51', '1577-03-20 08:32:17', '2550-01-17 08:56:44', '1551-03-04 08:36:08']:array[unknown]), columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=ISIN(transaction_date, ['2268-07-06 01:50:11', '3056-08-07 01:18:26', '3120-07-22 02:43:20', '1010-12-08 01:47:15', '1440-10-15 02:26:30', '3054-12-02 00:51:55', '3031-02-17 00:54:21', '1539-02-23 00:49:34', '2418-09-09 01:12:48', '2418-09-09 02:09:31', '2551-01-12 00:34:57', '3141-01-25 02:24:01', '2328-01-19 01:20:40', '1577-03-20 00:27:19', '1608-08-20 00:10:55', '1608-08-20 01:12:55', '1608-08-20 02:14:47', '2825-09-23 02:31:19', '1286-12-21 00:21:24', '1286-12-21 01:25:46', '3300-07-12 00:15:35', '2059-07-23 01:56:15', '2955-06-27 00:48:34', '2955-06-27 01:24:43', '0937-05-21 00:40:43', '0930-11-28 02:44:19', '1605-10-12 00:58:57', '0781-08-29 02:28:10', '2374-09-21 00:21:42', '2374-09-21 02:10:55', '3022-05-13 01:56:21', '3088-03-30 01:09:15', '3088-03-30 02:38:56', '1757-01-16 00:20:29', '3287-10-20 01:17:31', '2555-09-08 00:40:20', '2555-09-08 01:20:22', '2555-09-08 02:36:58', '2176-01-07 02:50:08', '2282-06-16 00:21:35', '2595-05-23 01:32:01', '3237-05-26 01:19:24', '3237-05-26 01:52:49', '2780-03-19 01:32:32', '2780-03-19 02:33:01']:array[unknown]) | ISIN(transaction_date, ['1752-07-20 01:18:18', '1880-04-06 00:47:47', '2956-09-24 00:03:03', '1868-06-13 01:22:22', '0780-03-23 01:14:14', '1598-04-24 01:11:11', '0763-04-15 00:16:16', '2780-03-19 01:32:32']:array[unknown]), columns={}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) diff --git a/tests/test_sql_refsols/cryptbank_agg_01_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_agg_01_rewrite_sqlite.sql index ac0ae3093..e8bb433e6 100644 --- a/tests/test_sql_refsols/cryptbank_agg_01_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_agg_01_rewrite_sqlite.sql @@ -4,5 +4,4 @@ SELECT )), 2) AS n FROM crbnk.transactions WHERE - CAST(STRFTIME('%Y', DATETIME(t_ts, '+54321 seconds')) AS INTEGER) = 2022 - AND CAST(STRFTIME('%m', DATETIME(t_ts, '+54321 seconds')) AS INTEGER) = 6 + t_ts IN ('2022-06-03 05:08:58', '2022-06-12 00:24:06', '2022-06-13 05:50:39', '2022-06-14 19:08:57', '2022-06-16 03:15:13', '2022-06-18 03:37:49', '2022-06-27 06:08:04', '2022-06-28 15:35:47', '2022-06-29 05:40:38', '2022-06-29 19:53:42') diff --git a/tests/test_sql_refsols/cryptbank_agg_06_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_agg_06_raw_sqlite.sql new file mode 100644 index 000000000..ed6bfdc6c --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_agg_06_raw_sqlite.sql @@ -0,0 +1,8 @@ +SELECT + COALESCE(SUM(( + 1025.67 - t_amount + ) < 0), 0) AS n_neg, + COALESCE(SUM(( + 1025.67 - t_amount + ) > 0), 0) AS n_positive +FROM crbnk.transactions diff --git a/tests/test_sql_refsols/cryptbank_agg_06_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_agg_06_rewrite_sqlite.sql new file mode 100644 index 000000000..104442af6 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_agg_06_rewrite_sqlite.sql @@ -0,0 +1,4 @@ +SELECT + SUM(FALSE) AS n_neg, + SUM(TRUE) AS n_positive +FROM crbnk.transactions diff --git a/tests/test_sql_refsols/cryptbank_agg_07_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_agg_07_raw_sqlite.sql new file mode 100644 index 000000000..d389c0bff --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_agg_07_raw_sqlite.sql @@ -0,0 +1,113 @@ +SELECT + COALESCE( + SUM(DATE(DATETIME(t_ts, '+54321 seconds'), 'start of year') = DATE('2023-01-01')), + 0 + ) AS n_yr, + COALESCE( + SUM( + DATE( + DATETIME(t_ts, '+54321 seconds'), + 'start of month', + '-' || CAST(( + ( + CAST(STRFTIME('%m', DATETIME(DATETIME(t_ts, '+54321 seconds'))) AS INTEGER) - 1 + ) % 3 + ) AS TEXT) || ' months' + ) = DATE('2023-04-01') + ), + 0 + ) AS n_qu, + COALESCE( + SUM(DATE(DATETIME(t_ts, '+54321 seconds'), 'start of month') = DATE('2023-06-01')), + 0 + ) AS n_mo, + COALESCE( + SUM( + DATE( + DATETIME(t_ts, '+54321 seconds'), + '-' || CAST(CAST(STRFTIME('%w', DATETIME(DATETIME(t_ts, '+54321 seconds'))) AS INTEGER) AS TEXT) || ' days', + 'start of day' + ) = DATE('2023-05-28') + ), + 0 + ) AS n_we, + COALESCE( + SUM(DATE(DATETIME(t_ts, '+54321 seconds'), 'start of day') = DATE('2023-06-02')), + 0 + ) AS n_da, + COALESCE( + SUM( + STRFTIME('%Y-%m-%d %H:00:00', DATETIME(DATETIME(t_ts, '+54321 seconds'))) = '2023-06-02 04:00:00' + ), + 0 + ) AS n_ho, + COALESCE( + SUM( + STRFTIME('%Y-%m-%d %H:%M:00', DATETIME(DATETIME(t_ts, '+54321 seconds'))) = '2023-06-02 04:55:00' + ), + 0 + ) AS n_mi, + COALESCE( + SUM( + STRFTIME('%Y-%m-%d %H:%M:%S', DATETIME(DATETIME(t_ts, '+54321 seconds'))) = '2023-06-02 04:55:31' + ), + 0 + ) AS n_se, + COALESCE(SUM(DATE('now', 'start of day') = DATETIME(t_ts, '+54321 seconds')), 0) AS n_cts, + COALESCE(SUM(DATETIME('2025-12-31') = DATETIME(t_ts, '+54321 seconds')), 0) AS n_dts, + COALESCE( + SUM( + DATE( + DATETIME(t_ts, '+54321 seconds'), + '-' || CAST(CAST(STRFTIME('%w', DATETIME(DATETIME(t_ts, '+54321 seconds'))) AS INTEGER) AS TEXT) || ' days', + 'start of day', + '3 day' + ) = DATE('2023-05-31') + ), + 0 + ) AS n_nst, + COALESCE( + SUM(DATETIME(DATETIME(t_ts, '+54321 seconds'), '1 year') = '2020-11-11 18:00:52'), + 0 + ) AS n_ayr, + COALESCE( + SUM(DATETIME(DATETIME(t_ts, '+54321 seconds'), '6 month') = '2020-05-11 18:00:52'), + 0 + ) AS n_aqu, + COALESCE( + SUM(DATETIME(DATETIME(t_ts, '+54321 seconds'), '-5 month') = '2019-06-11 18:00:52'), + 0 + ) AS n_amo, + COALESCE( + SUM( + DATE(DATETIME(t_ts, '+54321 seconds'), 'start of day', '7 day') = DATE('2023-06-09') + ), + 0 + ) AS n_awe, + COALESCE( + SUM(DATETIME(DATETIME(t_ts, '+54321 seconds'), '10 day') = '2019-11-21 18:00:52'), + 0 + ) AS n_ada, + COALESCE( + SUM(DATETIME(DATETIME(t_ts, '+54321 seconds'), '1000 hour') = '2019-12-23 10:00:52'), + 0 + ) AS n_aho, + COALESCE( + SUM( + DATETIME(DATETIME(t_ts, '+54321 seconds'), '10000 minute') = '2019-11-18 16:40:52' + ), + 0 + ) AS n_ami, + COALESCE( + SUM( + DATETIME(DATETIME(t_ts, '+54321 seconds'), '-1000000 second') = '2019-10-31 04:14:12' + ), + 0 + ) AS n_ase, + COALESCE( + SUM( + DATE(DATETIME(t_ts, '+54321 seconds'), 'start of month', '-1 day') = DATE('2019-10-31') + ), + 0 + ) AS n_ldm +FROM crbnk.transactions diff --git a/tests/test_sql_refsols/cryptbank_agg_07_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_agg_07_rewrite_sqlite.sql new file mode 100644 index 000000000..85a40572e --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_agg_07_rewrite_sqlite.sql @@ -0,0 +1,66 @@ +SELECT + COALESCE( + SUM( + t_ts IN ('2022-12-31 17:42:54', '2023-01-04 12:05:15', '2023-01-07 22:11:27', '2023-01-20 04:38:03', '2023-01-20 16:40:54', '2023-01-27 15:13:18', '2023-01-30 19:58:26', '2023-02-02 19:12:58', '2023-02-11 11:13:53', '2023-02-11 12:32:55', '2023-02-15 21:54:29', '2023-02-16 14:18:36', '2023-02-28 07:11:29', '2023-03-07 01:26:10', '2023-03-08 18:58:18', '2023-03-14 14:23:33', '2023-03-16 06:17:44', '2023-03-17 08:48:16', '2023-03-24 03:33:40', '2023-03-26 06:52:52', '2023-04-18 00:35:40', '2023-04-25 18:54:26', '2023-04-29 04:58:30', '2023-05-04 23:30:10', '2023-05-12 04:42:28', '2023-05-17 18:54:12', '2023-05-19 10:10:44', '2023-05-21 13:52:14', '2023-05-24 03:51:10', '2023-06-01 13:50:10', '2023-06-01 13:50:14', '2023-06-04 10:35:26', '2023-06-11 21:53:04', '2023-06-25 15:06:06', '2023-06-25 21:58:37', '2023-06-27 03:21:19', '2023-06-27 10:34:20', '2023-06-30 15:27:03', '2023-07-07 15:17:47', '2023-07-17 03:23:15', '2023-07-18 14:41:26', '2023-08-03 20:24:35', '2023-08-11 20:25:39', '2023-08-29 03:07:18', '2023-09-01 16:50:48', '2023-09-08 09:30:23', '2023-09-13 06:42:39', '2023-09-15 09:00:02', '2023-09-30 08:57:30', '2023-10-15 02:47:04', '2023-10-19 09:40:06', '2023-10-30 00:20:45', '2023-11-08 12:52:24', '2023-11-10 17:20:29', '2023-11-16 11:30:24', '2023-11-21 15:17:10', '2023-11-28 06:34:03', '2023-12-07 14:11:33', '2023-12-15 05:57:23', '2023-12-16 00:51:23', '2023-12-23 07:54:22') + ), + 0 + ) AS n_yr, + COALESCE( + SUM( + t_ts IN ('2023-04-18 00:35:40', '2023-04-25 18:54:26', '2023-04-29 04:58:30', '2023-05-04 23:30:10', '2023-05-12 04:42:28', '2023-05-17 18:54:12', '2023-05-19 10:10:44', '2023-05-21 13:52:14', '2023-05-24 03:51:10', '2023-06-01 13:50:10', '2023-06-01 13:50:14', '2023-06-04 10:35:26', '2023-06-11 21:53:04', '2023-06-25 15:06:06', '2023-06-25 21:58:37', '2023-06-27 03:21:19', '2023-06-27 10:34:20') + ), + 0 + ) AS n_qu, + COALESCE( + SUM( + t_ts IN ('2023-06-01 13:50:10', '2023-06-01 13:50:14', '2023-06-04 10:35:26', '2023-06-11 21:53:04', '2023-06-25 15:06:06', '2023-06-25 21:58:37', '2023-06-27 03:21:19', '2023-06-27 10:34:20') + ), + 0 + ) AS n_mo, + COALESCE( + SUM( + DATE( + DATETIME(t_ts, '+54321 seconds'), + '-' || CAST(CAST(STRFTIME('%w', DATETIME(DATETIME(t_ts, '+54321 seconds'))) AS INTEGER) AS TEXT) || ' days', + 'start of day' + ) = DATE('2023-05-28') + ), + 0 + ) AS n_we, + COALESCE(SUM(t_ts IN ('2023-06-01 13:50:10', '2023-06-01 13:50:14')), 0) AS n_da, + COALESCE(SUM(t_ts IN ('2023-06-01 13:50:10', '2023-06-01 13:50:14')), 0) AS n_ho, + COALESCE(SUM(t_ts IN ('2023-06-01 13:50:10', '2023-06-01 13:50:14')), 0) AS n_mi, + COALESCE(SUM(t_ts = '2023-06-01 13:50:10'), 0) AS n_se, + COALESCE(SUM(DATE('now', 'start of day') = DATETIME(t_ts, '+54321 seconds')), 0) AS n_cts, + COALESCE(SUM(DATETIME('2025-12-31') = DATETIME(t_ts, '+54321 seconds')), 0) AS n_dts, + COALESCE( + SUM( + DATE( + DATETIME(t_ts, '+54321 seconds'), + '-' || CAST(CAST(STRFTIME('%w', DATETIME(DATETIME(t_ts, '+54321 seconds'))) AS INTEGER) AS TEXT) || ' days', + 'start of day', + '3 day' + ) = DATE('2023-05-31') + ), + 0 + ) AS n_nst, + COALESCE(SUM(t_ts = '2019-11-11 02:55:31'), 0) AS n_ayr, + COALESCE(SUM(t_ts = '2019-11-11 02:55:31'), 0) AS n_aqu, + COALESCE(SUM(t_ts = '2019-11-11 02:55:31'), 0) AS n_amo, + COALESCE( + SUM( + DATE(DATETIME(t_ts, '+54321 seconds'), 'start of day', '7 day') = DATE('2023-06-09') + ), + 0 + ) AS n_awe, + COALESCE(SUM(t_ts = '2019-11-11 02:55:31'), 0) AS n_ada, + COALESCE(SUM(t_ts = '2019-11-11 02:55:31'), 0) AS n_aho, + COALESCE(SUM(t_ts = '2019-11-11 02:55:31'), 0) AS n_ami, + COALESCE(SUM(t_ts = '2019-11-11 02:55:31'), 0) AS n_ase, + COALESCE( + SUM( + t_ts IN ('2019-11-02 11:58:37', '2019-11-02 12:54:09', '2019-11-11 02:55:31', '2019-11-11 15:44:22') + ), + 0 + ) AS n_ldm +FROM crbnk.transactions diff --git a/tests/test_sql_refsols/cryptbank_analysis_04_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_analysis_04_rewrite_sqlite.sql index 82c586b20..999742bdc 100644 --- a/tests/test_sql_refsols/cryptbank_analysis_04_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_analysis_04_rewrite_sqlite.sql @@ -12,15 +12,12 @@ SELECT COUNT(*) AS n_trans FROM crbnk.accounts AS accounts JOIN crbnk.customers AS customers - ON CAST(STRFTIME('%Y', DATE(customers.c_birthday, '+472 days')) AS INTEGER) <= 1985 - AND CAST(STRFTIME('%Y', DATE(customers.c_birthday, '+472 days')) AS INTEGER) >= 1980 - AND accounts.a_custkey = ( + ON accounts.a_custkey = ( 42 - customers.c_key ) + AND customers.c_birthday IN ('1980-01-18', '1981-07-21', '1981-11-15', '1982-11-07', '1983-12-27') JOIN crbnk.transactions AS transactions - ON ( - 1025.67 - transactions.t_amount - ) > 9000.0 + ON transactions.t_amount IN (-8934.44, -8881.98, -8736.83, -8717.7, -8648.33, -8639.5, -8620.48, -8593.09, -8553.43, -8527.34, -8484.61, -8480.79, -8472.7, -8457.49, -8366.52, -8361.27, -8352.72, -8308.42, -8254.69, -8077.89, -8067.8) AND transactions.t_sourceaccount = CASE WHEN accounts.a_key = 0 THEN 0 diff --git a/tests/test_sql_refsols/cryptbank_filter_count_01_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_01_rewrite_sqlite.sql index bffd9c7c0..298dbab4e 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_01_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_01_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - c_lname = UPPER('lee') + c_lname = 'LEE' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_02_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_02_rewrite_sqlite.sql index f1f7b1c78..ec3a44be4 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_02_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_02_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - c_lname <> UPPER('lee') + c_lname <> 'LEE' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_03_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_03_rewrite_sqlite.sql index aa7550e49..a590ad01a 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_03_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_03_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - c_lname IN (UPPER('lee'), UPPER('smith'), UPPER('rodriguez')) + c_lname IN ('LEE', 'SMITH', 'RODRIGUEZ') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_04_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_04_rewrite_sqlite.sql index 6b329065c..5dc20fbac 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_04_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_04_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - NOT c_lname IN (UPPER('lee'), UPPER('smith'), UPPER('rodriguez')) + NOT c_lname IN ('LEE', 'SMITH', 'RODRIGUEZ') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_05_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_05_rewrite_sqlite.sql index 8205aea4b..983d9cffa 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_05_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_05_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0') LIKE '555-8%' + c_phone IN ('555-809-1234', '555-870-9123') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_06_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_06_rewrite_sqlite.sql index 6e69fc127..868bad685 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_06_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_06_rewrite_sqlite.sql @@ -2,6 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - ( - SUBSTRING(c_email, -1) || SUBSTRING(c_email, 1, LENGTH(c_email) - 1) - ) LIKE '%gmail.com' + c_email IN ('livia.a22@gmail.como', 'ob.smith77@gmail.comb', 'ob_moore78@gmail.comr', 'opez.luke99@gmail.coml') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_07_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_07_rewrite_sqlite.sql index 4d7c59588..c6210227a 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_07_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_07_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER) = 1978 + c_birthday IN ('1976-10-27', '1976-12-02') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_08_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_08_rewrite_sqlite.sql index fc4234022..f334ffdeb 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_08_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_08_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - c_birthday = DATE('1985-04-12', '-472 days') + c_birthday = '1983-12-27' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_14_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_14_rewrite_sqlite.sql index d70b6decd..850d44d4a 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_14_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_14_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - LOWER(c_fname) LIKE '%e' OR LOWER(c_lname) LIKE '%e' + c_fname IN ('ALICE', 'GRACE', 'LUKE', 'QUEENIE') OR c_lname IN ('LEE', 'MOORE') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_16_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_16_rewrite_sqlite.sql index 41580c0f1..d14b6d3f4 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_16_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_16_rewrite_sqlite.sql @@ -3,12 +3,7 @@ WITH _u_0 AS ( a_custkey AS _u_1 FROM crbnk.accounts WHERE - a_type <> ( - SUBSTRING('checking', 2) || SUBSTRING('checking', 1, 1) - ) - AND a_type <> ( - SUBSTRING('savings', 2) || SUBSTRING('savings', 1, 1) - ) + NOT a_type IN ('avingss', 'heckingc') GROUP BY 1 ) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_18_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_18_rewrite_sqlite.sql index 598be0fcf..e27ab0f98 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_18_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_18_rewrite_sqlite.sql @@ -2,6 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - ( - SUBSTRING(c_email, -1) || SUBSTRING(c_email, 1, LENGTH(c_email) - 1) - ) LIKE '%.%@%mail%' + c_email IN ('ophia.jackson@mail.orgs', 'livia.a22@gmail.como', '.gonzalez@ymail.comm', 'opez.luke99@gmail.coml', 'enry.g@fastmail.comh', 'rank.k@protonmail.comf', 'mily.jones@mail.come', 'ob.smith77@gmail.comb') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_19_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_19_rewrite_sqlite.sql index 565b89e92..25e1ef8f2 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_19_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_19_rewrite_sqlite.sql @@ -2,6 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - ( - SUBSTRING(c_email, -1) || SUBSTRING(c_email, 1, LENGTH(c_email) - 1) - ) LIKE '%mail%' + NOT c_email IN ('homasl@outlook.comt', 'ueenie.t@outlook.netq', '.hernandez@icloud.comk', 'martinez94@outlook.orgj', 'sa.rodriguez@zoho.comi', '.brown88@yahoo.comd', '.lee@outlook.comc', 'lice_j@example.orga') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_20_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_20_rewrite_sqlite.sql index 5bf9a250d..1a0d73758 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_20_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_20_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - DATE(c_birthday, '+472 days') > DATE('1991-11-15') + c_birthday IN ('1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_21_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_21_rewrite_sqlite.sql index d20b706ad..24dcf01a2 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_21_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_21_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - DATE(c_birthday, '+472 days') >= DATE('1991-11-15') + c_birthday IN ('1990-07-31', '1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_22_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_22_rewrite_sqlite.sql index b370979ce..7a4ae7954 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_22_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_22_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - DATE(c_birthday, '+472 days') < DATE('1991-11-15') + NOT c_birthday IN ('1990-07-31', '1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_23_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_23_rewrite_sqlite.sql index 05f9bc494..156a1de25 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_23_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_23_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - DATE(c_birthday, '+472 days') <= DATE('1991-11-15') + NOT c_birthday IN ('1991-03-13', '1992-05-06', '1993-01-01', '1994-06-15') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_25_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_25_rewrite_sqlite.sql index 2ef5a72af..5b28def34 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_25_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_25_rewrite_sqlite.sql @@ -2,5 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - DATE(c_birthday, '+472 days') IS NULL - OR c_birthday <> DATE('1991-11-15', '-472 days') + DATE(c_birthday, '+472 days') IS NULL OR c_birthday <> '1990-07-31' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_26_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_26_rewrite_sqlite.sql index 1b6629cc3..853eb7d65 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_26_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_26_rewrite_sqlite.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM crbnk.customers WHERE - c_phone = REPLACE(REPLACE(REPLACE('555-123-456', '0', '*'), '9', '0'), '*', '9') + FALSE diff --git a/tests/test_sql_refsols/cryptbank_filter_count_27_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_27_rewrite_sqlite.sql index 8e7e601b6..f88bf4b3d 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_27_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_27_rewrite_sqlite.sql @@ -3,37 +3,34 @@ SELECT FROM crbnk.customers WHERE ( - DATE(c_birthday, '+472 days') IS NULL - OR LOWER(c_fname) LIKE '%a' - OR LOWER(c_fname) LIKE '%e' - OR LOWER(c_fname) LIKE '%s' - ) - AND ( DATE(c_birthday, '+472 days') IS NULL OR NOT ( SUBSTRING(c_addr, -1) || SUBSTRING(c_addr, 1, LENGTH(c_addr) - 1) ) IS NULL ) AND ( - DATE(c_birthday, '+472 days') IS NULL OR c_lname <> UPPER('lopez') + DATE(c_birthday, '+472 days') IS NULL + OR c_fname IN ('ALICE', 'GRACE', 'LUKE', 'MARIA', 'OLIVIA', 'QUEENIE', 'SOPHIA') + OR c_fname IN ('JAMES', 'NICHOLAS', 'THOMAS') ) AND ( - LOWER(c_fname) LIKE '%a' - OR LOWER(c_fname) LIKE '%e' - OR LOWER(c_fname) LIKE '%s' - OR REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0') LIKE '%5' + DATE(c_birthday, '+472 days') IS NULL OR c_lname <> 'LOPEZ' ) AND ( NOT ( SUBSTRING(c_addr, -1) || SUBSTRING(c_addr, 1, LENGTH(c_addr) - 1) ) IS NULL - OR REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0') LIKE '%5' + OR c_phone IN ('555-091-2345', '555-901-2345') ) AND ( NOT DATE(c_birthday, '+472 days') IS NULL - OR REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0') LIKE '%5' + OR c_phone IN ('555-091-2345', '555-901-2345') + ) + AND ( + c_fname IN ('ALICE', 'GRACE', 'LUKE', 'MARIA', 'OLIVIA', 'QUEENIE', 'SOPHIA') + OR c_fname IN ('JAMES', 'NICHOLAS', 'THOMAS') + OR c_phone IN ('555-091-2345', '555-901-2345') ) AND ( - REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0') LIKE '%5' - OR c_lname <> UPPER('lopez') + c_lname <> 'LOPEZ' OR c_phone IN ('555-091-2345', '555-901-2345') ) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_28_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_28_rewrite_sqlite.sql index 0be30d552..188eb8e4c 100644 --- a/tests/test_sql_refsols/cryptbank_filter_count_28_rewrite_sqlite.sql +++ b/tests/test_sql_refsols/cryptbank_filter_count_28_rewrite_sqlite.sql @@ -16,11 +16,4 @@ JOIN crbnk.customers AS customers WHERE CAST(STRFTIME('%Y', DATETIME(accounts.a_open_ts, '+123456789 seconds')) AS INTEGER) < 2020 AND SQRT(accounts.a_balance) >= 5000 - AND ( - accounts.a_type = ( - SUBSTRING('retirement', 2) || SUBSTRING('retirement', 1, 1) - ) - OR accounts.a_type = ( - SUBSTRING('savings', 2) || SUBSTRING('savings', 1, 1) - ) - ) + AND accounts.a_type IN ('avingss', 'etirementr') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_29_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_29_raw_sqlite.sql new file mode 100644 index 000000000..705cae931 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_29_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + DATE(c_birthday, '+472 days') <= DATE('1925-01-01') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_29_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_29_rewrite_sqlite.sql new file mode 100644 index 000000000..853eb7d65 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_29_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + FALSE diff --git a/tests/test_sql_refsols/cryptbank_filter_count_30_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_30_raw_sqlite.sql new file mode 100644 index 000000000..8567c4aab --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_30_raw_sqlite.sql @@ -0,0 +1,10 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + ( + CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER) - 2 + ) IN (1975, 1977, 1979, 1981, 1983, 1985, 1987, 1989, 1991, 1993) + AND ( + CAST(STRFTIME('%m', DATE(c_birthday, '+472 days')) AS INTEGER) + 1 + ) IN (2, 4, 6, 8, 10, 12) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_30_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_30_rewrite_sqlite.sql new file mode 100644 index 000000000..e10e5d7dd --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_30_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_birthday IN ('1980-01-18', '1981-11-15', '1990-07-31', '1994-06-15') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_31_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_31_raw_sqlite.sql new file mode 100644 index 000000000..8c81871f2 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_31_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + DATE(c_birthday, '+472 days') IN ('1991-11-15', '1978-02-11', '2005-03-14', '1985-04-12') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_31_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_31_rewrite_sqlite.sql new file mode 100644 index 000000000..b82d51921 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_31_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_birthday IN ('1990-07-31', '1976-10-27', '1983-12-27') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_32_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_32_raw_sqlite.sql new file mode 100644 index 000000000..24ffe436b --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_32_raw_sqlite.sql @@ -0,0 +1,6 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + DATETIME(a_open_ts, '+123456789 seconds') <= '2020-09-20 08:30:00' + AND DATETIME(a_open_ts, '+123456789 seconds') >= '2020-03-28 09:20:00' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_32_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_32_rewrite_sqlite.sql new file mode 100644 index 000000000..27e9db986 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_32_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + a_open_ts IN ('2016-04-29 11:46:51', '2016-06-10 12:56:51', '2016-07-20 15:46:51', '2016-08-22 10:41:51', '2016-09-03 12:01:51') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_33_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_33_raw_sqlite.sql new file mode 100644 index 000000000..bd44fe280 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_33_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) IN (1, 2, 3) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_33_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_33_rewrite_sqlite.sql new file mode 100644 index 000000000..2e606aacb --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_33_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + a_open_ts IN ('2013-04-22 11:37:51', '2017-02-11 10:59:51', '2011-04-30 15:16:51', '2016-03-23 12:41:51', '2013-02-15 12:46:51', '2018-03-15 10:36:51', '2014-04-07 14:21:51', '2015-02-08 17:26:51', '2016-04-29 11:46:51', '2012-03-22 12:16:51', '2015-04-06 13:46:51') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_34_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_34_raw_sqlite.sql new file mode 100644 index 000000000..d750b8bd4 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_34_raw_sqlite.sql @@ -0,0 +1,18 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + CASE + WHEN CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) <= 3 + AND CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) >= 1 + THEN 1 + WHEN CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) <= 6 + AND CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) >= 4 + THEN 2 + WHEN CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) <= 9 + AND CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) >= 7 + THEN 3 + WHEN CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) <= 12 + AND CAST(STRFTIME('%m', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) >= 10 + THEN 4 + END = CAST(STRFTIME('%d', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_34_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_34_rewrite_sqlite.sql new file mode 100644 index 000000000..30fcdcb3b --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_34_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + a_open_ts = '2015-05-04 18:01:51' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_35_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_35_raw_sqlite.sql new file mode 100644 index 000000000..00039a869 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_35_raw_sqlite.sql @@ -0,0 +1,6 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + CAST(STRFTIME('%H', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) < 10 + AND CAST(STRFTIME('%M', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) < 20 diff --git a/tests/test_sql_refsols/cryptbank_filter_count_35_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_35_rewrite_sqlite.sql new file mode 100644 index 000000000..600292e56 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_35_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + a_open_ts IN ('2013-04-22 11:37:51', '2017-09-15 11:26:51', '2018-03-15 10:36:51', '2014-05-23 11:31:51', '2016-08-22 10:41:51', '2014-08-15 11:31:51') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_36_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_36_raw_sqlite.sql new file mode 100644 index 000000000..b88b20918 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_36_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.transactions +WHERE + CAST(STRFTIME('%S', DATETIME(t_ts, '+54321 seconds')) AS INTEGER) = 23 diff --git a/tests/test_sql_refsols/cryptbank_filter_count_36_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_36_rewrite_sqlite.sql new file mode 100644 index 000000000..66a0fd2a5 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_36_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.transactions +WHERE + t_ts IN ('2020-11-11 09:03:02', '2023-09-15 09:00:02', '2024-07-21 23:24:02') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_37_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_37_raw_sqlite.sql new file mode 100644 index 000000000..063179e95 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_37_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + ABS(SQRT(a_balance) - 7250) <= 600 AND ABS(SQRT(a_balance) - 7250) >= 200 diff --git a/tests/test_sql_refsols/cryptbank_filter_count_37_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_37_rewrite_sqlite.sql new file mode 100644 index 000000000..c80f475e1 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_37_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + a_balance IN (46240000.0, 57760000.0) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_38_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_38_raw_sqlite.sql new file mode 100644 index 000000000..d5c4aed7f --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_38_raw_sqlite.sql @@ -0,0 +1,14 @@ +WITH _t1 AS ( + SELECT + a_open_ts + FROM crbnk.accounts + WHERE + MAX( + CAST(STRFTIME('%H', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER), + CAST(STRFTIME('%M', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER), + CAST(STRFTIME('%S', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) + ) = 10 +) +SELECT + COUNT(*) AS n +FROM _t1 diff --git a/tests/test_sql_refsols/cryptbank_filter_count_38_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_38_rewrite_sqlite.sql new file mode 100644 index 000000000..d56c032b4 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_38_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + a_open_ts IN ('2018-03-15 10:36:51', '2018-01-02 12:26:51') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_39_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_39_raw_sqlite.sql new file mode 100644 index 000000000..5aab54336 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_39_raw_sqlite.sql @@ -0,0 +1,13 @@ +WITH _t1 AS ( + SELECT + a_open_ts + FROM crbnk.accounts + WHERE + MIN( + CAST(STRFTIME('%H', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER), + CAST(STRFTIME('%M', DATETIME(a_open_ts, '+123456789 seconds')) AS INTEGER) + ) = 15 +) +SELECT + COUNT(*) AS n +FROM _t1 diff --git a/tests/test_sql_refsols/cryptbank_filter_count_39_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_39_rewrite_sqlite.sql new file mode 100644 index 000000000..dad8534d8 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_39_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.accounts +WHERE + a_open_ts IN ('2015-08-10 18:11:51', '2015-05-04 18:01:51', '2015-10-19 18:11:51', '2014-10-03 17:41:51') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_40_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_40_raw_sqlite.sql new file mode 100644 index 000000000..f1c7b44c0 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_40_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + CONCAT_WS('', '1-', REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0')) LIKE '%1-5%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_40_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_40_rewrite_sqlite.sql new file mode 100644 index 000000000..c9724c3af --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_40_rewrite_sqlite.sql @@ -0,0 +1,3 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers diff --git a/tests/test_sql_refsols/cryptbank_filter_count_41_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_41_raw_sqlite.sql new file mode 100644 index 000000000..3d4f0db7b --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_41_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + CONCAT_WS('-', '1', REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0')) LIKE '%1-5%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_41_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_41_rewrite_sqlite.sql new file mode 100644 index 000000000..c9724c3af --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_41_rewrite_sqlite.sql @@ -0,0 +1,3 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers diff --git a/tests/test_sql_refsols/cryptbank_filter_count_42_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_42_raw_sqlite.sql new file mode 100644 index 000000000..f5b1f84a4 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_42_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + CONCAT_WS('-', '1', REPLACE(REPLACE(REPLACE(c_phone, '9', '*'), '0', '9'), '*', '0'), '1') LIKE '%5-1%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_42_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_42_rewrite_sqlite.sql new file mode 100644 index 000000000..2013f54c7 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_42_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_phone IN ('555-112-3456', '555-901-2345', '555-091-2345', '555-123-4567') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_43_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_43_raw_sqlite.sql new file mode 100644 index 000000000..54de7672d --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_43_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + CONCAT_WS(' ', LOWER(c_fname), LOWER(c_lname)) = 'olivia anderson' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_43_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_43_rewrite_sqlite.sql new file mode 100644 index 000000000..54de7672d --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_43_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + CONCAT_WS(' ', LOWER(c_fname), LOWER(c_lname)) = 'olivia anderson' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_44_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_44_raw_sqlite.sql new file mode 100644 index 000000000..8fd719c0c --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_44_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + COALESCE(CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER), 1990) IN (1990, 1991) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_44_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_44_rewrite_sqlite.sql new file mode 100644 index 000000000..c51703ad0 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_44_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_birthday IN ('1990-07-31', '1989-04-07') OR c_birthday IS NULL diff --git a/tests/test_sql_refsols/cryptbank_filter_count_45_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_45_raw_sqlite.sql new file mode 100644 index 000000000..919cc9063 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_45_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + COALESCE(CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER), 1990) IN (1990, 2005) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_45_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_45_rewrite_sqlite.sql new file mode 100644 index 000000000..14d064359 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_45_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_birthday = '1989-04-07' OR c_birthday IS NULL diff --git a/tests/test_sql_refsols/cryptbank_filter_count_46_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_46_raw_sqlite.sql new file mode 100644 index 000000000..0142b992f --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_46_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + COALESCE(CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER), 2005) IN (2005, 2006) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_46_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_46_rewrite_sqlite.sql new file mode 100644 index 000000000..344e0ac38 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_46_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_birthday IS NULL diff --git a/tests/test_sql_refsols/cryptbank_filter_count_47_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_47_raw_sqlite.sql new file mode 100644 index 000000000..1be1a96ad --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_47_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT COALESCE(CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER), 1990) IN (1990, 1991) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_47_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_47_rewrite_sqlite.sql new file mode 100644 index 000000000..de41664e9 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_47_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT c_birthday IN ('1990-07-31', '1989-04-07') AND NOT c_birthday IS NULL diff --git a/tests/test_sql_refsols/cryptbank_filter_count_48_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_48_raw_sqlite.sql new file mode 100644 index 000000000..fe7d8295f --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_48_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT COALESCE(CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER), 1990) IN (1990, 2005) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_48_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_48_rewrite_sqlite.sql new file mode 100644 index 000000000..95c463ed7 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_48_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT c_birthday IS NULL AND c_birthday <> '1989-04-07' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_49_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_49_raw_sqlite.sql new file mode 100644 index 000000000..57cc1e62a --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_49_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT COALESCE(CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER), 2005) IN (2005, 2006) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_49_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_49_rewrite_sqlite.sql new file mode 100644 index 000000000..2dfd1a393 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_49_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT c_birthday IS NULL diff --git a/tests/test_sql_refsols/cryptbank_filter_count_50_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_50_raw_sqlite.sql new file mode 100644 index 000000000..57cc1e62a --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_50_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT COALESCE(CAST(STRFTIME('%Y', DATE(c_birthday, '+472 days')) AS INTEGER), 2005) IN (2005, 2006) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_50_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_50_rewrite_sqlite.sql new file mode 100644 index 000000000..2dfd1a393 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_50_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + NOT c_birthday IS NULL diff --git a/tests/test_sql_refsols/cryptbank_filter_count_51_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_51_raw_sqlite.sql new file mode 100644 index 000000000..9a09159a5 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_51_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + IIF(SUBSTRING(LOWER(c_fname), 1, 1) IN ('q', 'r', 's'), LOWER(c_fname), LOWER(c_lname)) LIKE '%ee%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_51_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_51_rewrite_sqlite.sql new file mode 100644 index 000000000..f11735f74 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_51_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + IIF(c_fname IN ('QUEENIE', 'ROBERT', 'SOPHIA'), LOWER(c_fname), LOWER(c_lname)) LIKE '%ee%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_52_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_52_raw_sqlite.sql new file mode 100644 index 000000000..43699c0d4 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_52_raw_sqlite.sql @@ -0,0 +1,7 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + LOWER(c_lname) LIKE ( + '%' || CONCAT_WS('', 'e', IIF(SUBSTRING(LOWER(c_lname), 1, 1) IN ('q', 'r', 's'), 'z', 'e')) || '%' + ) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_52_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_52_rewrite_sqlite.sql new file mode 100644 index 000000000..c896fffa0 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_52_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_lname IN ('LEE', 'RODRIGUEZ') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_53_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_53_raw_sqlite.sql new file mode 100644 index 000000000..d91509f71 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_53_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING(LOWER(c_fname), 1, 1) = 'i' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_53_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_53_rewrite_sqlite.sql new file mode 100644 index 000000000..3db7a0281 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_53_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_fname = 'ISABEL' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_54_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_54_raw_sqlite.sql new file mode 100644 index 000000000..7c261ae4b --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_54_raw_sqlite.sql @@ -0,0 +1,16 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + 0 + ) < 1 + THEN 1 + ELSE ( + LENGTH(LOWER(c_fname)) + 0 + ) + END + ) IN ('a', 'e', 'i', 'o', 'u') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_54_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_54_rewrite_sqlite.sql new file mode 100644 index 000000000..7c261ae4b --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_54_rewrite_sqlite.sql @@ -0,0 +1,16 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + 0 + ) < 1 + THEN 1 + ELSE ( + LENGTH(LOWER(c_fname)) + 0 + ) + END + ) IN ('a', 'e', 'i', 'o', 'u') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_55_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_55_raw_sqlite.sql new file mode 100644 index 000000000..e7a88d0e3 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_55_raw_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING(LOWER(c_fname), 2, 2) IN ('ar', 'li', 'ra', 'to', 'am') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_55_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_55_rewrite_sqlite.sql new file mode 100644 index 000000000..81431699b --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_55_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_fname IN ('ALICE', 'CAROL', 'FRANK', 'GRACE', 'JAMES', 'KAREN', 'MARIA', 'OLIVIA') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_56_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_56_raw_sqlite.sql new file mode 100644 index 000000000..61af191d7 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_56_raw_sqlite.sql @@ -0,0 +1,33 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + -1 + ) < 1 + THEN 1 + ELSE ( + LENGTH(LOWER(c_fname)) + -1 + ) + END, + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + 0 + ) < 1 + THEN 0 + ELSE ( + LENGTH(LOWER(c_fname)) + 0 + ) - CASE + WHEN ( + LENGTH(LOWER(c_fname)) + -1 + ) < 1 + THEN 1 + ELSE ( + LENGTH(LOWER(c_fname)) + -1 + ) + END + END + ) IN ('a', 'c', 'l') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_56_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_56_rewrite_sqlite.sql new file mode 100644 index 000000000..61af191d7 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_56_rewrite_sqlite.sql @@ -0,0 +1,33 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + -1 + ) < 1 + THEN 1 + ELSE ( + LENGTH(LOWER(c_fname)) + -1 + ) + END, + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + 0 + ) < 1 + THEN 0 + ELSE ( + LENGTH(LOWER(c_fname)) + 0 + ) - CASE + WHEN ( + LENGTH(LOWER(c_fname)) + -1 + ) < 1 + THEN 1 + ELSE ( + LENGTH(LOWER(c_fname)) + -1 + ) + END + END + ) IN ('a', 'c', 'l') diff --git a/tests/test_sql_refsols/cryptbank_filter_count_57_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_57_raw_sqlite.sql new file mode 100644 index 000000000..47c6438a4 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_57_raw_sqlite.sql @@ -0,0 +1,17 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + 1, + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + -1 + ) < 0 + THEN 0 + ELSE ( + LENGTH(LOWER(c_fname)) + -1 + ) + END + ) LIKE '%e%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_57_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_57_rewrite_sqlite.sql new file mode 100644 index 000000000..47c6438a4 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_57_rewrite_sqlite.sql @@ -0,0 +1,17 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + 1, + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + -1 + ) < 0 + THEN 0 + ELSE ( + LENGTH(LOWER(c_fname)) + -1 + ) + END + ) LIKE '%e%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_58_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_58_raw_sqlite.sql new file mode 100644 index 000000000..f2068d2ab --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_58_raw_sqlite.sql @@ -0,0 +1,25 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + 2, + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + 0 + ) < 1 + THEN 0 + ELSE CASE + WHEN ( + ( + LENGTH(LOWER(c_fname)) + 0 + ) - 2 + ) <= 0 + THEN 0 + ELSE ( + LENGTH(LOWER(c_fname)) + 0 + ) - 2 + END + END + ) LIKE '%e%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_58_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_58_rewrite_sqlite.sql new file mode 100644 index 000000000..f2068d2ab --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_58_rewrite_sqlite.sql @@ -0,0 +1,25 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + SUBSTRING( + LOWER(c_fname), + 2, + CASE + WHEN ( + LENGTH(LOWER(c_fname)) + 0 + ) < 1 + THEN 0 + ELSE CASE + WHEN ( + ( + LENGTH(LOWER(c_fname)) + 0 + ) - 2 + ) <= 0 + THEN 0 + ELSE ( + LENGTH(LOWER(c_fname)) + 0 + ) - 2 + END + END + ) LIKE '%e%' diff --git a/tests/test_sql_refsols/cryptbank_filter_count_59_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_59_raw_sqlite.sql new file mode 100644 index 000000000..16eff450d --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_59_raw_sqlite.sql @@ -0,0 +1,7 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + 'SLICE' LIKE ( + '%' || UPPER(SUBSTRING(LOWER(c_fname), 1, 1)) || '%' + ) diff --git a/tests/test_sql_refsols/cryptbank_filter_count_59_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_filter_count_59_rewrite_sqlite.sql new file mode 100644 index 000000000..e35ec65cc --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_filter_count_59_rewrite_sqlite.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM crbnk.customers +WHERE + c_fname IN ('CAROL', 'EMILY', 'ISABEL', 'LUKE', 'SOPHIA') diff --git a/tests/test_sql_refsols/cryptbank_window_01_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_window_01_raw_sqlite.sql new file mode 100644 index 000000000..d394bcb5a --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_window_01_raw_sqlite.sql @@ -0,0 +1,26 @@ +WITH _t AS ( + SELECT + a_balance, + a_key, + a_type, + ROW_NUMBER() OVER (PARTITION BY ( + SUBSTRING(a_type, -1) || SUBSTRING(a_type, 1, LENGTH(a_type) - 1) + ) = 'retirement' + OR ( + SUBSTRING(a_type, -1) || SUBSTRING(a_type, 1, LENGTH(a_type) - 1) + ) = 'savings' ORDER BY SQRT(a_balance) DESC) AS _w + FROM crbnk.accounts +) +SELECT + SUBSTRING(a_type, -1) || SUBSTRING(a_type, 1, LENGTH(a_type) - 1) AS account_type, + CASE + WHEN a_key = 0 + THEN 0 + ELSE CASE WHEN a_key > 0 THEN 1 ELSE -1 END * CAST(SUBSTRING(a_key, 1 + INSTR(a_key, '-'), CAST(LENGTH(a_key) AS REAL) / 2) AS INTEGER) + END AS key, + SQRT(a_balance) AS balance +FROM _t +WHERE + _w = 1 +ORDER BY + 1 diff --git a/tests/test_sql_refsols/cryptbank_window_01_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_window_01_rewrite_sqlite.sql new file mode 100644 index 000000000..8fa8b7309 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_window_01_rewrite_sqlite.sql @@ -0,0 +1,21 @@ +WITH _t AS ( + SELECT + a_balance, + a_key, + a_type, + ROW_NUMBER() OVER (PARTITION BY a_type IN ('avingss', 'etirementr') ORDER BY SQRT(a_balance) DESC) AS _w + FROM crbnk.accounts +) +SELECT + SUBSTRING(a_type, -1) || SUBSTRING(a_type, 1, LENGTH(a_type) - 1) AS account_type, + CASE + WHEN a_key = 0 + THEN 0 + ELSE CASE WHEN a_key > 0 THEN 1 ELSE -1 END * CAST(SUBSTRING(a_key, 1 + INSTR(a_key, '-'), CAST(LENGTH(a_key) AS REAL) / 2) AS INTEGER) + END AS key, + SQRT(a_balance) AS balance +FROM _t +WHERE + _w = 1 +ORDER BY + 1 diff --git a/tests/test_sql_refsols/cryptbank_window_02_raw_sqlite.sql b/tests/test_sql_refsols/cryptbank_window_02_raw_sqlite.sql new file mode 100644 index 000000000..f06faa358 --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_window_02_raw_sqlite.sql @@ -0,0 +1,33 @@ +WITH _t AS ( + SELECT + accounts.a_key, + accounts.a_open_ts, + branches.b_name, + ROW_NUMBER() OVER (PARTITION BY accounts.a_branchkey ORDER BY CAST(STRFTIME('%Y', DATETIME(accounts.a_open_ts, '+123456789 seconds')) AS INTEGER) = 2021, CASE + WHEN accounts.a_key = 0 + THEN 0 + ELSE CASE WHEN accounts.a_key > 0 THEN 1 ELSE -1 END * CAST(SUBSTRING( + accounts.a_key, + 1 + INSTR(accounts.a_key, '-'), + CAST(LENGTH(accounts.a_key) AS REAL) / 2 + ) AS INTEGER) + END) AS _w + FROM crbnk.branches AS branches + JOIN crbnk.accounts AS accounts + ON accounts.a_branchkey = branches.b_key + WHERE + branches.b_addr LIKE '%;CA;%' +) +SELECT + b_name AS branch_name, + CASE + WHEN a_key = 0 + THEN 0 + ELSE CASE WHEN a_key > 0 THEN 1 ELSE -1 END * CAST(SUBSTRING(a_key, 1 + INSTR(a_key, '-'), CAST(LENGTH(a_key) AS REAL) / 2) AS INTEGER) + END AS key, + DATETIME(a_open_ts, '+123456789 seconds') AS creation_timestamp +FROM _t +WHERE + _w = 1 +ORDER BY + 1 diff --git a/tests/test_sql_refsols/cryptbank_window_02_rewrite_sqlite.sql b/tests/test_sql_refsols/cryptbank_window_02_rewrite_sqlite.sql new file mode 100644 index 000000000..718ec27bc --- /dev/null +++ b/tests/test_sql_refsols/cryptbank_window_02_rewrite_sqlite.sql @@ -0,0 +1,33 @@ +WITH _t AS ( + SELECT + accounts.a_key, + accounts.a_open_ts, + branches.b_name, + ROW_NUMBER() OVER (PARTITION BY accounts.a_branchkey ORDER BY accounts.a_open_ts IN ('2017-02-11 10:59:51', '2017-06-15 12:41:51', '2017-07-07 14:26:51', '2017-07-09 12:21:51', '2017-09-15 11:26:51', '2018-01-02 12:26:51'), CASE + WHEN accounts.a_key = 0 + THEN 0 + ELSE CASE WHEN accounts.a_key > 0 THEN 1 ELSE -1 END * CAST(SUBSTRING( + accounts.a_key, + 1 + INSTR(accounts.a_key, '-'), + CAST(LENGTH(accounts.a_key) AS REAL) / 2 + ) AS INTEGER) + END) AS _w + FROM crbnk.branches AS branches + JOIN crbnk.accounts AS accounts + ON accounts.a_branchkey = branches.b_key + WHERE + branches.b_addr LIKE '%;CA;%' +) +SELECT + b_name AS branch_name, + CASE + WHEN a_key = 0 + THEN 0 + ELSE CASE WHEN a_key > 0 THEN 1 ELSE -1 END * CAST(SUBSTRING(a_key, 1 + INSTR(a_key, '-'), CAST(LENGTH(a_key) AS REAL) / 2) AS INTEGER) + END AS key, + DATETIME(a_open_ts, '+123456789 seconds') AS creation_timestamp +FROM _t +WHERE + _w = 1 +ORDER BY + 1 diff --git a/tests/test_sql_refsols/fsi_accounts_customers_compound_a_rewrite_snowflake.sql b/tests/test_sql_refsols/fsi_accounts_customers_compound_a_rewrite_snowflake.sql index fffd63196..8b1649d7a 100644 --- a/tests/test_sql_refsols/fsi_accounts_customers_compound_a_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/fsi_accounts_customers_compound_a_rewrite_snowflake.sql @@ -3,6 +3,6 @@ SELECT FROM bodo.fsi.accounts AS accounts JOIN bodo.fsi.protected_customers AS protected_customers ON PTY_UNPROTECT(protected_customers.customerid, 'deAccount') = PTY_UNPROTECT_ACCOUNT(accounts.customerid) - AND protected_customers.state = PTY_PROTECT('California', 'deAddress') + AND protected_customers.state = 'V6kSQBaqGv' WHERE - accounts.balance < 20000 AND accounts.currency <> PTY_PROTECT('GBP', 'deAccount') + accounts.balance < 20000 AND accounts.currency IN ('jpb', 'gFr') diff --git a/tests/test_sql_refsols/fsi_accounts_customers_compound_b_rewrite_snowflake.sql b/tests/test_sql_refsols/fsi_accounts_customers_compound_b_rewrite_snowflake.sql index 0e2d85f72..53ca14cce 100644 --- a/tests/test_sql_refsols/fsi_accounts_customers_compound_b_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/fsi_accounts_customers_compound_b_rewrite_snowflake.sql @@ -2,9 +2,9 @@ SELECT COUNT(*) AS n FROM bodo.fsi.accounts AS accounts JOIN bodo.fsi.protected_customers AS protected_customers - ON NOT protected_customers.firstname IN (PTY_PROTECT('Jennifer', 'deName'), PTY_PROTECT('Julio', 'deName'), PTY_PROTECT('Johnson', 'deName'), PTY_PROTECT('Jameson', 'deName'), PTY_PROTECT('Michael', 'deName'), PTY_PROTECT('Robert', 'deName')) + ON NOT protected_customers.firstname IN ('tzuhpuCF', 'cPBnsOl', 'NVGimP') AND PTY_UNPROTECT(protected_customers.customerid, 'deAccount') = PTY_UNPROTECT_ACCOUNT(accounts.customerid) - AND protected_customers.state IN (PTY_PROTECT('Georgia', 'deAddress'), PTY_PROTECT('Alabama', 'deAddress'), PTY_PROTECT('Mississippi', 'deAddress'), PTY_PROTECT('Arkansas', 'deAddress'), PTY_PROTECT('Louisiana', 'deAddress'), PTY_PROTECT('Florida', 'deAddress'), PTY_PROTECT('South Carolina', 'deAddress'), PTY_PROTECT('North Carolina', 'deAddress'), PTY_PROTECT('Texas', 'deAddress'), PTY_PROTECT('Tennessee', 'deAddress'), PTY_PROTECT('Missouri', 'deAddress')) + AND protected_customers.state IN ('EdJ6cty', 'raXuWJGK', '4o0uuG1', 'FvlL1x8', 'TY84qyAxy', 'AqjyPuvoU8d', 'q6OaWD9X', 'MZBK0 U3nQzZbb', 'lN1sA AANifXzd', 'JXtZBpRhT', 'YYE75') WHERE YEAR(CAST(PTY_UNPROTECT_DOB(accounts.createddate) AS TIMESTAMP)) <= 2022 - AND accounts.currency IN (PTY_PROTECT('USD', 'deAccount'), PTY_PROTECT('GPB', 'deAccount'), PTY_PROTECT('EUR', 'deAccount'), PTY_PROTECT('JPY', 'deAccount'), PTY_PROTECT('AUD', 'deAccount')) + AND accounts.currency IN ('jpb', 'gFr') diff --git a/tests/test_sql_refsols/fsi_accounts_customers_compound_c_rewrite_snowflake.sql b/tests/test_sql_refsols/fsi_accounts_customers_compound_c_rewrite_snowflake.sql index 8c5b82203..7074bd30f 100644 --- a/tests/test_sql_refsols/fsi_accounts_customers_compound_c_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/fsi_accounts_customers_compound_c_rewrite_snowflake.sql @@ -2,21 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.fsi.accounts WHERE - ( - PTY_UNPROTECT_DOB(createddate) <= '2020-03-13' - OR PTY_UNPROTECT_DOB(createddate) >= '2022-12-25' - ) - AND ( - PTY_UNPROTECT_DOB(createddate) <= '2023-01-15' - OR PTY_UNPROTECT_DOB(createddate) >= '2024-08-04' - ) - AND ( - PTY_UNPROTECT_DOB(createddate) <= '2024-11-08' - OR PTY_UNPROTECT_DOB(createddate) >= '2022-12-25' - ) - AND ( - PTY_UNPROTECT_DOB(createddate) <= '2024-11-08' - OR PTY_UNPROTECT_DOB(createddate) >= '2025-06-07' - ) - AND PTY_UNPROTECT_DOB(createddate) <= '2026-03-07' - AND PTY_UNPROTECT_DOB(createddate) >= '2020-01-31' + createddate IN ('3149-05-04', '1478-09-27', '2396-11-12', '0714-10-12', '2461-03-25', '2326-07-19', '2883-05-12', '1368-06-18', '2386-05-20', '2241-06-04', '2413-07-10', '1464-06-25', '2308-05-18', '2690-01-11', '0937-05-21', '0794-10-27', '2856-02-06', '1335-02-11', '1605-10-12', '2456-12-12', '1610-12-28', '1267-04-15', '2133-09-29', '3337-02-07', '1403-12-19', '1484-05-22') diff --git a/tests/test_sql_refsols/fsi_customers_accounts_join_rewrite_snowflake.sql b/tests/test_sql_refsols/fsi_customers_accounts_join_rewrite_snowflake.sql index eb9530c20..2ddee36af 100644 --- a/tests/test_sql_refsols/fsi_customers_accounts_join_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/fsi_customers_accounts_join_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS num_customers_checking_accounts FROM bodo.fsi.accounts WHERE - accounttype <> PTY_PROTECT('checking', 'deAccount') + accounttype IN ('HPlnssRN', 'XADfRcm') diff --git a/tests/test_sql_refsols/fsi_customers_filter_isin_rewrite_snowflake.sql b/tests/test_sql_refsols/fsi_customers_filter_isin_rewrite_snowflake.sql index a3438c207..8a4faefad 100644 --- a/tests/test_sql_refsols/fsi_customers_filter_isin_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/fsi_customers_filter_isin_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.fsi.protected_customers WHERE - lastname IN (PTY_PROTECT_NAME('Barnes'), PTY_PROTECT_NAME('Hernandez'), PTY_PROTECT_NAME('Moore')) + lastname IN ('CyypMP', 'TlwQYRsjl', 'SmfgY') diff --git a/tests/test_sql_refsols/fsi_customers_filter_not_isin_rewrite_snowflake.sql b/tests/test_sql_refsols/fsi_customers_filter_not_isin_rewrite_snowflake.sql index 3c0e5c76e..35adebf6f 100644 --- a/tests/test_sql_refsols/fsi_customers_filter_not_isin_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/fsi_customers_filter_not_isin_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.fsi.protected_customers WHERE - NOT lastname IN (PTY_PROTECT_NAME('Barnes'), PTY_PROTECT_NAME('Hernandez'), PTY_PROTECT_NAME('Moore')) + NOT lastname IN ('CyypMP', 'TlwQYRsjl', 'SmfgY') diff --git a/tests/test_sql_refsols/health_claims_filter_day_rewrite_snowflake.sql b/tests/test_sql_refsols/health_claims_filter_day_rewrite_snowflake.sql index 0d2ef7a75..bdfbd0355 100644 --- a/tests/test_sql_refsols/health_claims_filter_day_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/health_claims_filter_day_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.health.claims WHERE - DAY(CAST(PTY_UNPROTECT_DOB(claim_date) AS TIMESTAMP)) = 31 + claim_date IN ('2666-05-02', '2627-10-27', '2896-11-08', '0775-03-22', '1471-09-22', '3175-06-30', '1909-08-08', '2063-10-13', '3095-04-16', '1842-06-18', '1292-11-24', '1324-05-13', '2757-05-10', '1415-01-25') diff --git a/tests/test_sql_refsols/retail_all_raw_snowflake.sql b/tests/test_sql_refsols/retail_all_raw_snowflake.sql new file mode 100644 index 000000000..d70eed690 --- /dev/null +++ b/tests/test_sql_refsols/retail_all_raw_snowflake.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members +WHERE + YEAR(CAST(PTY_UNPROTECT(date_of_birth, 'deDOB') AS TIMESTAMP)) < 2026 diff --git a/tests/test_sql_refsols/retail_all_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_all_rewrite_snowflake.sql new file mode 100644 index 000000000..232c2ec8a --- /dev/null +++ b/tests/test_sql_refsols/retail_all_rewrite_snowflake.sql @@ -0,0 +1,3 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members diff --git a/tests/test_sql_refsols/retail_members_compound_a_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_compound_a_rewrite_snowflake.sql index 3098195a7..df43b2a35 100644 --- a/tests/test_sql_refsols/retail_members_compound_a_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_compound_a_rewrite_snowflake.sql @@ -2,5 +2,5 @@ SELECT COUNT(*) AS n FROM bodo.retail.protected_loyalty_members WHERE - PTY_UNPROTECT(date_of_birth, 'deDOB') >= CAST('2002-01-01' AS DATE) - AND last_name IN (PTY_PROTECT('Johnson', 'deName'), PTY_PROTECT('Robinson', 'deName')) + date_of_birth IN ('1922-08-06', '0913-06-11', '2142-09-01', '2006-03-03', '0915-05-17', '1823-12-26', '1722-02-13', '2208-12-06', '2350-04-16', '2973-02-23', '1484-10-19', '1924-10-25', '2544-09-01', '2363-10-31', '2685-03-23', '1040-04-02', '3136-09-15', '1569-07-03', '1804-07-19', '1543-07-16', '2478-02-14', '0983-02-13', '2243-03-06', '2628-10-02', '2064-12-22', '1463-05-18', '1078-01-28', '1125-11-24', '1405-11-12', '3290-02-08', '1278-11-09', '3093-06-09', '1464-06-16', '2613-07-13', '1964-08-20', '1061-01-22', '2797-05-10', '1905-02-26', '1938-07-08', '1535-05-03', '1289-11-13', '1818-01-12', '1073-07-09', '2605-10-18', '1711-07-03', '3018-03-01', '2830-08-29') + AND last_name IN ('xnUVZyS', 'UcoQBfzB') diff --git a/tests/test_sql_refsols/retail_members_compound_b_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_compound_b_rewrite_snowflake.sql index 6733bb292..10b3a685e 100644 --- a/tests/test_sql_refsols/retail_members_compound_b_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_compound_b_rewrite_snowflake.sql @@ -2,5 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.protected_loyalty_members WHERE - date_of_birth = PTY_PROTECT(CAST('1979-03-07' AS DATE), 'deDOB') - AND last_name <> PTY_PROTECT('Smith', 'deName') + date_of_birth = '1622-10-03' AND last_name <> PTY_PROTECT('Smith', 'deName') diff --git a/tests/test_sql_refsols/retail_members_compound_e_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_compound_e_rewrite_snowflake.sql index 99b1a295a..75eab3e4e 100644 --- a/tests/test_sql_refsols/retail_members_compound_e_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_compound_e_rewrite_snowflake.sql @@ -2,5 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.protected_loyalty_members WHERE - PTY_UNPROTECT(date_of_birth, 'deDOB') < CAST('1983-01-30' AS DATE) - AND PTY_UNPROTECT(date_of_birth, 'deDOB') >= CAST('1983-01-10' AS DATE) + date_of_birth IN ('2637-10-01', '1403-11-22') diff --git a/tests/test_sql_refsols/retail_members_compound_f_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_compound_f_rewrite_snowflake.sql index 0c84a8637..9b69600a5 100644 --- a/tests/test_sql_refsols/retail_members_compound_f_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_compound_f_rewrite_snowflake.sql @@ -2,5 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.protected_loyalty_members WHERE - PTY_UNPROTECT(date_of_birth, 'deDOB') <= CAST('1976-07-28' AS DATE) - AND PTY_UNPROTECT(date_of_birth, 'deDOB') > CAST('1976-07-01' AS DATE) + date_of_birth IN ('1357-07-11', '0988-09-15') diff --git a/tests/test_sql_refsols/retail_members_compound_g_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_compound_g_rewrite_snowflake.sql index 3da3ff9aa..f8d121533 100644 --- a/tests/test_sql_refsols/retail_members_compound_g_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_compound_g_rewrite_snowflake.sql @@ -3,6 +3,6 @@ SELECT FROM bodo.retail.protected_loyalty_members WHERE DAY(CAST(PTY_UNPROTECT(date_of_birth, 'deDOB') AS TIMESTAMP)) <= 13 - AND DAY(CAST(PTY_UNPROTECT(date_of_birth, 'deDOB') AS TIMESTAMP)) > 3 AND MONTH(CAST(PTY_UNPROTECT(date_of_birth, 'deDOB') AS TIMESTAMP)) IN (1, 2, 5, 10, 12) + AND NOT date_of_birth IN ('0897-01-11', '0681-01-31', '2337-11-25', '0765-06-07', '3270-09-07', '3114-01-23', '0946-07-28', '0671-06-23', '1030-02-26', '2892-07-01', '1787-09-06', '2191-11-24', '0912-05-28', '1828-09-20', '1318-12-03', '0660-08-20', '1546-05-12', '2064-12-18', '1664-12-03', '0627-05-21', '1348-11-22', '3202-05-20', '0959-04-01', '1397-05-24', '3184-08-05', '2207-02-22', '2388-11-19', '2563-07-20', '3159-09-21', '2692-10-23', '1365-12-07', '1712-02-18', '0846-08-04', '3332-01-06', '2501-07-04', '3297-10-03', '2235-01-19', '2006-03-03', '2544-09-01', '1543-07-16') AND YEAR(CAST(PTY_UNPROTECT(date_of_birth, 'deDOB') AS TIMESTAMP)) IN (1960, 1970, 1980, 1990, 2000) diff --git a/tests/test_sql_refsols/retail_members_compound_h_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_compound_h_rewrite_snowflake.sql index 9663cadc8..02fbeefe3 100644 --- a/tests/test_sql_refsols/retail_members_compound_h_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_compound_h_rewrite_snowflake.sql @@ -2,5 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.protected_loyalty_members WHERE - PTY_UNPROTECT(date_of_birth, 'deDOB') < CAST('2007-01-01' AS DATE) - AND PTY_UNPROTECT_NAME(last_name) >= 'Cross' + PTY_UNPROTECT_NAME(last_name) >= 'Cross' AND date_of_birth <> '2605-10-18' diff --git a/tests/test_sql_refsols/retail_members_compound_j_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_compound_j_rewrite_snowflake.sql index 7e1177fd7..372fff9b1 100644 --- a/tests/test_sql_refsols/retail_members_compound_j_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_compound_j_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.protected_loyalty_members WHERE - CONTAINS(LOWER(PTY_UNPROTECT_NAME(last_name)), 'hu') + last_name IN ('jNPacL', 'NIAZ', 'eIVERzXY', 'tREJmG', 'cxyIdcy') diff --git a/tests/test_sql_refsols/retail_members_filter_name_endswith_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_members_filter_name_endswith_rewrite_snowflake.sql index 898dc0225..a8d47e303 100644 --- a/tests/test_sql_refsols/retail_members_filter_name_endswith_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_members_filter_name_endswith_rewrite_snowflake.sql @@ -2,5 +2,5 @@ SELECT COUNT(*) AS n FROM bodo.retail.protected_loyalty_members WHERE - ENDSWITH(PTY_UNPROTECT(first_name, 'deName'), 'e') - OR ENDSWITH(PTY_UNPROTECT_NAME(last_name), 'e') + first_name IN ('CdCPvr', 'EKjOcM', 'euaiZD', 'DsOlJExPB', 'yyrzPqwYJ', 'veSbKfjZ', 'ltpzJF', 'QlYGrf', 'wrJPcBLnb', 'aPZukW', 'zQhHu', 'rBysMhdxNH', 'xSofz', 'CvHV', 'UhnVJm', 'zixlYsG', 'OXzucS', 'nRhMWQ', 'oKd', 'rASYq', 'mFtb', 'XdaEj', 'StqmwCvYW', 'zPgDshgP', 'OQMeTN', 'fcxiAcj', 'otHnLXhd', 'ZpEzCmV', 'pFTdpMJ', 'eMPChjxY', 'IzjmJq', 'wzDFEL', 'vhZhdhNRf', 'FoEhR', 'RxvEbkd', 'KrdDrun', 'sFBVM') + OR last_name IN ('XuCRC', 'vcVqo', 'xpFpz', 'rpnEFk', 'brcc', 'teibn', 'KLvNYE', 'OgARIx', 'aPZukW', 'RZnrOO', 'LFtAm', 'VTFJ', 'NaTJ', 'gYR', 'SUvctz', 'SmfgY', 'FgeTdq', 'EYAd', 'iPPF', 'LEcnd', 'YYsb', 'wlBDLGE', 'xAzGl') diff --git a/tests/test_sql_refsols/retail_names_analysis_a_raw_snowflake.sql b/tests/test_sql_refsols/retail_names_analysis_a_raw_snowflake.sql new file mode 100644 index 000000000..b90077927 --- /dev/null +++ b/tests/test_sql_refsols/retail_names_analysis_a_raw_snowflake.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members +WHERE + CONTAINS('GT', UPPER(SUBSTRING(PTY_UNPROTECT(first_name, 'deName'), 1, 1))) diff --git a/tests/test_sql_refsols/retail_names_analysis_a_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_names_analysis_a_rewrite_snowflake.sql new file mode 100644 index 000000000..2a85ffb2e --- /dev/null +++ b/tests/test_sql_refsols/retail_names_analysis_a_rewrite_snowflake.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members +WHERE + first_name IN ('wrJPcBLnb', 'tfdP', 'aPZukW', 'FllTrn', 'dWFj', 'zQhHu', 'cLxbra', 'iShNn', 'nrvyDT', 'Eikudy', 'dDxAuD', 'RwQZcxw', 'RnYgnn', 'UMwsjSm', 'VjzF', 'lDKVA', 'DAzoEa', 'POnnEr', 'EGBa') diff --git a/tests/test_sql_refsols/retail_names_analysis_b_raw_snowflake.sql b/tests/test_sql_refsols/retail_names_analysis_b_raw_snowflake.sql new file mode 100644 index 000000000..116928e29 --- /dev/null +++ b/tests/test_sql_refsols/retail_names_analysis_b_raw_snowflake.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members +WHERE + CONTAINS('day', LOWER(SUBSTRING(PTY_UNPROTECT(first_name, 'deName'), 1, 2))) diff --git a/tests/test_sql_refsols/retail_names_analysis_b_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_names_analysis_b_rewrite_snowflake.sql new file mode 100644 index 000000000..394bfbb0d --- /dev/null +++ b/tests/test_sql_refsols/retail_names_analysis_b_rewrite_snowflake.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members +WHERE + first_name IN ('ZcH', 'MgEZTa', 'veSbKfjZ', 'HBRvO', 'jvUyLK', 'tdfnU') diff --git a/tests/test_sql_refsols/retail_none_raw_snowflake.sql b/tests/test_sql_refsols/retail_none_raw_snowflake.sql new file mode 100644 index 000000000..477aa0fb8 --- /dev/null +++ b/tests/test_sql_refsols/retail_none_raw_snowflake.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members +WHERE + YEAR(CAST(PTY_UNPROTECT(date_of_birth, 'deDOB') AS TIMESTAMP)) >= 2026 diff --git a/tests/test_sql_refsols/retail_none_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_none_rewrite_snowflake.sql new file mode 100644 index 000000000..f797002a5 --- /dev/null +++ b/tests/test_sql_refsols/retail_none_rewrite_snowflake.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS n +FROM bodo.retail.protected_loyalty_members +WHERE + FALSE diff --git a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_a_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_a_rewrite_snowflake.sql index caea10607..3153704a5 100644 --- a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_a_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_a_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.transactions WHERE - payment_method = PTY_PROTECT_ACCOUNT('Cash') + payment_method = 'CsNw' diff --git a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_b_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_b_rewrite_snowflake.sql index 7277069b6..6a6c763ee 100644 --- a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_b_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_b_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.transactions WHERE - payment_method <> PTY_PROTECT_ACCOUNT('Credit Card') + payment_method IN ('CsNw', 'DwXR YwQL', 'BaGWrt IqJfFoq') diff --git a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_c_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_c_rewrite_snowflake.sql index f65b42330..a4b1115bd 100644 --- a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_c_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_c_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.transactions WHERE - payment_method IN (PTY_PROTECT_ACCOUNT('Cash'), PTY_PROTECT_ACCOUNT('Gift Card')) + payment_method IN ('CsNw', 'DwXR YwQL') diff --git a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_d_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_d_rewrite_snowflake.sql index 73128145f..68a137bb0 100644 --- a/tests/test_sql_refsols/retail_transactions_payment_method_cmp_d_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_transactions_payment_method_cmp_d_rewrite_snowflake.sql @@ -2,4 +2,4 @@ SELECT COUNT(*) AS n FROM bodo.retail.transactions WHERE - NOT payment_method IN (PTY_PROTECT_ACCOUNT('Mobile Payment'), PTY_PROTECT_ACCOUNT('Gift Card')) + payment_method IN ('CsNw', 'JrVjGo Mdvt') diff --git a/tests/test_sql_refsols/retail_transactions_ts_raw_snowflake.sql b/tests/test_sql_refsols/retail_transactions_ts_raw_snowflake.sql new file mode 100644 index 000000000..f4a984e71 --- /dev/null +++ b/tests/test_sql_refsols/retail_transactions_ts_raw_snowflake.sql @@ -0,0 +1,50 @@ +WITH _s0 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 1 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 7 +), _s1 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 2 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 7 +), _s3 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 1 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 8 +), _s5 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 2 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 8 +), _s7 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + ( + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) < 4 + OR MINUTE(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = SECOND(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) + ) + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) < 3 +) +SELECT + _s0.n_rows AS n1, + _s1.n_rows AS n2, + _s3.n_rows AS n3, + _s5.n_rows AS n4, + _s7.n_rows AS n5 +FROM _s0 AS _s0 +CROSS JOIN _s1 AS _s1 +CROSS JOIN _s3 AS _s3 +CROSS JOIN _s5 AS _s5 +CROSS JOIN _s7 AS _s7 diff --git a/tests/test_sql_refsols/retail_transactions_ts_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_transactions_ts_rewrite_snowflake.sql new file mode 100644 index 000000000..b8cd69ee9 --- /dev/null +++ b/tests/test_sql_refsols/retail_transactions_ts_rewrite_snowflake.sql @@ -0,0 +1,43 @@ +WITH _s0 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + transaction_date IN ('2178-03-20 07:19:29', '2825-09-23 07:37:08') +), _s1 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + transaction_date IN ('1010-12-08 07:23:35', '2328-01-19 07:33:25', '1577-03-20 07:41:29', '1345-03-06 07:41:47', '0937-05-21 07:27:48', '2176-01-07 07:07:03') +), _s3 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + transaction_date IN ('3120-07-22 08:30:44', '1890-02-18 08:21:13', '1890-02-18 08:46:51') +), _s5 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + transaction_date IN ('2052-11-18 08:24:33', '2052-11-18 08:32:00', '1577-03-20 08:03:51', '1577-03-20 08:32:17', '2550-01-17 08:56:44', '1551-03-04 08:36:08') +), _s7 AS ( + SELECT + COUNT(*) AS n_rows + FROM bodo.retail.transactions + WHERE + transaction_date IN ('1752-07-20 01:18:18', '1880-04-06 00:47:47', '2956-09-24 00:03:03', '1868-06-13 01:22:22', '0780-03-23 01:14:14', '1598-04-24 01:11:11', '0763-04-15 00:16:16', '2780-03-19 01:32:32') + OR transaction_date IN ('2268-07-06 01:50:11', '3056-08-07 01:18:26', '3120-07-22 02:43:20', '1010-12-08 01:47:15', '1440-10-15 02:26:30', '3054-12-02 00:51:55', '3031-02-17 00:54:21', '1539-02-23 00:49:34', '2418-09-09 01:12:48', '2418-09-09 02:09:31', '2551-01-12 00:34:57', '3141-01-25 02:24:01', '2328-01-19 01:20:40', '1577-03-20 00:27:19', '1608-08-20 00:10:55', '1608-08-20 01:12:55', '1608-08-20 02:14:47', '2825-09-23 02:31:19', '1286-12-21 00:21:24', '1286-12-21 01:25:46', '3300-07-12 00:15:35', '2059-07-23 01:56:15', '2955-06-27 00:48:34', '2955-06-27 01:24:43', '0937-05-21 00:40:43', '0930-11-28 02:44:19', '1605-10-12 00:58:57', '0781-08-29 02:28:10', '2374-09-21 00:21:42', '2374-09-21 02:10:55', '3022-05-13 01:56:21', '3088-03-30 01:09:15', '3088-03-30 02:38:56', '1757-01-16 00:20:29', '3287-10-20 01:17:31', '2555-09-08 00:40:20', '2555-09-08 01:20:22', '2555-09-08 02:36:58', '2176-01-07 02:50:08', '2282-06-16 00:21:35', '2595-05-23 01:32:01', '3237-05-26 01:19:24', '3237-05-26 01:52:49', '2780-03-19 01:32:32', '2780-03-19 02:33:01') +) +SELECT + _s0.n_rows AS n1, + _s1.n_rows AS n2, + _s3.n_rows AS n3, + _s5.n_rows AS n4, + _s7.n_rows AS n5 +FROM _s0 AS _s0 +CROSS JOIN _s1 AS _s1 +CROSS JOIN _s3 AS _s3 +CROSS JOIN _s5 AS _s5 +CROSS JOIN _s7 AS _s7 diff --git a/tests/testing_utilities.py b/tests/testing_utilities.py index 229c70918..85c87d05c 100644 --- a/tests/testing_utilities.py +++ b/tests/testing_utilities.py @@ -23,13 +23,18 @@ "TableCollectionInfo", "TopKInfo", "WhereInfo", + "extract_batch_requests_from_logs", "graph_fetcher", "map_over_dict_values", + "temp_env_override", ] import datetime +import os +import re from abc import ABC, abstractmethod from collections.abc import Callable +from contextlib import contextmanager from dataclasses import dataclass from decimal import Decimal from typing import Any @@ -45,6 +50,7 @@ from pydough.database_connectors import DatabaseContext from pydough.errors import PyDoughTestingException from pydough.evaluation.evaluate_unqualified import _load_column_selection +from pydough.mask_server import MaskServerInfo from pydough.metadata import GraphMetadata from pydough.pydough_operators import get_operator_by_name from pydough.qdag import ( @@ -76,6 +82,39 @@ graph_fetcher = Callable[[str], GraphMetadata] +@contextmanager +def temp_env_override(env_vars: dict[str, str | None]): + """Update the current environment variables with key-value pairs provided + in a dictionary and then restore it after. + + Args + env_vars (dict(str, str or None)): A dictionary of environment variables to set. + A value of None indicates a variable should be removed. + """ + + def update_env_vars(env_vars): + old_env_vars: dict[str, str | None] = {} + for k, v in env_vars.items(): + if k in os.environ: + old_env_vars[k] = os.environ[k] + else: + old_env_vars[k] = None + + if v is None: + if k in os.environ: + del os.environ[k] + else: + os.environ[k] = v + return old_env_vars + + old_env = {} + try: + old_env = update_env_vars(env_vars) + yield + finally: + update_env_vars(old_env) + + def map_over_dict_values( dictionary: dict[Any, Any], func: Callable[[Any], Any] ) -> dict[Any, Any]: @@ -1148,6 +1187,7 @@ def run_relational_test( file_path: str, update: bool, config: PyDoughConfigs | None = None, + mask_server: MaskServerInfo | None = None, ) -> None: """ Runs a test on the relational plan code generated by the PyDough code, @@ -1162,6 +1202,7 @@ def run_relational_test( plan text, otherwise compares the generated relational plan text against the expected relational plan text in the file. `config`: The PyDough configuration to use for the test, if any. + `mask_server`: The mask server to use for the test, if any. """ # Skip if indicated. if self.skip_relational: @@ -1178,6 +1219,7 @@ def run_relational_test( session: PyDoughSession = PyDoughSession() session.metadata = graph session.config = config if config is not None else pydough.active_session.config + session.mask_server = mask_server qualified: PyDoughQDAG = qualify_node(root, session) assert isinstance(qualified, PyDoughCollectionQDAG), ( "Expected qualified answer to be a collection, not an expression" @@ -1205,6 +1247,7 @@ def run_sql_test( update: bool, database: DatabaseContext, config: PyDoughConfigs | None = None, + mask_server: MaskServerInfo | None = None, max_rows: int | None = None, ) -> None: """ @@ -1222,6 +1265,7 @@ def run_sql_test( `database`: The database context to determine what dialect of SQL to use when generating the SQL test. `config`: The PyDough configuration to use for the test, if any. + `mask_server`: The mask server to use for the test, if any. `max_rows`: The maximum number of rows to return from the query. """ # Skip if indicated. @@ -1244,6 +1288,8 @@ def run_sql_test( call_kwargs["config"] = config if self.columns is not None: call_kwargs["columns"] = self.columns + if mask_server is not None: + call_kwargs["mask_server"] = mask_server sql_text: str = to_sql(root, **call_kwargs) # Either update the reference solution, or compare the generated sql @@ -1265,6 +1311,7 @@ def run_e2e_test( config: PyDoughConfigs | None = None, display_sql: bool = False, coerce_types: bool = False, + mask_server: MaskServerInfo | None = None, max_rows: int | None = None, ): """ @@ -1298,6 +1345,8 @@ def run_e2e_test( call_kwargs["config"] = config if self.columns is not None: call_kwargs["columns"] = self.columns + if mask_server is not None: + call_kwargs["mask_server"] = mask_server result: pd.DataFrame = to_df(root, **call_kwargs) # Extract the reference solution from the function refsol: pd.DataFrame = self.pd_function() @@ -1494,3 +1543,66 @@ def run_e2e_error_test( if columns is not None: call_kwargs["columns"] = columns to_df(root, **call_kwargs) + + +def extract_batch_requests_from_logs(log_str: str) -> list[set[str]]: + """ + Extracts the batch requests made to a mask server from the provided log + string. Each batch request will have a corresponding sequence of log lines + in the following format (the phrase "Batch request" sometimes followed by + the text "(dry run)" if the mask server is in dry run mode): + + ``` + INFO pydough.mask_server.mask_server:mask_server.py:149 Batch request to Mask Server (2 items): + INFO pydough.mask_server.mask_server:mask_server.py:151 (1) CRBNK/CUSTOMERS/c_lname: ['EQUAL', 2, '__col__', 'lee'] + INFO pydough.mask_server.mask_server:mask_server.py:151 (2) CRBNK/CUSTOMERS/c_birthday: ['EQUAL', 2, 'YEAR', 1, '__col__', 1980] + ``` + + A log message string with those lines would return the following list of + sets (if doing a dry run, then "DRY_RUN" is also included in the set): + + ``` + [ + { + "CRBNK/CUSTOMERS/c_lname: ['EQUAL', 2, '__col__', 'lee']", + "CRBNK/CUSTOMERS/c_birthday: ['EQUAL', 2, 'YEAR', 1, '__col__', 1980]", + } + ] + ``` + + Args: + `log_str`: The log string to extract batch requests from. + + Returns: + A list of sets, each set indicating one of the batch requests made to + the mask server during the conversion process and logged in the logger + that was dumped into the log string. The format for each set entry is + `db_name.table_name.column_name: [expression_list]`. + """ + header_pattern: re.Pattern = re.compile( + r"Batch request( \(dry run\))? to Mask Server \((\d+) items?\):" + ) + entry_pattern: re.Pattern = re.compile(r"\(\d+\) (.+)") + result: list[set[str]] = [] + current_set: set[str] = set() + lines_remaining: int = 0 + for line in log_str.splitlines(): + header_match = re.findall(header_pattern, line) + if header_match: + assert lines_remaining == 0, ( + "Malformed log: new batch request started before previous one ended." + ) + current_set = set() + if bool(header_match[0][0]): + current_set.add("DRY_RUN") + lines_remaining = int(header_match[0][1]) + result.append(current_set) + elif lines_remaining > 0: + entry_match = re.findall(entry_pattern, line) + assert entry_match, "Malformed log: expected batch request entry line." + current_set.add(entry_match[0]) + lines_remaining -= 1 + assert lines_remaining == 0, ( + "Malformed log: batch request did not have expected number of entries." + ) + return result