From 63fec36a966f404bca18f2aa7e505c02d758e717 Mon Sep 17 00:00:00 2001 From: Hannah Ramadan Date: Wed, 3 Sep 2025 09:29:34 -0700 Subject: [PATCH 1/6] Feat: db.query.summary parser --- .../opentelemetry/helpers/query_summary.rb | 34 ++ .../helpers/query_summary/cache.rb | 48 +++ .../helpers/query_summary/parser.rb | 134 +++++++ .../helpers/query_summary/tokenizer.rb | 58 +++ .../test/fixtures/query_summary.json | 373 ++++++++++++++++++ .../test/helpers/query_summary_test.rb | 35 ++ 6 files changed, 682 insertions(+) create mode 100644 helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb create mode 100644 helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb create mode 100644 helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb create mode 100644 helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb create mode 100644 helpers/sql-obfuscation/test/fixtures/query_summary.json create mode 100644 helpers/sql-obfuscation/test/helpers/query_summary_test.rb diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb new file mode 100644 index 0000000000..f8146f65e0 --- /dev/null +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +# Copyright The OpenTelemetry Authors +# +# SPDX-License-Identifier: Apache-2.0 + +require_relative 'query_summary/cache' +require_relative 'query_summary/tokenizer' +require_relative 'query_summary/parser' + +module OpenTelemetry + module Helpers + # QuerySummary generates high-level summaries of SQL queries, made up of + # key operations and table names. + # + # Example: + # QuerySummary.generate_summary("SELECT * FROM users WHERE id = 1") + # # => "SELECT users" + module QuerySummary + def self.configure_cache(size: Cache::DEFAULT_SIZE) + Cache.configure(size: size) + end + + def self.generate_summary(query) + Cache.fetch(query) do + tokens = Tokenizer.tokenize(query) + Parser.build_summary_from_tokens(tokens) + end + rescue StandardError + 'UNKNOWN' + end + end + end +end diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb new file mode 100644 index 0000000000..5f70bae1d9 --- /dev/null +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +# Copyright The OpenTelemetry Authors +# +# SPDX-License-Identifier: Apache-2.0module OpenTelemetry + +module OpenTelemetry + module Helpers + module QuerySummary + # Cache provides thread-safe LRU caching for query summaries. + # + # Stores generated query summaries to avoid reprocessing identical queries. + # Uses mutex synchronization for thread safety. + # + # @example + # Cache.fetch("SELECT * FROM users") { "SELECT users" } # => "SELECT users" + class Cache + DEFAULT_SIZE = 1000 + + @cache = {} + @cache_mutex = Mutex.new + @cache_size = DEFAULT_SIZE + + def self.fetch(key) + return @cache[key] if @cache.key?(key) + + result = yield + store(key, result) + result + end + + def self.configure(size: DEFAULT_SIZE) + @cache_mutex.synchronize do + @cache_size = size + @cache.clear if @cache.size > size + end + end + + def self.store(key, value) + @cache_mutex.synchronize do + @cache.shift if @cache.size >= @cache_size + @cache[key] = value + end + end + end + end + end +end diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb new file mode 100644 index 0000000000..866761ee12 --- /dev/null +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb @@ -0,0 +1,134 @@ +# frozen_string_literal: true + +# Copyright The OpenTelemetry Authors +# +# SPDX-License-Identifier: Apache-2.0 + +module OpenTelemetry + module Helpers + module QuerySummary + # Parser builds high-level SQL query summaries from tokenized input. + # + # Processes tokens to extract key operations and table names, creating + # summaries like "SELECT users" or "INSERT INTO orders". + # + # @example + # tokens = [Token.new(:keyword, "SELECT"), Token.new(:identifier, "users")] + # Parser.build_summary_from_tokens(tokens) # => "SELECT users" + class Parser + def self.build_summary_from_tokens(tokens) + summary_parts = [] + state = :default # Either :default or :expect_collection + skip_until = 0 # Next token index to process; allows skipping tokens already consumed by previous operations + + tokens.each_with_index do |token, index| + next if index < skip_until # Skip already processed tokens + + result = process_token(token, tokens, index, state) + + summary_parts.concat(result[:parts]) + state = result[:new_state] + skip_until = result[:next_index] + end + + summary_parts.join(' ') + end + + def self.process_token(token, tokens, index, state) + operation_result = process_main_operation(token, tokens, index, state) + return operation_result if operation_result[:processed] + + collection_result = process_collection_token(token, tokens, index, state) + return collection_result if collection_result[:processed] + + { processed: false, parts: [], new_state: state, next_index: index + 1 } + end + + def self.process_main_operation(token, tokens, index, current_state) + case token.value.upcase + when 'SELECT', 'INSERT', 'DELETE' + add_to_summary(token.value, :default, index + 1) + when 'WITH', 'UPDATE' + add_to_summary(token.value, :expect_collection, index + 1) + when 'FROM', 'INTO', 'JOIN', 'IN' + trigger_collection_mode(index + 1) + when 'CREATE', 'ALTER', 'DROP', 'TRUNCATE' + handle_table_operation(token, tokens, index) + when 'UNION' + handle_union(token, tokens, index) + else + not_processed(current_state, index + 1) + end + end + + def self.process_collection_token(token, tokens, index, state) + return { processed: false, parts: [], new_state: state, next_index: index + 1 } unless state == :expect_collection + + upcased_value = token.value.upcase + + if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value)) + skip_count = calculate_alias_skip(tokens, index) + new_state = tokens[index + 1 + skip_count]&.value == ',' ? :expect_collection : :default + skip_count += 1 if tokens[index + 1 + skip_count]&.value == ',' + + { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count } + elsif token.value == '(' || token.type == :operator + { processed: true, parts: [], new_state: state, next_index: index + 1 } + else + { processed: true, parts: [], new_state: :default, next_index: index + 1 } + end + end + + def self.identifier_like?(token) + %i[identifier quoted_identifier string].include?(token.type) + end + + def self.can_be_table_name?(upcased_value) + # Keywords that can also be used as table/object names in certain contexts + %w[TABLE INDEX PROCEDURE VIEW DATABASE].include?(upcased_value) + end + + def self.calculate_alias_skip(tokens, index) + if tokens[index + 1]&.value&.upcase == 'AS' + 2 # Skip 'AS' and the alias + elsif tokens[index + 1]&.type == :identifier + 1 # Skip the alias + else + 0 + end + end + + def self.add_to_summary(part, new_state, next_index) + { processed: true, parts: [part], new_state: new_state, next_index: next_index } + end + + def self.trigger_collection_mode(next_index) + { processed: true, parts: [], new_state: :expect_collection, next_index: next_index } + end + + def self.not_processed(current_state, next_index) + { processed: false, parts: [], new_state: current_state, next_index: next_index } + end + + def self.handle_union(token, tokens, index) + if tokens[index + 1]&.value&.upcase == 'ALL' + { processed: true, parts: ["#{token.value} #{tokens[index + 1].value}"], new_state: :default, next_index: index + 2 } + else + add_to_summary(token.value, :default, index + 1) + end + end + + def self.handle_table_operation(token, tokens, index) + next_token = tokens[index + 1]&.value&.upcase + + case next_token + when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE' + { processed: true, parts: ["#{token.value} #{next_token}"], new_state: :expect_collection, next_index: index + 2 } + else + add_to_summary(token.value, :default, index + 1) + end + end + end + end + end +end diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb new file mode 100644 index 0000000000..19029e20ea --- /dev/null +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +# Copyright The OpenTelemetry Authors +# +# SPDX-License-Identifier: Apache-2.0 + +require 'strscan' + +module OpenTelemetry + module Helpers + module QuerySummary + # Tokenizer breaks down SQL queries into structured tokens for analysis. + # + # Parses SQL query strings into typed tokens (keywords, identifiers, operators, literals) + # for generating query summaries while filtering out sensitive data. + # + # @example + # tokens = Tokenizer.tokenize("SELECT * FROM users WHERE id = 1") + # # Returns tokens: [keyword: SELECT], [operator: *], [keyword: FROM], etc. + class Tokenizer + # Token holds the type (e.g., :keyword) and value (e.g., "SELECT") + Token = Struct.new(:type, :value) + + # The order of token matching is important for correct parsing, + # as more specific patterns should be matched before more general ones. + TOKEN_REGEX = { + whitespace: /\s+/, + comment: %r{--[^\r\n]*|\/\*.*?\*\/}m, + numeric: /[+-]?(?:0x[0-9a-fA-F]+|\d+\.?\d*(?:[eE][+-]?\d+)?|\.\d+(?:[eE][+-]?\d+)?)/, + string: /'(?:''|[^'\r\n])*'?/, + quoted_identifier: /"(?:""|[^"\r\n])*"|`(?:``|[^`\r\n])*`|\[(?:[^\]\r\n])*\]/, + keyword: /\b(?:SELECT|INSERT|UPDATE|DELETE|FROM|INTO|JOIN|CREATE|ALTER|DROP|TRUNCATE|WITH|UNION|TABLE|INDEX|PROCEDURE|VIEW|DATABASE)\b/i, + identifier: /[a-zA-Z_][a-zA-Z0-9_.]*/, + operator: /<=|>=|<>|!=|[=<>+\-*\/%,;()!?]/ + }.freeze + + EXCLUDED_TYPES = %i[whitespace comment].freeze + + def self.tokenize(query) + scanner = StringScanner.new(query) + tokens = [] + + until scanner.eos? + matched = TOKEN_REGEX.any? do |type, regex| + next unless (value = scanner.scan(regex)) + + tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type) + true + end + scanner.getch unless matched + end + + tokens + end + end + end + end +end diff --git a/helpers/sql-obfuscation/test/fixtures/query_summary.json b/helpers/sql-obfuscation/test/fixtures/query_summary.json new file mode 100644 index 0000000000..bf16e40acc --- /dev/null +++ b/helpers/sql-obfuscation/test/fixtures/query_summary.json @@ -0,0 +1,373 @@ +[ + { + "name": "numeric_literal_integers", + "input": { + "query": "SELECT 12, -12, +12" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "caching_query_summaries", + "input": { + "query": "SELECT 12, -12, +12" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "numeric_literal_with_decimal_point", + "input": { + "query": "SELECT 12.34, -12.34, +12.34, .01, -.01" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "numeric_literal_exponential", + "input": { + "query": "SELECT 12.34e56, -12.34e56, +12.34e56" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "numeric_literal_negative_exponential", + "input": { + "query": "SELECT 12.34e-56, -12.34e-56, +12.34e-56" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "arithmetic_on_numeric_literals", + "input": { + "query": "SELECT 99+100" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "hex_literal", + "input": { + "query": "SELECT 0xDEADBEEF, 0XdeadBEEF" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "string_literal", + "input": { + "query": "SELECT 'hello'" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "string_literal_escaped_single_quote", + "input": { + "query": "SELECT 'My name''s not important'" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "string_with_embedded_newline", + "input": { + "query": "SELECT 'My name is \n not important'" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "numbers_in_identifiers", + "input": { + "query": "SELECT c3po, r2d2 FROM covid19 WHERE n1h1=1234" + }, + "expected": { + "db.query.summary": "SELECT covid19" + } + }, + { + "name": "periods_in_identifiers", + "input": { + "query": "SELECT a FROM dbo.Table JOIN dbo.AnotherTable" + }, + "expected": { + "db.query.summary": "SELECT dbo.Table dbo.AnotherTable" + } + }, + { + "name": "insert_into", + "input": { + "query": "INSERT INTO X VALUES(1, 23456, 123.456, 99+100)" + }, + "expected": { + "db.query.summary": "INSERT X" + } + }, + { + "name": "uuid", + "input": { + "query": "SELECT { guid '01234567-89ab-cdef-0123-456789abcdef' }" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "in_clause", + "input": { + "query": "SELECT * FROM table WHERE value IN (123, 456, 'abc')" + }, + "expected": { + "db.query.summary": "SELECT table" + } + }, + { + "name": "comments", + "input": { + "query": "SELECT column -- end of line comment\nFROM /* block \n comment */ table" + }, + "expected": { + "db.query.summary": "SELECT table" + } + }, + { + "name": "insert_into_select", + "input": { + "query": "INSERT INTO shipping_details\n(order_id,\naddress)\nSELECT order_id,\naddress\nFROM orders\nWHERE order_id = 1" + }, + "expected": { + "db.query.summary": "INSERT shipping_details SELECT orders" + } + }, + { + "name": "select_nested_query", + "input": { + "query": "SELECT order_date\nFROM (SELECT *\nFROM orders o\nJOIN customers c\nON o.customer_id = c.customer_id)" + }, + "expected": { + "db.query.summary": "SELECT SELECT orders customers" + } + }, + { + "name": "select_nested_query_case_preserved", + "input": { + "query": "SELEcT order_date\nFROM (sELECT *\nFROM orders o\nJOIN customers c\nON o.customer_id = c.customer_id)" + }, + "expected": { + "db.query.summary": "SELEcT sELECT orders customers" + } + }, + { + "name": "case_preserved", + "input": { + "query": "SELEcT order_date\nFROM ORders" + }, + "expected": { + "db.query.summary": "SELEcT ORders" + } + }, + { + "name": "cross_join", + "input": { + "query": "SELECT * FROM Orders o CROSS JOIN OrderDetails od" + }, + "expected": { + "db.query.summary": "SELECT Orders OrderDetails" + } + }, + { + "name": "cross_join_comma_separated_syntax", + "input": { + "query": "SELECT * FROM Orders o, OrderDetails od" + }, + "expected": { + "db.query.summary": "SELECT Orders OrderDetails" + } + }, + { + "name": "left_outer_join", + "input": { + "query": "SELECT c.name, o.id FROM customers c LEFT JOIN orders o ON c.id = o.customer_id" + }, + "expected": { + "db.query.summary": "SELECT customers orders" + } + }, + { + "name": "create_table", + "input": { + "query": "CREATE TABLE MyTable (\n ID NOT NULL IDENTITY(1,1) PRIMARY KEY\n)" + }, + "expected": { + "db.query.summary": "CREATE TABLE MyTable" + } + }, + { + "name": "alter_table", + "input": { + "query": "ALTER TABLE MyTable ADD Name varchar(255)" + }, + "expected": { + "db.query.summary": "ALTER TABLE MyTable" + } + }, + { + "name": "drop_table", + "input": { + "query": "DROP TABLE MyTable" + }, + "expected": { + "db.query.summary": "DROP TABLE MyTable" + } + }, + { + "name": "query_that_performs_multiple_operations", + "input": { + "query": "INSERT INTO shipping_details(order_id, address) SELECT order_id, address FROM orders WHERE order_id = ?" + }, + "expected": { + "db.query.summary": "INSERT shipping_details SELECT orders" + } + }, + { + "name": "query_that_performs_an_operation_thats_applied_to_multiple_collections", + "input": { + "db.system.name": "other_sql", + "query": "SELECT * FROM songs, artists WHERE songs.artist_id == artists.id" + }, + "expected": { + "db.query.summary": "SELECT songs artists" + } + }, + { + "name": "query_that_performs_operation_on_multiple_collections_with_double-quotes_or_other_punctuation", + "input": { + "query": "SELECT * FROM \"song list\", 'artists'" + }, + "expected": { + "db.query.summary": "SELECT \"song list\" 'artists'" + } + }, + { + "name": "update_statement", + "input": { + "query": "UPDATE Customers SET ContactName = 'Alfred Schmidt', City= 'Frankfurt' WHERE CustomerID = 1" + }, + "expected": { + "db.query.summary": "UPDATE Customers" + } + }, + { + "name": "delete_statement", + "input": { + "query": "DELETE FROM Customers WHERE CustomerName='Alfreds Futterkiste'" + }, + "expected": { + "db.query.summary": "DELETE Customers" + } + }, + { + "name": "truncate_table_statement", + "input": { + "query": "TRUNCATE TABLE Customers" + }, + "expected": { + "db.query.summary": "TRUNCATE TABLE Customers" + } + }, + { + "name": "with_clause_cte", + "input": { + "query": "WITH regional_sales AS (SELECT region, SUM(amount) AS total_sales FROM orders GROUP BY region) SELECT region, total_sales FROM regional_sales WHERE total_sales > 1000" + }, + "expected": { + "db.query.summary": "WITH regional_sales SELECT orders SELECT regional_sales" + } + }, + { + "name": "union_statement", + "input": { + "query": "SELECT City FROM Customers UNION ALL SELECT City FROM Suppliers ORDER BY City" + }, + "expected": { + "db.query.summary": "SELECT Customers UNION ALL SELECT Suppliers" + } + }, + { + "name": "group_by_and_having_clauses", + "input": { + "query": "SELECT COUNT(CustomerID), Country FROM Customers WHERE Country != 'USA' GROUP BY Country HAVING COUNT(CustomerID) > 5" + }, + "expected": { + "db.query.summary": "SELECT Customers" + } + }, + { + "name": "boolean_and_null_literals", + "input": { + "query": "SELECT * FROM my_table WHERE a IS NOT NULL AND b = TRUE AND c = FALSE" + }, + "expected": { + "db.query.summary": "SELECT my_table" + } + }, + { + "name": "multiple_joins_and_aliases", + "input": { + "query": "SELECT o.OrderID, c.CustomerName, s.ShipperName FROM ((Orders AS o INNER JOIN Customers AS c ON o.CustomerID = c.CustomerID) INNER JOIN Shippers AS s ON o.ShipperID = s.ShipperID)" + }, + "expected": { + "db.query.summary": "SELECT Orders Customers Shippers" + } + }, + { + "name": "window_function_over_partition", + "input": { + "query": "SELECT name, salary, ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rank FROM employees" + }, + "expected": { + "db.query.summary": "SELECT employees" + } + }, + { + "name": "case_statement", + "input": { + "query": "SELECT OrderID, Quantity, CASE WHEN Quantity > 30 THEN 'Large' WHEN Quantity > 10 THEN 'Medium' ELSE 'Small' END AS QuantityText FROM OrderDetails" + }, + "expected": { + "db.query.summary": "SELECT OrderDetails" + } + }, + { + "name": "like_predicate", + "input": { + "query": "SELECT * FROM products WHERE product_name LIKE 'Chai%'" + }, + "expected": { + "db.query.summary": "SELECT products" + } + }, + { + "name": "between_predicate", + "input": { + "query": "SELECT * FROM products WHERE price BETWEEN 10 AND 20" + }, + "expected": { + "db.query.summary": "SELECT products" + } + } +] + diff --git a/helpers/sql-obfuscation/test/helpers/query_summary_test.rb b/helpers/sql-obfuscation/test/helpers/query_summary_test.rb new file mode 100644 index 0000000000..1efeed7f9d --- /dev/null +++ b/helpers/sql-obfuscation/test/helpers/query_summary_test.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +# Copyright The OpenTelemetry Authors +# +# SPDX-License-Identifier: Apache-2.0 + +require_relative '../test_helper' +require_relative '../../lib/opentelemetry/helpers/query_summary' + +class QuerySummaryTest < Minitest::Test + def self.load_fixture + data = File.read("#{Dir.pwd}/test/fixtures/query_summary.json") + JSON.parse(data) + end + + def build_failure_message(query, expected_summary, actual_summary) + "Failed to generate query summary correctly.\n" \ + "Input: #{query}\n" \ + "Expected: #{expected_summary}\n" \ + "Actual: #{actual_summary}\n" + end + + load_fixture.each do |test_case| + name = test_case['name'] + query = test_case['input']['query'] + expected_summary = test_case['expected']['db.query.summary'] + + define_method(:"test_query_summary_#{name}") do + actual_summary = OpenTelemetry::Helpers::QuerySummary.generate_summary(query) + message = build_failure_message(query, expected_summary, actual_summary) + + assert_equal(expected_summary, actual_summary, message) + end + end +end From 08a896404c34ab621ee53e547d45ff2a010e4b08 Mon Sep 17 00:00:00 2001 From: Hannah Ramadan Date: Wed, 3 Sep 2025 15:21:42 -0700 Subject: [PATCH 2/6] Code and test updates --- .../helpers/query_summary/cache.rb | 11 ++- .../helpers/query_summary/parser.rb | 78 ++++++++++++------- .../helpers/query_summary/tokenizer.rb | 20 ++--- .../test/fixtures/query_summary.json | 45 +++++++++++ 4 files changed, 115 insertions(+), 39 deletions(-) diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb index 5f70bae1d9..3546a40f4b 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb @@ -22,11 +22,14 @@ class Cache @cache_size = DEFAULT_SIZE def self.fetch(key) - return @cache[key] if @cache.key?(key) + @cache_mutex.synchronize do + return @cache[key] if @cache.key?(key) - result = yield - store(key, result) - result + result = yield + @cache.shift if @cache.size >= @cache_size + @cache[key] = result + result + end end def self.configure(size: DEFAULT_SIZE) diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb index 866761ee12..bf8e4adc7f 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb @@ -16,9 +16,18 @@ module QuerySummary # tokens = [Token.new(:keyword, "SELECT"), Token.new(:identifier, "users")] # Parser.build_summary_from_tokens(tokens) # => "SELECT users" class Parser + DEFAULT_STATE = :default + EXPECT_COLLECTION_STATE = :expect_collection + + MAIN_OPERATIONS = %w[SELECT INSERT DELETE].freeze + COLLECTION_OPERATIONS = %w[WITH UPDATE].freeze + TRIGGER_COLLECTION = %w[FROM INTO JOIN IN].freeze + TABLE_OPERATIONS = %w[CREATE ALTER DROP TRUNCATE].freeze + TABLE_OBJECTS = %w[TABLE INDEX PROCEDURE VIEW DATABASE].freeze + def self.build_summary_from_tokens(tokens) summary_parts = [] - state = :default # Either :default or :expect_collection + state = DEFAULT_STATE skip_until = 0 # Next token index to process; allows skipping tokens already consumed by previous operations tokens.each_with_index do |token, index| @@ -45,14 +54,16 @@ def self.process_token(token, tokens, index, state) end def self.process_main_operation(token, tokens, index, current_state) - case token.value.upcase - when 'SELECT', 'INSERT', 'DELETE' - add_to_summary(token.value, :default, index + 1) - when 'WITH', 'UPDATE' - add_to_summary(token.value, :expect_collection, index + 1) - when 'FROM', 'INTO', 'JOIN', 'IN' + upcased_value = token.value.upcase + + case upcased_value + when *MAIN_OPERATIONS + add_to_summary(token.value, DEFAULT_STATE, index + 1) + when *COLLECTION_OPERATIONS + add_to_summary(token.value, EXPECT_COLLECTION_STATE, index + 1) + when *TRIGGER_COLLECTION trigger_collection_mode(index + 1) - when 'CREATE', 'ALTER', 'DROP', 'TRUNCATE' + when *TABLE_OPERATIONS handle_table_operation(token, tokens, index) when 'UNION' handle_union(token, tokens, index) @@ -62,36 +73,49 @@ def self.process_main_operation(token, tokens, index, current_state) end def self.process_collection_token(token, tokens, index, state) - return { processed: false, parts: [], new_state: state, next_index: index + 1 } unless state == :expect_collection + return not_processed(state, index + 1) unless state == EXPECT_COLLECTION_STATE upcased_value = token.value.upcase if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value)) - skip_count = calculate_alias_skip(tokens, index) - new_state = tokens[index + 1 + skip_count]&.value == ',' ? :expect_collection : :default - skip_count += 1 if tokens[index + 1 + skip_count]&.value == ',' - - { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count } + handle_collection_identifier(token, tokens, index) elsif token.value == '(' || token.type == :operator - { processed: true, parts: [], new_state: state, next_index: index + 1 } + handle_collection_operator(token, state, index) else - { processed: true, parts: [], new_state: :default, next_index: index + 1 } + handle_collection_default(token, index) end end + def self.handle_collection_identifier(token, tokens, index) + skip_count = calculate_alias_skip(tokens, index) + new_state = tokens[index + 1 + skip_count]&.value == ',' ? EXPECT_COLLECTION_STATE : DEFAULT_STATE + skip_count += 1 if tokens[index + 1 + skip_count]&.value == ',' + + { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count } + end + + def self.handle_collection_operator(token, state, index) + { processed: true, parts: [], new_state: state, next_index: index + 1 } + end + + def self.handle_collection_default(token, index) + { processed: true, parts: [], new_state: DEFAULT_STATE, next_index: index + 1 } + end + def self.identifier_like?(token) %i[identifier quoted_identifier string].include?(token.type) end def self.can_be_table_name?(upcased_value) # Keywords that can also be used as table/object names in certain contexts - %w[TABLE INDEX PROCEDURE VIEW DATABASE].include?(upcased_value) + TABLE_OBJECTS.include?(upcased_value) end def self.calculate_alias_skip(tokens, index) - if tokens[index + 1]&.value&.upcase == 'AS' + next_token = tokens[index + 1] + if next_token && next_token.value&.upcase == 'AS' 2 # Skip 'AS' and the alias - elsif tokens[index + 1]&.type == :identifier + elsif next_token && next_token.type == :identifier 1 # Skip the alias else 0 @@ -103,7 +127,7 @@ def self.add_to_summary(part, new_state, next_index) end def self.trigger_collection_mode(next_index) - { processed: true, parts: [], new_state: :expect_collection, next_index: next_index } + { processed: true, parts: [], new_state: EXPECT_COLLECTION_STATE, next_index: next_index } end def self.not_processed(current_state, next_index) @@ -111,21 +135,23 @@ def self.not_processed(current_state, next_index) end def self.handle_union(token, tokens, index) - if tokens[index + 1]&.value&.upcase == 'ALL' - { processed: true, parts: ["#{token.value} #{tokens[index + 1].value}"], new_state: :default, next_index: index + 2 } + next_token = tokens[index + 1] + if next_token && next_token.value&.upcase == 'ALL' + { processed: true, parts: ["#{token.value} #{next_token.value}"], new_state: DEFAULT_STATE, next_index: index + 2 } else - add_to_summary(token.value, :default, index + 1) + add_to_summary(token.value, DEFAULT_STATE, index + 1) end end def self.handle_table_operation(token, tokens, index) - next_token = tokens[index + 1]&.value&.upcase + next_token_obj = tokens[index + 1] + next_token = next_token_obj&.value&.upcase case next_token when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE' - { processed: true, parts: ["#{token.value} #{next_token}"], new_state: :expect_collection, next_index: index + 2 } + { processed: true, parts: ["#{token.value} #{next_token}"], new_state: EXPECT_COLLECTION_STATE, next_index: index + 2 } else - add_to_summary(token.value, :default, index + 1) + add_to_summary(token.value, DEFAULT_STATE, index + 1) end end end diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb index 19029e20ea..c5f247a46d 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb @@ -40,18 +40,20 @@ def self.tokenize(query) scanner = StringScanner.new(query) tokens = [] - until scanner.eos? - matched = TOKEN_REGEX.any? do |type, regex| - next unless (value = scanner.scan(regex)) - - tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type) - true - end - scanner.getch unless matched - end + scan_next_token(scanner, tokens) until scanner.eos? tokens end + + def self.scan_next_token(scanner, tokens) + matched = TOKEN_REGEX.any? do |type, regex| + next unless (value = scanner.scan(regex)) + + tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type) + true + end + scanner.getch unless matched + end end end end diff --git a/helpers/sql-obfuscation/test/fixtures/query_summary.json b/helpers/sql-obfuscation/test/fixtures/query_summary.json index bf16e40acc..3578004738 100644 --- a/helpers/sql-obfuscation/test/fixtures/query_summary.json +++ b/helpers/sql-obfuscation/test/fixtures/query_summary.json @@ -17,6 +17,24 @@ "db.query.summary": "SELECT" } }, + { + "name": "nil_input", + "input": { + "query": null + }, + "expected": { + "db.query.summary": "UNKNOWN" + } + }, + { + "name": "deeply_nested_subqueries", + "input": { + "query": "SELECT * FROM (SELECT * FROM (SELECT * FROM my_table))" + }, + "expected": { + "db.query.summary": "SELECT SELECT SELECT my_table" + } + }, { "name": "numeric_literal_with_decimal_point", "input": { @@ -368,6 +386,33 @@ "expected": { "db.query.summary": "SELECT products" } + }, + { + "name": "create_index", + "input": { + "query": "CREATE INDEX idx_name ON MyTable (column1)" + }, + "expected": { + "db.query.summary": "CREATE INDEX idx_name" + } + }, + { + "name": "create_database", + "input": { + "query": "CREATE DATABASE my_db" + }, + "expected": { + "db.query.summary": "CREATE DATABASE my_db" + } + }, + { + "name": "create_procedure", + "input": { + "query": "CREATE PROCEDURE my_proc AS BEGIN SELECT * FROM MyTable END" + }, + "expected": { + "db.query.summary": "CREATE PROCEDURE my_proc SELECT MyTable" + } } ] From 56be5816ba6a7e6bd1857e225b3fc735fccff0cb Mon Sep 17 00:00:00 2001 From: Hannah Ramadan Date: Wed, 3 Sep 2025 15:28:28 -0700 Subject: [PATCH 3/6] rubocop edit --- .../lib/opentelemetry/helpers/query_summary/tokenizer.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb index c5f247a46d..16c4a56955 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb @@ -31,7 +31,7 @@ class Tokenizer quoted_identifier: /"(?:""|[^"\r\n])*"|`(?:``|[^`\r\n])*`|\[(?:[^\]\r\n])*\]/, keyword: /\b(?:SELECT|INSERT|UPDATE|DELETE|FROM|INTO|JOIN|CREATE|ALTER|DROP|TRUNCATE|WITH|UNION|TABLE|INDEX|PROCEDURE|VIEW|DATABASE)\b/i, identifier: /[a-zA-Z_][a-zA-Z0-9_.]*/, - operator: /<=|>=|<>|!=|[=<>+\-*\/%,;()!?]/ + operator: %r{<=|>=|<>|!=|[=<>+\-*\/%,;()!?]} }.freeze EXCLUDED_TYPES = %i[whitespace comment].freeze From 784f1a41d3b05997b142f2ca4132f9cb42eb4881 Mon Sep 17 00:00:00 2001 From: Hannah Ramadan Date: Thu, 4 Sep 2025 15:52:32 -0700 Subject: [PATCH 4/6] Refactoring, cache tests --- .../opentelemetry/helpers/query_summary.rb | 28 ++- .../helpers/query_summary/cache.rb | 40 ++-- .../helpers/query_summary/parser.rb | 200 +++++++++--------- .../helpers/query_summary/tokenizer.rb | 26 +-- .../test/fixtures/query_summary.json | 11 +- .../test/helpers/query_summary/cache_test.rb | 20 ++ .../{ => query_summary}/query_summary_test.rb | 4 +- 7 files changed, 187 insertions(+), 142 deletions(-) create mode 100644 helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb rename helpers/sql-obfuscation/test/helpers/{ => query_summary}/query_summary_test.rb (90%) diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb index f8146f65e0..b2fd844699 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb @@ -4,8 +4,8 @@ # # SPDX-License-Identifier: Apache-2.0 -require_relative 'query_summary/cache' require_relative 'query_summary/tokenizer' +require_relative 'query_summary/cache' require_relative 'query_summary/parser' module OpenTelemetry @@ -17,17 +17,25 @@ module Helpers # QuerySummary.generate_summary("SELECT * FROM users WHERE id = 1") # # => "SELECT users" module QuerySummary - def self.configure_cache(size: Cache::DEFAULT_SIZE) - Cache.configure(size: size) - end + class << self + def configure_cache(size: Cache::DEFAULT_SIZE) + cache_instance.configure(size: size) + end + + def generate_summary(query) + cache_instance.fetch(query) do + tokens = Tokenizer.tokenize(query) + Parser.build_summary_from_tokens(tokens) + end + rescue StandardError + 'UNKNOWN' + end + + private - def self.generate_summary(query) - Cache.fetch(query) do - tokens = Tokenizer.tokenize(query) - Parser.build_summary_from_tokens(tokens) + def cache_instance + @cache_instance ||= Cache.new end - rescue StandardError - 'UNKNOWN' end end end diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb index 3546a40f4b..56486faccc 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb @@ -2,7 +2,7 @@ # Copyright The OpenTelemetry Authors # -# SPDX-License-Identifier: Apache-2.0module OpenTelemetry +# SPDX-License-Identifier: Apache-2.0 module OpenTelemetry module Helpers @@ -13,39 +13,43 @@ module QuerySummary # Uses mutex synchronization for thread safety. # # @example - # Cache.fetch("SELECT * FROM users") { "SELECT users" } # => "SELECT users" + # cache = Cache.new + # cache.fetch("SELECT * FROM users") { "SELECT users" } # => "SELECT users" class Cache DEFAULT_SIZE = 1000 - @cache = {} - @cache_mutex = Mutex.new - @cache_size = DEFAULT_SIZE + def initialize(size: DEFAULT_SIZE) + @cache = {} + @cache_mutex = Mutex.new + @cache_size = size + end - def self.fetch(key) + def fetch(key) @cache_mutex.synchronize do return @cache[key] if @cache.key?(key) result = yield - @cache.shift if @cache.size >= @cache_size + evict_if_needed @cache[key] = result result end end - def self.configure(size: DEFAULT_SIZE) - @cache_mutex.synchronize do - @cache_size = size - @cache.clear if @cache.size > size - end + private + + def configure(size: DEFAULT_SIZE) + @cache_size = size + @cache.clear if @cache.size > size end - def self.store(key, value) - @cache_mutex.synchronize do - @cache.shift if @cache.size >= @cache_size - @cache[key] = value - end + def clear + @cache.clear + end + + def evict_if_needed + @cache.shift if @cache.size >= @cache_size end end end end -end +end \ No newline at end of file diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb index bf8e4adc7f..c851dc63ce 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb @@ -25,133 +25,135 @@ class Parser TABLE_OPERATIONS = %w[CREATE ALTER DROP TRUNCATE].freeze TABLE_OBJECTS = %w[TABLE INDEX PROCEDURE VIEW DATABASE].freeze - def self.build_summary_from_tokens(tokens) - summary_parts = [] - state = DEFAULT_STATE - skip_until = 0 # Next token index to process; allows skipping tokens already consumed by previous operations + class << self + def build_summary_from_tokens(tokens) + summary_parts = [] + state = DEFAULT_STATE + skip_until = 0 # Next token index to process; allows skipping tokens already consumed by previous operations - tokens.each_with_index do |token, index| - next if index < skip_until # Skip already processed tokens + tokens.each_with_index do |token, index| + next if index < skip_until # Skip already processed tokens - result = process_token(token, tokens, index, state) + result = process_token(token, tokens, index, state) - summary_parts.concat(result[:parts]) - state = result[:new_state] - skip_until = result[:next_index] - end + summary_parts.concat(result[:parts]) + state = result[:new_state] + skip_until = result[:next_index] + end - summary_parts.join(' ') - end + summary_parts.join(' ') + end - def self.process_token(token, tokens, index, state) - operation_result = process_main_operation(token, tokens, index, state) - return operation_result if operation_result[:processed] + def process_token(token, tokens, index, state) + operation_result = process_main_operation(token, tokens, index, state) + return operation_result if operation_result[:processed] - collection_result = process_collection_token(token, tokens, index, state) - return collection_result if collection_result[:processed] + collection_result = process_collection_token(token, tokens, index, state) + return collection_result if collection_result[:processed] - { processed: false, parts: [], new_state: state, next_index: index + 1 } - end + { processed: false, parts: [], new_state: state, next_index: index + 1 } + end - def self.process_main_operation(token, tokens, index, current_state) - upcased_value = token.value.upcase - - case upcased_value - when *MAIN_OPERATIONS - add_to_summary(token.value, DEFAULT_STATE, index + 1) - when *COLLECTION_OPERATIONS - add_to_summary(token.value, EXPECT_COLLECTION_STATE, index + 1) - when *TRIGGER_COLLECTION - trigger_collection_mode(index + 1) - when *TABLE_OPERATIONS - handle_table_operation(token, tokens, index) - when 'UNION' - handle_union(token, tokens, index) - else - not_processed(current_state, index + 1) + def process_main_operation(token, tokens, index, current_state) + upcased_value = token.value.upcase + + case upcased_value + when *MAIN_OPERATIONS + add_to_summary(token.value, DEFAULT_STATE, index + 1) + when *COLLECTION_OPERATIONS + add_to_summary(token.value, EXPECT_COLLECTION_STATE, index + 1) + when *TRIGGER_COLLECTION + trigger_collection_mode(index + 1) + when *TABLE_OPERATIONS + handle_table_operation(token, tokens, index) + when 'UNION' + handle_union(token, tokens, index) + else + not_processed(current_state, index + 1) + end end - end - def self.process_collection_token(token, tokens, index, state) - return not_processed(state, index + 1) unless state == EXPECT_COLLECTION_STATE + def process_collection_token(token, tokens, index, state) + return not_processed(state, index + 1) unless state == EXPECT_COLLECTION_STATE - upcased_value = token.value.upcase + upcased_value = token.value.upcase - if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value)) - handle_collection_identifier(token, tokens, index) - elsif token.value == '(' || token.type == :operator - handle_collection_operator(token, state, index) - else - handle_collection_default(token, index) + if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value)) + handle_collection_identifier(token, tokens, index) + elsif token.value == '(' || token.type == :operator + handle_collection_operator(token, state, index) + else + handle_collection_default(token, index) + end end - end - def self.handle_collection_identifier(token, tokens, index) - skip_count = calculate_alias_skip(tokens, index) - new_state = tokens[index + 1 + skip_count]&.value == ',' ? EXPECT_COLLECTION_STATE : DEFAULT_STATE - skip_count += 1 if tokens[index + 1 + skip_count]&.value == ',' + def handle_collection_identifier(token, tokens, index) + skip_count = calculate_alias_skip(tokens, index) + new_state = tokens[index + 1 + skip_count]&.value == ',' ? EXPECT_COLLECTION_STATE : DEFAULT_STATE + skip_count += 1 if tokens[index + 1 + skip_count]&.value == ',' - { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count } - end + { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count } + end - def self.handle_collection_operator(token, state, index) - { processed: true, parts: [], new_state: state, next_index: index + 1 } - end + def handle_collection_operator(token, state, index) + { processed: true, parts: [], new_state: state, next_index: index + 1 } + end - def self.handle_collection_default(token, index) - { processed: true, parts: [], new_state: DEFAULT_STATE, next_index: index + 1 } - end + def handle_collection_default(token, index) + { processed: true, parts: [], new_state: DEFAULT_STATE, next_index: index + 1 } + end - def self.identifier_like?(token) - %i[identifier quoted_identifier string].include?(token.type) - end + def identifier_like?(token) + %i[identifier quoted_identifier string].include?(token.type) + end - def self.can_be_table_name?(upcased_value) - # Keywords that can also be used as table/object names in certain contexts - TABLE_OBJECTS.include?(upcased_value) - end + def can_be_table_name?(upcased_value) + # Keywords that can also be used as table/object names in certain contexts + TABLE_OBJECTS.include?(upcased_value) + end - def self.calculate_alias_skip(tokens, index) - next_token = tokens[index + 1] - if next_token && next_token.value&.upcase == 'AS' - 2 # Skip 'AS' and the alias - elsif next_token && next_token.type == :identifier - 1 # Skip the alias - else - 0 + def calculate_alias_skip(tokens, index) + next_token = tokens[index + 1] + if next_token && next_token.value&.upcase == 'AS' + 2 # Skip 'AS' and the alias + elsif next_token && next_token.type == :identifier + 1 # Skip the alias + else + 0 + end end - end - def self.add_to_summary(part, new_state, next_index) - { processed: true, parts: [part], new_state: new_state, next_index: next_index } - end + def add_to_summary(part, new_state, next_index) + { processed: true, parts: [part], new_state: new_state, next_index: next_index } + end - def self.trigger_collection_mode(next_index) - { processed: true, parts: [], new_state: EXPECT_COLLECTION_STATE, next_index: next_index } - end + def trigger_collection_mode(next_index) + { processed: true, parts: [], new_state: EXPECT_COLLECTION_STATE, next_index: next_index } + end - def self.not_processed(current_state, next_index) - { processed: false, parts: [], new_state: current_state, next_index: next_index } - end + def not_processed(current_state, next_index) + { processed: false, parts: [], new_state: current_state, next_index: next_index } + end - def self.handle_union(token, tokens, index) - next_token = tokens[index + 1] - if next_token && next_token.value&.upcase == 'ALL' - { processed: true, parts: ["#{token.value} #{next_token.value}"], new_state: DEFAULT_STATE, next_index: index + 2 } - else - add_to_summary(token.value, DEFAULT_STATE, index + 1) + def handle_union(token, tokens, index) + next_token = tokens[index + 1] + if next_token && next_token.value&.upcase == 'ALL' + { processed: true, parts: ["#{token.value} #{next_token.value}"], new_state: DEFAULT_STATE, next_index: index + 2 } + else + add_to_summary(token.value, DEFAULT_STATE, index + 1) + end end - end - def self.handle_table_operation(token, tokens, index) - next_token_obj = tokens[index + 1] - next_token = next_token_obj&.value&.upcase + def handle_table_operation(token, tokens, index) + next_token_obj = tokens[index + 1] + next_token = next_token_obj&.value&.upcase - case next_token - when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE' - { processed: true, parts: ["#{token.value} #{next_token}"], new_state: EXPECT_COLLECTION_STATE, next_index: index + 2 } - else - add_to_summary(token.value, DEFAULT_STATE, index + 1) + case next_token + when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE' + { processed: true, parts: ["#{token.value} #{next_token}"], new_state: EXPECT_COLLECTION_STATE, next_index: index + 2 } + else + add_to_summary(token.value, DEFAULT_STATE, index + 1) + end end end end diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb index 16c4a56955..6bad41cd8f 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb @@ -36,23 +36,25 @@ class Tokenizer EXCLUDED_TYPES = %i[whitespace comment].freeze - def self.tokenize(query) - scanner = StringScanner.new(query) - tokens = [] + class << self + def tokenize(query) + scanner = StringScanner.new(query) + tokens = [] - scan_next_token(scanner, tokens) until scanner.eos? + scan_next_token(scanner, tokens) until scanner.eos? - tokens - end + tokens + end - def self.scan_next_token(scanner, tokens) - matched = TOKEN_REGEX.any? do |type, regex| - next unless (value = scanner.scan(regex)) + def scan_next_token(scanner, tokens) + matched = TOKEN_REGEX.any? do |type, regex| + next unless (value = scanner.scan(regex)) - tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type) - true + tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type) + true + end + scanner.getch unless matched end - scanner.getch unless matched end end end diff --git a/helpers/sql-obfuscation/test/fixtures/query_summary.json b/helpers/sql-obfuscation/test/fixtures/query_summary.json index 3578004738..97f8c77b1f 100644 --- a/helpers/sql-obfuscation/test/fixtures/query_summary.json +++ b/helpers/sql-obfuscation/test/fixtures/query_summary.json @@ -413,6 +413,15 @@ "expected": { "db.query.summary": "CREATE PROCEDURE my_proc SELECT MyTable" } - } + }, + { + "name": "oracle_angle_quote", + "input": { + "query": "select * from foo where bar=q'' and x=5" + }, + "expected": { + "db.query.summary": "select foo" + } + } ] diff --git a/helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb b/helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb new file mode 100644 index 0000000000..2d401b5b06 --- /dev/null +++ b/helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb @@ -0,0 +1,20 @@ +require_relative '../../test_helper' +require_relative '../../../lib/opentelemetry/helpers/query_summary/cache' + +class CacheTest < Minitest::Test + def setup + @cache = OpenTelemetry::Helpers::QuerySummary::Cache.new + end + + def test_fetch_returns_new_value_when_key_does_not_exist + result = @cache.fetch('key1') { 'value1' } + assert_equal 'value1', result + end + + def test_fetch_returns_value_when_key_exists + @cache.fetch('key1') { 'value1' } + result = @cache.fetch('key1') { 'different_value' } + + assert_equal 'value1', result + end +end diff --git a/helpers/sql-obfuscation/test/helpers/query_summary_test.rb b/helpers/sql-obfuscation/test/helpers/query_summary/query_summary_test.rb similarity index 90% rename from helpers/sql-obfuscation/test/helpers/query_summary_test.rb rename to helpers/sql-obfuscation/test/helpers/query_summary/query_summary_test.rb index 1efeed7f9d..7009eac028 100644 --- a/helpers/sql-obfuscation/test/helpers/query_summary_test.rb +++ b/helpers/sql-obfuscation/test/helpers/query_summary/query_summary_test.rb @@ -4,8 +4,8 @@ # # SPDX-License-Identifier: Apache-2.0 -require_relative '../test_helper' -require_relative '../../lib/opentelemetry/helpers/query_summary' +require_relative '../../test_helper' +require_relative '../../../lib/opentelemetry/helpers/query_summary' class QuerySummaryTest < Minitest::Test def self.load_fixture From ee6ae1d167ac0b51ebed894488d5e97e15a9f69d Mon Sep 17 00:00:00 2001 From: Hannah Ramadan Date: Mon, 8 Sep 2025 14:18:33 -0700 Subject: [PATCH 5/6] Add tests for cache --- .../test/helpers/query_summary/cache_test.rb | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb b/helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb index 2d401b5b06..a194b2aad5 100644 --- a/helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb +++ b/helpers/sql-obfuscation/test/helpers/query_summary/cache_test.rb @@ -17,4 +17,39 @@ def test_fetch_returns_value_when_key_exists assert_equal 'value1', result end + + def test_eviction_when_cache_size_exceeded + small_cache = OpenTelemetry::Helpers::QuerySummary::Cache.new(size: 2) + + small_cache.fetch('key1') { 'value1' } + small_cache.fetch('key2') { 'value2' } + small_cache.fetch('key3') { 'value3' } + + result = small_cache.fetch('key1') { 'new_value1' } + assert_equal 'new_value1', result + end + + def test_cache_thread_safety + threads = Array.new(10) do |i| + Thread.new do + @cache.fetch('shared_key') { "thread_#{i}_value" } + end + end + + results = threads.map(&:value) + + assert_equal 1, results.uniq.size + end + + def test_empty_string + @cache.fetch('') { 'empty_string_value' } + + assert_equal 'empty_string_value', @cache.fetch('') + end + + def test_nil + @cache.fetch(nil) { 'nil_value' } + + assert_equal 'nil_value', @cache.fetch(nil) + end end From c296a46c47142aa7504a978154d720f7d0182310 Mon Sep 17 00:00:00 2001 From: Hannah Ramadan Date: Mon, 8 Sep 2025 14:55:51 -0700 Subject: [PATCH 6/6] Add comments to parser --- .../helpers/query_summary/parser.rb | 53 +- .../test/fixtures/query_summary.json | 833 +++++++++--------- 2 files changed, 450 insertions(+), 436 deletions(-) diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb index c851dc63ce..f275661366 100644 --- a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb +++ b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb @@ -16,23 +16,24 @@ module QuerySummary # tokens = [Token.new(:keyword, "SELECT"), Token.new(:identifier, "users")] # Parser.build_summary_from_tokens(tokens) # => "SELECT users" class Parser - DEFAULT_STATE = :default + # Two states: normal parsing vs. waiting for table names + PARSING_STATE = :parsing EXPECT_COLLECTION_STATE = :expect_collection - MAIN_OPERATIONS = %w[SELECT INSERT DELETE].freeze - COLLECTION_OPERATIONS = %w[WITH UPDATE].freeze - TRIGGER_COLLECTION = %w[FROM INTO JOIN IN].freeze - TABLE_OPERATIONS = %w[CREATE ALTER DROP TRUNCATE].freeze - TABLE_OBJECTS = %w[TABLE INDEX PROCEDURE VIEW DATABASE].freeze + MAIN_OPERATIONS = %w[SELECT INSERT DELETE].freeze # Operations that start queries and need table names + COLLECTION_OPERATIONS = %w[WITH UPDATE].freeze # Operations that work with existing data and expect table names to follow + TRIGGER_COLLECTION = %w[FROM INTO JOIN IN].freeze # Keywords that signal a table name is coming next + TABLE_OPERATIONS = %w[CREATE ALTER DROP TRUNCATE].freeze # Database structure operations that create, modify, or remove objects + TABLE_OBJECTS = %w[TABLE INDEX PROCEDURE VIEW DATABASE].freeze # Types of database objects that can be created, modified, or removed class << self def build_summary_from_tokens(tokens) summary_parts = [] - state = DEFAULT_STATE - skip_until = 0 # Next token index to process; allows skipping tokens already consumed by previous operations + state = PARSING_STATE + skip_until = 0 # Skip tokens we've already processed when looking ahead tokens.each_with_index do |token, index| - next if index < skip_until # Skip already processed tokens + next if index < skip_until result = process_token(token, tokens, index, state) @@ -59,11 +60,11 @@ def process_main_operation(token, tokens, index, current_state) case upcased_value when *MAIN_OPERATIONS - add_to_summary(token.value, DEFAULT_STATE, index + 1) + add_to_summary(token.value, PARSING_STATE, index + 1) when *COLLECTION_OPERATIONS add_to_summary(token.value, EXPECT_COLLECTION_STATE, index + 1) when *TRIGGER_COLLECTION - trigger_collection_mode(index + 1) + expect_table_names_next(index + 1) when *TABLE_OPERATIONS handle_table_operation(token, tokens, index) when 'UNION' @@ -79,17 +80,19 @@ def process_collection_token(token, tokens, index, state) upcased_value = token.value.upcase if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value)) - handle_collection_identifier(token, tokens, index) + process_table_name_and_alias(token, tokens, index) elsif token.value == '(' || token.type == :operator handle_collection_operator(token, state, index) else - handle_collection_default(token, index) + return_to_normal_parsing(token, index) end end - def handle_collection_identifier(token, tokens, index) + def process_table_name_and_alias(token, tokens, index) + # Look ahead to skip table aliases (e.g., "users u" or "users AS u") skip_count = calculate_alias_skip(tokens, index) - new_state = tokens[index + 1 + skip_count]&.value == ',' ? EXPECT_COLLECTION_STATE : DEFAULT_STATE + # Check if there's a comma - if so, expect more table names in the list + new_state = tokens[index + 1 + skip_count]&.value == ',' ? EXPECT_COLLECTION_STATE : PARSING_STATE skip_count += 1 if tokens[index + 1 + skip_count]&.value == ',' { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count } @@ -99,8 +102,8 @@ def handle_collection_operator(token, state, index) { processed: true, parts: [], new_state: state, next_index: index + 1 } end - def handle_collection_default(token, index) - { processed: true, parts: [], new_state: DEFAULT_STATE, next_index: index + 1 } + def return_to_normal_parsing(token, index) + { processed: true, parts: [], new_state: PARSING_STATE, next_index: index + 1 } end def identifier_like?(token) @@ -108,16 +111,17 @@ def identifier_like?(token) end def can_be_table_name?(upcased_value) - # Keywords that can also be used as table/object names in certain contexts + # Object types that can appear after DDL operations TABLE_OBJECTS.include?(upcased_value) end def calculate_alias_skip(tokens, index) + # Handle both "table AS alias" and "table alias" patterns next_token = tokens[index + 1] if next_token && next_token.value&.upcase == 'AS' - 2 # Skip 'AS' and the alias + 2 elsif next_token && next_token.type == :identifier - 1 # Skip the alias + 1 else 0 end @@ -127,7 +131,7 @@ def add_to_summary(part, new_state, next_index) { processed: true, parts: [part], new_state: new_state, next_index: next_index } end - def trigger_collection_mode(next_index) + def expect_table_names_next(next_index) { processed: true, parts: [], new_state: EXPECT_COLLECTION_STATE, next_index: next_index } end @@ -138,13 +142,14 @@ def not_processed(current_state, next_index) def handle_union(token, tokens, index) next_token = tokens[index + 1] if next_token && next_token.value&.upcase == 'ALL' - { processed: true, parts: ["#{token.value} #{next_token.value}"], new_state: DEFAULT_STATE, next_index: index + 2 } + { processed: true, parts: ["#{token.value} #{next_token.value}"], new_state: PARSING_STATE, next_index: index + 2 } else - add_to_summary(token.value, DEFAULT_STATE, index + 1) + add_to_summary(token.value, PARSING_STATE, index + 1) end end def handle_table_operation(token, tokens, index) + # Combine DDL operations with object types: "CREATE TABLE", "DROP INDEX", etc. next_token_obj = tokens[index + 1] next_token = next_token_obj&.value&.upcase @@ -152,7 +157,7 @@ def handle_table_operation(token, tokens, index) when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE' { processed: true, parts: ["#{token.value} #{next_token}"], new_state: EXPECT_COLLECTION_STATE, next_index: index + 2 } else - add_to_summary(token.value, DEFAULT_STATE, index + 1) + add_to_summary(token.value, PARSING_STATE, index + 1) end end end diff --git a/helpers/sql-obfuscation/test/fixtures/query_summary.json b/helpers/sql-obfuscation/test/fixtures/query_summary.json index 97f8c77b1f..ec0d505a4b 100644 --- a/helpers/sql-obfuscation/test/fixtures/query_summary.json +++ b/helpers/sql-obfuscation/test/fixtures/query_summary.json @@ -1,427 +1,436 @@ [ - { - "name": "numeric_literal_integers", - "input": { - "query": "SELECT 12, -12, +12" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "caching_query_summaries", - "input": { - "query": "SELECT 12, -12, +12" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "nil_input", - "input": { - "query": null - }, - "expected": { - "db.query.summary": "UNKNOWN" - } - }, - { - "name": "deeply_nested_subqueries", - "input": { - "query": "SELECT * FROM (SELECT * FROM (SELECT * FROM my_table))" - }, - "expected": { - "db.query.summary": "SELECT SELECT SELECT my_table" - } - }, - { - "name": "numeric_literal_with_decimal_point", - "input": { - "query": "SELECT 12.34, -12.34, +12.34, .01, -.01" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "numeric_literal_exponential", - "input": { - "query": "SELECT 12.34e56, -12.34e56, +12.34e56" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "numeric_literal_negative_exponential", - "input": { - "query": "SELECT 12.34e-56, -12.34e-56, +12.34e-56" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "arithmetic_on_numeric_literals", - "input": { - "query": "SELECT 99+100" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "hex_literal", - "input": { - "query": "SELECT 0xDEADBEEF, 0XdeadBEEF" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "string_literal", - "input": { - "query": "SELECT 'hello'" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "string_literal_escaped_single_quote", - "input": { - "query": "SELECT 'My name''s not important'" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "string_with_embedded_newline", - "input": { - "query": "SELECT 'My name is \n not important'" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "numbers_in_identifiers", - "input": { - "query": "SELECT c3po, r2d2 FROM covid19 WHERE n1h1=1234" - }, - "expected": { - "db.query.summary": "SELECT covid19" - } - }, - { - "name": "periods_in_identifiers", - "input": { - "query": "SELECT a FROM dbo.Table JOIN dbo.AnotherTable" - }, - "expected": { - "db.query.summary": "SELECT dbo.Table dbo.AnotherTable" - } - }, - { - "name": "insert_into", - "input": { - "query": "INSERT INTO X VALUES(1, 23456, 123.456, 99+100)" - }, - "expected": { - "db.query.summary": "INSERT X" - } - }, - { - "name": "uuid", - "input": { - "query": "SELECT { guid '01234567-89ab-cdef-0123-456789abcdef' }" - }, - "expected": { - "db.query.summary": "SELECT" - } - }, - { - "name": "in_clause", - "input": { - "query": "SELECT * FROM table WHERE value IN (123, 456, 'abc')" - }, - "expected": { - "db.query.summary": "SELECT table" - } - }, - { - "name": "comments", - "input": { - "query": "SELECT column -- end of line comment\nFROM /* block \n comment */ table" - }, - "expected": { - "db.query.summary": "SELECT table" - } - }, - { - "name": "insert_into_select", - "input": { - "query": "INSERT INTO shipping_details\n(order_id,\naddress)\nSELECT order_id,\naddress\nFROM orders\nWHERE order_id = 1" - }, - "expected": { - "db.query.summary": "INSERT shipping_details SELECT orders" - } - }, - { - "name": "select_nested_query", - "input": { - "query": "SELECT order_date\nFROM (SELECT *\nFROM orders o\nJOIN customers c\nON o.customer_id = c.customer_id)" - }, - "expected": { - "db.query.summary": "SELECT SELECT orders customers" - } - }, - { - "name": "select_nested_query_case_preserved", - "input": { - "query": "SELEcT order_date\nFROM (sELECT *\nFROM orders o\nJOIN customers c\nON o.customer_id = c.customer_id)" - }, - "expected": { - "db.query.summary": "SELEcT sELECT orders customers" - } - }, - { - "name": "case_preserved", - "input": { - "query": "SELEcT order_date\nFROM ORders" - }, - "expected": { - "db.query.summary": "SELEcT ORders" - } - }, - { - "name": "cross_join", - "input": { - "query": "SELECT * FROM Orders o CROSS JOIN OrderDetails od" - }, - "expected": { - "db.query.summary": "SELECT Orders OrderDetails" - } - }, - { - "name": "cross_join_comma_separated_syntax", - "input": { - "query": "SELECT * FROM Orders o, OrderDetails od" - }, - "expected": { - "db.query.summary": "SELECT Orders OrderDetails" - } - }, - { - "name": "left_outer_join", - "input": { - "query": "SELECT c.name, o.id FROM customers c LEFT JOIN orders o ON c.id = o.customer_id" - }, - "expected": { - "db.query.summary": "SELECT customers orders" - } - }, - { - "name": "create_table", - "input": { - "query": "CREATE TABLE MyTable (\n ID NOT NULL IDENTITY(1,1) PRIMARY KEY\n)" - }, - "expected": { - "db.query.summary": "CREATE TABLE MyTable" - } - }, - { - "name": "alter_table", - "input": { - "query": "ALTER TABLE MyTable ADD Name varchar(255)" - }, - "expected": { - "db.query.summary": "ALTER TABLE MyTable" - } - }, - { - "name": "drop_table", - "input": { - "query": "DROP TABLE MyTable" - }, - "expected": { - "db.query.summary": "DROP TABLE MyTable" - } - }, - { - "name": "query_that_performs_multiple_operations", - "input": { - "query": "INSERT INTO shipping_details(order_id, address) SELECT order_id, address FROM orders WHERE order_id = ?" - }, - "expected": { - "db.query.summary": "INSERT shipping_details SELECT orders" - } - }, - { - "name": "query_that_performs_an_operation_thats_applied_to_multiple_collections", - "input": { - "db.system.name": "other_sql", - "query": "SELECT * FROM songs, artists WHERE songs.artist_id == artists.id" - }, - "expected": { - "db.query.summary": "SELECT songs artists" - } - }, - { - "name": "query_that_performs_operation_on_multiple_collections_with_double-quotes_or_other_punctuation", - "input": { - "query": "SELECT * FROM \"song list\", 'artists'" - }, - "expected": { - "db.query.summary": "SELECT \"song list\" 'artists'" - } - }, - { - "name": "update_statement", - "input": { - "query": "UPDATE Customers SET ContactName = 'Alfred Schmidt', City= 'Frankfurt' WHERE CustomerID = 1" - }, - "expected": { - "db.query.summary": "UPDATE Customers" - } - }, - { - "name": "delete_statement", - "input": { - "query": "DELETE FROM Customers WHERE CustomerName='Alfreds Futterkiste'" - }, - "expected": { - "db.query.summary": "DELETE Customers" - } - }, - { - "name": "truncate_table_statement", - "input": { - "query": "TRUNCATE TABLE Customers" - }, - "expected": { - "db.query.summary": "TRUNCATE TABLE Customers" - } - }, - { - "name": "with_clause_cte", - "input": { - "query": "WITH regional_sales AS (SELECT region, SUM(amount) AS total_sales FROM orders GROUP BY region) SELECT region, total_sales FROM regional_sales WHERE total_sales > 1000" - }, - "expected": { - "db.query.summary": "WITH regional_sales SELECT orders SELECT regional_sales" - } - }, - { - "name": "union_statement", - "input": { - "query": "SELECT City FROM Customers UNION ALL SELECT City FROM Suppliers ORDER BY City" - }, - "expected": { - "db.query.summary": "SELECT Customers UNION ALL SELECT Suppliers" - } - }, - { - "name": "group_by_and_having_clauses", - "input": { - "query": "SELECT COUNT(CustomerID), Country FROM Customers WHERE Country != 'USA' GROUP BY Country HAVING COUNT(CustomerID) > 5" - }, - "expected": { - "db.query.summary": "SELECT Customers" - } - }, - { - "name": "boolean_and_null_literals", - "input": { - "query": "SELECT * FROM my_table WHERE a IS NOT NULL AND b = TRUE AND c = FALSE" - }, - "expected": { - "db.query.summary": "SELECT my_table" - } - }, - { - "name": "multiple_joins_and_aliases", - "input": { - "query": "SELECT o.OrderID, c.CustomerName, s.ShipperName FROM ((Orders AS o INNER JOIN Customers AS c ON o.CustomerID = c.CustomerID) INNER JOIN Shippers AS s ON o.ShipperID = s.ShipperID)" - }, - "expected": { - "db.query.summary": "SELECT Orders Customers Shippers" - } - }, - { - "name": "window_function_over_partition", - "input": { - "query": "SELECT name, salary, ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rank FROM employees" - }, - "expected": { - "db.query.summary": "SELECT employees" - } - }, - { - "name": "case_statement", - "input": { - "query": "SELECT OrderID, Quantity, CASE WHEN Quantity > 30 THEN 'Large' WHEN Quantity > 10 THEN 'Medium' ELSE 'Small' END AS QuantityText FROM OrderDetails" - }, - "expected": { - "db.query.summary": "SELECT OrderDetails" - } - }, - { - "name": "like_predicate", - "input": { - "query": "SELECT * FROM products WHERE product_name LIKE 'Chai%'" - }, - "expected": { - "db.query.summary": "SELECT products" - } - }, - { - "name": "between_predicate", - "input": { - "query": "SELECT * FROM products WHERE price BETWEEN 10 AND 20" - }, - "expected": { - "db.query.summary": "SELECT products" - } - }, - { - "name": "create_index", + { + "name": "numeric_literal_integers", "input": { - "query": "CREATE INDEX idx_name ON MyTable (column1)" + "query": "SELECT 12, -12, +12" }, "expected": { - "db.query.summary": "CREATE INDEX idx_name" + "db.query.summary": "SELECT" } - }, - { - "name": "create_database", + }, + { + "name": "caching_query_summaries", "input": { - "query": "CREATE DATABASE my_db" + "query": "SELECT 12, -12, +12" }, "expected": { - "db.query.summary": "CREATE DATABASE my_db" + "db.query.summary": "SELECT" } - }, - { - "name": "create_procedure", + }, + { + "name": "nil_input", "input": { - "query": "CREATE PROCEDURE my_proc AS BEGIN SELECT * FROM MyTable END" + "query": null }, "expected": { - "db.query.summary": "CREATE PROCEDURE my_proc SELECT MyTable" + "db.query.summary": "UNKNOWN" } - }, - { - "name": "oracle_angle_quote", - "input": { - "query": "select * from foo where bar=q'' and x=5" - }, - "expected": { - "db.query.summary": "select foo" - } - } + }, + { + "name": "deeply_nested_subqueries", + "input": { + "query": "SELECT * FROM (SELECT * FROM (SELECT * FROM my_table))" + }, + "expected": { + "db.query.summary": "SELECT SELECT SELECT my_table" + } + }, + { + "name": "numeric_literal_with_decimal_point", + "input": { + "query": "SELECT 12.34, -12.34, +12.34, .01, -.01" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "numeric_literal_exponential", + "input": { + "query": "SELECT 12.34e56, -12.34e56, +12.34e56" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "numeric_literal_negative_exponential", + "input": { + "query": "SELECT 12.34e-56, -12.34e-56, +12.34e-56" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "arithmetic_on_numeric_literals", + "input": { + "query": "SELECT 99+100" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "hex_literal", + "input": { + "query": "SELECT 0xDEADBEEF, 0XdeadBEEF" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "string_literal", + "input": { + "query": "SELECT 'hello'" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "string_literal_escaped_single_quote", + "input": { + "query": "SELECT 'My name''s not important'" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "string_with_embedded_newline", + "input": { + "query": "SELECT 'My name is \n not important'" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "numbers_in_identifiers", + "input": { + "query": "SELECT c3po, r2d2 FROM covid19 WHERE n1h1=1234" + }, + "expected": { + "db.query.summary": "SELECT covid19" + } + }, + { + "name": "periods_in_identifiers", + "input": { + "query": "SELECT a FROM dbo.Table JOIN dbo.AnotherTable" + }, + "expected": { + "db.query.summary": "SELECT dbo.Table dbo.AnotherTable" + } + }, + { + "name": "insert_into", + "input": { + "query": "INSERT INTO X VALUES(1, 23456, 123.456, 99+100)" + }, + "expected": { + "db.query.summary": "INSERT X" + } + }, + { + "name": "uuid", + "input": { + "query": "SELECT { guid '01234567-89ab-cdef-0123-456789abcdef' }" + }, + "expected": { + "db.query.summary": "SELECT" + } + }, + { + "name": "in_clause", + "input": { + "query": "SELECT * FROM table WHERE value IN (123, 456, 'abc')" + }, + "expected": { + "db.query.summary": "SELECT table" + } + }, + { + "name": "comments", + "input": { + "query": "SELECT column -- end of line comment\nFROM /* block \n comment */ table" + }, + "expected": { + "db.query.summary": "SELECT table" + } + }, + { + "name": "insert_into_select", + "input": { + "query": "INSERT INTO shipping_details\n(order_id,\naddress)\nSELECT order_id,\naddress\nFROM orders\nWHERE order_id = 1" + }, + "expected": { + "db.query.summary": "INSERT shipping_details SELECT orders" + } + }, + { + "name": "select_nested_query", + "input": { + "query": "SELECT order_date\nFROM (SELECT *\nFROM orders o\nJOIN customers c\nON o.customer_id = c.customer_id)" + }, + "expected": { + "db.query.summary": "SELECT SELECT orders customers" + } + }, + { + "name": "select_nested_query_case_preserved", + "input": { + "query": "SELEcT order_date\nFROM (sELECT *\nFROM orders o\nJOIN customers c\nON o.customer_id = c.customer_id)" + }, + "expected": { + "db.query.summary": "SELEcT sELECT orders customers" + } + }, + { + "name": "case_preserved", + "input": { + "query": "SELEcT order_date\nFROM ORders" + }, + "expected": { + "db.query.summary": "SELEcT ORders" + } + }, + { + "name": "cross_join", + "input": { + "query": "SELECT * FROM Orders o CROSS JOIN OrderDetails od" + }, + "expected": { + "db.query.summary": "SELECT Orders OrderDetails" + } + }, + { + "name": "cross_join_comma_separated_syntax", + "input": { + "query": "SELECT * FROM Orders o, OrderDetails od" + }, + "expected": { + "db.query.summary": "SELECT Orders OrderDetails" + } + }, + { + "name": "left_outer_join", + "input": { + "query": "SELECT c.name, o.id FROM customers c LEFT JOIN orders o ON c.id = o.customer_id" + }, + "expected": { + "db.query.summary": "SELECT customers orders" + } + }, + { + "name": "create_table", + "input": { + "query": "CREATE TABLE MyTable (\n ID NOT NULL IDENTITY(1,1) PRIMARY KEY\n)" + }, + "expected": { + "db.query.summary": "CREATE TABLE MyTable" + } + }, + { + "name": "alter_table", + "input": { + "query": "ALTER TABLE MyTable ADD Name varchar(255)" + }, + "expected": { + "db.query.summary": "ALTER TABLE MyTable" + } + }, + { + "name": "drop_table", + "input": { + "query": "DROP TABLE MyTable" + }, + "expected": { + "db.query.summary": "DROP TABLE MyTable" + } + }, + { + "name": "query_that_performs_multiple_operations", + "input": { + "query": "INSERT INTO shipping_details(order_id, address) SELECT order_id, address FROM orders WHERE order_id = ?" + }, + "expected": { + "db.query.summary": "INSERT shipping_details SELECT orders" + } + }, + { + "name": "query_that_performs_an_operation_thats_applied_to_multiple_collections", + "input": { + "db.system.name": "other_sql", + "query": "SELECT * FROM songs, artists WHERE songs.artist_id == artists.id" + }, + "expected": { + "db.query.summary": "SELECT songs artists" + } + }, + { + "name": "query_that_performs_operation_on_multiple_collections_with_double-quotes_or_other_punctuation", + "input": { + "query": "SELECT * FROM \"song list\", 'artists'" + }, + "expected": { + "db.query.summary": "SELECT \"song list\" 'artists'" + } + }, + { + "name": "update_statement", + "input": { + "query": "UPDATE Customers SET ContactName = 'Alfred Schmidt', City= 'Frankfurt' WHERE CustomerID = 1" + }, + "expected": { + "db.query.summary": "UPDATE Customers" + } + }, + { + "name": "delete_statement", + "input": { + "query": "DELETE FROM Customers WHERE CustomerName='Alfreds Futterkiste'" + }, + "expected": { + "db.query.summary": "DELETE Customers" + } + }, + { + "name": "truncate_table_statement", + "input": { + "query": "TRUNCATE TABLE Customers" + }, + "expected": { + "db.query.summary": "TRUNCATE TABLE Customers" + } + }, + { + "name": "with_clause_cte", + "input": { + "query": "WITH regional_sales AS (SELECT region, SUM(amount) AS total_sales FROM orders GROUP BY region) SELECT region, total_sales FROM regional_sales WHERE total_sales > 1000" + }, + "expected": { + "db.query.summary": "WITH regional_sales SELECT orders SELECT regional_sales" + } + }, + { + "name": "union_statement", + "input": { + "query": "SELECT City FROM Customers UNION ALL SELECT City FROM Suppliers ORDER BY City" + }, + "expected": { + "db.query.summary": "SELECT Customers UNION ALL SELECT Suppliers" + } + }, + { + "name": "group_by_and_having_clauses", + "input": { + "query": "SELECT COUNT(CustomerID), Country FROM Customers WHERE Country != 'USA' GROUP BY Country HAVING COUNT(CustomerID) > 5" + }, + "expected": { + "db.query.summary": "SELECT Customers" + } + }, + { + "name": "boolean_and_null_literals", + "input": { + "query": "SELECT * FROM my_table WHERE a IS NOT NULL AND b = TRUE AND c = FALSE" + }, + "expected": { + "db.query.summary": "SELECT my_table" + } + }, + { + "name": "multiple_joins_and_aliases", + "input": { + "query": "SELECT o.OrderID, c.CustomerName, s.ShipperName FROM ((Orders AS o INNER JOIN Customers AS c ON o.CustomerID = c.CustomerID) INNER JOIN Shippers AS s ON o.ShipperID = s.ShipperID)" + }, + "expected": { + "db.query.summary": "SELECT Orders Customers Shippers" + } + }, + { + "name": "window_function_over_partition", + "input": { + "query": "SELECT name, salary, ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rank FROM employees" + }, + "expected": { + "db.query.summary": "SELECT employees" + } + }, + { + "name": "case_statement", + "input": { + "query": "SELECT OrderID, Quantity, CASE WHEN Quantity > 30 THEN 'Large' WHEN Quantity > 10 THEN 'Medium' ELSE 'Small' END AS QuantityText FROM OrderDetails" + }, + "expected": { + "db.query.summary": "SELECT OrderDetails" + } + }, + { + "name": "like_predicate", + "input": { + "query": "SELECT * FROM products WHERE product_name LIKE 'Chai%'" + }, + "expected": { + "db.query.summary": "SELECT products" + } + }, + { + "name": "between_predicate", + "input": { + "query": "SELECT * FROM products WHERE price BETWEEN 10 AND 20" + }, + "expected": { + "db.query.summary": "SELECT products" + } + }, + { + "name": "create_index", + "input": { + "query": "CREATE INDEX idx_name ON MyTable (column1)" + }, + "expected": { + "db.query.summary": "CREATE INDEX idx_name" + } + }, + { + "name": "create_database", + "input": { + "query": "CREATE DATABASE my_db" + }, + "expected": { + "db.query.summary": "CREATE DATABASE my_db" + } + }, + { + "name": "create_procedure", + "input": { + "query": "CREATE PROCEDURE my_proc AS BEGIN SELECT * FROM MyTable END" + }, + "expected": { + "db.query.summary": "CREATE PROCEDURE my_proc SELECT MyTable" + } + }, + { + "name": "oracle_angle_quote", + "input": { + "query": "select * from foo where bar=q'' and x=5" + }, + "expected": { + "db.query.summary": "select foo" + } + }, + { + "name": "cassandra_blobs", + "input" : { + "query": "select * from foo where bar=0xabcdef123 and x=5" + }, + "expected": { + "db.query.summary": "select foo" + } + } ]