-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Fix SELECT DISTINCT with delimited alias in ORDER BY bug #27004
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,20 +25,27 @@ | |
| public class CanonicalizationAware<T extends Node> | ||
| { | ||
| private final T node; | ||
| private final boolean ignoreCase; | ||
|
|
||
| // Updates to this field are thread-safe despite benign data race due to: | ||
| // 1. idempotent hash computation | ||
| // 2. atomic updates to int fields per JMM | ||
| private int hashCode; | ||
|
|
||
| private CanonicalizationAware(T node) | ||
| private CanonicalizationAware(T node, boolean ignoreCase) | ||
| { | ||
| this.node = requireNonNull(node, "node is null"); | ||
| this.ignoreCase = ignoreCase; | ||
| } | ||
|
|
||
| public static <T extends Node> CanonicalizationAware<T> canonicalizationAwareKey(T node) | ||
| { | ||
| return new CanonicalizationAware<T>(node); | ||
| return new CanonicalizationAware<T>(node, false); | ||
| } | ||
|
|
||
| public static <T extends Node> CanonicalizationAware<T> canonicalizationAwareKey(T node, boolean ignoreCase) | ||
| { | ||
| return new CanonicalizationAware<T>(node, ignoreCase); | ||
| } | ||
|
|
||
| public T getNode() | ||
|
|
@@ -51,7 +58,8 @@ public int hashCode() | |
| { | ||
| int hash = hashCode; | ||
| if (hash == 0) { | ||
| hash = treeHash(node, CanonicalizationAware::canonicalizationAwareHash); | ||
| hash = treeHash(node, | ||
| ignoreCase ? CanonicalizationAware::canonicalizationAwareIgnoreCaseHash : CanonicalizationAware::canonicalizationAwareHash); | ||
| if (hash == 0) { | ||
| hash = 1; | ||
| } | ||
|
|
@@ -73,7 +81,8 @@ public boolean equals(Object o) | |
| } | ||
|
|
||
| CanonicalizationAware<T> other = (CanonicalizationAware<T>) o; | ||
| return treeEqual(node, other.node, CanonicalizationAware::canonicalizationAwareComparison); | ||
| return treeEqual(node, other.node, | ||
| ignoreCase ? CanonicalizationAware::canonicalizationAwareIgnoreCaseComparison : CanonicalizationAware::canonicalizationAwareComparison); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -91,6 +100,15 @@ public static Boolean canonicalizationAwareComparison(Node left, Node right) | |
| return null; | ||
| } | ||
|
|
||
| public static Boolean canonicalizationAwareIgnoreCaseComparison(Node left, Node right) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we are ignoring case comparison - Say if we have a |
||
| { | ||
| if (left instanceof Identifier leftIdentifier && right instanceof Identifier rightIdentifier) { | ||
| return leftIdentifier.getCanonicalValue(true).equals(rightIdentifier.getCanonicalValue(true)); | ||
| } | ||
|
|
||
| return null; | ||
| } | ||
|
|
||
| public static OptionalInt canonicalizationAwareHash(Node node) | ||
| { | ||
| if (node instanceof Identifier identifier) { | ||
|
|
@@ -101,4 +119,15 @@ public static OptionalInt canonicalizationAwareHash(Node node) | |
| } | ||
| return OptionalInt.empty(); | ||
| } | ||
|
|
||
| public static OptionalInt canonicalizationAwareIgnoreCaseHash(Node node) | ||
| { | ||
| if (node instanceof Identifier identifier) { | ||
| return OptionalInt.of(identifier.getCanonicalValue(true).hashCode()); | ||
| } | ||
| if (node.getChildren().isEmpty()) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue (bug_risk): Including node.hashCode() for leaf nodes in canonicalizationAwareIgnoreCaseHash may introduce inconsistency. This mismatch between hashCode and equals for non-Identifier leaf nodes may cause issues in hash-based collections. Please ensure both methods are consistent. |
||
| return OptionalInt.of(node.hashCode()); | ||
| } | ||
| return OptionalInt.empty(); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5764,7 +5764,7 @@ private void verifySelectDistinct(QuerySpecification node, List<Expression> orde | |
| // SELECT a FROM t ORDER BY a | ||
| // the "a" in the SELECT clause is bound to the FROM scope, while the "a" in ORDER BY clause is bound | ||
| // to the "a" from the SELECT clause, so we can't compare by field id / relation id. | ||
| if (expression instanceof Identifier && aliases.contains(canonicalizationAwareKey(expression))) { | ||
| if (expression instanceof Identifier && aliases.contains(canonicalizationAwareKey(expression, true))) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question (bug_risk): The use of ignoreCase=true for all alias comparisons may affect queries with delimited identifiers. This change may result in incorrect behavior for queries using case-sensitive delimited identifiers. Please review whether this could impact query accuracy. |
||
| continue; | ||
| } | ||
|
|
||
|
|
@@ -5787,13 +5787,13 @@ private Set<CanonicalizationAware<Identifier>> getAliases(Select node) | |
| if (item instanceof SingleColumn column) { | ||
| Optional<Identifier> alias = column.getAlias(); | ||
| if (alias.isPresent()) { | ||
| aliases.add(canonicalizationAwareKey(alias.get())); | ||
| aliases.add(canonicalizationAwareKey(alias.get(), true)); | ||
| } | ||
| else if (column.getExpression() instanceof Identifier identifier) { | ||
| aliases.add(canonicalizationAwareKey(identifier)); | ||
| aliases.add(canonicalizationAwareKey(identifier, true)); | ||
| } | ||
| else if (column.getExpression() instanceof DereferenceExpression dereferenceExpression) { | ||
| aliases.add(canonicalizationAwareKey(dereferenceExpression.getField().orElseThrow())); | ||
| aliases.add(canonicalizationAwareKey(dereferenceExpression.getField().orElseThrow(), true)); | ||
| } | ||
| } | ||
| else if (item instanceof AllColumns allColumns) { | ||
|
|
@@ -5803,10 +5803,10 @@ else if (item instanceof AllColumns allColumns) { | |
| Field field = fields.get(i); | ||
|
|
||
| if (!allColumns.getAliases().isEmpty()) { | ||
| aliases.add(canonicalizationAwareKey(allColumns.getAliases().get(i))); | ||
| aliases.add(canonicalizationAwareKey(allColumns.getAliases().get(i), true)); | ||
| } | ||
| else if (field.getName().isPresent()) { | ||
| aliases.add(canonicalizationAwareKey(new Identifier(field.getName().get()))); | ||
| aliases.add(canonicalizationAwareKey(new Identifier(field.getName().get()), true)); | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't right. The whole purpose of this class is to compare identifiers taking into account SQL canonicalization rules. I.e., delimited identifiers are kept as is, while non-delimited identifiers are canonicalized (to upper-case, per the SQL standard) before comparison.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, if the adopted convention is similar to Oracle (non-delimited identifiers converted to uppercase, delimited identifiers kept as they are), then the changes in this PR are not ok.
The changes in this PR try to follow the Trino documentation https://trino.io/docs/current/language/reserved.html#language-identifiers
"Identifiers are not treated as case sensitive."
I read the "Identifiers" as meaning all identifiers, non-delimited and delimited.
The SELECT (without DISTINCT) follows the convention in the online documentation, by ignoring the case of all identifiers.
The changes here try to bring the SELECT DISTINCT to be similar to SELECT.
I modified the CanonicalizationAware, instead of removing its usage from SELECT DISTINCT, because the Identifier's equals and hashCode consider the case sensitive value.
I will copy this comment on the issue #26755, as the issue is more visible, and this PR seems to be wrong.