From d58531062de4fe18e5c475cdd845ea0579f6e40b Mon Sep 17 00:00:00 2001 From: YANGDB Date: Sat, 24 Aug 2024 23:42:09 -0700 Subject: [PATCH] Add support of GroK command including default patterns Signed-off-by: YANGDB Signed-off-by: YANGDB --- .../spark/ppl/FlintSparkPPLGrokITSuite.scala | 17 +- .../opensearch/sql/ppl/utils/ParseUtils.java | 394 +++++++++--------- ...PLLogicalPlanGrokTranslatorTestSuite.scala | 92 ++-- 3 files changed, 272 insertions(+), 231 deletions(-) diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGrokITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGrokITSuite.scala index e9a43e10d..b2ac390d3 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGrokITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGrokITSuite.scala @@ -5,11 +5,11 @@ package org.opensearch.flint.spark.ppl +import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.expressions.{Alias, Coalesce, Descending, GreaterThan, Literal, NullsLast, RegExpExtract, SortOrder} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.streaming.StreamTest -import org.apache.spark.sql.{QueryTest, Row} class FlintSparkPPLGrokITSuite extends QueryTest @@ -67,8 +67,13 @@ class FlintSparkPPLGrokITSuite val emailAttribute = UnresolvedAttribute("email") val hostAttribute = UnresolvedAttribute("host") val hostExpression = Alias( - Coalesce(Seq(RegExpExtract(emailAttribute, Literal(".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), - Literal("1")))), + Coalesce( + Seq( + RegExpExtract( + emailAttribute, + Literal( + ".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), + Literal("1")))), "host")() val expectedPlan = Project( Seq(emailAttribute, hostAttribute), @@ -80,7 +85,7 @@ class FlintSparkPPLGrokITSuite test("test parse email expressions parsing filter & sort by age") { val frame = sql(s""" - | source = $testTable| parse email '.+@(?.+)' | where age > 45 | sort - age | fields age, email, host ; + | source = $testTable| grok email '.+@%{HOSTNAME:host}' | where age > 45 | sort - age | fields age, email, host ; | """.stripMargin) // Retrieve the results @@ -119,7 +124,7 @@ class FlintSparkPPLGrokITSuite test("test parse email expressions and group by count host ") { val frame = sql(s""" - | source = $testTable| parse email '.+@(?.+)' | stats count() by host + | source = $testTable| grok email '.+@%{HOSTNAME:host}' | stats count() by host | """.stripMargin) // Retrieve the results @@ -167,7 +172,7 @@ class FlintSparkPPLGrokITSuite test("test parse email expressions and top count_host ") { val frame = sql(s""" - | source = $testTable| parse email '.+@(?.+)' | top 1 host + | source = $testTable| grok email '.+@%{HOSTNAME:host}' | top 1 host | """.stripMargin) // Retrieve the results diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java index 48f8b00fd..dea62cfd7 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseUtils.java @@ -28,94 +28,27 @@ import java.util.stream.Collectors; public class ParseUtils { - private static final Pattern GROUP_PATTERN = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>"); - private static final String NEW_FIELD_KEY = "new_field"; + private static final Pattern GROUP_PATTERN = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>"); + private static final String NEW_FIELD_KEY = "new_field"; - /** - * Construct corresponding ParseExpression by {@link ParseMethod}. - * - * @param parseMethod method used to parse - * @param pattern pattern used for parsing - * @param identifier derived field - * @return {@link ParseExpression} - */ - public static ParseExpression createParseExpression( - ParseMethod parseMethod, String pattern, String identifier) { - switch (parseMethod) { - case GROK: return new GrokExpression(pattern, identifier); - case PATTERNS: return new PatternsExpression(pattern, identifier); - default: return new RegexExpression(pattern, identifier); - } - } - - /** - * Get list of derived fields based on parse pattern. - * - * @param pattern pattern used for parsing - * @return list of names of the derived fields - */ - public static List getNamedGroupCandidates( - ParseMethod parseMethod, String pattern, Map arguments) { - switch (parseMethod) { - case REGEX: - return RegexExpression.getNamedGroupCandidates(pattern); - case GROK: - return GrokExpression.getNamedGroupCandidates(pattern); - default: - return GrokExpression.getNamedGroupCandidates( - arguments.containsKey(NEW_FIELD_KEY) - ? (String) arguments.get(NEW_FIELD_KEY).getValue() - : null); - } - } - - /** - * extract the cleaner pattern without the additional fields - * @param parseMethod - * @param pattern - * @param columns - * @return - */ - public static String extractPatterns( - ParseMethod parseMethod, String pattern, List columns) { - switch (parseMethod) { - case REGEX: - return RegexExpression.extractPattern(pattern, columns); - case GROK: - return GrokExpression.extractPattern(pattern, columns); - default: - return PatternsExpression.extractPattern(pattern, columns); - } - } - - public static String extractPattern(String patterns, List columns) { - StringBuilder result = new StringBuilder(); - Matcher matcher = GROUP_PATTERN.matcher(patterns); - - int lastEnd = 0; - while (matcher.find()) { - String groupName = matcher.group(1); - if (columns.contains(groupName)) { - result.append(patterns, lastEnd, matcher.start()); - result.append("("); - lastEnd = matcher.end(); - } - } - result.append(patterns.substring(lastEnd)); - return result.toString(); - } - - public static abstract class ParseExpression { - abstract String parseValue(String value); - } - - public static class RegexExpression extends ParseExpression{ - private final Pattern regexPattern; - protected final String identifier; - - public RegexExpression(String patterns, String identifier) { - this.regexPattern = Pattern.compile(patterns); - this.identifier = identifier; + /** + * Construct corresponding ParseExpression by {@link ParseMethod}. + * + * @param parseMethod method used to parse + * @param pattern pattern used for parsing + * @param identifier derived field + * @return {@link ParseExpression} + */ + public static ParseExpression createParseExpression( + ParseMethod parseMethod, String pattern, String identifier) { + switch (parseMethod) { + case GROK: + return new GrokExpression(pattern, identifier); + case PATTERNS: + return new PatternsExpression(pattern, identifier); + default: + return new RegexExpression(pattern, identifier); + } } /** @@ -124,137 +57,202 @@ public RegexExpression(String patterns, String identifier) { * @param pattern pattern used for parsing * @return list of names of the derived fields */ - public static List getNamedGroupCandidates(String pattern) { - ImmutableList.Builder namedGroups = ImmutableList.builder(); - Matcher m = GROUP_PATTERN.matcher(pattern); - while (m.find()) { - namedGroups.add(m.group(1)); - } - return namedGroups.build(); + public static List getNamedGroupCandidates( + ParseMethod parseMethod, String pattern, Map arguments) { + switch (parseMethod) { + case REGEX: + return RegexExpression.getNamedGroupCandidates(pattern); + case GROK: + return GrokExpression.getNamedGroupCandidates(pattern); + default: + return GrokExpression.getNamedGroupCandidates( + arguments.containsKey(NEW_FIELD_KEY) + ? (String) arguments.get(NEW_FIELD_KEY).getValue() + : null); + } } - @Override - public String parseValue(String value) { - Matcher matcher = regexPattern.matcher(value); - if (matcher.matches()) { - return matcher.group(identifier); - } - return ""; + /** + * extract the cleaner pattern without the additional fields + * + * @param parseMethod + * @param pattern + * @param columns + * @return + */ + public static String extractPatterns( + ParseMethod parseMethod, String pattern, List columns) { + switch (parseMethod) { + case REGEX: + return RegexExpression.extractPattern(pattern, columns); + case GROK: + return GrokExpression.extractPattern(pattern, columns); + default: + return PatternsExpression.extractPattern(pattern, columns); + } } - + public static String extractPattern(String patterns, List columns) { - return ParseUtils.extractPattern(patterns, columns); + AtomicReference patternsContainer = new AtomicReference<>(patterns); + // Iterate through each column (which is actually a group name) + for (String column : columns) { + patternsContainer.set(patternsContainer.get().replace("?<"+column+">", "")); + } + return patternsContainer.get(); } - } - - public static class GrokExpression extends ParseExpression { - private static final GrokCompiler grokCompiler = GrokCompiler.newInstance(); - static { - grokCompiler.registerDefaultPatterns(); + public static abstract class ParseExpression { + abstract String parseValue(String value); } - private final Grok grok; - private final String identifier; + public static class RegexExpression extends ParseExpression { + private final Pattern regexPattern; + protected final String identifier; - public GrokExpression(String pattern, String identifier) { - this.grok = grokCompiler.compile(pattern); - this.identifier = identifier; - } + public RegexExpression(String patterns, String identifier) { + this.regexPattern = Pattern.compile(patterns); + this.identifier = identifier; + } - @Override - public String parseValue(String value) { - Match grokMatch = grok.match(value); - Map capture = grokMatch.capture(); - Object match = capture.get(identifier); - if (match != null) { - return match.toString(); - } - return ""; - } + /** + * Get list of derived fields based on parse pattern. + * + * @param pattern pattern used for parsing + * @return list of names of the derived fields + */ + public static List getNamedGroupCandidates(String pattern) { + ImmutableList.Builder namedGroups = ImmutableList.builder(); + Matcher m = GROUP_PATTERN.matcher(pattern); + while (m.find()) { + namedGroups.add(m.group(1)); + } + return namedGroups.build(); + } - /** - * Get list of derived fields based on parse pattern. - * - * @param pattern pattern used for parsing - * @return list of names of the derived fields - */ - public static List getNamedGroupCandidates(String pattern) { - Grok grok = grokCompiler.compile(pattern); - return grok.namedGroups.stream() - .map(grok::getNamedRegexCollectionById) - .filter(group -> !group.equals("UNWANTED")) - .collect(Collectors.toUnmodifiableList()); - } + @Override + public String parseValue(String value) { + Matcher matcher = regexPattern.matcher(value); + if (matcher.matches()) { + return matcher.group(identifier); + } + return ""; + } - public static String extractPattern(final String patterns, List columns) { - AtomicReference cleanedPattern = new AtomicReference<>(patterns); - Grok grok = grokCompiler.compile(patterns); - columns.forEach(group -> { - if (grok.getNamedRegexCollection().containsValue(group)) { - String groupName = GrokUtils.getKeyByValue(grok.getNamedRegexCollection(), group); - cleanedPattern.set(ParseUtils.extractPattern(grok.getNamedRegex(), List.of(groupName))); + public static String extractPattern(String patterns, List columns) { + return ParseUtils.extractPattern(patterns, columns); } - }); - return cleanedPattern.get(); } - } - - public static class PatternsExpression extends ParseExpression{ - public static final String DEFAULT_NEW_FIELD = "patterns_field"; - private static final ImmutableSet DEFAULT_IGNORED_CHARS = - ImmutableSet.copyOf( - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" - .chars() - .mapToObj(c -> (char) c) - .toArray(Character[]::new)); - private final boolean useCustomPattern; - private Pattern pattern; + public static class GrokExpression extends ParseExpression { + private static final GrokCompiler grokCompiler = GrokCompiler.newInstance(); - /** - * PatternsExpression. - * - * @param pattern pattern used for parsing - * @param identifier derived field - */ - public PatternsExpression(String pattern, String identifier) { - useCustomPattern = !pattern.isEmpty(); - if (useCustomPattern) { - this.pattern = Pattern.compile(pattern); - } - } + static { + grokCompiler.registerDefaultPatterns(); + } - @Override - public String parseValue(String value) { - if (useCustomPattern) { - return pattern.matcher(value).replaceAll(""); - } + private final Grok grok; + private final String identifier; - char[] chars = value.toCharArray(); - int pos = 0; - for (int i = 0; i < chars.length; i++) { - if (!DEFAULT_IGNORED_CHARS.contains(chars[i])) { - chars[pos++] = chars[i]; + public GrokExpression(String pattern, String identifier) { + this.grok = grokCompiler.compile(pattern); + this.identifier = identifier; } - } - return new String(chars, 0, pos); - } - /** - * Get list of derived fields. - * - * @param identifier identifier used to generate the field name - * @return list of names of the derived fields - */ - public static List getNamedGroupCandidates(String identifier) { - return ImmutableList.of(Objects.requireNonNullElse(identifier, DEFAULT_NEW_FIELD)); + @Override + public String parseValue(String value) { + Match grokMatch = grok.match(value); + Map capture = grokMatch.capture(); + Object match = capture.get(identifier); + if (match != null) { + return match.toString(); + } + return ""; + } + + /** + * Get list of derived fields based on parse pattern. + * + * @param pattern pattern used for parsing + * @return list of names of the derived fields + */ + public static List getNamedGroupCandidates(String pattern) { + Grok grok = grokCompiler.compile(pattern); + return grok.namedGroups.stream() + .map(grok::getNamedRegexCollectionById) + .filter(group -> !group.equals("UNWANTED")) + .collect(Collectors.toUnmodifiableList()); + } + + public static String extractPattern(final String patterns, List columns) { + Grok grok = grokCompiler.compile(patterns); + AtomicReference cleanedPattern = new AtomicReference<>(grok.getNamedRegex()); + columns.stream() + .filter(group -> !group.equals("UNWANTED")) + .forEach(group -> { + if (grok.getNamedRegexCollection().containsValue(group)) { + String groupName = GrokUtils.getKeyByValue(grok.getNamedRegexCollection(), group); + cleanedPattern.set(ParseUtils.extractPattern(cleanedPattern.get(), List.of(groupName))); + } + }); + return cleanedPattern.get(); + } } - - public static String extractPattern(String patterns, List columns) { - //todo implement - return patterns; + + public static class PatternsExpression extends ParseExpression { + public static final String DEFAULT_NEW_FIELD = "patterns_field"; + + private static final ImmutableSet DEFAULT_IGNORED_CHARS = + ImmutableSet.copyOf( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + .chars() + .mapToObj(c -> (char) c) + .toArray(Character[]::new)); + private final boolean useCustomPattern; + private Pattern pattern; + + /** + * PatternsExpression. + * + * @param pattern pattern used for parsing + * @param identifier derived field + */ + public PatternsExpression(String pattern, String identifier) { + useCustomPattern = !pattern.isEmpty(); + if (useCustomPattern) { + this.pattern = Pattern.compile(pattern); + } + } + + @Override + public String parseValue(String value) { + if (useCustomPattern) { + return pattern.matcher(value).replaceAll(""); + } + + char[] chars = value.toCharArray(); + int pos = 0; + for (int i = 0; i < chars.length; i++) { + if (!DEFAULT_IGNORED_CHARS.contains(chars[i])) { + chars[pos++] = chars[i]; + } + } + return new String(chars, 0, pos); + } + + /** + * Get list of derived fields. + * + * @param identifier identifier used to generate the field name + * @return list of names of the derived fields + */ + public static List getNamedGroupCandidates(String identifier) { + return ImmutableList.of(Objects.requireNonNullElse(identifier, DEFAULT_NEW_FIELD)); + } + + public static String extractPattern(String patterns, List columns) { + //todo implement + return patterns; + } } - } - + } diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala index e323eaf27..8a0928969 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGrokTranslatorTestSuite.scala @@ -5,18 +5,19 @@ package org.opensearch.flint.spark.ppl -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, Coalesce, Descending, GreaterThan, Literal, NullsFirst, NullsLast, RegExpExtract, SortOrder} -import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical._ +import java.util +import java.util.Map + import org.opensearch.flint.spark.ppl.PlaneUtils.plan import org.opensearch.sql.common.grok.{Grok, GrokCompiler, Match} import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor} import org.scalatest.matchers.should.Matchers -import java.util -import java.util.Map +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, Coalesce, Descending, GreaterThan, Literal, NullsFirst, NullsLast, RegExpExtract, SortOrder} +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical._ class PPLLogicalPlanGrokTranslatorTestSuite extends SparkFunSuite @@ -31,10 +32,10 @@ class PPLLogicalPlanGrokTranslatorTestSuite val grokCompiler = GrokCompiler.newInstance grokCompiler.registerDefaultPatterns() - /* Grok pattern to compile, here httpd logs *//* Grok pattern to compile, here httpd logs */ + /* Grok pattern to compile, here httpd logs */ /* Grok pattern to compile, here httpd logs */ val grok = grokCompiler.compile(".+@%{HOSTNAME:host}") - /* Line of log to match *//* Line of log to match */ + /* Line of log to match */ /* Line of log to match */ val log = "iii@gmail.com" val gm = grok.`match`(log) @@ -52,7 +53,13 @@ class PPLLogicalPlanGrokTranslatorTestSuite val emailAttribute = UnresolvedAttribute("email") val hostAttribute = UnresolvedAttribute("host") val hostExpression = Alias( - Coalesce(Seq(RegExpExtract(emailAttribute, Literal(".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), Literal("1")))), + Coalesce( + Seq( + RegExpExtract( + emailAttribute, + Literal( + ".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), + Literal("1")))), "host")() val expectedPlan = Project( Seq(emailAttribute, hostAttribute), @@ -66,18 +73,22 @@ class PPLLogicalPlanGrokTranslatorTestSuite val context = new CatalystPlanContext val logPlan = planTransformer.visit( - plan(pplParser, "source=apache | grok message '%{COMMONAPACHELOG}' | fields COMMONAPACHELOG, timestamp, response, bytes", false), + plan( + pplParser, + "source=apache | grok message '%{COMMONAPACHELOG}' | fields COMMONAPACHELOG, timestamp, response, bytes", + false), context) - val emailAttribute = UnresolvedAttribute("email") - val hostExpression = Alias( - Coalesce(Seq(RegExpExtract(emailAttribute, Literal(".+@(.+)"), Literal("1")))), - "email")() + val logAttribute = UnresolvedAttribute("COMMONAPACHELOG") + val timestampAttribute = UnresolvedAttribute("timestamp") + val responseAttribute = UnresolvedAttribute("response") + val bytesAttribute = UnresolvedAttribute("bytes") + val logExpression = Alias( + Coalesce(Seq(RegExpExtract(logAttribute, Literal(".+@(.+)"), Literal("1")))), + "COMMONAPACHELOG")() val expectedPlan = Project( - Seq(emailAttribute), - Project( - Seq(emailAttribute, hostExpression, UnresolvedStar(None)), - UnresolvedRelation(Seq("t")))) + Seq(logAttribute, timestampAttribute, responseAttribute, bytesAttribute), + Project(Seq(logExpression, UnresolvedStar(None)), UnresolvedRelation(Seq("t")))) assert(compareByString(expectedPlan) === compareByString(logPlan)) } @@ -95,7 +106,13 @@ class PPLLogicalPlanGrokTranslatorTestSuite val emailAttribute = UnresolvedAttribute("email") val ageAttribute = UnresolvedAttribute("age") val hostExpression = Alias( - Coalesce(Seq(RegExpExtract(emailAttribute, Literal(".+@(.+)"), Literal(1)))), + Coalesce( + Seq( + RegExpExtract( + emailAttribute, + Literal( + ".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), + Literal(1)))), "host")() // Define the corrected expected plan @@ -108,7 +125,7 @@ class PPLLogicalPlanGrokTranslatorTestSuite GreaterThan(ageAttribute, Literal(45)), Project( Seq(emailAttribute, hostExpression, UnresolvedStar(None)), - UnresolvedRelation(Seq("t")))))) + UnresolvedRelation(Seq("accounts")))))) assert(compareByString(expectedPlan) === compareByString(logPlan)) } @@ -127,7 +144,13 @@ class PPLLogicalPlanGrokTranslatorTestSuite val evalResultAttribute = UnresolvedAttribute("eval_result") val hostExpression = Alias( - Coalesce(Seq(RegExpExtract(emailAttribute, Literal(".+@(.+)"), Literal("1")))), + Coalesce( + Seq( + RegExpExtract( + emailAttribute, + Literal( + ".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), + Literal("1")))), "host")() val evalResultExpression = Alias(Literal(1), "eval_result")() @@ -138,21 +161,30 @@ class PPLLogicalPlanGrokTranslatorTestSuite Seq(UnresolvedStar(None), evalResultExpression), Project( Seq(emailAttribute, hostExpression, UnresolvedStar(None)), - UnresolvedRelation(Seq("t"))))) + UnresolvedRelation(Seq("accounts"))))) assert(compareByString(expectedPlan) === compareByString(logPlan)) } - + test("test parse email expressions and group by count host ") { val context = new CatalystPlanContext val logPlan = planTransformer.visit( - plan(pplParser, "source=t | grok email '.+@%{HOSTNAME:host}' | stats count() by host", false), + plan( + pplParser, + "source=t | grok email '.+@%{HOSTNAME:host}' | stats count() by host", + false), context) val emailAttribute = UnresolvedAttribute("email") val hostAttribute = UnresolvedAttribute("host") val hostExpression = Alias( - Coalesce(Seq(RegExpExtract(emailAttribute, Literal(".+@(.+)"), Literal(1)))), + Coalesce( + Seq( + RegExpExtract( + emailAttribute, + Literal( + ".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), + Literal(1)))), "host")() // Define the corrected expected plan @@ -183,7 +215,13 @@ class PPLLogicalPlanGrokTranslatorTestSuite val emailAttribute = UnresolvedAttribute("email") val hostAttribute = UnresolvedAttribute("host") val hostExpression = Alias( - Coalesce(Seq(RegExpExtract(emailAttribute, Literal(".+@(.+)"), Literal(1)))), + Coalesce( + Seq( + RegExpExtract( + emailAttribute, + Literal( + ".+@(\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b))"), + Literal(1)))), "host")() val sortedPlan = Sort(