Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract all fields from filter:includeRegex and filter:excludeRegex … #2450

Merged
merged 3 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.util.List;
import java.util.Set;

import org.apache.commons.jexl3.parser.ASTAndNode;
import org.apache.commons.jexl3.parser.ASTFunctionNode;
import org.apache.commons.jexl3.parser.ASTIdentifier;
import org.apache.commons.jexl3.parser.ASTOrNode;
Expand All @@ -21,8 +20,6 @@
import datawave.query.jexl.JexlASTHelper;
import datawave.query.jexl.JexlNodeFactory;
import datawave.query.jexl.functions.FunctionJexlNodeVisitor;
import datawave.query.language.functions.jexl.Jexl;
import datawave.query.language.tree.FunctionNode;
import datawave.query.parser.JavaRegexAnalyzer;
import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException;
import datawave.query.util.MetadataHelper;
Expand All @@ -41,16 +38,16 @@ public class RegexFunctionVisitor extends FunctionIndexQueryExpansionVisitor {

private static final Logger log = ThreadConfigurableLogger.getLogger(RegexFunctionVisitor.class);

protected Set<String> nonEventFields;
protected Set<String> indexOnlyFields;

public RegexFunctionVisitor(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> nonEventFields) {
public RegexFunctionVisitor(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> indexOnlyFields) {
super(config, metadataHelper, null);
this.nonEventFields = nonEventFields;
this.indexOnlyFields = indexOnlyFields;
}

@SuppressWarnings("unchecked")
public static <T extends JexlNode> T expandRegex(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> nonEventFields, T script) {
RegexFunctionVisitor visitor = new RegexFunctionVisitor(config, metadataHelper, nonEventFields);
public static <T extends JexlNode> T expandRegex(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> indexOnlyFields, T script) {
RegexFunctionVisitor visitor = new RegexFunctionVisitor(config, metadataHelper, indexOnlyFields);
JexlNode root = (T) script.jjtAccept(visitor, null);
root = TreeFlatteningRebuildingVisitor.flatten(root);
return (T) root;
Expand All @@ -67,66 +64,59 @@ public Object visit(ASTFunctionNode node, Object data) {

List<ASTIdentifier> identifiers = JexlASTHelper.getIdentifiers(arguments.get(0));
List<JexlNode> extractChildren = new ArrayList<>(identifiers.size());
List<JexlNode> keepChildren = new ArrayList<>(identifiers.size());

boolean extractFields = false;
for (ASTIdentifier identifier : identifiers) {
JexlNode regexNode = buildRegexNode(identifier, functionMetadata.name(), JexlNodes.getIdentifierOrLiteralAsString(arguments.get(1)));
if (regexNode != null) {
extractChildren.add(regexNode);
} else {
keepChildren.add(JexlNodeFactory.buildIdentifier(identifier.getName()));
String field = JexlASTHelper.deconstructIdentifier(identifier.getName());
// if the function contains any index-only fields, extract them all from the function
if (indexOnlyFields.contains(field.toUpperCase())) {
extractFields = true;
break;
}

}

if (extractChildren.size() == 0) {
// nothing to rewrite
if (!extractFields) {
return returnNode;
} else if (keepChildren.size() == 0) {
// rewrite all nodes
if (identifiers.size() == 1) {
// we've already rewritten our one node
returnNode = extractChildren.get(0);
} else {
if (functionMetadata.name().equals(INCLUDE_REGEX) && arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createOrNode(extractChildren);
} else {
// build an AND node because of how DeMorgan's law works with expanding negations
returnNode = JexlNodeFactory.createAndNode(extractChildren);
}
}
} else {
// construct each and put it all together
JexlNode extractNode;
List<JexlNode> joint = new ArrayList<>();
List<JexlNode> newArgs = new ArrayList<>();

if (functionMetadata.name().equals(INCLUDE_REGEX) && arguments.get(0) instanceof ASTOrNode) {
newArgs.add(JexlNodeFactory.createOrNode(keepChildren));
extractNode = JexlNodeFactory.createOrNode(extractChildren);
} else {
newArgs.add(JexlNodeFactory.createAndNode(keepChildren));
extractNode = JexlNodeFactory.createAndNode(extractChildren);
}

newArgs.add(arguments.get(1));
JexlNode newFunc = FunctionJexlNodeVisitor.makeFunctionFrom(functionMetadata.namespace(), functionMetadata.name(),
newArgs.toArray(new JexlNode[0]));

joint.add(extractNode);
joint.add(newFunc);
for (ASTIdentifier identifier : identifiers) {
JexlNode regexNode = buildRegexNode(identifier, functionMetadata.name(), JexlNodes.getIdentifierOrLiteralAsString(arguments.get(1)));
if (regexNode != null) {
extractChildren.add(regexNode);
}
}

if (functionMetadata.name().equals(INCLUDE_REGEX) && arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createOrNode(joint);
if (extractChildren.size() == 0) {
// nothing to rewrite
return returnNode;
} else {
returnNode = JexlNodeFactory.createAndNode(joint);
// rewrite all nodes
if (identifiers.size() == 1) {
// we've already rewritten our one node
returnNode = extractChildren.get(0);
} else {
if (functionMetadata.name().equals(INCLUDE_REGEX)) {
if (arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createOrNode(extractChildren);
} else {
returnNode = JexlNodeFactory.createAndNode(extractChildren);
}
} else {
// Follow DeMorgan's law when expanding negations
if (arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createAndNode(extractChildren);
} else {
returnNode = JexlNodeFactory.createOrNode(extractChildren);
}
}
}
}
}
}

if (log.isTraceEnabled()) {
log.trace("Rewrote \"" + JexlStringBuildingVisitor.buildQueryWithoutParse(node) + "\" into \""
+ JexlStringBuildingVisitor.buildQueryWithoutParse(returnNode) + "\"");
if (log.isTraceEnabled()) {
log.trace("Rewrote \"" + JexlStringBuildingVisitor.buildQueryWithoutParse(node) + "\" into \""
+ JexlStringBuildingVisitor.buildQueryWithoutParse(returnNode) + "\"");
}
}
return returnNode;
}
Expand All @@ -144,21 +134,20 @@ public Object visit(ASTFunctionNode node, Object data) {
*/
private JexlNode buildRegexNode(ASTIdentifier identifier, String functionName, String regex) {
String field = JexlASTHelper.deconstructIdentifier(identifier.getName());
if (nonEventFields.contains(field.toUpperCase())) {
try {
JavaRegexAnalyzer jra = new JavaRegexAnalyzer(regex);
if (!jra.isNgram()) {
if (functionName.equals(INCLUDE_REGEX)) {
return JexlNodeFactory.buildERNode(field, regex);
} else {
return JexlNodeFactory.buildNRNode(field, regex);
}
try {
JavaRegexAnalyzer jra = new JavaRegexAnalyzer(regex);
if (!jra.isNgram()) {
if (functionName.equals(INCLUDE_REGEX)) {
return JexlNodeFactory.buildERNode(field, regex);
} else {
return JexlNodeFactory.buildNRNode(field, regex);
}
} catch (JavaRegexParseException e) {
QueryException qe = new QueryException(DatawaveErrorCode.INVALID_REGEX);
throw new DatawaveFatalQueryException(qe);
}
} catch (JavaRegexParseException e) {
QueryException qe = new QueryException(DatawaveErrorCode.INVALID_REGEX);
throw new DatawaveFatalQueryException(qe);
}

return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,22 @@ public void testFilterFunctionsInvalidatedByIndexOnlyFields() throws ParseExcept
}
}

@Test
public void testMultiFieldInclude() throws Exception {
eventQueryLogic.setParser(new LuceneToJexlQueryParser());
Map<String,String> extraParameters = new HashMap<>();
// @formatter:off
String[] queryStrings = {
"UUID:SOPRANO AND #INCLUDE(LOCATION || POSIZIONE || NAME, 'newjersey')"
};
@SuppressWarnings("unchecked")
List<String>[] expectedLists = new List[] {Collections.singletonList("SOPRANO")};
for (int i = 0; i < queryStrings.length; i++) {
runTestQuery(expectedLists[i], queryStrings[i], format.parse("20091231"), format.parse("20150101"), extraParameters);
}

}

@Test
public void testDelayedExceededValueThresholdRegexTFField() throws Exception {
Map<String,String> extraParameters = new HashMap();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,59 +70,59 @@ public void testRewriteMultiFieldedIncludeRegex() throws ParseException {
public void testRewriteMultiHeteroIndexOnlyFieldsIncludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDB");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDB =~ 'ba.*' || filter:includeRegex(FIELDA || FIELDC, 'ba.*'))";
String expected = "FOO == 'bar' && (FIELDA =~ 'ba.*' || FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDB =~ 'ba.*' || filter:includeRegex(FIELDA || FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA =~ 'ba.*' || FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testRewriteMultiHeteroIndexOnlyFieldsIncludeRegex2() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDB", "FIELDC");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || filter:includeRegex(FIELDA, 'ba.*'))";
String expected = "FOO == 'bar' && (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || FIELDA =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || filter:includeRegex(FIELDA, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || FIELDA =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testRewriteHeteroIndexOnlyFieldsExcludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA", "FIELDB");
String query = "FOO == 'bar' && filter:excludeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && FIELDC !~ 'ba.*'";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:excludeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && FIELDC !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testANDIncludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA", "FIELDB");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && filter:includeRegex(FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && FIELDC =~ 'ba.*'";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:includeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && filter:includeRegex(FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && FIELDC =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testANDExcludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA", "FIELDB");
String query = "FOO == 'bar' && filter:excludeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDA !~ 'ba.*' || FIELDB !~ 'ba.*' || FIELDC !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:excludeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' || FIELDB !~ 'ba.*' || FIELDC !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

Expand All @@ -142,11 +142,11 @@ public void testRewriteMultiFieldedExcludeRegex() throws ParseException {
public void testRewriteMultiHeteroFieldedExcludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA");
String query = "FOO == 'bar' && filter:excludeRegex(FIELDA||FIELDB, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDA !~ 'ba.*' && filter:excludeRegex(FIELDB, 'ba.*'))";
String expected = "FOO == 'bar' && (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:excludeRegex(FIELDA||FIELDB, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && filter:excludeRegex(FIELDB, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

Expand All @@ -166,7 +166,7 @@ public void testEndWildCardNotIndexOnly() throws ParseException {
public void testMixedEventNonEvent() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("NON_EVENT_FIELD");
String query = "filter:includeRegex(EVENT_FIELD || NON_EVENT_FIELD,'all_.*?')";
String expected = " NON_EVENT_FIELD =~ 'all_.*?' || filter:includeRegex(EVENT_FIELD, 'all_.*?')";
String expected = "EVENT_FIELD =~ 'all_.*?' || NON_EVENT_FIELD =~ 'all_.*?'";
assertVisitorResult(query, expected, indexOnlyFields);
}

Expand All @@ -186,6 +186,15 @@ public void testDoubleEndedWildCard2() throws ParseException {
assertVisitorResult(query, query, indexOnlyFields);
}

@Test
public void testMultiFieldDoubleEndedWildCard() throws ParseException {
hlgp marked this conversation as resolved.
Show resolved Hide resolved
// we cannot extract the index only field because we don't support double ended wildcards against the index. Queries like this will fail in a later
// visitor
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA||FIELDB,'.*all_.*')";
assertVisitorResult(query, query, indexOnlyFields);
}

@Test
public void testBadRegex() throws ParseException {
exception.expect(DatawaveFatalQueryException.class);
Expand Down
Loading