From 6e27e5ad8e37944605bef61e14f1972f440e4724 Mon Sep 17 00:00:00 2001 From: Naomichi Yamakita Date: Wed, 14 Aug 2024 06:25:57 +0900 Subject: [PATCH] Fix parseToken array popping (#13620) * Fix parseToken array popping Signed-off-by: naomichi-y * Add fix description Signed-off-by: naomichi-y * Refactor JsonToStringXContentParser#parseToken This method was pretty complicated, hard to follow, and therefore prone to bugs. This change introduces a proper stack (rather than a StringBuilder) to keep track of fields under fields. When ever we parse a field name, we push it onto the stack, recursively parse its value (an object, an array, or a primitive value), then we pop (guaranteeing pushes and pops are balanced). Signed-off-by: Michael Froh * Remove use of org.apache.logging.log4j.util.Strings Signed-off-by: Michael Froh * Make sure JsonToStringXContentParser ends on END_OBJECT There is logic in DocumentParser that expects any object parser to start with currentToken() pointing to START_OBJECT, and return with currentToken() pointing to END_OBJECT. My previous commits were stepping over the start/end, and returning on the token following the end. Signed-off-by: Michael Froh * Throw exception when flat object is null The existing behavior throws an exception on null flat object by advancing the parser beyond the end of the flat object input. We could simply skip a null flat_object field, but this would be a behavioral change from 2.7. Signed-off-by: Michael Froh --------- Signed-off-by: naomichi-y Signed-off-by: Michael Froh Co-authored-by: Michael Froh --- CHANGELOG.md | 1 + .../xcontent/JsonToStringXContentParser.java | 122 ++++++++---------- .../index/mapper/FlatObjectFieldMapper.java | 9 +- .../JsonToStringXContentParserTests.java | 57 +++++++- 4 files changed, 117 insertions(+), 72 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34cd4c2097e48..cf0d032cc75ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix missing value of FieldSort for unsigned_long ([#14963](https://github.com/opensearch-project/OpenSearch/pull/14963)) - Fix delete index template failed when the index template matches a data stream but is unused ([#15080](https://github.com/opensearch-project/OpenSearch/pull/15080)) - Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126)) +- Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620)) ### Security diff --git a/server/src/main/java/org/opensearch/common/xcontent/JsonToStringXContentParser.java b/server/src/main/java/org/opensearch/common/xcontent/JsonToStringXContentParser.java index 998122d9e5c43..d24571fc5778d 100644 --- a/server/src/main/java/org/opensearch/common/xcontent/JsonToStringXContentParser.java +++ b/server/src/main/java/org/opensearch/common/xcontent/JsonToStringXContentParser.java @@ -9,6 +9,7 @@ package org.opensearch.common.xcontent; import org.opensearch.common.xcontent.json.JsonXContent; +import org.opensearch.core.common.Strings; import org.opensearch.core.common.bytes.BytesReference; import org.opensearch.core.xcontent.AbstractXContentParser; import org.opensearch.core.xcontent.DeprecationHandler; @@ -23,6 +24,9 @@ import java.math.BigInteger; import java.nio.CharBuffer; import java.util.ArrayList; +import java.util.Collections; +import java.util.Deque; +import java.util.LinkedList; /** * JsonToStringParser is the main parser class to transform JSON into stringFields in a XContentParser @@ -32,21 +36,20 @@ */ public class JsonToStringXContentParser extends AbstractXContentParser { private final String fieldTypeName; - private XContentParser parser; + private final XContentParser parser; - private ArrayList valueList = new ArrayList<>(); - private ArrayList valueAndPathList = new ArrayList<>(); - private ArrayList keyList = new ArrayList<>(); + private final ArrayList valueList = new ArrayList<>(); + private final ArrayList valueAndPathList = new ArrayList<>(); + private final ArrayList keyList = new ArrayList<>(); - private XContentBuilder builder = XContentBuilder.builder(JsonXContent.jsonXContent); + private final XContentBuilder builder = XContentBuilder.builder(JsonXContent.jsonXContent); - private NamedXContentRegistry xContentRegistry; + private final NamedXContentRegistry xContentRegistry; - private DeprecationHandler deprecationHandler; + private final DeprecationHandler deprecationHandler; private static final String VALUE_AND_PATH_SUFFIX = "._valueAndPath"; private static final String VALUE_SUFFIX = "._value"; - private static final String DOT_SYMBOL = "."; private static final String EQUAL_SYMBOL = "="; public JsonToStringXContentParser( @@ -63,9 +66,14 @@ public JsonToStringXContentParser( } public XContentParser parseObject() throws IOException { + assert currentToken() == Token.START_OBJECT; + parser.nextToken(); // Skip the outer START_OBJECT. Need to return on END_OBJECT. + builder.startObject(); - StringBuilder path = new StringBuilder(fieldTypeName); - parseToken(path, null); + LinkedList path = new LinkedList<>(Collections.singleton(fieldTypeName)); + while (currentToken() != Token.END_OBJECT) { + parseToken(path); + } builder.field(this.fieldTypeName, keyList); builder.field(this.fieldTypeName + VALUE_SUFFIX, valueList); builder.field(this.fieldTypeName + VALUE_AND_PATH_SUFFIX, valueAndPathList); @@ -74,75 +82,55 @@ public XContentParser parseObject() throws IOException { return JsonXContent.jsonXContent.createParser(this.xContentRegistry, this.deprecationHandler, String.valueOf(jString)); } - private void parseToken(StringBuilder path, String currentFieldName) throws IOException { - - while (this.parser.nextToken() != Token.END_OBJECT) { - if (this.parser.currentName() != null) { - currentFieldName = this.parser.currentName(); + private void parseToken(Deque path) throws IOException { + if (this.parser.currentToken() == Token.FIELD_NAME) { + String fieldName = this.parser.currentName(); + path.addLast(fieldName); // Pushing onto the stack *must* be matched by pop + String parts = fieldName; + while (parts.contains(".")) { // Extract the intermediate keys maybe present in fieldName + int dotPos = parts.indexOf('.'); + String part = parts.substring(0, dotPos); + this.keyList.add(part); + parts = parts.substring(dotPos + 1); } - StringBuilder parsedFields = new StringBuilder(); - - if (this.parser.currentToken() == Token.FIELD_NAME) { - path.append(DOT_SYMBOL).append(currentFieldName); - int dotIndex = currentFieldName.indexOf(DOT_SYMBOL); - String fieldNameSuffix = currentFieldName; - // The field name may be of the form foo.bar.baz - // If that's the case, each "part" is a key. - while (dotIndex >= 0) { - String fieldNamePrefix = fieldNameSuffix.substring(0, dotIndex); - if (!fieldNamePrefix.isEmpty()) { - this.keyList.add(fieldNamePrefix); - } - fieldNameSuffix = fieldNameSuffix.substring(dotIndex + 1); - dotIndex = fieldNameSuffix.indexOf(DOT_SYMBOL); - } - if (!fieldNameSuffix.isEmpty()) { - this.keyList.add(fieldNameSuffix); - } - } else if (this.parser.currentToken() == Token.START_ARRAY) { - parseToken(path, currentFieldName); - break; - } else if (this.parser.currentToken() == Token.END_ARRAY) { - // skip - } else if (this.parser.currentToken() == Token.START_OBJECT) { - parseToken(path, currentFieldName); - int dotIndex = path.lastIndexOf(DOT_SYMBOL, path.length()); - - if (dotIndex != -1 && path.length() > currentFieldName.length()) { - path.setLength(path.length() - currentFieldName.length() - 1); - } - } else { - if (!path.toString().contains(currentFieldName)) { - path.append(DOT_SYMBOL).append(currentFieldName); - } - parseValue(parsedFields); - this.valueList.add(parsedFields.toString()); - this.valueAndPathList.add(path + EQUAL_SYMBOL + parsedFields); - int dotIndex = path.lastIndexOf(DOT_SYMBOL, path.length()); - if (dotIndex != -1 && path.length() > currentFieldName.length()) { - path.setLength(path.length() - currentFieldName.length() - 1); - } + this.keyList.add(parts); // parts has no dot, so either it's the original fieldName or it's the last part + this.parser.nextToken(); // advance to the value of fieldName + parseToken(path); // parse the value for fieldName (which will be an array, an object, or a primitive value) + path.removeLast(); // Here is where we pop fieldName from the stack (since we're done with the value of fieldName) + // Note that whichever other branch we just passed through has already ended with nextToken(), so we + // don't need to call it. + } else if (this.parser.currentToken() == Token.START_ARRAY) { + parser.nextToken(); + while (this.parser.currentToken() != Token.END_ARRAY) { + parseToken(path); } - + this.parser.nextToken(); + } else if (this.parser.currentToken() == Token.START_OBJECT) { + parser.nextToken(); + while (this.parser.currentToken() != Token.END_OBJECT) { + parseToken(path); + } + this.parser.nextToken(); + } else if (this.parser.currentToken().isValue()) { + String parsedValue = parseValue(); + if (parsedValue != null) { + this.valueList.add(parsedValue); + this.valueAndPathList.add(Strings.collectionToDelimitedString(path, ".") + EQUAL_SYMBOL + parsedValue); + } + this.parser.nextToken(); } } - private void parseValue(StringBuilder parsedFields) throws IOException { + private String parseValue() throws IOException { switch (this.parser.currentToken()) { case VALUE_BOOLEAN: case VALUE_NUMBER: case VALUE_STRING: case VALUE_NULL: - parsedFields.append(this.parser.textOrNull()); - break; + return this.parser.textOrNull(); // Handle other token types as needed - case FIELD_NAME: - case VALUE_EMBEDDED_OBJECT: - case END_ARRAY: - case START_ARRAY: - break; default: - throw new IOException("Unsupported token type [" + parser.currentToken() + "]"); + throw new IOException("Unsupported value token type [" + parser.currentToken() + "]"); } } diff --git a/server/src/main/java/org/opensearch/index/mapper/FlatObjectFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/FlatObjectFieldMapper.java index 9a3f2595a7c9e..b82fa3999612a 100644 --- a/server/src/main/java/org/opensearch/index/mapper/FlatObjectFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/FlatObjectFieldMapper.java @@ -568,8 +568,12 @@ protected void parseCreateField(ParseContext context) throws IOException { if (context.externalValueSet()) { String value = context.externalValue().toString(); parseValueAddFields(context, value, fieldType().name()); + } else if (context.parser().currentToken() == XContentParser.Token.VALUE_NULL) { + context.parser().nextToken(); // This triggers an exception in DocumentParser. + // We could remove the above nextToken() call to skip the null value, but the existing + // behavior (since 2.7) throws the exception. } else { - JsonToStringXContentParser JsonToStringParser = new JsonToStringXContentParser( + JsonToStringXContentParser jsonToStringParser = new JsonToStringXContentParser( NamedXContentRegistry.EMPTY, DeprecationHandler.IGNORE_DEPRECATIONS, context.parser(), @@ -579,7 +583,7 @@ protected void parseCreateField(ParseContext context) throws IOException { JsonToStringParser is the main parser class to transform JSON into stringFields in a XContentParser It reads the JSON object and parsed to a list of string */ - XContentParser parser = JsonToStringParser.parseObject(); + XContentParser parser = jsonToStringParser.parseObject(); XContentParser.Token currentToken; while ((currentToken = parser.nextToken()) != XContentParser.Token.END_OBJECT) { @@ -594,7 +598,6 @@ protected void parseCreateField(ParseContext context) throws IOException { } } - } } diff --git a/server/src/test/java/org/opensearch/common/xcontent/JsonToStringXContentParserTests.java b/server/src/test/java/org/opensearch/common/xcontent/JsonToStringXContentParserTests.java index 0feb7bcd1ceec..a0f5150981a08 100644 --- a/server/src/test/java/org/opensearch/common/xcontent/JsonToStringXContentParserTests.java +++ b/server/src/test/java/org/opensearch/common/xcontent/JsonToStringXContentParserTests.java @@ -19,7 +19,6 @@ public class JsonToStringXContentParserTests extends OpenSearchTestCase { private String flattenJsonString(String fieldName, String in) throws IOException { - String transformed; try ( XContentParser parser = JsonXContent.jsonXContent.createParser( xContentRegistry(), @@ -33,10 +32,11 @@ private String flattenJsonString(String fieldName, String in) throws IOException parser, fieldName ); - // Skip the START_OBJECT token: + // Point to the first token (should be START_OBJECT) jsonToStringXContentParser.nextToken(); XContentParser transformedParser = jsonToStringXContentParser.parseObject(); + assertSame(XContentParser.Token.END_OBJECT, jsonToStringXContentParser.currentToken()); try (XContentBuilder jsonBuilder = XContentFactory.jsonBuilder()) { jsonBuilder.copyCurrentStructure(transformedParser); return jsonBuilder.toString(); @@ -110,4 +110,57 @@ public void testNestChildObjectWithDotsAndFieldWithDots() throws IOException { ); } + public void testArrayOfObjects() throws IOException { + String jsonExample = "{" + + "\"field\": {" + + " \"detail\": {" + + " \"foooooooooooo\": [" + + " {\"name\":\"baz\"}," + + " {\"name\":\"baz\"}" + + " ]" + + " }" + + "}}"; + + assertEquals( + "{" + + "\"flat\":[\"field\",\"detail\",\"foooooooooooo\",\"name\",\"name\"]," + + "\"flat._value\":[\"baz\",\"baz\"]," + + "\"flat._valueAndPath\":[" + + "\"flat.field.detail.foooooooooooo.name=baz\"," + + "\"flat.field.detail.foooooooooooo.name=baz\"" + + "]}", + flattenJsonString("flat", jsonExample) + ); + } + + public void testArraysOfObjectsAndValues() throws IOException { + String jsonExample = "{" + + "\"field\": {" + + " \"detail\": {" + + " \"foooooooooooo\": [" + + " {\"name\":\"baz\"}," + + " {\"name\":\"baz\"}" + + " ]" + + " }," + + " \"numbers\" : [" + + " 1," + + " 2," + + " 3" + + " ]" + + "}}"; + + assertEquals( + "{" + + "\"flat\":[\"field\",\"detail\",\"foooooooooooo\",\"name\",\"name\",\"numbers\"]," + + "\"flat._value\":[\"baz\",\"baz\",\"1\",\"2\",\"3\"]," + + "\"flat._valueAndPath\":[" + + "\"flat.field.detail.foooooooooooo.name=baz\"," + + "\"flat.field.detail.foooooooooooo.name=baz\"," + + "\"flat.field.numbers=1\"," + + "\"flat.field.numbers=2\"," + + "\"flat.field.numbers=3\"" + + "]}", + flattenJsonString("flat", jsonExample) + ); + } }