From af402d8f5ea4cb698cd2156fddbe511fb7f1a831 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 22 Apr 2024 08:15:51 -0500 Subject: [PATCH 1/2] Let big data gen set nullability recursively Signed-off-by: Robert (Bobby) Evans --- .../spark/sql/tests/datagen/bigDataGen.scala | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala index da8f9461e2e..91335afe4e6 100644 --- a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala +++ b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala @@ -609,6 +609,15 @@ abstract class DataGen(var conf: ColumnConf, this } + def setNullProbabilityRecursively(probability: Double): DataGen = { + this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability)) + children.foreach { + case (_, dataGen) => + dataGen.setNullProbabilityRecursively(probability) + } + this + } + /** * Set a specific location to seed mapping for the value generation. */ @@ -672,6 +681,7 @@ abstract class DataGen(var conf: ColumnConf, * Get the default value generator for this specific data gen. */ protected def getValGen: GeneratorFunction + def children: Seq[(String, DataGen)] /** * Get the final ready to use GeneratorFunction for the data generator. @@ -823,6 +833,8 @@ class BooleanGen(conf: ColumnConf, override def dataType: DataType = BooleanType override protected def getValGen: GeneratorFunction = BooleanGenFunc() + + override def children: Seq[(String, DataGen)] = Seq.empty } /** @@ -878,6 +890,8 @@ class ByteGen(conf: ColumnConf, extends DataGen(conf, defaultValueRange) { override def getValGen: GeneratorFunction = ByteGenFunc() override def dataType: DataType = ByteType + + override def children: Seq[(String, DataGen)] = Seq.empty } /** @@ -935,6 +949,8 @@ class ShortGen(conf: ColumnConf, override def getValGen: GeneratorFunction = ShortGenFunc() override def dataType: DataType = ShortType + + override def children: Seq[(String, DataGen)] = Seq.empty } /** @@ -991,6 +1007,8 @@ class IntGen(conf: ColumnConf, override def getValGen: GeneratorFunction = IntGenFunc() override def dataType: DataType = IntegerType + + override def children: Seq[(String, DataGen)] = Seq.empty } /** @@ -1045,6 +1063,8 @@ class LongGen(conf: ColumnConf, override def getValGen: GeneratorFunction = LongGenFunc() override def dataType: DataType = LongType + + override def children: Seq[(String, DataGen)] = Seq.empty } case class Decimal32GenFunc( @@ -1284,6 +1304,8 @@ class DecimalGen(dt: DecimalType, val max = DecimalGen.genMaxUnscaled(dt.precision) DecimalGenFunc(dt.precision, dt.scale, -max, max) } + + override def children: Seq[(String, DataGen)] = Seq.empty } /** @@ -1341,6 +1363,8 @@ class TimestampGen(conf: ColumnConf, override protected def getValGen: GeneratorFunction = TimestampGenFunc() override def dataType: DataType = TimestampType + + override def children: Seq[(String, DataGen)] = Seq.empty } object BigDataGenConsts { @@ -1418,6 +1442,8 @@ class DateGen(conf: ColumnConf, override protected def getValGen: GeneratorFunction = DateGenFunc() override def dataType: DataType = DateType + + override def children: Seq[(String, DataGen)] = Seq.empty } /** @@ -1440,6 +1466,8 @@ class DoubleGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)]) override def dataType: DataType = DoubleType override protected def getValGen: GeneratorFunction = DoubleGenFunc() + + override def children: Seq[(String, DataGen)] = Seq.empty } /** @@ -1462,6 +1490,8 @@ class FloatGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)]) override def dataType: DataType = FloatType override protected def getValGen: GeneratorFunction = FloatGenFunc() + + override def children: Seq[(String, DataGen)] = Seq.empty } trait JSONType { @@ -1648,6 +1678,8 @@ class StringGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)]) override def dataType: DataType = StringType override protected def getValGen: GeneratorFunction = ASCIIGenFunc() + + override def children: Seq[(String, DataGen)] = Seq.empty } case class StructGenFunc(childGens: Array[GeneratorFunction]) extends GeneratorFunction { @@ -1752,6 +1784,8 @@ class ArrayGen(child: DataGen, None } } + + override def children: Seq[(String, DataGen)] = Seq(("data", child)) } case class MapGenFunc( @@ -1816,6 +1850,8 @@ class MapGen(key: DataGen, None } } + + override def children: Seq[(String, DataGen)] = Seq(("key", key), ("value", value)) } @@ -1864,6 +1900,11 @@ class ColumnGen(val dataGen: DataGen) { this } + def setNullProbabilityRecursively(probability: Double): ColumnGen = { + dataGen.setNullProbabilityRecursively(probability) + this + } + def setNullGen(f: NullGeneratorFunction): ColumnGen = { dataGen.setNullGen(f) this @@ -1973,6 +2014,14 @@ class TableGen(val columns: Seq[(String, ColumnGen)], numRows: Long) { this } + def setNullProbabilityRecursively(probability: Double): TableGen = { + columns.foreach { + case (_, columnGen) => + columnGen.setNullProbabilityRecursively(probability) + } + this + } + /** * Convert this table into a `DataFrame` that can be * written out or used directly. Writing it out to parquet From 967001b7508257caf510d5c5d3049b441d8d26c4 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 22 Apr 2024 08:28:22 -0500 Subject: [PATCH 2/2] Add in some benchmarks for get_json_object Signed-off-by: Robert (Bobby) Evans --- benchark/get_json_object_stress_gen.scala | 258 ++++++++++++++++++++++ benchark/get_json_object_stress_run.scala | 92 ++++++++ 2 files changed, 350 insertions(+) create mode 100644 benchark/get_json_object_stress_gen.scala create mode 100644 benchark/get_json_object_stress_run.scala diff --git a/benchark/get_json_object_stress_gen.scala b/benchark/get_json_object_stress_gen.scala new file mode 100644 index 00000000000..a6e7948f767 --- /dev/null +++ b/benchark/get_json_object_stress_gen.scala @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +spark.conf.set("spark.rapids.sql.enabled", false) + +import org.apache.spark.sql.tests.datagen._ +import org.apache.spark.sql.types._ + +val numRows = 3000000 +//val nullProbability = 0.1 +val nullProbability = 0.0001 +val output = "/data/tmp/SCALE_FROM_JSON" + +def doIt(): Unit = { + val ts_0 = StructField("test_string_0", StringType) + val ts_1 = StructField("test_string_1", StringType) + val ts_2 = StructField("test_string_2", StringType) + val ts_3 = StructField("test_string_3", StringType) + val ts_4 = StructField("test_string_4", StringType) + val ts_5 = StructField("test_string_5", StringType) + val ts_6 = StructField("test_string_6", StringType) + val ts_7 = StructField("test_string_7", StringType) + + val ti_0 = StructField("test_int_0", IntegerType) + + val tl_0 = StructField("test_long_0", LongType) + + val item001 = StructField("item001", StructType(Seq(ts_0, ts_1, ti_0))) + val item005 = StructField("item005", FloatType) + val item004 = StructField("item004", StructType(Seq(item005, ts_0))) + val t_struct_0 = StructField("test_struct_0", StructType(Seq(item005, ts_0))) + val item003 = StructField("item003", StructType(Seq(item004, t_struct_0, ts_2))) + val item002 = StructField("item002", ArrayType(StructType(Seq(item003)))) + val item006 = StructField("item006", StringType) + val t_struct_1 = StructField("test_struct_1", StructType(Seq(ts_0, ts_1, ts_2, ti_0))) + val t_struct_2 = StructField("test_struct_2", StructType(Seq(ts_0, ts_1, ts_2, ts_3, ts_4))) + val t_struct_3 = StructField("test_struct_3", StructType(Seq(ts_0, ts_1, ts_2, ts_3, ts_4, ts_5, ts_6))) + val t_struct_4 = StructField("test_struct_4", StructType(Seq(ts_0, ts_1, ts_2, ts_3, ts_4, ts_5, ts_6))) + val t_struct_5 = StructField("test_struct_5", StructType(Seq(t_struct_4))) + val item008 = StructField("item008", ArrayType(StructType(Seq(t_struct_2, t_struct_3)))) + val item007 = StructField("item007", StructType(Seq(t_struct_1, item008, t_struct_5))) + val item013 = StructField("item013", StringType) + val item014 = StructField("item014", StringType) + val item015 = StructField("item015", StringType) + val item012 = StructField("item012", StructType(Seq(item013, ts_0, item014, ts_1, ts_2, ts_3, ts_4, item015, ts_5, ts_6, ts_7))) + val t_struct_7 = StructField("test_struct_7", StructType(Seq(ts_0))) + val t_struct_8 = StructField("test_struct_8", StructType(Seq(ts_0, ts_1))) + val t_array_1 = StructField("test_array_1", ArrayType(StructType(Seq(t_struct_8)))) + val t_struct_6 = StructField("test_struct_6", StructType(Seq(t_struct_7, t_array_1))) + val item011 = StructField("item011", StringType) + val item084 = StructField("item084", ArrayType(StructType(Seq(item011)))) + val item010 = StructField("item010", StructType(Seq(t_struct_7, item084))) + val t_struct_9 = StructField("test_struct_9", StructType(Seq(t_struct_7, item084))) + val item009 = StructField("item009", StructType(Seq(t_struct_6, item010, t_struct_9))) + val item018 = StructField("item018", StringType) + val item030 = StructField("item030", LongType) + val item051 = StructField("item051", StringType) + val item052 = StructField("item052", StringType) + val item085 = StructField("item085", ArrayType(StructType(Seq(item018, item030, item051, item052)))) + val item028 = StructField("item028", StringType) + val item029 = StructField("item029", StringType) + val item027 = StructField("item027", StructType(Seq(item028, item029))) + val item017 = StructField("item017", StructType(Seq(item027, item085))) + val item019 = StructField("item019", StructType(Seq(item027, item085))) + val item020 = StructField("item020", StructType(Seq(item027, item085))) + val item021 = StructField("item021", StructType(Seq(item027, item085))) + val item022 = StructField("item022", StructType(Seq(item027, item085))) + val item023 = StructField("item023", StructType(Seq(item027, item085))) + val item024 = StructField("item024", StructType(Seq(item027, item085))) + val item025 = StructField("item025", StructType(Seq(item027, item085))) + val item026 = StructField("item026", StructType(Seq(item027, item085))) + val item031 = StructField("item031", StructType(Seq(item027, item085))) + val item032 = StructField("item032", StructType(Seq(item027, item085))) + val item033 = StructField("item033", StructType(Seq(item027, item085))) + val item034 = StructField("item034", StructType(Seq(item027, item085))) + val item035 = StructField("item035", StructType(Seq(item027, item085))) + val item036 = StructField("item036", StructType(Seq(item027, item085))) + val item037 = StructField("item037", StructType(Seq(item027, item085))) + val item038 = StructField("item038", StructType(Seq(item027, item085))) + val item039 = StructField("item039", StructType(Seq(item027, item085))) + val item040 = StructField("item040", StructType(Seq(item027, item085))) + val item041 = StructField("item041", StructType(Seq(item027, item085))) + val item042 = StructField("item042", StructType(Seq(item027, item085))) + val item043 = StructField("item043", StructType(Seq(item027, item085))) + val item044 = StructField("item044", StructType(Seq(item027, item085))) + val item045 = StructField("item045", StructType(Seq(item027, item085))) + val item046 = StructField("item046", StructType(Seq(item027, item085))) + val item047 = StructField("item047", StructType(Seq(item027, item085))) + val item048 = StructField("item048", StructType(Seq(item027, item085))) + val item049 = StructField("item049", StructType(Seq(item027, item085))) + val item050 = StructField("item050", StructType(Seq(item027, item085))) + val item053 = StructField("item053", StructType(Seq(item027, item085))) + val item054 = StructField("item054", StructType(Seq(item027, item085))) + val item055 = StructField("item055", StructType(Seq(item027, item085))) + val item056 = StructField("item056", StructType(Seq(item027, item085))) + val item057 = StructField("item057", StructType(Seq(item027, item085))) + val item058 = StructField("item058", StructType(Seq(item027, item085))) + val item059 = StructField("item059", StructType(Seq(item027, item085))) + val item060 = StructField("item060", StructType(Seq(item027, item085))) + val item061 = StructField("item061", StructType(Seq(item027, item085))) + val item062 = StructField("item062", StructType(Seq(item027, item085))) + val item016 = StructField("item016", StructType(Seq(item017, item019, + item020, item021, item022, item023, item024, item025, item026, + item031, item032, item033, item034, item035, item036, item037, item038, item039, + item040, item041, item042, item043, item044, item045, item046, item047, item048, item049, + item050, item053, item054, item055, item056, item057, item058, item059, + item060, item061, item062))) + + val item063 = StructField("item063", StringType) + + val item065 = StructField("item065", StringType) + val item064 = StructField("item064", ArrayType(StructType(Seq(ti_0, ts_0, ts_1, item065, tl_0)))) + + val columnA = StructField("columnA", StructType(Seq(item001))) + val columnB = StructField("columnB", ArrayType(StructType(Seq(item002)))) + val columnC = StructField("columnC", StructType(Seq(ts_0, item063, ts_2, ti_0, tl_0, ts_3, item006, t_struct_4, item016, item012, item009, item007, item064))) + + val jsonTable = DBGen().addTable("json_data", StructType(Seq(columnA, columnB, columnC)), numRows) + + jsonTable("columnB").setLength(1,3) + jsonTable("columnB")("data")("item002").setLength(1,2) + jsonTable("columnB")("data")("item002")("data")("item003")("item004")("test_string_0").setLength(2) + jsonTable("columnB")("data")("item002")("data")("item003")("test_struct_0")("test_string_0").setLength(2) + + jsonTable("columnC")("item007")("item008").setLength(2) + jsonTable("columnC")("item007")("item008")("data")("test_struct_2")("test_string_0").setLength(140,200) + jsonTable("columnC")("item007")("item008")("data")("test_struct_2")("test_string_1").setLength(140,200) + jsonTable("columnC")("item007")("item008")("data")("test_struct_2")("test_string_2").setLength(140,200) + jsonTable("columnC")("item007")("item008")("data")("test_struct_2")("test_string_3").setLength(140,200) + jsonTable("columnC")("item007")("item008")("data")("test_struct_2")("test_string_4").setLength(140,200) + + jsonTable("columnC")("item009")("item010")("item084").setLength(2,5) + jsonTable("columnC")("item009")("test_struct_9")("item084").setLength(2,4) + jsonTable("columnC")("item016")("item017")("item085").setLength(2,3) + jsonTable("columnC")("item016")("item019")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item020")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item021")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item022")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item023")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item024")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item025")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item026")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item031")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item032")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item033")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item034")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item035")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item036")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item037")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item038")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item039")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item040")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item041")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item042")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item043")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item044")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item045")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item046")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item047")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item048")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item049")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item050")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item053")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item054")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item055")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item056")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item057")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item058")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item059")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item060")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item061")("item085").setLength(1,2) + jsonTable("columnC")("item016")("item062")("item085").setLength(1,2) + + jsonTable.setNullProbabilityRecursively(nullProbability) + + jsonTable.toDF(spark).selectExpr("to_json(columnA) as columnA", "to_json(columnB) as columnB", + "to_json(columnC) as columnC").write.mode("overwrite").parquet(output) + + spark.time(spark.read.parquet(output).selectExpr("get_json_object(columnA, '$.item001') as A_001").show()) + spark.time(spark.read.parquet(output).selectExpr("get_json_object(columnB, '$[0].item002[0].item003.item004.item005') as B_005").show()) + spark.time(spark.read.parquet(output).selectExpr("get_json_object(columnC, '$.item006') as C_006", + "get_json_object(columnC, '$.item007.item008') as C_008", + "get_json_object(columnC, '$.item009.item010.item084[0].item011') as C_011", + "get_json_object(columnC, '$.item012.item013') as C_013", + "get_json_object(columnC, '$.item012.item014') as C_014", + "get_json_object(columnC, '$.item012.item015') as C_015", + "get_json_object(columnC, '$.item016.item017.item085[*].item018') as C_017", + "get_json_object(columnC, '$.item016.item019.item085[0].item018') as C_019", + "get_json_object(columnC, '$.item016.item020.item085[0].item018') as C_020", + "get_json_object(columnC, '$.item016.item021.item085[0].item018') as C_021", + "get_json_object(columnC, '$.item016.item022.item085[0].item018') as C_022", + "get_json_object(columnC, '$.item016.item023.item085[0].item018') as C_023", + "get_json_object(columnC, '$.item016.item024.item085[0].item018') as C_024", + "get_json_object(columnC, '$.item016.item025.item085[0].item018') as C_025", + "get_json_object(columnC, '$.item016.item026.item085[0].item018') as C_026_18", + "get_json_object(columnC, '$.item016.item026.item085[0].item030') as C_026_30", + "get_json_object(columnC, '$.item016.item026.item027.item028') as C_028", + "get_json_object(columnC, '$.item016.item026.item027.item029') as C_029", + "get_json_object(columnC, '$.item016.item031.item085[0].item018') as C_031", + "get_json_object(columnC, '$.item016.item032.item085[0].item018') as C_032", + "get_json_object(columnC, '$.item016.item033.item085[0].item018') as C_033", + "get_json_object(columnC, '$.item016.item034.item085[0].item018') as C_034", + "get_json_object(columnC, '$.item016.item035.item085[0].item018') as C_035", + "get_json_object(columnC, '$.item016.item036.item085[0].item018') as C_036", + "get_json_object(columnC, '$.item016.item037.item085[0].item018') as C_037", + "get_json_object(columnC, '$.item016.item038.item085[0].item018') as C_038", + "get_json_object(columnC, '$.item016.item039.item085[0].item018') as C_039", + "get_json_object(columnC, '$.item016.item040.item085[0].item018') as C_040", + "get_json_object(columnC, '$.item016.item041.item085[0].item018') as C_041", + "get_json_object(columnC, '$.item016.item042.item085[0].item018') as C_042", + "get_json_object(columnC, '$.item016.item043.item085[0].item018') as C_043", + "get_json_object(columnC, '$.item016.item044.item085[0].item018') as C_044", + "get_json_object(columnC, '$.item016.item045.item085[0].item018') as C_045", + "get_json_object(columnC, '$.item016.item046.item085[0].item018') as C_046", + "get_json_object(columnC, '$.item016.item047.item085[0].item018') as C_047", + "get_json_object(columnC, '$.item016.item048.item085[0].item018') as C_048", + "get_json_object(columnC, '$.item016.item049.item085[0].item018') as C_049", + "get_json_object(columnC, '$.item016.item050.item085[0].item051') as C_051", + "get_json_object(columnC, '$.item016.item050.item085[0].item052') as C_052", + "get_json_object(columnC, '$.item016.item053.item085[0].item018') as C_053", + "get_json_object(columnC, '$.item016.item054.item085[0].item018') as C_054", + "get_json_object(columnC, '$.item016.item055.item027.item028') as C_055_28", + "get_json_object(columnC, '$.item016.item055.item027.item029') as C_055_29", + "get_json_object(columnC, '$.item016.item055.item085[0].item030') as C_055_30", + "get_json_object(columnC, '$.item016.item055.item085[0].item018') as C_055_18", + "get_json_object(columnC, '$.item016.item056.item027.item028') as C_056_28", + "get_json_object(columnC, '$.item016.item056.item027.item029') as C_056_29", + "get_json_object(columnC, '$.item016.item056[0].item018') as C_056_29", + "get_json_object(columnC, '$.item016.item057.item085[0].item018') as C_057", + "get_json_object(columnC, '$.item016.item058.item085[0].item018') as C_058", + "get_json_object(columnC, '$.item016.item059.item085[0].item018') as C_059", + "get_json_object(columnC, '$.item016.item060.item085[0].item051') as C_60_51", + "get_json_object(columnC, '$.item016.item060.item085[0].item052') as C_60_52", + "get_json_object(columnC, '$.item016.item061.item085[0].item018') as C_061", + "get_json_object(columnC, '$.item016.item062.item085[0].item018') as C_062", + "get_json_object(columnC, '$.item063') as C_063", + "get_json_object(columnC, '$.item064[*].item065') as C_065" + ).show()) + + spark.read.parquet(output).selectExpr("AVG(octet_length(columnA))", "AVG(octet_length(columnB))", + "AVG(octet_length(columnC))").show() +} + +doIt() + diff --git a/benchark/get_json_object_stress_run.scala b/benchark/get_json_object_stress_run.scala new file mode 100644 index 00000000000..6b05cd14789 --- /dev/null +++ b/benchark/get_json_object_stress_run.scala @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +val input = "/data/tmp/SCALE_FROM_JSON" +val iters = 3 + +def doIt(): Unit = { + (0 until iters).foreach { _ => + spark.time(spark.read.parquet(input).selectExpr( + "SUM(octet_length(get_json_object(columnA, '$.item001'))) as A_001", + "SUM(octet_length(get_json_object(columnB, '$[0].item002[0].item003.item004.item005'))) as B_005", + "SUM(octet_length(get_json_object(columnC, '$.item006'))) as C_006", + "SUM(octet_length(get_json_object(columnC, '$.item007.item008'))) as C_008", + "SUM(octet_length(get_json_object(columnC, '$.item009.item010.item084[0].item011'))) as C_011", + "SUM(octet_length(get_json_object(columnC, '$.item012.item013'))) as C_013", + "SUM(octet_length(get_json_object(columnC, '$.item012.item014'))) as C_014", + "SUM(octet_length(get_json_object(columnC, '$.item012.item015'))) as C_015", + "SUM(octet_length(get_json_object(columnC, '$.item016.item017.item085[*].item018'))) as C_017", + "SUM(octet_length(get_json_object(columnC, '$.item016.item019.item085[0].item018'))) as C_019", + "SUM(octet_length(get_json_object(columnC, '$.item016.item020.item085[0].item018'))) as C_020", + "SUM(octet_length(get_json_object(columnC, '$.item016.item021.item085[0].item018'))) as C_021", + "SUM(octet_length(get_json_object(columnC, '$.item016.item022.item085[0].item018'))) as C_022", + "SUM(octet_length(get_json_object(columnC, '$.item016.item023.item085[0].item018'))) as C_023", + "SUM(octet_length(get_json_object(columnC, '$.item016.item024.item085[0].item018'))) as C_024", + "SUM(octet_length(get_json_object(columnC, '$.item016.item025.item085[0].item018'))) as C_025", + "SUM(octet_length(get_json_object(columnC, '$.item016.item026.item085[0].item018'))) as C_026_18", + "SUM(octet_length(get_json_object(columnC, '$.item016.item026.item085[0].item030'))) as C_026_30", + "SUM(octet_length(get_json_object(columnC, '$.item016.item026.item027.item028'))) as C_028", + "SUM(octet_length(get_json_object(columnC, '$.item016.item026.item027.item029'))) as C_029", + "SUM(octet_length(get_json_object(columnC, '$.item016.item031.item085[0].item018'))) as C_031", + "SUM(octet_length(get_json_object(columnC, '$.item016.item032.item085[0].item018'))) as C_032", + "SUM(octet_length(get_json_object(columnC, '$.item016.item033.item085[0].item018'))) as C_033", + "SUM(octet_length(get_json_object(columnC, '$.item016.item034.item085[0].item018'))) as C_034", + "SUM(octet_length(get_json_object(columnC, '$.item016.item035.item085[0].item018'))) as C_035", + "SUM(octet_length(get_json_object(columnC, '$.item016.item036.item085[0].item018'))) as C_036", + "SUM(octet_length(get_json_object(columnC, '$.item016.item037.item085[0].item018'))) as C_037", + "SUM(octet_length(get_json_object(columnC, '$.item016.item038.item085[0].item018'))) as C_038", + "SUM(octet_length(get_json_object(columnC, '$.item016.item039.item085[0].item018'))) as C_039", + "SUM(octet_length(get_json_object(columnC, '$.item016.item040.item085[0].item018'))) as C_040", + "SUM(octet_length(get_json_object(columnC, '$.item016.item041.item085[0].item018'))) as C_041", + "SUM(octet_length(get_json_object(columnC, '$.item016.item042.item085[0].item018'))) as C_042", + "SUM(octet_length(get_json_object(columnC, '$.item016.item043.item085[0].item018'))) as C_043", + "SUM(octet_length(get_json_object(columnC, '$.item016.item044.item085[0].item018'))) as C_044", + "SUM(octet_length(get_json_object(columnC, '$.item016.item045.item085[0].item018'))) as C_045", + "SUM(octet_length(get_json_object(columnC, '$.item016.item046.item085[0].item018'))) as C_046", + "SUM(octet_length(get_json_object(columnC, '$.item016.item047.item085[0].item018'))) as C_047", + "SUM(octet_length(get_json_object(columnC, '$.item016.item048.item085[0].item018'))) as C_048", + "SUM(octet_length(get_json_object(columnC, '$.item016.item049.item085[0].item018'))) as C_049", + "SUM(octet_length(get_json_object(columnC, '$.item016.item050.item085[0].item051'))) as C_051", + "SUM(octet_length(get_json_object(columnC, '$.item016.item050.item085[0].item052'))) as C_052", + "SUM(octet_length(get_json_object(columnC, '$.item016.item053.item085[0].item018'))) as C_053", + "SUM(octet_length(get_json_object(columnC, '$.item016.item054.item085[0].item018'))) as C_054", + "SUM(octet_length(get_json_object(columnC, '$.item016.item055.item027.item028'))) as C_055_28", + "SUM(octet_length(get_json_object(columnC, '$.item016.item055.item027.item029'))) as C_055_29", + "SUM(octet_length(get_json_object(columnC, '$.item016.item055.item085[0].item030'))) as C_055_30", + "SUM(octet_length(get_json_object(columnC, '$.item016.item055.item085[0].item018'))) as C_055_18", + "SUM(octet_length(get_json_object(columnC, '$.item016.item056.item027.item028'))) as C_056_28", + "SUM(octet_length(get_json_object(columnC, '$.item016.item056.item027.item029'))) as C_056_29", + "SUM(octet_length(get_json_object(columnC, '$.item016.item056[0].item018'))) as C_056_29", + "SUM(octet_length(get_json_object(columnC, '$.item016.item057.item085[0].item018'))) as C_057", + "SUM(octet_length(get_json_object(columnC, '$.item016.item058.item085[0].item018'))) as C_058", + "SUM(octet_length(get_json_object(columnC, '$.item016.item059.item085[0].item018'))) as C_059", + "SUM(octet_length(get_json_object(columnC, '$.item016.item060.item085[0].item051'))) as C_60_51", + "SUM(octet_length(get_json_object(columnC, '$.item016.item060.item085[0].item052'))) as C_60_52", + "SUM(octet_length(get_json_object(columnC, '$.item016.item061.item085[0].item018'))) as C_061", + "SUM(octet_length(get_json_object(columnC, '$.item016.item062.item085[0].item018'))) as C_062", + "SUM(octet_length(get_json_object(columnC, '$.item063'))) as C_063", + "SUM(octet_length(get_json_object(columnC, '$.item064[*].item065'))) as C_065" + ).show()) + } + + (0 until iters).foreach { _ => + spark.time(spark.read.parquet(input).selectExpr("SUM(octet_length(columnA))", "SUM(octet_length(columnB))", + "SUM(octet_length(columnC))").show()) + } +} + +doIt() +