From f298cba340b00d8f35e199c0b79ce9117b88bac6 Mon Sep 17 00:00:00 2001 From: huangjianmin <531493269@qq.com> Date: Tue, 22 Nov 2022 18:40:19 +0800 Subject: [PATCH 01/18] spark benchmark optimize --- docker/benchmark/.env | 3 +- docker/benchmark/build-image.sh | 8 +- docker/benchmark/docker-compose.yml | 13 +- .../benchmark-config/spark_arctic_config.xml | 132 ++++++++ .../benchmark-config/spark_hudi_config.xml | 132 ++++++++ .../benchmark-config/spark_iceberg_config.xml | 132 ++++++++ .../images/benchmark-spark.Dockerfile | 48 +-- .../benchmark/images/scripts/arctic-init.sql | 281 ------------------ .../scripts/benchmark-ams-entrypoint.sh | 3 - .../scripts/benchmark-spark-entrypoint.sh | 15 + .../images/spark-config/spark-defaults.conf | 1 + 11 files changed, 460 insertions(+), 308 deletions(-) create mode 100644 docker/benchmark/images/benchmark-config/spark_arctic_config.xml create mode 100644 docker/benchmark/images/benchmark-config/spark_hudi_config.xml create mode 100644 docker/benchmark/images/benchmark-config/spark_iceberg_config.xml delete mode 100644 docker/benchmark/images/scripts/arctic-init.sql delete mode 100644 docker/benchmark/images/scripts/benchmark-ams-entrypoint.sh create mode 100644 docker/benchmark/images/scripts/benchmark-spark-entrypoint.sh diff --git a/docker/benchmark/.env b/docker/benchmark/.env index 4e0ea639..610070e5 100644 --- a/docker/benchmark/.env +++ b/docker/benchmark/.env @@ -27,5 +27,6 @@ MYSQL_VERSION=8.0.30 SCALA_BINARY_VERSION=2.12 SPARK_VERSION=3.1.3 SPARK_BINARY_VERSION=3.1 -SPARK_HADOOP_VERSION=3.2.0 ZOOKEEPER_VERSION=3.6.3 + +tpcc_name_prefix=arctic. diff --git a/docker/benchmark/build-image.sh b/docker/benchmark/build-image.sh index f633b0c4..7c7a94c2 100755 --- a/docker/benchmark/build-image.sh +++ b/docker/benchmark/build-image.sh @@ -84,7 +84,13 @@ ${BUILD_CMD} \ "${SELF_DIR}/images" $@ ${BUILD_CMD} \ - --build-arg BENCHMARK_VERSION=${BENCHMARK_VERSION} \ + --build-arg APACHE_MIRROR=${APACHE_MIRROR} \ + --build-arg MAVEN_MIRROR=${MAVEN_MIRROR} \ + --build-arg SPARK_VERSION=${SPARK_VERSION} \ + --build-arg SPARK_BINARY_VERSION=${SPARK_BINARY_VERSION} \ + --build-arg ARCTIC_VERSION=${ARCTIC_VERSION} \ + --build-arg ARCTIC_RELEASE=${ARCTIC_RELEASE} \ + --build-arg SCALA_BINARY_VERSION=${SCALA_BINARY_VERSION} \ --file "${SELF_DIR}/images/benchmark-spark.Dockerfile" \ --tag arctic163/benchmark-spark:${BENCHMARK_VERSION} \ "${SELF_DIR}/images" $@ diff --git a/docker/benchmark/docker-compose.yml b/docker/benchmark/docker-compose.yml index f7c4dadd..a25eb11e 100644 --- a/docker/benchmark/docker-compose.yml +++ b/docker/benchmark/docker-compose.yml @@ -16,7 +16,6 @@ services: volumes: - ./images/scripts/hive-schema-2.3.0.mysql.sql:/docker-entrypoint-initdb.d/hive-schema-2.3.0.mysql.sql - ./images/scripts/hive-txn-schema-2.3.0.mysql.sql:/docker-entrypoint-initdb.d/hive-txn-schema-2.3.0.mysql.sql - - ./images/scripts/arctic-init.sql:/docker-entrypoint-initdb.d/arctic-init.sql metastore: image: arctic163/benchmark-metastore:${BENCHMARK_VERSION} @@ -77,12 +76,14 @@ services: image: arctic163/benchmark-spark:${BENCHMARK_VERSION} container_name: spark hostname: spark - command: ["bash","-c","./sbin/start-thriftserver.sh && tail -f /dev/null"] + environment: + MYSQL_USER: root + MYSQL_PASSWORD: password volumes: - ./hive:/tmp/hive/warehouse - ./arctic:/tmp/arctic/warehouse - - ./images/hive-config/hive-site.xml:/usr/local/arctic_spark/spark-3.1.3-bin-hadoop2.7/conf/hive-site.xml - - ./images/spark-config/spark-defaults.conf:/usr/local/arctic_spark/spark-3.1.3-bin-hadoop2.7/conf/spark-defaults.conf + - ./images/hive-config/hive-site.xml:/opt/spark/conf/hive-site.xml + - ./images/spark-config/spark-defaults.conf:/opt/spark/conf/spark-defaults.conf ports: - 10000:10000 depends_on: @@ -98,9 +99,13 @@ services: - ./images/benchmark-config/presto_hudi_config.xml:/usr/lib/lakehouse-benchmark/config/trino/presto_hudi_config.xml - ./images/benchmark-config/trino_arctic_config.xml:/usr/lib/lakehouse-benchmark/config/trino/trino_arctic_config.xml - ./images/benchmark-config/trino_iceberg_config.xml:/usr/lib/lakehouse-benchmark/config/trino/trino_iceberg_config.xml + - ./images/benchmark-config/spark_arctic_config.xml:/usr/lib/lakehouse-benchmark/config/spark/spark_arctic_config.xml + - ./images/benchmark-config/spark_hudi_config.xml:/usr/lib/lakehouse-benchmark/config/spark/spark_hudi_config.xml + - ./images/benchmark-config/spark_iceberg_config.xml:/usr/lib/lakehouse-benchmark/config/spark/spark_iceberg_config.xml depends_on: - trino - mysql + - spark tty: true stdin_open: true diff --git a/docker/benchmark/images/benchmark-config/spark_arctic_config.xml b/docker/benchmark/images/benchmark-config/spark_arctic_config.xml new file mode 100644 index 00000000..55117f26 --- /dev/null +++ b/docker/benchmark/images/benchmark-config/spark_arctic_config.xml @@ -0,0 +1,132 @@ + + + + + HIVE + org.apache.hive.jdbc.HiveDriver + jdbc:hive2://spark:10000/local_catalog + hdfs + + TRANSACTION_READ_UNCOMMITTED + 128 + + + 1 + + + + + 1 + + + + + + + + + + + + + + + + + 200 + disabled + unlimited + + 45,43,4,4,4 + 3, 2, 3, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 + + + + + + + + + Q1 + + + Q2 + + + Q3 + + + Q4 + + + Q5 + + + Q6 + + + Q7 + + + Q8 + + + Q9 + + + Q10 + + + Q11 + + + Q12 + + + Q13 + + + Q14 + + + Q15 + + + Q17 + + + Q18 + + + Q19 + + + Q20 + + + Q21 + + + Q22 + + + + + + + NewOrder + + + Payment + + + OrderStatus + + + Delivery + + + StockLevel + + + \ No newline at end of file diff --git a/docker/benchmark/images/benchmark-config/spark_hudi_config.xml b/docker/benchmark/images/benchmark-config/spark_hudi_config.xml new file mode 100644 index 00000000..3d23b5c5 --- /dev/null +++ b/docker/benchmark/images/benchmark-config/spark_hudi_config.xml @@ -0,0 +1,132 @@ + + + + + HIVE + org.apache.hive.jdbc.HiveDriver + jdbc:hive2://spark:10000/hudi + hdfs + + TRANSACTION_READ_UNCOMMITTED + 128 + + + 1 + + + + + 1 + + + + + + + + + + + + + + + + + 200 + disabled + unlimited + + 45,43,4,4,4 + 3, 2, 3, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 + + + + + + + + + Q1 + + + Q2 + + + Q3 + + + Q4 + + + Q5 + + + Q6 + + + Q7 + + + Q8 + + + Q9 + + + Q10 + + + Q11 + + + Q12 + + + Q13 + + + Q14 + + + Q15 + + + Q17 + + + Q18 + + + Q19 + + + Q20 + + + Q21 + + + Q22 + + + + + + + NewOrder + + + Payment + + + OrderStatus + + + Delivery + + + StockLevel + + + \ No newline at end of file diff --git a/docker/benchmark/images/benchmark-config/spark_iceberg_config.xml b/docker/benchmark/images/benchmark-config/spark_iceberg_config.xml new file mode 100644 index 00000000..2457ce3d --- /dev/null +++ b/docker/benchmark/images/benchmark-config/spark_iceberg_config.xml @@ -0,0 +1,132 @@ + + + + + HIVE + org.apache.hive.jdbc.HiveDriver + jdbc:hive2://spark:10000/iceberg_catalog + hdfs + + TRANSACTION_READ_UNCOMMITTED + 128 + + + 1 + + + + + 1 + + + + + + + + + + + + + + + + + 200 + disabled + unlimited + + 45,43,4,4,4 + 3, 2, 3, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 + + + + + + + + + Q1 + + + Q2 + + + Q3 + + + Q4 + + + Q5 + + + Q6 + + + Q7 + + + Q8 + + + Q9 + + + Q10 + + + Q11 + + + Q12 + + + Q13 + + + Q14 + + + Q15 + + + Q17 + + + Q18 + + + Q19 + + + Q20 + + + Q21 + + + Q22 + + + + + + + NewOrder + + + Payment + + + OrderStatus + + + Delivery + + + StockLevel + + + \ No newline at end of file diff --git a/docker/benchmark/images/benchmark-spark.Dockerfile b/docker/benchmark/images/benchmark-spark.Dockerfile index f59a89ba..b91eb9d4 100644 --- a/docker/benchmark/images/benchmark-spark.Dockerfile +++ b/docker/benchmark/images/benchmark-spark.Dockerfile @@ -1,27 +1,39 @@ FROM openjdk:8u332-jdk -WORKDIR /usr/local/arctic_spark - -RUN apt update \ - && apt-get install -y vim \ - && apt-get install -y net-tools \ - && apt-get install -y telnet - -RUN wget https://dlcdn.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop2.7.tgz && \ - tar -zxvf spark-3.1.3-bin-hadoop2.7.tgz && \ - rm -rf spark-3.1.3-bin-hadoop2.7.tgz - -ENV SPARK_HOME=/usr/local/arctic_spark/spark-3.1.3-bin-hadoop2.7 \ - PATH=${PATH}:${SPARK_HOME}/bin - -RUN wget https://github.com/NetEase/arctic/releases/download/v0.3.2-rc1/arctic-spark_3.1-runtime-0.3.2.jar -P ${SPARK_HOME}/jars && \ - wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/1.0.0/iceberg-spark-runtime-3.1_2.12-1.0.0.jar -P ${SPARK_HOME}/jars && \ - wget https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark3.1-bundle_2.12/0.12.1/hudi-spark3.1-bundle_2.12-0.12.1.jar -P ${SPARK_HOME}/jars && \ - wget https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.0.0/delta-core_2.12-1.0.0.jar -P ${SPARK_HOME}/jars +ARG APACHE_MIRROR +ARG MAVEN_MIRROR +ARG SPARK_VERSION +ARG SPARK_BINARY_VERSION +ARG ARCTIC_VERSION +ARG ARCTIC_RELEASE +ARG SCALA_BINARY_VERSION + +ENV SPARK_HOME=/opt/spark + +RUN apt update && \ + apt-get install -y vim && \ + apt-get install -y net-tools && \ + apt-get install -y telnet && \ + apt-get install -y default-mysql-client && \ + apt-get clean + +RUN wget ${APACHE_MIRROR}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.2.tgz && \ + tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.2.tgz -C /opt && \ + ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop3.2 ${SPARK_HOME} && \ + rm spark-${SPARK_VERSION}-bin-hadoop3.2.tgz + +RUN wget https://github.com/NetEase/arctic/releases/download/${ARCTIC_RELEASE}/arctic-spark_3.1-runtime-${ARCTIC_VERSION}.jar -P ${SPARK_HOME}/jars && \ + wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${SPARK_BINARY_VERSION}_${SCALA_BINARY_VERSION}/1.0.0/iceberg-spark-runtime-${SPARK_BINARY_VERSION}_${SCALA_BINARY_VERSION}-1.0.0.jar -P ${SPARK_HOME}/jars && \ + wget https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark${SPARK_BINARY_VERSION}-bundle_${SCALA_BINARY_VERSION}/0.12.1/hudi-spark${SPARK_BINARY_VERSION}-bundle_${SCALA_BINARY_VERSION}-0.12.1.jar -P ${SPARK_HOME}/jars && \ + wget https://repo1.maven.org/maven2/io/delta/delta-core_${SCALA_BINARY_VERSION}/1.0.0/delta-core_${SCALA_BINARY_VERSION}-1.0.0.jar -P ${SPARK_HOME}/jars RUN mkdir -p -m 777 /tmp/hive RUN mkdir -p -m 777 /tmp/arctic WORKDIR ${SPARK_HOME} +COPY scripts/benchmark-spark-entrypoint.sh ${SPARK_HOME} +RUN chmod a+x ${SPARK_HOME}/benchmark-spark-entrypoint.sh + +CMD ["bash","-c","/opt/spark/benchmark-spark-entrypoint.sh && tail -f /dev/null"] diff --git a/docker/benchmark/images/scripts/arctic-init.sql b/docker/benchmark/images/scripts/arctic-init.sql deleted file mode 100644 index 8fd9eba9..00000000 --- a/docker/benchmark/images/scripts/arctic-init.sql +++ /dev/null @@ -1,281 +0,0 @@ -CREATE DATABASE IF NOT EXISTS arctic; - -USE arctic; - -CREATE TABLE `catalog_metadata` -( - `catalog_id` int(11) NOT NULL AUTO_INCREMENT, - `catalog_name` varchar(64) NOT NULL COMMENT 'catalog name', - `display_name` varchar(64) DEFAULT NULL COMMENT 'display name of catalog', - `catalog_type` varchar(64) NOT NULL COMMENT 'catalog type like hive/hadoop', - `storage_configs` mediumtext COMMENT 'base64 code of storage configs', - `auth_configs` mediumtext COMMENT 'base64 code of auth configs', - `catalog_properties` mediumtext COMMENT 'catalog properties', - PRIMARY KEY (`catalog_id`), - UNIQUE KEY `catalog_metadata_catalog_name_uindex` (`catalog_name`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'catalog metadata'; - -CREATE TABLE `container_metadata` -( - `name` varchar(64) NOT NULL COMMENT 'container name', - `type` varchar(64) NOT NULL COMMENT 'container type like flink/local', - `properties` mediumtext COMMENT 'container properties', - PRIMARY KEY (`name`,`type`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'container metadata'; - -CREATE TABLE `database_metadata` -( - `db_id` int(11) NOT NULL AUTO_INCREMENT, - `catalog_name` varchar(64) NOT NULL COMMENT 'catalog name', - `db_name` varchar(64) NOT NULL COMMENT 'database name', - PRIMARY KEY (`db_id`), - UNIQUE KEY `database_name_uindex` (`catalog_name`,`db_name`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'database metadata'; - -CREATE TABLE `file_info_cache` -( - `primary_key_md5` varchar(64) NOT NULL COMMENT 'generate md5 with table_identifier+inner_table+file_path+partition_name as primary_key', - `table_identifier` varchar(384) NOT NULL COMMENT 'table full name with catalog.db.table', - `add_snapshot_id` bigint(20) NOT NULL COMMENT 'the snapshot id who add this file', - `parent_snapshot_id` bigint(20) NOT NULL COMMENT 'parent snapshot of add_snapshot_id', - `delete_snapshot_id` bigint(20) DEFAULT NULL COMMENT 'the snapshot id who delete this file', - `inner_table` varchar(64) DEFAULT NULL COMMENT 'table type like change/base', - `file_path` varchar(400) NOT NULL COMMENT 'table type like change/base', - `file_type` varchar(64) DEFAULT NULL COMMENT 'absolute file path', - `file_size` bigint(20) DEFAULT NULL COMMENT 'file size', - `file_mask` bigint(20) DEFAULT NULL COMMENT 'file mask', - `file_index` bigint(20) DEFAULT NULL COMMENT 'file index', - `spec_id` bigint(20) DEFAULT NULL COMMENT 'file spec id', - `record_count` bigint(20) DEFAULT NULL COMMENT 'file record count', - `partition_name` varchar(256) DEFAULT NULL COMMENT 'the partition name which file belongs to', - `action` varchar(64) DEFAULT NULL COMMENT 'snapshot type', - `commit_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'file commit time', - `watermark` timestamp NULL DEFAULT NULL COMMENT 'file max event time', - `producer` varchar(64) NOT NULL DEFAULT 'INGESTION' COMMENT 'who produce this snapshot', - PRIMARY KEY (`primary_key_md5`), - KEY `table_snap_index` (`table_identifier`,`add_snapshot_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'cache files info of table'; - -CREATE TABLE `optimize_file` -( - `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'Auto increment id', - `optimize_type` varchar(10) NOT NULL COMMENT 'Optimize type: Major, Minor, FullMajor', - `trace_id` varchar(40) NOT NULL COMMENT 'Optimize task unique id', - `file_type` varchar(16) NOT NULL COMMENt 'File type: BASE_FILE, INSERT_FILE, EQ_DELETE_FILE, POS_DELETE_FILE', - `is_target` tinyint(4) DEFAULT '0' COMMENT 'Is file newly generated by optimizing', - `file_content` varbinary(60000) DEFAULT NULL COMMENT 'File bytes after serialization', - PRIMARY KEY (`id`), - KEY `compact_task_id` (`optimize_type`,`trace_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'Optimize files for Optimize task'; - -CREATE TABLE `optimize_history` -( - `history_id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'History auto increment id', - `catalog_name` varchar(64) NOT NULL COMMENT 'Catalog name', - `db_name` varchar(64) NOT NULL COMMENT 'Database name', - `table_name` varchar(64) NOT NULL COMMENT 'Table name', - `optimize_range` varchar(10) NOT NULL COMMENT 'Optimize Range: Table, Partition, Node', - `visible_time` datetime(3) DEFAULT NULL COMMENT 'Latest visible time', - `commit_time` datetime(3) DEFAULT NULL COMMENT 'Commit time', - `plan_time` datetime(3) DEFAULT NULL COMMENT 'First plan time', - `duration` bigint(20) DEFAULT NULL COMMENT 'Execute cost time', - `total_file_cnt_before` int(11) NOT NULL COMMENT 'Total file cnt before optimizing', - `total_file_size_before` bigint(20) NOT NULL COMMENT 'Total file size in bytes before optimizing', - `insert_file_cnt_before` int(11) NOT NULL COMMENT 'Insert file cnt before optimizing', - `insert_file_size_before` bigint(20) NOT NULL COMMENT 'Insert file size in bytes before optimizing', - `delete_file_cnt_before` int(11) NOT NULL COMMENT 'Delete file cnt before optimizing', - `delete_file_size_before` bigint(20) NOT NULL COMMENT 'Delete file size in bytes before optimizing', - `base_file_cnt_before` int(11) NOT NULL COMMENT 'Base file cnt before optimizing', - `base_file_size_before` bigint(20) NOT NULL COMMENT 'Base file size in bytes before optimizing', - `pos_delete_file_cnt_before` int(11) NOT NULL COMMENT 'Pos-Delete file cnt before optimizing', - `pos_delete_file_size_before` bigint(20) NOT NULL COMMENT 'Pos-Delete file size in bytes before optimizing', - `total_file_cnt_after` int(11) NOT NULL COMMENT 'Total file cnt after optimizing', - `total_file_size_after` bigint(20) NOT NULL COMMENT 'Total file cnt after optimizing', - `snapshot_id` bigint(20) DEFAULT NULL COMMENT 'Snapshot id after commit', - `total_size` bigint(20) DEFAULT NULL COMMENT 'Total size of the snapshot', - `added_files` int(11) DEFAULT NULL COMMENT 'Added files cnt of the snapshot', - `removed_files` int(11) DEFAULT NULL COMMENT 'Removed files cnt of the snapshot', - `added_records` bigint(20) DEFAULT NULL COMMENT 'Added records of the snapshot', - `removed_records` bigint(20) DEFAULT NULL COMMENT 'Removed records of the snapshot', - `added_files_size` bigint(20) DEFAULT NULL COMMENT 'Added files size of the snapshot', - `removed_files_size` bigint(20) DEFAULT NULL COMMENT 'Removed files size of the snapshot', - `total_files` bigint(20) DEFAULT NULL COMMENT 'Total file size of the snapshot', - `total_records` bigint(20) DEFAULT NULL COMMENT 'Total records of the snapshot', - `partition_cnt` int(11) NOT NULL COMMENT 'Partition cnt for this optimizing', - `partitions` text COMMENT 'Partitions', - `max_change_transaction_id` mediumtext COMMENT 'Max change transaction id of these tasks', - `optimize_type` varchar(10) NOT NULL COMMENT 'Optimize type: Major, Minor', - PRIMARY KEY (`history_id`), - KEY `table_name_record` (`catalog_name`,`db_name`,`table_name`,`history_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'History of optimizing after each commit'; - -CREATE TABLE `optimizer` -( - `optimizer_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, - `optimizer_name` varchar(1024) DEFAULT NULL COMMENT 'optimizer name', - `queue_id` int(11) DEFAULT NULL COMMENT 'queue id', - `queue_name` varchar(1024) DEFAULT NULL COMMENT 'queue name', - `optimizer_start_time` varchar(1024) DEFAULT NULL COMMENT 'optimizer start time', - `optimizer_fail_time` varchar(1024) DEFAULT NULL COMMENT 'optimizer fail time', - `optimizer_status` varchar(16) DEFAULT NULL COMMENT 'optimizer status', - `core_number` int(11) DEFAULT NULL COMMENT 'total number of all CPU resources', - `memory` bigint(30) DEFAULT NULL COMMENT 'optimizer use memory size', - `parallelism` int(11) DEFAULT NULL COMMENT 'optimizer parallelism', - `jobmanager_url` varchar(1024) DEFAULT NULL COMMENT 'jobmanager url', - `optimizer_instance` blob COMMENT 'optimizer instance bytes, use to deserialize optimizer instance', - `optimizer_state_info` mediumtext COMMENT 'optimizer state info, contains like yarn application id and flink job id', - `container` varchar(50) DEFAULT '' COMMENT 'name of container which this optimizer belongs to', - `update_time` timestamp not null default CURRENT_TIMESTAMP COMMENT 'update time', - PRIMARY KEY (`optimizer_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'Optimizer info'; - -CREATE TABLE `optimize_group` -( - `group_id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'Optimize group unique id', - `name` varchar(50) NOT NULL COMMENT 'Optimize group name', - `properties` mediumtext COMMENT 'Properties', - `container` varchar(100) DEFAULT NULL COMMENT 'Container: local, flink', - PRIMARY KEY (`group_id`), - UNIQUE KEY `uniqueName` (`name`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'Group to divide optimize resources'; - -CREATE TABLE `optimize_task` -( - `trace_id` varchar(40) NOT NULL COMMENT 'Optimize task uuid', - `optimize_type` varchar(10) NOT NULL COMMENT 'Optimize type: Major, Minor', - `catalog_name` varchar(64) NOT NULL COMMENT 'Catalog name', - `db_name` varchar(64) NOT NULL COMMENT 'Database name', - `table_name` varchar(64) NOT NULL COMMENT 'Table name', - `partition` varchar(128) DEFAULT NULL COMMENT 'Partition', - `task_commit_group` varchar(40) DEFAULT NULL COMMENT 'UUID. Commit group of task, task of one commit group should commit together', - `max_change_transaction_id` bigint(20) NOT NULL DEFAULT '-1' COMMENT 'Max change transaction id', - `create_time` datetime(3) DEFAULT NULL COMMENT 'Task create time', - `properties` text COMMENT 'Task properties', - `queue_id` int(11) NOT NULL COMMENT 'Task group id', - `insert_files` int(11) DEFAULT NULL COMMENT 'Insert file cnt', - `delete_files` int(11) DEFAULT NULL COMMENT 'Delete file cnt', - `base_files` int(11) DEFAULT NULL COMMENT 'Base file cnt', - `pos_delete_files` int(11) DEFAULT NULL COMMENT 'Pos-Delete file cnt', - `insert_file_size` bigint(20) DEFAULT NULL COMMENT 'Insert file size in bytes', - `delete_file_size` bigint(20) DEFAULT NULL COMMENT 'Delete file size in bytes', - `base_file_size` bigint(20) DEFAULT NULL COMMENT 'Base file size in bytes', - `pos_delete_file_size` bigint(20) DEFAULT NULL COMMENT 'Pos-Delete file size in bytes', - `source_nodes` varchar(2048) DEFAULT NULL COMMENT 'Source nodes of task', - `task_plan_group` varchar(40) DEFAULT NULL COMMENT 'UUID. Plan group of task, task of one plan group are planned together', - `status` varchar(16) DEFAULT NULL COMMENT 'Optimize Status: Init, Pending, Executing, Failed, Prepared, Committed', - `pending_time` datetime(3) DEFAULT NULL COMMENT 'Time when task start waiting to execute', - `execute_time` datetime(3) DEFAULT NULL COMMENT 'Time when task start executing', - `prepared_time` datetime(3) DEFAULT NULL COMMENT 'Time when task finish executing', - `report_time` datetime(3) DEFAULT NULL COMMENT 'Time when task report result', - `commit_time` datetime(3) DEFAULT NULL COMMENT 'Time when task committed', - `job_type` varchar(16) DEFAULT NULL COMMENT 'Job type', - `job_id` varchar(32) DEFAULT NULL COMMENT 'Job id', - `attempt_id` varchar(40) DEFAULT NULL COMMENT 'Attempt id', - `retry` int(11) DEFAULT NULL COMMENT 'Retry times', - `fail_reason` varchar(4096) DEFAULT NULL COMMENT 'Error message after task failed', - `fail_time` datetime(3) DEFAULT NULL COMMENT 'Fail time', - `new_file_size` bigint(20) DEFAULT NULL COMMENT 'File size generated by task executing', - `new_file_cnt` int(11) DEFAULT NULL COMMENT 'File cnt generated by task executing', - `cost_time` bigint(20) DEFAULT NULL COMMENT 'Task Execute cost time', - PRIMARY KEY (`trace_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'Optimize task basic information'; - -CREATE TABLE `snapshot_info_cache` -( - `table_identifier` varchar(384) NOT NULL COMMENT 'table full name with catalog.db.table', - `snapshot_id` bigint(20) NOT NULL COMMENT 'snapshot id', - `parent_snapshot_id` bigint(20) NOT NULL COMMENT 'parent snapshot id', - `action` varchar(64) DEFAULT NULL COMMENT 'snapshot type', - `inner_table` varchar(64) NOT NULL COMMENT 'table type like change/base', - `producer` varchar(64) NOT NULL DEFAULT 'INGESTION' COMMENT 'who produce this snapshot', - `file_size` bigint(20) NOT NULL DEFAULT 0 COMMENT 'file size', - `file_count` int(11) NOT NULL DEFAULT 0 COMMENT 'file count', - `commit_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'snapshot commit time', - PRIMARY KEY (`table_identifier`,`inner_table`,`snapshot_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'cache snapshot info of table'; - -CREATE TABLE `table_metadata` -( - `catalog_name` varchar(64) NOT NULL COMMENT 'Catalog name', - `db_name` varchar(64) NOT NULL COMMENT 'Database name', - `table_name` varchar(64) NOT NULL COMMENT 'Table name', - `primary_key` varchar(256) DEFAULT NULL COMMENT 'Primary key', - `sort_key` varchar(256) DEFAULT NULL COMMENT 'Sort key', - `table_location` varchar(256) DEFAULT NULL COMMENT 'Table location', - `base_location` varchar(256) DEFAULT NULL COMMENT 'Base table location', - `delta_location` varchar(256) DEFAULT NULL COMMENT 'change table location', - `properties` text COMMENT 'Table properties', - `meta_store_site` mediumtext COMMENT 'base64 code of meta store site', - `hdfs_site` mediumtext COMMENT 'base64 code of hdfs site', - `core_site` mediumtext COMMENT 'base64 code of core site', - `hbase_site` mediumtext COMMENT 'base64 code of hbase site', - `auth_method` varchar(32) DEFAULT NULL COMMENT 'auth method like KERBEROS/SIMPLE', - `hadoop_username` varchar(64) DEFAULT NULL COMMENT 'hadpp username when auth method is SIMPLE', - `krb_keytab` text COMMENT 'kerberos keytab when auth method is KERBEROS', - `krb_conf` text COMMENT 'kerberos conf when auth method is KERBEROS', - `krb_principal` text COMMENT 'kerberos principal when auth method is KERBEROS', - `current_tx_id` bigint(20) DEFAULT NULL COMMENT 'current transaction id', - `cur_schema_id` int(11) NOT NULL DEFAULT 0 COMMENT 'current schema id', - PRIMARY KEY `table_name_index` (`catalog_name`,`db_name`,`table_name`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'Table metadata'; - -CREATE TABLE `optimize_table_runtime` -( - `catalog_name` varchar(64) NOT NULL COMMENT 'Catalog name', - `db_name` varchar(64) NOT NULL COMMENT 'Database name', - `table_name` varchar(64) NOT NULL COMMENT 'Table name', - `current_snapshot_id` bigint(20) NOT NULL DEFAULT '-1' COMMENT 'Base table current snapshot id', - `latest_major_optimize_time` mediumtext COMMENT 'Latest Major Optimize time for all partitions', - `latest_minor_optimize_time` mediumtext COMMENT 'Latest Minor Optimize time for all partitions', - `latest_task_plan_group` varchar(40) DEFAULT NULL COMMENT 'Latest task plan group', - `optimize_status` varchar(20) DEFAULT 'Idle' COMMENT 'Table optimize status: MajorOptimizing, MinorOptimizing, Pending, Idle', - `optimize_status_start_time` datetime(3) DEFAULT NULL COMMENT 'Table optimize status start time', - `current_change_snapshotId` bigint(20) DEFAULT NULL COMMENT 'Change table current snapshot id', - `latest_full_optimize_time` MEDIUMTEXT NULL COMMENT 'Latest Full Optimize time for all partitions', - PRIMARY KEY `table_name_index` (`catalog_name`,`db_name`,`table_name`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'Optimize running information of each table'; - -CREATE TABLE `optimize_task_history` -( - `task_trace_id` varchar(50) NOT NULL COMMENT 'Optimize task uuid', - `retry` int(11) NOT NULL COMMENT 'Retry times for the same task_trace_id', - `task_plan_group` varchar(40) NOT NULL COMMENT 'Plan group of task, task of one plan group are planned together', - `catalog_name` varchar(64) NOT NULL COMMENT 'Catalog name', - `db_name` varchar(64) NOT NULL COMMENT 'Database name', - `table_name` varchar(64) NOT NULL COMMENT 'Table name', - `start_time` datetime(3) DEFAULT NULL COMMENT 'Task start time', - `end_time` datetime(3) DEFAULT NULL COMMENT 'Task end time', - `cost_time` bigint(20) DEFAULT NULL COMMENT 'Task cost time', - `queue_id` int(11) DEFAULT NULL COMMENT 'Queue id which execute task', - PRIMARY KEY (`task_trace_id`, `retry`), - KEY `table_end_time_plan_group_index` (`catalog_name`, `db_name`, `table_name`, `end_time`, `task_plan_group`), - KEY `table_plan_group_index` (`catalog_name`, `db_name`, `table_name`, `task_plan_group`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'History of each optimize task execute'; - -CREATE TABLE `table_transaction_meta` -( - `table_identifier` varchar(384) NOT NULL COMMENT 'table full name with catalog.db.table', - `transaction_id` bigint(20) NOT NULL COMMENT 'allocated transaction id', - `signature` varchar(256) NOT NULL COMMENT 'transaction request signature', - `commit_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'transaction allocate time', - PRIMARY KEY (`table_identifier`,`transaction_id`), - UNIQUE KEY `signature_unique` (`table_identifier`,`signature`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'transaction meta info of table'; - -CREATE TABLE `api_tokens` -( - `id` int(11) NOT NULL AUTO_INCREMENT, - `apikey` varchar(256) NOT NULL COMMENT 'openapi client public key', - `secret` varchar(256) NOT NULL COMMENT 'The key used by the client to generate the request signature', - `apply_time` datetime DEFAULT NULL COMMENT 'apply time', - PRIMARY KEY (`id`) USING BTREE, - UNIQUE KEY `account_unique` (`apikey`) USING BTREE COMMENT 'account unique' -) ENGINE=InnoDB AUTO_INCREMENT=33 DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC COMMENT='Openapi secret'; - -CREATE TABLE `ddl_record` -( - `table_identifier` varchar(384) NOT NULL COMMENT 'table full name with catalog.db.table', - `ddl` mediumtext COMMENT 'ddl', - `ddl_type` varchar(256) NOT NULL COMMENT 'ddl type', - `commit_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'ddl commit time' -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT 'ddl record of table'; \ No newline at end of file diff --git a/docker/benchmark/images/scripts/benchmark-ams-entrypoint.sh b/docker/benchmark/images/scripts/benchmark-ams-entrypoint.sh deleted file mode 100644 index 768f07c7..00000000 --- a/docker/benchmark/images/scripts/benchmark-ams-entrypoint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -sleep 20 -/opt/arctic/bin/ams.sh start \ No newline at end of file diff --git a/docker/benchmark/images/scripts/benchmark-spark-entrypoint.sh b/docker/benchmark/images/scripts/benchmark-spark-entrypoint.sh new file mode 100644 index 00000000..003a963d --- /dev/null +++ b/docker/benchmark/images/scripts/benchmark-spark-entrypoint.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +SQL_RESULT="" +while [[ "${SQL_RESULT}" != 1 ]] +do + echo "Wait for table initialization to complete..." + sleep 1 + SQL_RESULT=$(mysql -h mysql -P 3306 --database=metastore \ + --user="${MYSQL_USER}" --password="${MYSQL_PASSWORD}" \ + --execute="select VER_ID from VERSION;" \ + -s -N) +done +echo "MySQL initialization is successful, starting Spark Thrift Server..." + +sbin/start-thriftserver.sh \ No newline at end of file diff --git a/docker/benchmark/images/spark-config/spark-defaults.conf b/docker/benchmark/images/spark-config/spark-defaults.conf index 5f717d8e..aba9ba5d 100644 --- a/docker/benchmark/images/spark-config/spark-defaults.conf +++ b/docker/benchmark/images/spark-config/spark-defaults.conf @@ -5,6 +5,7 @@ spark.sql.catalog.local_catalog=com.netease.arctic.spark.ArcticSparkCatalog spark.sql.catalog.local_catalog.url=thrift://ams:1260/local_catalog #Iceberg +spark.sql.iceberg.handle-timestamp-without-timezone=true spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions spark.sql.catalog.iceberg_catalog=org.apache.iceberg.spark.SparkCatalog spark.sql.catalog.iceberg_catalog.type=hive From bc7d84f42279148320e0121941c001cf46b5d9f2 Mon Sep 17 00:00:00 2001 From: huangjianmin <531493269@qq.com> Date: Wed, 23 Nov 2022 14:51:04 +0800 Subject: [PATCH 02/18] polish document and change some settings in config --- docker/benchmark/README.md | 155 ++++++++++++------ docker/benchmark/docker-compose.yml | 4 +- .../benchmark-config/spark_arctic_config.xml | 4 +- .../benchmark-config/spark_iceberg_config.xml | 2 +- .../images/spark-config/spark-defaults.conf | 4 - 5 files changed, 108 insertions(+), 61 deletions(-) diff --git a/docker/benchmark/README.md b/docker/benchmark/README.md index 2177b457..1e8ee98e 100644 --- a/docker/benchmark/README.md +++ b/docker/benchmark/README.md @@ -1,13 +1,14 @@ ## 介绍 -Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用户熟悉 Benchamrk 流程。其中 Hdfs 文件系统用本地文件系统代替,所以确保运行目录有足够存储空间 +Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用户熟悉 Benchmark 流程。其中 HDFS 文件系统用本地文件系统代替,所以确保运行目录有足够存储空间。相关配置文件已挂载到宿主机,修改主机中的配置文件即可同步到容器。 ## 使用 +本 Docker 环境使用介绍仅为引导用户进行简单入门,熟悉之后建议配合 Arctic 官方 [Benchmark流程](https://github.com/NetEase/arctic/blob/master/site/docs/ch/benchmark/benchmark-step.md) 文档以及本项目主页文档进行更为深入的测试与使用。 进入 docker/benchmark 目录下 -如果需要自己build镜像需要先执行 +远程仓库 DockerHub 中已上传构建好的镜像,如需自己构建镜像需要先执行一下命令构建镜像: ``` ./build-image.sh ``` -构建镜像 + 如果不构建镜像也可以从远程仓库下载,不过需要初始化环境变量 ``` source .env @@ -17,11 +18,10 @@ Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用 ``` docker-compose up -d ``` -即可通过 docker-compose (如果没有 docker-compose 那么需要安装)拉起全部容器,主要有 mysql,hive,ams,trino,presto,spark,lakehouse-benchmark,lakehouse-benchmark-ingestion -其中 hive 是测试 iceberg 和 hudi 时需要的,presto 是专门测试 hudi 用的。其中 lakehouse-benchmark,lakehouse-benchmark-ingestion 两个是静态容器 -只有配置好的环境信息,用户需要执行特定的命令触发运行。 +即可通过 docker-compose (如果没有 docker-compose 那么需要安装)拉起全部容器,主要有 mysql,hive, ams, trino, presto, spark, lakehouse-benchmark, lakehouse-benchmark-ingestion +其中 hive 是测试 iceberg 和 hudi 时需要的,presto 是专门测试 hudi 用的。 - - 使用如下命令进入 + - 使用如下命令生成静态数据到 mysql : ``` docker exec -it lakehouse-benchmark \ java -jar lakehouse-benchmark.jar \ @@ -29,8 +29,8 @@ Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用 -c config/mysql/sample_chbenchmark_config.xml \ --create=true --load=true ``` - 生成静态数据进入 mysql。 - - 使用如下命令进入 + + - 使用如下命令开启数据同步程序,将数据库的数据实时同步到数据湖 ``` docker exec -it benchmark-lakehouse-ingestion \ java -cp lakehouse-benchmark-ingestion-1.0-SNAPSHOT.jar \ @@ -39,10 +39,11 @@ Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用 -sinkType [arctic/iceberg/hudi] \ -sinkDatabase [arctic/iceberg/hudi] ``` - 上述命令需要选择 sinkType 及 sinkDatabase 参数,命令行参数的具体说明请参考[lakehouse-benchmark-ingestion](https://github.com/NetEase/lakehouse-benchmark-ingestion) - ingestion 服务可以通过宿主机上的localhost:8081页面打开 Flink Web UI,观察数据同步情况。 - - 等 lakehouse-benchmark-ingestion 容器同步完数据以后在进入lakehouse-benchmark 容器,进行静态数据查询性能测试 - - arctic + 上述命令需要选择 sinkType 及 sinkDatabase 参数,命令行参数的具体说明请参考 [lakehouse-benchmark-ingestion](https://github.com/NetEase/lakehouse-benchmark-ingestion)。 + 可以通过宿主机上的 `localhost:8081` 页面打开 [Flink Web UI](localhost:8081),观察数据同步情况。 + 观察 Flink Web UI ,通过 source 算子的 Records Sent 指标观察数据同步的情况,当该指标不再增加时,表示全量数据同步完成。 + - 等 lakehouse-benchmark-ingestion 容器同步完数据以后,保留此窗口以便后续使用以及观察日志。再新建一个窗口执行命令进入lakehouse-benchmark 容器,进行静态数据查询性能测试,推荐使用 Trino 进行测试: + - Arctic ``` docker exec -it lakehouse-benchmark \ java -jar lakehouse-benchmark.jar \ @@ -50,7 +51,7 @@ Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用 -c config/trino/trino_arctic_config.xml \ --create=false --load=false --execute=true ``` - - iceberg + - Iceberg ``` docker exec -it lakehouse-benchmark \ java -jar lakehouse-benchmark.jar \ @@ -58,7 +59,7 @@ Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用 -c config/trino/trino_iceberg_config.xml \ --create=false --load=false --execute=true ``` - - hudi + - Hudi ``` docker exec -it lakehouse-benchmark \ java -jar lakehouse-benchmark.jar \ @@ -66,40 +67,90 @@ Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用 -c config/trino/presto_hudi_config.xml \ --create=false --load=false --execute=true ``` - - 上述测试的是静态数据,数据中不包含 update,delete,如果想测试动态数据需要边向 Mysql 造数据边测试查询,进入 lakehouse-benchmark 容器 - 先执行产生tpcc数据的命令: - ``` - docker exec -it lakehouse-benchmark \ - nohup java -jar lakehouse-benchmark.jar \ - -b tpcc,chbenchmark \ - -c config/mysql/sample_chbenchmark_config.xml \ - --execute=true -s 5 >> run.log1 2>&1 & - ``` - 然后同时执行tpch性能查询的命令: - - arctic - ``` - docker exec -it lakehouse-benchmark \ - nohup java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForTrino \ - -c config/trino/trino_arctic_config.xml \ - --create=false --load=false --execute=true >> run.log2 2>&1 & - ``` - - iceberg - ``` - docker exec -it lakehouse-benchmark \ - nohup java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForTrino \ - -c config/trino/trino_iceberg_config.xml \ - --create=false --load=false --execute=true >> run.log2 2>&1 & - ``` - - hudi - ``` - docker exec -it lakehouse-benchmark \ - nohup java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForTrino \ - -c config/trino/presto_hudi_config.xml \ - --create=false --load=false --execute=true >> run.log2 2>&1 & - ``` - - 进入 lakehouse-benchmark 容器,配置放在 config 目录下,如果想要测试 hudi 需要使用 config/trino/presto_chbenchmark_config.xml 配置文件, - 如果需要测试 iceberg 等库需要修改 config/trino/trino_chbenchmark_config.xml 里面的 catalog 和 database 名称。 \ No newline at end of file + - 本 Docker 环境也支持使用 Spark 进行测试: + - Arctic + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_arctic_config.xml \ + --create=false --load=false --execute=true + ``` + - Iceberg + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_iceberg_config.xml \ + --create=false --load=false --execute=true + ``` + - Hudi + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_hudi_config.xml \ + --create=false --load=false --execute=true + ``` + + - 上述测试的是静态数据,数据中不包含 update,delete,如果想测试动态数据需要边向 Mysql 造数据边测试查询, + 进入 lakehouse-benchmark 容器执行命令向 Mysql 里生产增量数据,这些数据会通过已经运行的数据同步工具源源不断写入数据湖: + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b tpcc,chbenchmark \ + -c config/mysql/sample_chbenchmark_config.xml \ + --execute=true -s 5 + ``` + 再新建一个窗口,然后同时执行 TPCH 性能查询的命令 (Trino) : + - Arctic + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForTrino \ + -c config/trino/trino_arctic_config.xml \ + --create=false --load=false --execute=true + ``` + - Iceberg + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForTrino \ + -c config/trino/trino_iceberg_config.xml \ + --create=false --load=false --execute=true + ``` + - Hudi + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForTrino \ + -c config/trino/presto_hudi_config.xml \ + --create=false --load=false --execute=true + ``` + 也可以使用 Spark : + - Arctic + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_arctic_config.xml \ + --create=false --load=false --execute=true + ``` + - Iceberg + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_iceberg_config.xml \ + --create=false --load=false --execute=true + ``` + - Hudi + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_hudi_config.xml \ + --create=false --load=false --execute=true + ``` +## 测试结果 +测试跑完以后会在 `lakehouse-benchmark` 容器 `/usr/lib/lakehouse-benchmark/` 目录下生成一个 `results` 目录,测试结果都在里面,主要关注两个文件,第一:`xxx.summary.json` 文件, 这里面的 Average Latency 项显示的是本次性能测试的平均响应时间,第二:`xxx.statistic.csv` 文件,里面记录了每个 Query 类型的最大,最小,平均耗时。 \ No newline at end of file diff --git a/docker/benchmark/docker-compose.yml b/docker/benchmark/docker-compose.yml index a25eb11e..674fcfaa 100644 --- a/docker/benchmark/docker-compose.yml +++ b/docker/benchmark/docker-compose.yml @@ -54,7 +54,7 @@ services: - ./images/trino-presto-config/iceberg.properties:/etc/trino/catalog/iceberg.properties - ./images/trino-presto-config/delta-lake.properties:/etc/trino/catalog/delta-lake.properties ports: - - 9001:8080 + - 9003:8080 depends_on: - ams - metastore @@ -67,7 +67,7 @@ services: - ./hive:/tmp/hive/warehouse - ./images/trino-presto-config/hudi.properties:/opt/presto-server/etc/catalog/hudi.properties ports: - - 9002:8080 + - 9004:8080 depends_on: - ams - metastore diff --git a/docker/benchmark/images/benchmark-config/spark_arctic_config.xml b/docker/benchmark/images/benchmark-config/spark_arctic_config.xml index 55117f26..3f0678b0 100644 --- a/docker/benchmark/images/benchmark-config/spark_arctic_config.xml +++ b/docker/benchmark/images/benchmark-config/spark_arctic_config.xml @@ -4,7 +4,7 @@ HIVE org.apache.hive.jdbc.HiveDriver - jdbc:hive2://spark:10000/local_catalog + jdbc:hive2://spark:10000/local_catalog.arctic hdfs TRANSACTION_READ_UNCOMMITTED @@ -129,4 +129,4 @@ StockLevel - \ No newline at end of file + diff --git a/docker/benchmark/images/benchmark-config/spark_iceberg_config.xml b/docker/benchmark/images/benchmark-config/spark_iceberg_config.xml index 2457ce3d..7fbcfa37 100644 --- a/docker/benchmark/images/benchmark-config/spark_iceberg_config.xml +++ b/docker/benchmark/images/benchmark-config/spark_iceberg_config.xml @@ -4,7 +4,7 @@ HIVE org.apache.hive.jdbc.HiveDriver - jdbc:hive2://spark:10000/iceberg_catalog + jdbc:hive2://spark:10000/iceberg_catalog.iceberg hdfs TRANSACTION_READ_UNCOMMITTED diff --git a/docker/benchmark/images/spark-config/spark-defaults.conf b/docker/benchmark/images/spark-config/spark-defaults.conf index aba9ba5d..0f7903a2 100644 --- a/docker/benchmark/images/spark-config/spark-defaults.conf +++ b/docker/benchmark/images/spark-config/spark-defaults.conf @@ -14,7 +14,3 @@ spark.sql.catalog.iceberg_catalog.uri=thrift://metastore:9083 #hudi spark.serializer=org.apache.spark.serializer.KryoSerializer spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension - -#delta -spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension -spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog From 3d0b85070bb6659f6b631965f6bcdb6764677482 Mon Sep 17 00:00:00 2001 From: huangjianmin <531493269@qq.com> Date: Wed, 23 Nov 2022 14:51:52 +0800 Subject: [PATCH 03/18] polish document and change some settings in config --- docker/benchmark/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/benchmark/docker-compose.yml b/docker/benchmark/docker-compose.yml index 674fcfaa..a25eb11e 100644 --- a/docker/benchmark/docker-compose.yml +++ b/docker/benchmark/docker-compose.yml @@ -54,7 +54,7 @@ services: - ./images/trino-presto-config/iceberg.properties:/etc/trino/catalog/iceberg.properties - ./images/trino-presto-config/delta-lake.properties:/etc/trino/catalog/delta-lake.properties ports: - - 9003:8080 + - 9001:8080 depends_on: - ams - metastore @@ -67,7 +67,7 @@ services: - ./hive:/tmp/hive/warehouse - ./images/trino-presto-config/hudi.properties:/opt/presto-server/etc/catalog/hudi.properties ports: - - 9004:8080 + - 9002:8080 depends_on: - ams - metastore From 30cbcf251217eedae613edc723c48d911319c3be Mon Sep 17 00:00:00 2001 From: huangjianmin <531493269@qq.com> Date: Wed, 23 Nov 2022 14:59:22 +0800 Subject: [PATCH 04/18] polish document and change some settings in config --- docker/benchmark/README.md | 100 ++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/docker/benchmark/README.md b/docker/benchmark/README.md index 1e8ee98e..8e073499 100644 --- a/docker/benchmark/README.md +++ b/docker/benchmark/README.md @@ -93,64 +93,64 @@ Docker 的全套 Benchmark 容器只支持单机版本,主要是为了让用 --create=false --load=false --execute=true ``` - - 上述测试的是静态数据,数据中不包含 update,delete,如果想测试动态数据需要边向 Mysql 造数据边测试查询, - 进入 lakehouse-benchmark 容器执行命令向 Mysql 里生产增量数据,这些数据会通过已经运行的数据同步工具源源不断写入数据湖: - ``` - docker exec -it lakehouse-benchmark \ - java -jar lakehouse-benchmark.jar \ - -b tpcc,chbenchmark \ - -c config/mysql/sample_chbenchmark_config.xml \ - --execute=true -s 5 - ``` - 再新建一个窗口,然后同时执行 TPCH 性能查询的命令 (Trino) : - - Arctic + - 上述测试的是静态数据,数据中不包含 update,delete,如果想测试动态数据需要边向 Mysql 造数据边测试查询, + 进入 lakehouse-benchmark 容器执行命令向 Mysql 里生产增量数据,这些数据会通过已经运行的数据同步工具源源不断写入数据湖: + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b tpcc,chbenchmark \ + -c config/mysql/sample_chbenchmark_config.xml \ + --execute=true -s 5 + ``` + - 再新建一个窗口,然后同时执行 TPCH 性能查询的命令 (Trino) : + - Arctic ``` docker exec -it lakehouse-benchmark \ - java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForTrino \ - -c config/trino/trino_arctic_config.xml \ - --create=false --load=false --execute=true + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForTrino \ + -c config/trino/trino_arctic_config.xml \ + --create=false --load=false --execute=true ``` - - Iceberg + - Iceberg ``` docker exec -it lakehouse-benchmark \ - java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForTrino \ - -c config/trino/trino_iceberg_config.xml \ - --create=false --load=false --execute=true + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForTrino \ + -c config/trino/trino_iceberg_config.xml \ + --create=false --load=false --execute=true ``` - - Hudi + - Hudi ``` docker exec -it lakehouse-benchmark \ - java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForTrino \ - -c config/trino/presto_hudi_config.xml \ - --create=false --load=false --execute=true + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForTrino \ + -c config/trino/presto_hudi_config.xml \ + --create=false --load=false --execute=true ``` - 也可以使用 Spark : - - Arctic - ``` - docker exec -it lakehouse-benchmark \ - java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForSpark \ - -c config/spark/spark_arctic_config.xml \ - --create=false --load=false --execute=true - ``` - - Iceberg - ``` - docker exec -it lakehouse-benchmark \ - java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForSpark \ - -c config/spark/spark_iceberg_config.xml \ - --create=false --load=false --execute=true - ``` - - Hudi - ``` - docker exec -it lakehouse-benchmark \ - java -jar lakehouse-benchmark.jar \ - -b chbenchmarkForSpark \ - -c config/spark/spark_hudi_config.xml \ - --create=false --load=false --execute=true - ``` + - 也可以使用 Spark : + - Arctic + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_arctic_config.xml \ + --create=false --load=false --execute=true + ``` + - Iceberg + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_iceberg_config.xml \ + --create=false --load=false --execute=true + ``` + - Hudi + ``` + docker exec -it lakehouse-benchmark \ + java -jar lakehouse-benchmark.jar \ + -b chbenchmarkForSpark \ + -c config/spark/spark_hudi_config.xml \ + --create=false --load=false --execute=true + ``` ## 测试结果 测试跑完以后会在 `lakehouse-benchmark` 容器 `/usr/lib/lakehouse-benchmark/` 目录下生成一个 `results` 目录,测试结果都在里面,主要关注两个文件,第一:`xxx.summary.json` 文件, 这里面的 Average Latency 项显示的是本次性能测试的平均响应时间,第二:`xxx.statistic.csv` 文件,里面记录了每个 Query 类型的最大,最小,平均耗时。 \ No newline at end of file From f73576cd80015ac34e49aa77191e449b28099666 Mon Sep 17 00:00:00 2001 From: huangjianmin <531493269@qq.com> Date: Wed, 30 Nov 2022 15:01:17 +0800 Subject: [PATCH 05/18] polish document and change some settings in config --- .run/chbenchmark - spark.run.xml | 2 +- docker/benchmark/README.md | 62 ++++++++++--------- .../benchmark-lakehouse-ingestion.Dockerfile | 2 +- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/.run/chbenchmark - spark.run.xml b/.run/chbenchmark - spark.run.xml index 7194ee85..2d7a84e7 100644 --- a/.run/chbenchmark - spark.run.xml +++ b/.run/chbenchmark - spark.run.xml @@ -1,7 +1,7 @@ - +