Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop spark31x shims [databricks] #11159

Merged
merged 9 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 1 addition & 69 deletions aggregator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -252,79 +252,11 @@

<profiles>
<profile>
<id>release311</id>
<id>release320</id>
<activation>
<!-- #if scala-2.12 -->
<activeByDefault>true</activeByDefault>
<!-- #endif scala-2.12 -->
<property>
<name>buildver</name>
<value>311</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-delta-stub_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
</dependency>
</dependencies>
</profile>
<profile>
<id>release312</id>
<activation>
<property>
<name>buildver</name>
<value>312</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-delta-stub_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
</dependency>
</dependencies>
</profile>
<profile>
<id>release313</id>
<activation>
<property>
<name>buildver</name>
<value>313</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-delta-stub_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
</dependency>
</dependencies>
</profile>
<profile>
<id>release314</id>
<activation>
<property>
<name>buildver</name>
<value>314</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-delta-stub_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
</dependency>
</dependencies>
</profile>
<profile>
<id>release320</id>
<activation>
<property>
<name>buildver</name>
<value>320</value>
Expand Down
4 changes: 2 additions & 2 deletions api_validation/auditAllVersions.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,4 +14,4 @@
# limitations under the License.
set -ex

mvn scala:run -P spark311
mvn scala:run -P spark320
6 changes: 3 additions & 3 deletions build/buildall
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ export -f build_single_shim
# Install all the versions for DIST_PROFILE

# First build the aggregator module for all SPARK_SHIM_VERSIONS in parallel skipping expensive plugins that
# - either deferred to 311 because the check is identical in all shim profiles such as scalastyle
# - or deferred to 311 because we currently don't require it per shim such as scaladoc generation
# - either deferred to 320 because the check is identical in all shim profiles such as scalastyle
# - or deferred to 320 because we currently don't require it per shim such as scaladoc generation
# - or there is a dedicated step to run against a particular shim jar such as unit tests, in
# the near future we will run unit tests against a combined multi-shim jar to catch classloading
# regressions even before pytest-based integration_tests
Expand All @@ -296,7 +296,7 @@ time (
fi
# This used to resume from dist. However, without including aggregator in the build
# the build does not properly initialize spark.version property via buildver profiles
# in the root pom, and we get a missing spark311 dependency even for --profile=312,321
# in the root pom, and we get a missing spark320 dependency even for --profile=320,321
# where the build does not require it. Moving it to aggregator resolves this issue with
# a negligible increase of the build time by ~2 seconds.
joinShimBuildFrom="aggregator"
Expand Down
6 changes: 3 additions & 3 deletions build/make-scala-version-build-files.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
#
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -20,7 +20,7 @@ set -e

VALID_VERSIONS=( 2.13 )
declare -A DEFAULT_SPARK
DEFAULT_SPARK[2.12]="spark311"
DEFAULT_SPARK[2.12]="spark320"
DEFAULT_SPARK[2.13]="spark330"

usage() {
Expand Down Expand Up @@ -94,4 +94,4 @@ sed_i '/<spark\-rapids\-jni\.version>/,/<scala\.binary\.version>[0-9]*\.[0-9]*</
# Match any scala version to ensure idempotency
SCALA_VERSION=$(mvn help:evaluate -Pscala-${TO_VERSION} -Dexpression=scala.version -q -DforceStdout)
sed_i '/<spark\-rapids\-jni\.version>/,/<scala.version>[0-9]*\.[0-9]*\.[0-9]*</s/<scala\.version>[0-9]*\.[0-9]*\.[0-9]*</<scala.version>'$SCALA_VERSION'</' \
"$TO_DIR/pom.xml"
"$TO_DIR/pom.xml"
8 changes: 4 additions & 4 deletions build/shimplify.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,15 +67,15 @@
Each line is assumed to be a JSON to keep it extensible.

/*** spark-rapids-shim-json-lines
{"spark": "312"}
{"spark": "320"}
{"spark": "323"}
spark-rapids-shim-json-lines ***/

The canonical location of a source file shared by multiple shims is
src/main/<top_buildver_in_the_comment>

You can find all shim files for a particular shim, e.g. 312, easily by executing:
git grep '{"spark": "312"}' '*.java' '*.scala'
You can find all shim files for a particular shim, e.g. 320, easily by executing:
git grep '{"spark": "320"}' '*.java' '*.scala'
"""

import errno
Expand Down

This file was deleted.

8 changes: 4 additions & 4 deletions dist/build/package-parallel-worlds.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -73,8 +73,8 @@ def shell_exec(shell_cmd):
shell_exec(mvn_cmd)

dist_dir = os.sep.join([source_basedir, 'dist'])
with open(os.sep.join([dist_dir, 'unshimmed-common-from-spark311.txt']), 'r') as f:
from_spark311 = f.read().splitlines()
with open(os.sep.join([dist_dir, 'unshimmed-common-from-spark320.txt']), 'r') as f:
from_spark320 = f.read().splitlines()
with open(os.sep.join([dist_dir, 'unshimmed-from-each-spark3xx.txt']), 'r') as f:
from_each = f.read().splitlines()
with zipfile.ZipFile(os.sep.join([deps_dir, art_jar]), 'r') as zip_handle:
Expand All @@ -88,7 +88,7 @@ def shell_exec(shell_cmd):
# TODO deprecate
namelist = zip_handle.namelist()
matching_members = []
glob_list = from_spark311 + from_each if bv == buildver_list[0] else from_each
glob_list = from_spark320 + from_each if bv == buildver_list[0] else from_each
for pat in glob_list:
new_matches = fnmatch.filter(namelist, pat)
matching_members += new_matches
Expand Down
2 changes: 1 addition & 1 deletion dist/maven-antrun/build-parallel-worlds.xml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
<!-- Remove the explicily unshimmed files from the common directory -->
<delete>
<fileset dir="${project.build.directory}/parallel-world/spark-shared"
includesfile="${spark.rapids.source.basedir}/${rapids.module}/unshimmed-common-from-spark311.txt"/>
includesfile="${spark.rapids.source.basedir}/${rapids.module}/unshimmed-common-from-spark320.txt"/>
</delete>
</target>
<target name="remove-dependencies-from-pom" depends="build-parallel-worlds">
Expand Down
3 changes: 1 addition & 2 deletions dist/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@
<id>minimumFeatureVersionMix</id>
<properties>
<included_buildvers>
312,
320,
321cdh,
330,
Expand Down Expand Up @@ -389,7 +388,7 @@ self.log("... OK")
<target>
<taskdef resource="net/sf/antcontrib/antcontrib.properties"/>
<ac:if xmlns:ac="antlib:net.sf.antcontrib">
<equals arg1="spark311" arg2="${spark.version.classifier}"/>
<equals arg1="spark320" arg2="${spark.version.classifier}"/>
<ac:then>
<java classname="com.nvidia.spark.rapids.RapidsConf" failonerror="true">
<arg value="${project.basedir}/../docs/configs.md"/>
Expand Down
8 changes: 4 additions & 4 deletions dist/scripts/binary-dedupe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,14 @@ mv "$SPARK_SHARED_DIR" parallel-world/
# identical regardless of the Spark-version-specific jar.
#
# At this point the duplicate classes have not been removed from version-specific jar
# locations such as parallel-world/spark312.
# locations such as parallel-world/spark321.
# For each unshimmed class file look for all of its copies inside /spark[34]* and
# and count the number of distinct checksums. There are two representative cases
# 1) The class is contributed to the unshimmed location via the unshimmed-from-each-spark34 list. These are classes
# carrying the shim classifier in their package name such as
# com.nvidia.spark.rapids.spark312.RapidsShuffleManager. They are unique by construction,
# and will have zero copies in any non-spark312 shims. Although such classes are currently excluded from
# being copied to the /spark312 Parallel World we keep the algorithm below general without assuming this.
# com.nvidia.spark.rapids.spark321.RapidsShuffleManager. They are unique by construction,
# and will have zero copies in any non-spark321 shims. Although such classes are currently excluded from
# being copied to the /spark321 Parallel World we keep the algorithm below general without assuming this.
#
# 2) The class is contributed to the unshimmed location via unshimmed-common. These are classes that
# that have the same package and class name across all parallel worlds.
Expand Down
14 changes: 7 additions & 7 deletions docs/dev/shimplify.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ time of this writing) have a new set of special sibling directories
`src/(main|test)/spark${buildver}`.

Previous `src/(main|test)/${buildver}` and
version-range-with-exceptions directories such as `src/main/311until340-non330db` are deprecated and
version-range-with-exceptions directories such as `src/main/320until340-non330db` are deprecated and
are being removed as a result of the conversion to the new structure.

`shimplify` changes the way the source code is shared among shims by using an explicit
Expand All @@ -37,7 +37,7 @@ in a source-code level comment instead of the shared directories.

```scala
/*** spark-rapids-shim-json-lines
{"spark": "312"}
{"spark": "320"}
{"spark": "323"}
spark-rapids-shim-json-lines ***/
```
Expand Down Expand Up @@ -155,7 +155,7 @@ It is not expected to be really necessary but it is possible to convert a subset

* Either by adding -Dshimplify.shims=buildver1,buildver2,... to the commands above
* Or by specifying a list of directories you would like to delete to have a simpler directory
-Dshimplify.dirs=311until340-non330db,320until330-noncdh
-Dshimplify.dirs=320until340-non330db,320until330-noncdh

The latter is just a minor twist on the former. Instead of having an explicit list of shims, it
first computes the list of all `buildver` values using provided directories. After this *all* the
Expand Down Expand Up @@ -202,15 +202,15 @@ work on resolving potential compilation failures manually.

## Deleting a Shim

Every Spark build is de-supported eventually. To drop a build say 311 you can run
Every Spark build is de-supported eventually. To drop a build say 320 you can run

```bash
mvn generate-sources -Dshimplify=true -Dshimplify.move=true \
-Dshimplify.remove.shim=311
-Dshimplify.remove.shim=320
```

This command will remove the comment line `{"spark": "311"}` from all source files contributing to
the 311 shim. If a file belongs exclusively to 311 it will be removed.
This command will remove the comment line `{"spark": "320"}` from all source files contributing to
the 320 shim. If a file belongs exclusively to 320 it will be removed.

After adding or deleting shims you should sanity-check the diff in the local git repo and
run the integration tests above.
Expand Down
2 changes: 1 addition & 1 deletion jenkins/spark-nightly-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3

DIST_PL="dist"
function mvnEval {
$MVN help:evaluate -q -pl $DIST_PL $MVN_URM_MIRROR -Prelease311 -Dmaven.repo.local=$M2DIR -DforceStdout -Dexpression=$1
$MVN help:evaluate -q -pl $DIST_PL $MVN_URM_MIRROR -Prelease320 -Dmaven.repo.local=$M2DIR -DforceStdout -Dexpression=$1
}

ART_ID=$(mvnEval project.artifactId)
Expand Down
4 changes: 2 additions & 2 deletions jenkins/spark-premerge-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,12 @@ mvn_verify() {
# The jacoco coverage should have been collected, but because of how the shade plugin
# works and jacoco we need to clean some things up so jacoco will only report for the
# things we care about
SPK_VER=${JACOCO_SPARK_VER:-"311"}
SPK_VER=${JACOCO_SPARK_VER:-"320"}
mkdir -p target/jacoco_classes/
FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)
UDF_JAR=$(ls ./udf-compiler/target/spark${SPK_VER}/rapids-4-spark-udf_2.12-*-spark${SPK_VER}.jar | grep -v test | xargs readlink -f)
pushd target/jacoco_classes/
jar xf $FILE com org rapids spark-shared "spark${JACOCO_SPARK_VER:-311}/"
jar xf $FILE com org rapids spark-shared "spark${JACOCO_SPARK_VER:-320}/"
# extract the .class files in udf jar and replace the existing ones in spark3xx-ommon and spark$SPK_VER
# because the class files in udf jar will be modified in aggregator's shade phase
jar xf "$UDF_JAR" com/nvidia/spark/udf
Expand Down
2 changes: 1 addition & 1 deletion jenkins/spark-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ $MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \
-DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER -Dclassifier=pytest -Dpackaging=tar.gz

RAPIDS_INT_TESTS_HOME="$ARTF_ROOT/integration_tests/"
# The version of pytest.tar.gz that is uploaded is the one built against spark311 but its being pushed without classifier for now
# The version of pytest.tar.gz that is uploaded is the one built against spark320 but its being pushed without classifier for now
RAPIDS_INT_TESTS_TGZ="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_TEST_VER-pytest.tar.gz"

tmp_info=${TMP_INFO_FILE:-'/tmp/artifacts-build.info'}
Expand Down
2 changes: 1 addition & 1 deletion jenkins/version-def.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ fi
# PHASE_TYPE: CICD phase at which the script is called, to specify Spark shim versions.
# regular: noSnapshots + snapshots
# pre-release: noSnapshots only
# *: shim versions to build, e.g., PHASE_TYPE="311 321"
# *: shim versions to build, e.g., PHASE_TYPE="320 321"
PHASE_TYPE=${PHASE_TYPE:-"regular"}
case $PHASE_TYPE in
# SPARK_SHIM_VERSIONS will be used for nightly artifact build
Expand Down
Loading
Loading