Skip to content

Commit

Permalink
Metaclient: Drop Hadoop 2 support (#6740)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnnyaug authored Oct 10, 2023
1 parent eb62d5d commit 9926364
Show file tree
Hide file tree
Showing 13 changed files with 129 additions and 336 deletions.
36 changes: 10 additions & 26 deletions .github/workflows/esti.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -622,13 +622,6 @@ jobs:
name: Build metadata client for Spark 3.x
runs-on: ubuntu-latest-8-cores
needs: check-secrets
strategy:
matrix:
spark:
- project-variable: core3
project-suffix: "-301"
- project-variable: core
project-suffix: ""
env:
TAG: ${{ needs.deploy-image.outputs.tag }}
REPO: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.us-east-1.amazonaws.com
Expand All @@ -640,7 +633,7 @@ jobs:
id: restore-cache
with:
path: ${{ github.workspace }}/test/spark/metaclient
key: metadata-client-${{ matrix.spark.project-variable }}-${{ hashFiles('./clients/spark/**') }}
key: metadata-client-core-${{ hashFiles('./clients/spark/**') }}

- uses: actions/setup-java@v3
if: steps.restore-cache.outputs.cache-hit != 'true'
Expand All @@ -653,7 +646,7 @@ jobs:
if: steps.restore-cache.outputs.cache-hit != 'true'
working-directory: clients/spark
run: |
sbt 'set ${{ matrix.spark.project-variable }} / assembly / test := {}' lakefs-spark-client${{ matrix.spark.project-suffix }}/assembly
sbt 'set assembly / test := {}' assembly
- name: Prepare Metaclient location for export
if: steps.restore-cache.outputs.cache-hit != 'true'
Expand All @@ -663,23 +656,14 @@ jobs:
working-directory: clients/spark
run: |
mkdir -p ${{ github.workspace }}/test/spark/metaclient
cp target/core${{ matrix.spark.project-suffix }}/scala-2.12/lakefs-spark-client${{ matrix.spark.project-suffix }}-assembly*.jar ${{ github.workspace }}/test/spark/metaclient/spark-assembly-${{ matrix.spark.project-variable }}.jar
cp target/core/scala-2.12/lakefs-spark-client-assembly*.jar ${{ github.workspace }}/test/spark/metaclient/spark-assembly-core.jar
metadata-client-export-spark3:
name: Test lakeFS metadata client export with Spark 3.x
needs: [deploy-image, build-spark3-metadata-client]
runs-on: ubuntu-20.04
strategy:
matrix:
spark:
- version: 3.2.1
project: "core"
- version: 3.1.2
project: "core3"
- version: 3.0.2
project: "core3"
env:
SPARK_TAG: ${{ matrix.spark.version }}
SPARK_TAG: 3.2.1
REPO: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.us-east-1.amazonaws.com
TAG: ${{ needs.deploy-image.outputs.tag }}
steps:
Expand All @@ -690,7 +674,7 @@ jobs:
id: restore-cache
with:
path: ${{ github.workspace }}/test/spark/metaclient
key: metadata-client-${{ matrix.spark.project }}-${{ hashFiles('./clients/spark/**') }}
key: metadata-client-core-${{ hashFiles('./clients/spark/**') }}

- name: Generate uniquifying value
id: unique
Expand All @@ -709,21 +693,21 @@ jobs:
LAKEFS_BLOCKSTORE_S3_CREDENTIALS_SECRET_ACCESS_KEY: ${{ secrets.ESTI_AWS_SECRET_ACCESS_KEY }}

- name: Copy repository ref
run: aws s3 cp --recursive s3://esti-system-testing-data/golden-files/gc-test-data s3://esti-system-testing/${{ github.run_number }}-spark${{ matrix.spark.version }}-metaclient/exporter/${{ steps.unique.outputs.value }}
run: aws s3 cp --recursive s3://esti-system-testing-data/golden-files/gc-test-data s3://esti-system-testing/${{ github.run_number }}-spark3.2.1-metaclient/exporter/${{ steps.unique.outputs.value }}

- name: Setup Exporter tests
env:
STORAGE_NAMESPACE: s3://esti-system-testing/${{ github.run_number }}-spark${{ matrix.spark.version }}-metaclient/exporter/${{ steps.unique.outputs.value }}
STORAGE_NAMESPACE: s3://esti-system-testing/${{ github.run_number }}-spark3.2.1-metaclient/exporter/${{ steps.unique.outputs.value }}
REPOSITORY: test-data-exporter
working-directory: test/spark
run: ./setup-exporter-test.sh

- name: Test Exporter with Spark 3.x
env:
STORAGE_NAMESPACE: s3://esti-system-testing/${{ github.run_number }}-spark${{ matrix.spark.version }}-metaclient/exporter/${{ steps.unique.outputs.value }}
STORAGE_NAMESPACE: s3://esti-system-testing/${{ github.run_number }}-spark3.2.1-metaclient/exporter/${{ steps.unique.outputs.value }}
REPOSITORY: test-data-exporter
CLIENT_JAR: ${{ github.workspace }}/test/spark/metaclient/spark-assembly-${{ matrix.spark.project }}.jar
EXPORT_LOCATION: s3://esti-system-testing/${{ github.run_number }}-spark${{ matrix.spark.version }}-client-export/${{ steps.unique.outputs.value }}
CLIENT_JAR: ${{ github.workspace }}/test/spark/metaclient/spark-assembly-core.jar
EXPORT_LOCATION: s3://esti-system-testing/${{ github.run_number }}-spark3.2.1-client-export/${{ steps.unique.outputs.value }}
working-directory: test/spark
run: ./run-exporter-test.sh

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ proto: tools ## Build proto (Protocol Buffers) files
$(PROTOC) --proto_path=pkg/kv/kvtest --go_out=pkg/kv/kvtest --go_opt=paths=source_relative test_model.proto

publish-scala: ## sbt publish spark client jars to nexus and s3 bucket
cd clients/spark && sbt assembly && sbt s3Upload && sbt publishSigned
cd clients/spark && sbt assembly && sbt s3Upload && sbt "project root" publishSigned
aws s3 cp --recursive --acl public-read $(CLIENT_JARS_BUCKET) $(CLIENT_JARS_BUCKET) --metadata-directive REPLACE

publish-lakefsfs-test: ## sbt publish spark lakefsfs test jars to s3 bucket
Expand Down
24 changes: 3 additions & 21 deletions clients/spark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,35 +13,17 @@ _Please note that starting version 0.9.0, Spark 2 is not supported with the lake

### Uber-jar
The Uber-Jar can be found on a public S3 location:

It should be used when running into conflicting dependencies on environments like EMR, Databricks, etc.

For Spark for Hadoop 3:
http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/${CLIENT_VERSION}/lakefs-spark-client-assembly-${CLIENT_VERSION}.jar

For Spark for Hadoop 2 (deprecated):
http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-301/${CLIENT_VERSION}/lakefs-spark-client-301-assembly-${CLIENT_VERSION}.jar


### Maven
Otherwise, the client can be included using Maven coordinates:

For Spark for Hadoop 3:
```
io.lakefs:lakefs-spark-client_2.12:<version>
```
[See available versions](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client_2.12).

For Spark for Hadoop 2 (deprecated):
```
io.lakefs:lakefs-spark-client-301_2.12:<version>
io.lakefs:lakefs-spark-client_2.12:${CLIENT_VERSION}
```
[See available versions](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client-301_2.12).

## Usage Examples
### Export using spark-submit

Replace `<version>` below with the latest version available. See available versions for [Spark for Hadoop 3](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client_2.12) or [Spark for Hadoop 2](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client-301_2.12) (deprecated).
Replace `<version>` below with the latest version available. See [available versions](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client_2.12).

```
CLIENT_VERSION=0.10.0
Expand All @@ -58,7 +40,7 @@ spark-submit --conf spark.hadoop.lakefs.api.url=https://lakefs.example.com/api/v

### Export using spark-submit (uber-jar)

Replace `<version>` below with the latest version available. See available versions for [Spark for Hadoop 3](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client_2.12) or [Spark for Hadoop 2](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client-301_2.12) (deprecated).
Replace `<version>` below with the latest version available. See [available versions](https://mvnrepository.com/artifact/io.lakefs/lakefs-spark-client_2.12).
```
CLIENT_VERSION=0.10.0
Expand Down
133 changes: 35 additions & 98 deletions clients/spark/build.sbt
Original file line number Diff line number Diff line change
@@ -1,102 +1,54 @@
import build.BuildType

lazy val baseName = "lakefs-spark"
lazy val projectVersion = "0.10.0"
lazy val hadoopVersion = "3.2.1"

ThisBuild / isSnapshot := false
ThisBuild / scalaVersion := "2.12.12"

def settingsToCompileIn(dir: String, flavour: String = "") = {
lazy val allSettings = Seq(
def settingsToCompileIn(dir: String) = {
Seq(
Compile / scalaSource := (ThisBuild / baseDirectory).value / dir / "src" / "main" / "scala",
Test / scalaSource := (ThisBuild / baseDirectory).value / dir / "src" / "test" / "scala",
Compile / resourceDirectory := (ThisBuild / baseDirectory).value / dir / "src" / "main" / "resources",
Compile / PB.includePaths += (Compile / resourceDirectory).value,
Compile / PB.protoSources += (Compile / resourceDirectory).value
)
lazy val flavourSettings =
if (flavour != "")
Seq(
Compile / unmanagedSourceDirectories += (ThisBuild / baseDirectory).value / dir / "src" / "main" / flavour / "scala"
)
else
Seq()
allSettings ++ flavourSettings
}

def generateCoreProject(buildType: BuildType) = {
Project(s"${baseName}-client${buildType.suffix}", file("core"))
.settings(
sharedSettings,
if (buildType.hadoopFlavour == "hadoop2") hadoop2ShadingSettings
else hadoop3ShadingSettings,
s3UploadSettings,
settingsToCompileIn("core", buildType.hadoopFlavour),
semanticdbEnabled := true, // enable SemanticDB
semanticdbVersion := scalafixSemanticdb.revision,
scalacOptions += "-Ywarn-unused-import",
Compile / PB.targets := Seq(
scalapb.gen() -> (Compile / sourceManaged).value / "scalapb"
),
libraryDependencies ++= getSharedLibraryDependencies(buildType)
++ getLibraryDependenciesByHadoopFlavour(buildType.hadoopFlavour),
testFrameworks += new TestFramework("org.scalameter.ScalaMeterFramework"),
Test / logBuffered := false,
// Uncomment to get accurate benchmarks with just "sbt test".
// Otherwise tell sbt to
// "testOnly io.treeverse.clients.ReadSSTableBenchmark"
// (or similar).
//
// Test / parallelExecution := false,

// Uncomment to get (very) full stacktraces in test:
// Test / testOptions += Tests.Argument("-oF"),
target := file(s"target/core${buildType.suffix}/"),
buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion),
buildInfoPackage := "io.treeverse.clients"
)
.enablePlugins(S3Plugin, BuildInfoPlugin)
}
def generateExamplesProject(buildType: BuildType) =
Project(s"${baseName}-examples${buildType.suffix}", file(s"examples"))
.settings(
sharedSettings,
settingsToCompileIn("examples", buildType.hadoopFlavour),
semanticdbEnabled := true, // enable SemanticDB
semanticdbVersion := scalafixSemanticdb.revision,
scalacOptions += "-Ywarn-unused-import",
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-sql" % buildType.sparkVersion % "provided",
"com.amazonaws" % "aws-java-sdk-bundle" % "1.12.194"
),
assembly / mainClass := Some("io.treeverse.examples.List"),
target := file(s"target/examples${buildType.suffix}/"),
run / fork := false // https://stackoverflow.com/questions/44298847/sbt-spark-fork-in-run
)

lazy val spark3Type =
new BuildType("-301", "3.0.1", "0.10.11", "hadoop2", "hadoop2-2.0.1")

// EMR-6.5.0 beta, managed GC
lazy val spark312Type =
new BuildType("-312-hadoop3", "3.1.2", "0.10.11", "hadoop3", "hadoop3-2.0.1")

lazy val coreType =
new BuildType("", "3.1.2", "0.10.11", "hadoop3", "hadoop3-2.0.1")
lazy val core = generateCoreProject(coreType)
lazy val core3 = generateCoreProject(spark3Type)
lazy val core312 = generateCoreProject(spark312Type)
lazy val examples3 = generateExamplesProject(spark3Type).dependsOn(core3)
lazy val examples312 = generateExamplesProject(spark312Type).dependsOn(core312)

lazy val root =
(project in file(".")).aggregate(core, core3, core312, examples3, examples312)
lazy val root = (project in file("core"))
.settings(
name := "lakefs-spark-client",
sharedSettings,
hadoop3ShadingSettings,
s3UploadSettings,
settingsToCompileIn("core"),
semanticdbEnabled := true, // enable SemanticDB
semanticdbVersion := scalafixSemanticdb.revision,
scalacOptions += "-Ywarn-unused-import",
Compile / PB.targets := Seq(
scalapb.gen() -> (Compile / sourceManaged).value / "scalapb"
),
libraryDependencies ++= getSharedLibraryDependencies(),
testFrameworks += new TestFramework("org.scalameter.ScalaMeterFramework"),
Test / logBuffered := false,
// Uncomment to get accurate benchmarks with just "sbt test".
// Otherwise tell sbt to
// "testOnly io.treeverse.clients.ReadSSTableBenchmark"
// (or similar).
//
// Test / parallelExecution := false,

// Uncomment to get (very) full stacktraces in test:
// Test / testOptions += Tests.Argument("-oF"),
buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion),
buildInfoPackage := "io.treeverse.clients",
target := file(s"target/core/")
)
.enablePlugins(S3Plugin, BuildInfoPlugin)

def getSharedLibraryDependencies(buildType: BuildType): Seq[ModuleID] = {
def getSharedLibraryDependencies(): Seq[ModuleID] = {
Seq(
"io.lakefs" % "api-client" % "0.91.0",
"org.apache.spark" %% "spark-sql" % buildType.sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % "3.1.2" % "provided",
"com.thesamet.scalapb" %% "scalapb-runtime" % scalapb.compiler.Version.scalapbVersion % "protobuf",
"org.apache.hadoop" % "hadoop-aws" % hadoopVersion % "provided",
"org.apache.hadoop" % "hadoop-common" % hadoopVersion % "provided",
Expand All @@ -107,6 +59,7 @@ def getSharedLibraryDependencies(buildType: BuildType): Seq[ModuleID] = {
"com.azure" % "azure-storage-blob" % "12.9.0",
"com.azure" % "azure-storage-blob-batch" % "12.7.0",
"com.azure" % "azure-identity" % "1.2.0",
"com.amazonaws" % "aws-java-sdk-bundle" % "1.12.194" % "provided",
// Snappy is JNI :-(. However it does claim to work with
// ClassLoaders, and (even more importantly!) using a preloaded JNI
// version will probably continue to work because the C language API
Expand All @@ -129,17 +82,6 @@ def getSharedLibraryDependencies(buildType: BuildType): Seq[ModuleID] = {
)
}

def getLibraryDependenciesByHadoopFlavour(hadoopFlavour: String): Seq[ModuleID] = {
if (hadoopFlavour == "hadoop2") {
// hadoop-aws provides AWS SDK at version >= 1.7.4. So declare this
// version, but ask to use whatever is provided so we do not
// override what it selects.
Seq("com.amazonaws" % "aws-java-sdk-bundle" % "1.12.194")
} else {
Seq("com.amazonaws" % "aws-java-sdk-bundle" % "1.12.194" % "provided")
}
}

def rename(prefix: String) = ShadeRule.rename(prefix -> "io.lakefs.spark.shade.@0")

// We are using the default sbt assembly merge strategy https://github.com/sbt/sbt-assembly#merge-strategy with a change
Expand Down Expand Up @@ -178,10 +120,7 @@ lazy val sharedShadeRules = Seq(
rename("reactor.util.**").inAll
)

lazy val hadoop2ShadeRules = sharedShadeRules ++ Seq(rename("com.amazonaws.**").inAll)
lazy val hadoop3ShadeRules = sharedShadeRules

lazy val hadoop2ShadingSettings = assembly / assemblyShadeRules := hadoop2ShadeRules
lazy val hadoop3ShadingSettings = assembly / assemblyShadeRules := hadoop3ShadeRules

// Upload assembly jars to S3
Expand All @@ -194,8 +133,6 @@ lazy val s3UploadSettings = Seq(
s3Upload / s3Progress := true
)

// Don't publish root project
root / publish / skip := true

lazy val commonSettings = Seq(
version := projectVersion,
Expand Down

This file was deleted.

Loading

0 comments on commit 9926364

Please sign in to comment.