Skip to content

Commit

Permalink
[SPARK-34624][CORE] Exclude non-jar dependencies of ivy/maven packages
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Exclude non-jar dependencies of the ivy/maven packages we want to resolve as our current dependency resolution code assumes artifacts to be jars. https://github.com/apache/spark/blob/17601e014c6ccb48958d35ffb04bedeac8cfc66a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala#L1215 and https://github.com/apache/spark/blob/17601e014c6ccb48958d35ffb04bedeac8cfc66a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala#L318

### Why are the changes needed?
Some maven artifacts define non-jar dependencies. One such example is `hive-exec`'s dependency on the `pom` of `apache-curator` https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.8/hive-exec-2.3.8.pom

Today trying to depend on such an artifact using `--packages` will print an error but continue without including the non-jar dependency. Doing the same using `spark.sql("ADD JAR ivy://org.apache.hive:hive-exec:2.3.8?exclude=org.pentaho:pentaho-aggdesigner-algorithm")` will cause a failure. Detailed stacktraces can be found in SPARK-34624.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Added unit test. Retried the same example in `spark-shell` which produced the stacktrace in the JIRA.

Closes apache#31741 from shardulm94/add-jar-filter-poms.

Authored-by: Shardul Mahadik <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
shardulm94 authored and dongjoon-hyun committed Mar 5, 2021
1 parent c91a756 commit 1fd7368
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 7 deletions.
15 changes: 11 additions & 4 deletions core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
}

/** Provides utility functions to be used inside SparkSubmit. */
private[spark] object SparkSubmitUtils {
private[spark] object SparkSubmitUtils extends Logging {

// Exposed for testing
var printStream = SparkSubmit.printStream
Expand Down Expand Up @@ -1203,9 +1203,16 @@ private[spark] object SparkSubmitUtils {
def resolveDependencyPaths(
artifacts: Array[AnyRef],
cacheDirectory: File): Seq[String] = {
artifacts.map { artifactInfo =>
val artifact = artifactInfo.asInstanceOf[Artifact].getModuleRevisionId
val extraAttrs = artifactInfo.asInstanceOf[Artifact].getExtraAttributes
artifacts.map(_.asInstanceOf[Artifact]).filter { artifactInfo =>
if (artifactInfo.getExt == "jar") {
true
} else {
logInfo(s"Skipping non-jar dependency ${artifactInfo.getId}")
false
}
}.map { artifactInfo =>
val artifact = artifactInfo.getModuleRevisionId
val extraAttrs = artifactInfo.getExtraAttributes
val classifier = if (extraAttrs.containsKey("classifier")) {
"-" + extraAttrs.get("classifier")
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@
package org.apache.spark.deploy

import java.io.{File, OutputStream, PrintStream}
import java.net.URI
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

import com.google.common.io.Files
import org.apache.ivy.core.module.descriptor.MDArtifact
import org.apache.ivy.core.settings.IvySettings
import org.apache.ivy.plugins.resolver.{AbstractResolver, ChainResolver, FileSystemResolver, IBiblioResolver}
Expand Down Expand Up @@ -245,8 +247,8 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
|</ivysettings>
|""".stripMargin

val settingsFile = new File(tempIvyPath, "ivysettings.xml")
Files.write(settingsText, settingsFile, StandardCharsets.UTF_8)
val settingsFile = Paths.get(tempIvyPath, "ivysettings.xml")
Files.write(settingsFile, settingsText.getBytes(StandardCharsets.UTF_8))
val settings = SparkSubmitUtils.loadIvySettings(settingsFile.toString, None, None)
settings.setDefaultIvyUserDir(new File(tempIvyPath)) // NOTE - can't set this through file

Expand Down Expand Up @@ -277,4 +279,29 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
.exists(r.findFirstIn(_).isDefined), "resolution files should be cleaned")
}
}

test("SPARK-34624: should ignore non-jar dependencies") {
val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
val dep = "my.great.dep:mydep:0.1"

IvyTestUtils.withRepository(main, Some(dep), None) { repo =>
// IvyTestUtils.withRepository does not have an easy way for creating non-jar dependencies
// So we let it create the jar dependency in `mylib-0.1.pom`, and then modify the pom
// to change the type of the transitive to `pom`
val mainPom = Paths.get(URI.create(repo)).resolve("my/great/lib/mylib/0.1/mylib-0.1.pom")
val lines = Files.lines(mainPom).iterator.asScala
.map(l => if (l.trim == "<artifactId>mydep</artifactId>") s"$l<type>pom</type>" else l)
.toList
Files.write(mainPom, lines.asJava)

val ivySettings = SparkSubmitUtils.buildIvySettings(Some(repo), Some(tempIvyPath))
val jarPath = SparkSubmitUtils.resolveMavenCoordinates(
main.toString,
ivySettings,
transitive = true,
isTest = true)
assert(!jarPath.exists(_.indexOf("mydep") >= 0), "should not find pom dependency." +
s" Resolved jars are: $jarPath")
}
}
}

0 comments on commit 1fd7368

Please sign in to comment.