Skip to content

Commit

Permalink
[SPARK-45225][SQL] XML: XSD file URL support
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Add support to read XSD file URL.

### Why are the changes needed?
Add support to read XSD file URL.

### Does this PR introduce _any_ user-facing change?
Yes

### How was this patch tested?
Unit test
Manual test

### Was this patch authored or co-authored using generative AI tooling?
No

Closes apache#43000 from sandip-db/xml-xsd-url-master.

Authored-by: Sandip Agarwala <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
sandip-db authored and HyukjinKwon committed Sep 20, 2023
1 parent 48b1a28 commit 6039320
Showing 1 changed file with 20 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,43 @@
*/
package org.apache.spark.sql.catalyst.xml

import java.nio.file.Paths
import javax.xml.XMLConstants
import javax.xml.transform.stream.StreamSource
import javax.xml.validation.{Schema, SchemaFactory}

import com.google.common.cache.{CacheBuilder, CacheLoader}
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkFiles
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.util.Utils

/**
* Utilities for working with XSD validation.
*/
private[sql] object ValidatorUtil {

// Parsing XSDs may be slow, so cache them by path:

private val cache = CacheBuilder.newBuilder().softValues().build(
new CacheLoader[String, Schema] {
override def load(key: String): Schema = {
// Handle case where file exists as specified
var path = Paths.get(key)
if (!path.toFile.exists()) {
// Handle case where it was added with sc.addFile
path = Paths.get(SparkFiles.get(key))
val in = try {
// Handle case where file exists as specified
val fs = Utils.getHadoopFileSystem(key, SparkHadoopUtil.get.conf)
fs.open(new Path(key))
} catch {
case _: Throwable =>
// Handle case where it was added with sc.addFile
val addFileUrl = SparkFiles.get(key)
val fs = Utils.getHadoopFileSystem(addFileUrl, SparkHadoopUtil.get.conf)
fs.open(new Path(addFileUrl))
}
try {
val schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
schemaFactory.newSchema(new StreamSource(in))
} finally {
in.close()
}
val schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
schemaFactory.newSchema(path.toFile)
}
})

Expand Down

0 comments on commit 6039320

Please sign in to comment.