Skip to content

Commit e93b8f0

Browse files
gengliangwangcloud-fan
authored andcommitted
[SPARK-32539][INFRA] Disallow FileSystem.get(Configuration conf) in style check by default
### What changes were proposed in this pull request? Disallow `FileSystem.get(Configuration conf)` in Scala style check by default and suggest developers use `FileSystem.get(URI uri, Configuration conf)` or `Path.getFileSystem()` instead. ### Why are the changes needed? The method `FileSystem.get(Configuration conf)` will return a default FileSystem instance if the conf `fs.file.impl` is not set. This can cause file not found exception on reading a target path of non-default file system, e.g. S3. It is hard to discover such a mistake via unit tests. If we disallow it in Scala style check by default and suggest developers use `FileSystem.get(URI uri, Configuration conf)` or `Path.getFileSystem(Configuration conf)`, we can reduce potential regression and PR review effort. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manually run scala style check and test. Closes apache#29357 from gengliangwang/newStyleRule. Authored-by: Gengliang Wang <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 7f275ee commit e93b8f0

File tree

5 files changed

+23
-1
lines changed

5 files changed

+23
-1
lines changed

core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala

+2
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,9 @@ private[deploy] object HadoopFSDelegationTokenProvider {
139139
def hadoopFSsToAccess(
140140
sparkConf: SparkConf,
141141
hadoopConf: Configuration): Set[FileSystem] = {
142+
// scalastyle:off FileSystemGet
142143
val defaultFS = FileSystem.get(hadoopConf)
144+
// scalastyle:on FileSystemGet
143145

144146
val filesystemsToAccess = sparkConf.get(KERBEROS_FILESYSTEMS_TO_ACCESS)
145147
.map(new Path(_).getFileSystem(hadoopConf))

core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala

+4
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,9 @@ class HadoopMapRedWriteConfigUtil[K, V: ClassTag](conf: SerializableJobConf)
222222
if (path != null) {
223223
path.getFileSystem(getConf)
224224
} else {
225+
// scalastyle:off FileSystemGet
225226
FileSystem.get(getConf)
227+
// scalastyle:on FileSystemGet
226228
}
227229
}
228230

@@ -285,7 +287,9 @@ class HadoopMapRedWriteConfigUtil[K, V: ClassTag](conf: SerializableJobConf)
285287

286288
if (SparkHadoopWriterUtils.isOutputSpecValidationEnabled(conf)) {
287289
// FileOutputFormat ignores the filesystem parameter
290+
// scalastyle:off FileSystemGet
288291
val ignoredFs = FileSystem.get(getConf)
292+
// scalastyle:on FileSystemGet
289293
getOutputFormat().checkOutputSpecs(ignoredFs, getConf)
290294
}
291295
}

resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala

+2
Original file line numberDiff line numberDiff line change
@@ -181,10 +181,12 @@ private[spark] class Client(
181181

182182
// The app staging dir based on the STAGING_DIR configuration if configured
183183
// otherwise based on the users home directory.
184+
// scalastyle:off FileSystemGet
184185
val appStagingBaseDir = sparkConf.get(STAGING_DIR)
185186
.map { new Path(_, UserGroupInformation.getCurrentUser.getShortUserName) }
186187
.getOrElse(FileSystem.get(hadoopConf).getHomeDirectory())
187188
stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
189+
// scalastyle:on FileSystemGet
188190

189191
new CallerContext("CLIENT", sparkConf.get(APP_CALLER_CONTEXT),
190192
Option(appId.toString)).setCurrentContext()

scalastyle-config.xml

+14
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,20 @@ This file is divided into 3 sections:
264264
of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
265265
</check>
266266

267+
<check customId="FileSystemGet" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
268+
<parameters><parameter name="regex">FileSystem.get\([a-zA-Z_$][a-zA-Z_$0-9]*\)</parameter></parameters>
269+
<customMessage><![CDATA[
270+
Are you sure that you want to use "FileSystem.get(Configuration conf)"? If the input
271+
configuration is not set properly, a default FileSystem instance will be returned. It can
272+
lead to errors when you deal with multiple file systems. Please consider using
273+
"FileSystem.get(URI uri, Configuration conf)" or "Path.getFileSystem(Configuration conf)" instead.
274+
If you must use the method "FileSystem.get(Configuration conf)", wrap the code block with
275+
// scalastyle:off FileSystemGet
276+
FileSystem.get(...)
277+
// scalastyle:on FileSystemGet
278+
]]></customMessage>
279+
</check>
280+
267281
<check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
268282
<parameters><parameter name="regex">extractOpt</parameter></parameters>
269283
<customMessage>Use jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
165165
// the assumption on column stats, and also the end-to-end behavior.
166166

167167
val hadoopConf = spark.sessionState.newHadoopConf()
168-
val fs = FileSystem.get(hadoopConf)
168+
val fs = new Path(tableDir.getAbsolutePath).getFileSystem(hadoopConf)
169169
val parts = fs.listStatus(new Path(tableDir.getAbsolutePath), new PathFilter {
170170
override def accept(path: Path): Boolean = !path.getName.startsWith("_")
171171
})

0 commit comments

Comments
 (0)