Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error: runtime error with pyspark quickstart notebook #24

Open
silpara opened this issue May 10, 2023 · 2 comments
Open

Error: runtime error with pyspark quickstart notebook #24

silpara opened this issue May 10, 2023 · 2 comments

Comments

@silpara
Copy link

silpara commented May 10, 2023

I installed the conda environment using conda env create -f envs/pyspark-330-delta-220 and tried running the notebook
01_quickstart.ipynb but I get the following error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[3], line 1
----> 1 spark = configure_spark_with_delta_pip(builder).getOrCreate()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\sql\session.py:269, in SparkSession.Builder.getOrCreate(self)
    267     sparkConf.set(key, value)
    268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
    270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
    271 # by all sessions.
    272 session = SparkSession(sc, options=self._options)

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:483, in SparkContext.getOrCreate(cls, conf)
    481 with SparkContext._lock:
    482     if SparkContext._active_spark_context is None:
--> 483         SparkContext(conf=conf or SparkConf())
    484     assert SparkContext._active_spark_context is not None
    485     return SparkContext._active_spark_context

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:195, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
    189 if gateway is not None and gateway.gateway_parameters.auth_token is None:
    190     raise ValueError(
    191         "You are trying to pass an insecure Py4j gateway to Spark. This"
    192         " is not allowed as it is a security risk."
    193     )
--> 195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    196 try:
    197     self._do_init(
    198         master,
    199         appName,
   (...)
    208         udf_profiler_cls,
    209     )

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:417, in SparkContext._ensure_initialized(cls, instance, gateway, conf)
    415 with SparkContext._lock:
    416     if not SparkContext._gateway:
--> 417         SparkContext._gateway = gateway or launch_gateway(conf)
    418         SparkContext._jvm = SparkContext._gateway.jvm
    420     if instance:

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\java_gateway.py:106, in launch_gateway(conf, popen_kwargs)
    103     time.sleep(0.1)
    105 if not os.path.isfile(conn_info_file):
--> 106     raise RuntimeError("Java gateway process exited before sending its port number")
    108 with open(conn_info_file, "rb") as info:
    109     gateway_port = read_int(info)

RuntimeError: Java gateway process exited before sending its port number

I am on windows 11 machine.

@silpara
Copy link
Author

silpara commented May 10, 2023

Update: I installed java from https://download.oracle.com/java/20/latest/jdk-20_windows-x64_bin.exe, set JAVA_HOME environment variable and tried again. Getting the following error now

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
Cell In[3], line 1
----> 1 spark = configure_spark_with_delta_pip(builder).getOrCreate()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\sql\session.py:269, in SparkSession.Builder.getOrCreate(self)
    267     sparkConf.set(key, value)
    268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
    270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
    271 # by all sessions.
    272 session = SparkSession(sc, options=self._options)

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:483, in SparkContext.getOrCreate(cls, conf)
    481 with SparkContext._lock:
    482     if SparkContext._active_spark_context is None:
--> 483         SparkContext(conf=conf or SparkConf())
    484     assert SparkContext._active_spark_context is not None
    485     return SparkContext._active_spark_context

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:197, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
    195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    196 try:
--> 197     self._do_init(
    198         master,
    199         appName,
    200         sparkHome,
    201         pyFiles,
    202         environment,
    203         batchSize,
    204         serializer,
    205         conf,
    206         jsc,
    207         profiler_cls,
    208         udf_profiler_cls,
    209     )
    210 except BaseException:
    211     # If an error occurs, clean up in order to allow future SparkContext creation:
    212     self.stop()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:282, in SparkContext._do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls, udf_profiler_cls)
    279 self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
    281 # Create the Java SparkContext through Py4J
--> 282 self._jsc = jsc or self._initialize_context(self._conf._jconf)
    283 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
    284 self._conf = SparkConf(_jconf=self._jsc.sc().conf())

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:402, in SparkContext._initialize_context(self, jconf)
    398 """
    399 Initialize SparkContext in function to allow subclass specific initialization
    400 """
    401 assert self._jvm is not None
--> 402 return self._jvm.JavaSparkContext(jconf)

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\py4j\java_gateway.py:1585, in JavaClass.__call__(self, *args)
   1579 command = proto.CONSTRUCTOR_COMMAND_NAME +\
   1580     self._command_header +\
   1581     args_command +\
   1582     proto.END_COMMAND_PART
   1584 answer = self._gateway_client.send_command(command)
-> 1585 return_value = get_return_value(
   1586     answer, self._gateway_client, None, self._fqn)
   1588 for temp_arg in temp_args:
   1589     temp_arg._detach()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
    324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325 if answer[1] == REFERENCE_TYPE:
--> 326     raise Py4JJavaError(
    327         "An error occurred while calling {0}{1}{2}.\n".
    328         format(target_id, ".", name), value)
    329 else:
    330     raise Py4JError(
    331         "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
    332         format(target_id, ".", name, value))

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1108)
	at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1094)
	at org.apache.spark.util.Utils$.fetchFile(Utils.scala:579)
	at org.apache.spark.SparkContext.addFile(SparkContext.scala:1647)
	at org.apache.spark.SparkContext.$anonfun$new$13(SparkContext.scala:514)
	at org.apache.spark.SparkContext.$anonfun$new$13$adapted(SparkContext.scala:514)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:514)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:67)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:343)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 22 more

@handreassa
Copy link

@silpara in case this is not fixed yet, try to install Hadoop directly as per in their docs (https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems):

How to fix a missing WINUTILS.EXE
You can fix this problem in two ways
Install a full native windows Hadoop version. The ASF does not currently (September 2015) release such a version; releases are available externally.
Or: get the WINUTILS.EXE binary from a Hadoop redistribution. There is a repository of this for some Hadoop versions on github.
Then
Set the environment variable %HADOOP_HOME% to point to the directory above the BIN dir containing WINUTILS.EXE.
Or: run the Java process with the system property hadoop.home.dir set to the home directory.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants