diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index d57a62dbe..559b10f0b 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -16,6 +16,7 @@ - Fixed `polars.CSVDataset` `save` method on Windows using `utf-8` as default encoding. - Made `table_name` a keyword argument in the `ibis.FileDataset` implementation to be compatible with Ibis 10.0. - Fixed how sessions are handled in the `snowflake.SnowflakeTableDataset` implementation. +- Provide enhanced error message for the spark session created via databricks-connect if the builder args are incomplete provided. - Fixed credentials handling in `pandas.GBQQueryDataset` and `pandas.GBQTableDataset` ## Breaking Changes diff --git a/kedro-datasets/kedro_datasets/_utils/spark_utils.py b/kedro-datasets/kedro_datasets/_utils/spark_utils.py index e55012275..4ff079354 100644 --- a/kedro-datasets/kedro_datasets/_utils/spark_utils.py +++ b/kedro-datasets/kedro_datasets/_utils/spark_utils.py @@ -1,25 +1,15 @@ -from typing import TYPE_CHECKING, Union - from pyspark.sql import SparkSession -if TYPE_CHECKING: - from databricks.connect import DatabricksSession - -def get_spark() -> Union[SparkSession, "DatabricksSession"]: +def get_spark() -> SparkSession: """ Returns the SparkSession. In case databricks-connect is available we use it for extended configuration mechanisms and notebook compatibility, otherwise we use classic pyspark. """ try: - # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) - # the remote session is instantiated using the databricks module - # If the databricks-connect module is installed, we use a remote session - from databricks.connect import DatabricksSession - # We can't test this as there's no Databricks test env available - spark = DatabricksSession.builder.getOrCreate() # pragma: no cover + spark = _create_databricks_session() # pragma: no cover except ImportError: # For "normal" spark sessions that don't use databricks-connect @@ -27,3 +17,27 @@ def get_spark() -> Union[SparkSession, "DatabricksSession"]: spark = SparkSession.builder.getOrCreate() return spark + + +def _create_databricks_session() -> SparkSession: + # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) + # the remote session is instantiated using the databricks module + # If the databricks-connect module is installed, we use a remote session + from databricks.connect import DatabricksSession + + try: + return DatabricksSession.builder.getOrCreate() + # this can't be narrowed down since databricks-connect throws error of Exception type + except Exception as exception: + if ( + str(exception) + == "Cluster id or serverless are required but were not specified." + ): + raise type(exception)( + "DatabricksSession is expected to behave as singleton but it didn't. " + "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " + "env variables in your hooks prior to using the spark session. " + "Read more about these variables here: " + "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" + ) from exception + raise exception