kedro-org · jmholzer · Mar 6, 2023 · Feb 3, 2023 · Feb 3, 2023 · Feb 6, 2023
diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -1,6 +1,9 @@
 
 # Upcoming Release:
 
+## Bug fixes and other changes
+* Added a warning when the user tries to use `SparkDataSet` on Databricks without specifying a file path with the `/dbfs/` prefix.
+
 # Release 1.0.2:
 
 ## Bug fixes and other changes

diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py
@@ -2,6 +2,8 @@
 ``pyspark``
 """
 import json
+import logging
+import os
 from copy import deepcopy
 from fnmatch import fnmatch
 from functools import partial
@@ -23,6 +25,8 @@
 from pyspark.sql.utils import AnalysisException
 from s3fs import S3FileSystem
 
+logger = logging.getLogger(__name__)
+
 
 def _parse_glob_pattern(pattern: str) -> str:
     special = ("*", "?", "[")
@@ -114,6 +118,20 @@ def _dbfs_exists(pattern: str, dbutils: Any) -> bool:
         return False
 
 
+def _deployed_on_databricks() -> bool:
+    """Check if running on Databricks."""
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+
+def _path_has_dbfs_prefix(path: str) -> bool:
+    """Check if a file path has a valid dbfs prefix.
+
+    Args:
+        path: File path to check.
+    """
+    return path.startswith("/dbfs/")
+
+
 class KedroHdfsInsecureClient(InsecureClient):
     """Subclasses ``hdfs.InsecureClient`` and implements ``hdfs_exists``
     and ``hdfs_glob`` methods required by ``SparkDataSet``"""
@@ -304,7 +322,11 @@ def __init__(  # pylint: disable=too-many-arguments
 
         else:
             path = PurePosixPath(filepath)
-
+            if _deployed_on_databricks() and not _path_has_dbfs_prefix(filepath):
+                logger.error(
+                    "Using SparkDataSet on Databricks without the `/dbfs` prefix in the "
+                    "filepath raises an error. Add this prefix to fix the error."
+                )
             if filepath.startswith("/dbfs"):
                 dbutils = _get_dbutils(self._get_spark())
                 if dbutils:

diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py
@@ -1,3 +1,4 @@
+# pylint: disable=too-many-lines
 import re
 import sys
 import tempfile
@@ -161,6 +162,7 @@ def isDir(self):
         return "." not in self.path.split("/")[-1]
 
 
+# pylint: disable=too-many-public-methods
 class TestSparkDataSet:
     def test_load_parquet(self, tmp_path, sample_pandas_df):
         temp_path = (tmp_path / "data").as_posix()
@@ -440,6 +442,25 @@ def test_copy(self):
         assert spark_dataset_copy._file_format == "csv"
         assert spark_dataset_copy._save_args == {"mode": "overwrite"}
 
+    def test_dbfs_prefix_warning(self, monkeypatch, caplog):
+        expected_message = (
+            "Using SparkDataSet on Databricks without the `/dbfs` prefix in the "
+            "filepath raises an error. Add this prefix to fix the error."
+        )
+
+        # test that warning is not raised when not on Databricks
+        SparkDataSet(filepath="my_project/data/02_intermediate/processed_data")
+        assert expected_message not in caplog.text
+
+        # test that warning is not raised when on Databricks and filepath has /dbfs prefix
+        monkeypatch.setenv("DATABRICKS_RUNTIME_VERSION", "7.3")
+        SparkDataSet(filepath="/dbfs/my_project/data/02_intermediate/processed_data")
+        assert expected_message not in caplog.text
+
+        # test that warning is raised when on Databricks and filepath does not have /dbfs prefix
+        SparkDataSet(filepath="my_project/data/02_intermediate/processed_data")
+        assert expected_message in caplog.text
+
 
 class TestSparkDataSetVersionedLocal:
     def test_no_version(self, versioned_dataset_local):