logicalclocks · moritzmeister · Aug 18, 2020 · Aug 16, 2020 · Aug 17, 2020 · Aug 17, 2020
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
@@ -14,6 +14,8 @@
 #   limitations under the License.
 #
 
+import os
+
 import pandas as pd
 import numpy as np
 
@@ -40,6 +42,10 @@ def __init__(self):
         self._spark_session.conf.set("hive.exec.dynamic.partition", "true")
         self._spark_session.conf.set("hive.exec.dynamic.partition.mode", "nonstrict")
 
+        if not os.path.exists("/dbfs/"):
+            # If we are on Databricks don't setup Pydoop as it's not available and cannot be easily installed.
+            self._setup_pydoop()
+
     def sql(self, sql_query, feature_store, online_conn, dataframe_type):
         if not online_conn:
             result_df = self._sql_offline(sql_query, feature_store)
@@ -272,6 +278,39 @@ def _setup_s3(self, storage_connector, path):
             )
         return path.replace("s3", "s3a", 1)
 
+    def _setup_pydoop(self):
+        # Import Pydoop only here, so it doesn't trigger if the execution environment
+        # does not support Pydoop. E.g. Sagemaker
+        from pydoop import hdfs
+
+        # Create a subclass that replaces the check on the hdfs scheme to allow hopsfs as well.
+        class _HopsFSPathSplitter(hdfs.path._HdfsPathSplitter):
+            @classmethod
+            def split(cls, hdfs_path, user):
+                if not hdfs_path:
+                    cls.raise_bad_path(hdfs_path, "empty")
+                scheme, netloc, path = cls.parse(hdfs_path)
+                if not scheme:
+                    scheme = "file" if hdfs_fs.default_is_local() else "hdfs"
+                if scheme == "hdfs" or scheme == "hopsfs":
+                    if not path:
+                        cls.raise_bad_path(hdfs_path, "path part is empty")
+                    if ":" in path:
+                        cls.raise_bad_path(
+                            hdfs_path, "':' not allowed outside netloc part"
+                        )
+                    hostname, port = cls.split_netloc(netloc)
+                    if not path.startswith("/"):
+                        path = "/user/%s/%s" % (user, path)
+                elif scheme == "file":
+                    hostname, port, path = "", 0, netloc + path
+                else:
+                    cls.raise_bad_path(hdfs_path, "unsupported scheme %r" % scheme)
+                return hostname, port, path
+
+        # Monkey patch the class to use the one defined above.
+        hdfs.path._HdfsPathSplitter = _HopsFSPathSplitter
+
 
 class SchemaError(Exception):
     """Thrown when schemas don't match"""
diff --git a/python/setup.py b/python/setup.py
@@ -4,7 +4,8 @@
 
 
 __version__ = imp.load_source(
-    'hsfs.version', os.path.join('hsfs', 'version.py')).__version__
+    "hsfs.version", os.path.join("hsfs", "version.py")
+).__version__
 
 
 def read(fname):
@@ -24,14 +25,9 @@ def read(fname):
         "pyhopshive[thrift]",
         "PyMySQL",
         "pyjks",
-        "sqlalchemy"
+        "sqlalchemy",
     ],
-    extras_require={
-        "dev": [
-            "pytest",
-            "flake8",
-            "black"]
-    },
+    extras_require={"dev": ["pytest", "flake8", "black"]},
     author="Moritz Meister",
     author_email="moritz@logicalclocks.com",
     description="",