feathr-ai · xiaoyongzhu · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022
diff --git a/docs/how-to-guides/feathr-configuration-and-env.md b/docs/how-to-guides/feathr-configuration-and-env.md
@@ -38,24 +38,25 @@ Feathr will get the configurations in the following order:
 | AZURE_CLIENT_ID                                       | Client ID for authentication into Azure Services. Read [here](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) for more details.                                                                                 | This is required if you are using Service Principal to login with Feathr. |
 | AZURE_TENANT_ID                                       | Client ID for authentication into Azure Services. Read [here](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) for more details.                                                                                 | This is required if you are using Service Principal to login with Feathr. |
 | AZURE_CLIENT_SECRET                                   | Client ID for authentication into Azure Services. Read [here](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) for more details.                                                                                 | This is required if you are using Service Principal to login with Feathr. |
-| OFFLINE_STORE__ADLS__ADLS_ENABLED                     | Whether to enable ADLS as offline store or not.                                                                                                                                                                                                                                    | Optional                                                                  |
+| OFFLINE_STORE__ADLS__ADLS_ENABLED                     | Whether to enable ADLS as offline store or not. Available value: "True" or "False". Equivalent to "False" if not set.                                                                                                                                                                                                                                    | Optional                                                                  |
 | ADLS_ACCOUNT                                          | ADLS account that you connect to.                                                                                                                                                                                                                                                  | Required if using ADLS as an offline store.                               |
 | ADLS_KEY                                              | ADLS key that you connect to.                                                                                                                                                                                                                                                      | Required if using ADLS as an offline store.                               |
-| OFFLINE_STORE__WASB__WASB_ENABLED                     | Whether to enable Azure BLOB storage as offline store or not.                                                                                                                                                                                                                      |
+| OFFLINE_STORE__WASB__WASB_ENABLED                     | Whether to enable Azure BLOB storage as offline store or not. Available value: "True" or "False". Equivalent to "False" if not set.                                                                                                                                                                                                                      |
 | WASB_ACCOUNT                                          | Azure BLOB Storage account that you connect to.                                                                                                                                                                                                                                    | Required if using Azure BLOB Storage as an offline store.                 |
 | WASB_KEY                                              | Azure BLOB Storage key that you connect to.                                                                                                                                                                                                                                        | Required if using Azure BLOB Storage as an offline store.                 |
 | S3_ACCESS_KEY                                         | AWS S3 access key for the S3 account.                                                                                                                                                                                                                                              | Required if using AWS S3 Storage as an offline store.                     |
 | S3_SECRET_KEY                                         | AWS S3 secret key for the S3 account.                                                                                                                                                                                                                                              | Required if using AWS S3 Storage as an offline store.                     |
-| OFFLINE_STORE__S3__S3_ENABLED                         | Whether to enable S3 as offline store or not.                                                                                                                                                                                                                                      | Optional                                                                  |
+| OFFLINE_STORE__S3__S3_ENABLED                         | Whether to enable S3 as offline store or not. Available value: "True" or "False". Equivalent to "False" if not set.                                                                                                                                                                                                                                      | Optional                                                                  |
 | OFFLINE_STORE__S3__S3_ENDPOINT                        | S3 endpoint. If you use S3 endpoint, then you need to provide access key and secret key in the environment variable as well.                                                                                                                                                       | Required if using AWS S3 Storage as an offline store.                     |
-| OFFLINE_STORE__JDBC__JDBC_ENABLED                     | Whether to enable JDBC as offline store or not.                                                                                                                                                                                                                                    | Optional                                                                  |
+| OFFLINE_STORE__JDBC__JDBC_ENABLED                     | Whether to enable JDBC as offline store or not. Available value: "True" or "False". Equivalent to "False" if not set.                                                                                                                                                                                                                                    | Optional                                                                  |
 | OFFLINE_STORE__JDBC__JDBC_DATABASE                    | If using JDBC endpoint as offline store, this config specifies the JDBC database to read from.                                                                                                                                                                                     | Required if using JDBC sources as offline store                           |
 | OFFLINE_STORE__JDBC__JDBC_TABLE                       | If using JDBC endpoint as offline store, this config specifies the JDBC table to read from. Same as `JDBC_TABLE`.                                                                                                                                                                  | Required if using JDBC sources as offline store                           |
 | JDBC_TABLE                                            | If using JDBC endpoint as offline store, this config specifies the JDBC table to read from                                                                                                                                                                                         | Required if using JDBC sources as offline store                           |
 | JDBC_USER                                             | If using JDBC endpoint as offline store, this config specifies the JDBC user                                                                                                                                                                                                       | Required if using JDBC sources as offline store                           |
 | JDBC_PASSWORD                                         | If using JDBC endpoint as offline store, this config specifies the JDBC password                                                                                                                                                                                                   | Required if using JDBC sources as offline store                           |
 | KAFKA_SASL_JAAS_CONFIG                                | see [here](#KAFKA_SASL_JAAS_CONFIG) for more details. | Required if using Kafka/EventHub as streaming source input.               |
 | PROJECT_CONFIG__PROJECT_NAME                        | Configures the project name.                                                                                                                                                                                                                                                       | Required                                                                  |
+| OFFLINE_STORE__SNOWFLAKE__SNOWFLAKE_ENABLED                         | Configures whether Snowflake as offline store is enabled or not. Available value: "True" or "False". Equivalent to "False" if not set.                                                                                                                                                                                 | Required if using Snowflake as an offline store.                          |
 | OFFLINE_STORE__SNOWFLAKE__URL                         | Configures the Snowflake URL. Usually it's something like `dqllago-ol19457.snowflakecomputing.com`.                                                                                                                                                                                | Required if using Snowflake as an offline store.                          |
 | OFFLINE_STORE__SNOWFLAKE__USER                        | Configures the Snowflake user.                                                                                                                                                                                                                                                     | Required if using Snowflake as an offline store.                          |
 | OFFLINE_STORE__SNOWFLAKE__ROLE                        | Configures the Snowflake role. Usually it's something like `ACCOUNTADMIN`.                                                                                                                                                                                                         | Required if using Snowflake as an offline store.                          |
@@ -72,6 +73,7 @@ Feathr will get the configurations in the following order:
 | SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE             | Config string including run time information, spark version, machine size, etc. See [below](#SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE) for more details.                                                                                                                                | Required if using Databricks                                              |
 | SPARK_CONFIG__DATABRICKS__WORK_DIR                    | Workspace dir for storing all the required configuration files and the jar resources. All the feature definitions will be uploaded here.                                                                                                                                           | Required if using Databricks                                              |
 | SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION     | Feathr runtime location. Support local paths, path start with `http(s)://`, and paths start with `dbfs:/`. If not set, will use the [Feathr package published in Maven](https://search.maven.org/artifact/com.linkedin.feathr/feathr_2.12).                                        | Required if using Databricks                                              |
+| DATABRICKS_WORKSPACE_TOKEN_VALUE      | Token value to access databricks workspace. More details can be found at [Authentication using Databricks personal access tokens](https://docs.databricks.com/dev-tools/api/latest/authentication.html)                                        | Required if using Databricks                                              |
 | ONLINE_STORE__REDIS__HOST                             | Redis host name to access Redis cluster.                                                                                                                                                                                                                                           | Required if using Redis as online store.                                  |
 | ONLINE_STORE__REDIS__PORT                             | Redis port number to access Redis cluster.                                                                                                                                                                                                                                         | Required if using Redis as online store.                                  |
 | ONLINE_STORE__REDIS__SSL_ENABLED                      | Whether SSL is enabled to access Redis cluster.                                                                                                                                                                                                                                    | Required if using Redis as online store.                                  |

diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py
@@ -69,23 +69,25 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str):
         """
         src_parse_result = urlparse(local_path_or_http_path)
         file_name = os.path.basename(local_path_or_http_path)
-        # returned paths for the uploaded file
-        returned_path = os.path.join(self.databricks_work_dir, file_name)
+        # returned paths for the uploaded file. Note that we cannot use os.path.join here, since in Windows system it will yield paths like this:
+        # dbfs:/feathrazure_cijob_snowflake_9_30_157692\auto_generated_derived_features.conf, where the path sep is mixed, and won't be able to be parsed by databricks.
+        # so we force the path to be Linux style here.
+        cloud_dest_path = self.databricks_work_dir + "/" + file_name
         if src_parse_result.scheme.startswith('http'):
             with urlopen(local_path_or_http_path) as f:
                 # use REST API to avoid local temp file
                 data = f.read()
                 files = {'file': data}
                 # for DBFS APIs, see: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs
                 r = requests.post(url=self.workspace_instance_url+'/api/2.0/dbfs/put',
-                                  headers=self.auth_headers, files=files,  data={'overwrite': 'true', 'path': returned_path})
+                                  headers=self.auth_headers, files=files,  data={'overwrite': 'true', 'path': cloud_dest_path})
                 logger.info('{} is downloaded and then uploaded to location: {}',
-                             local_path_or_http_path, returned_path)
+                             local_path_or_http_path, cloud_dest_path)
         elif src_parse_result.scheme.startswith('dbfs'):
             # passed a cloud path
             logger.info(
                 'Skip uploading file {} as the file starts with dbfs:/', local_path_or_http_path)
-            returned_path = local_path_or_http_path
+            cloud_dest_path = local_path_or_http_path
         elif src_parse_result.scheme.startswith(('wasb','s3','gs')):
             # if the path starts with a location that's not a local path
             logger.error("File {} cannot be downloaded. Please upload the file to dbfs manually.", local_path_or_http_path)
@@ -96,27 +98,29 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str):
                 logger.info("Uploading folder {}", local_path_or_http_path)
                 dest_paths = []
                 for item in Path(local_path_or_http_path).glob('**/*.conf'):
-                    returned_path = self.upload_local_file(item.resolve())
-                    dest_paths.extend([returned_path])
-                returned_path = ','.join(dest_paths)
+                    cloud_dest_path = self._upload_local_file_to_workspace(item.resolve())
+                    dest_paths.extend([cloud_dest_path])
+                cloud_dest_path = ','.join(dest_paths)
             else:
-                returned_path = self.upload_local_file(local_path_or_http_path)
-        return returned_path
+                cloud_dest_path = self._upload_local_file_to_workspace(local_path_or_http_path)
+        return cloud_dest_path
 
-    def upload_local_file(self, local_path: str) -> str:
+    def _upload_local_file_to_workspace(self, local_path: str) -> str:
         """
         Supports transferring file from a local path to cloud working storage.
         """
         file_name = os.path.basename(local_path)
-        # returned paths for the uploaded file
-        returned_path = os.path.join(self.databricks_work_dir, file_name)
+        # returned paths for the uploaded file. Note that we cannot use os.path.join here, since in Windows system it will yield paths like this:
+        # dbfs:/feathrazure_cijob_snowflake_9_30_157692\auto_generated_derived_features.conf, where the path sep is mixed, and won't be able to be parsed by databricks.
+        # so we force the path to be Linux style here.
+        cloud_dest_path = self.databricks_work_dir + "/" + file_name
         # `local_path_or_http_path` will be either string or PathLib object, so normalize it to string
         local_path = str(local_path)
         try:
-            DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=local_path, dst=returned_path)
+            DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=local_path, dst=cloud_dest_path)
         except RuntimeError as e:
-            raise RuntimeError(f"The source path: {local_path}, or the destination path: {returned_path}, is/are not valid.") from e
-        return returned_path
+            raise RuntimeError(f"The source path: {local_path}, or the destination path: {cloud_dest_path}, is/are not valid.") from e
+        return cloud_dest_path
 
     def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}):
         """