Adding dummy entries to feathr_config.yaml file for additional librar…

…y entries (#1012) * Removing double quotes * Commenting out * Bringing maven jar back * Adding double quotes * Reverting changes * Adding instructions to add additional entries to databricks spark job config, and also adding additonal entries to reference feathr_config.yaml file so array is initialized assize 2 if users are just copying from it. Also made a note of the error in the code
feathr-ai · Jan 27, 2023 · 304de03 · 304de03
1 parent 305ef73
commit 304de03
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 3 deletions.
diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py
@@ -231,7 +231,15 @@ def submit_feathr_job(
                 "coordinates": get_maven_artifact_fullname()}
             # Add json-schema dependency
             # TODO: find a proper way deal with unresolved dependencies
-            submission_params["libraries"][1]["maven"]= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"}
+            # Since we are adding another entry to the config, make sure that the spark config passed as part of execution also contains a libraries array of atleast size 2
+            # else you will get List Index Out of Bound exception
+            # Example from feathr_config.yaml -
+            # config_template: {"run_name":"FEATHR_FILL_IN",.....,"libraries":[{}, {}],".......}
+
+            submission_params["libraries"][1]["maven"]= {
+                "coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1",
+                "repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"
+                }
         else:
             submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path(
                 main_jar_path)

diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml b/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml
@@ -92,7 +92,7 @@ spark_config:
     # config string including run time information, spark version, machine size, etc.
     # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6
     # The fields marked as "FEATHR_FILL_IN" will be managed by Feathr. Other parameters can be customizable. For example, you can customize the node type, spark version, number of workers, instance pools, timeout, etc.
-    config_template: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","node_type_id":"Standard_D3_v2","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"}},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}'
+    config_template: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","node_type_id":"Standard_D3_v2","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"}},"libraries":[{"jar":"FEATHR_FILL_IN"}, {"maven":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}'
     # workspace dir for storing all the required configuration files and the jar resources. All the feature definitions will be uploaded here
     work_dir: "dbfs:/feathr_getting_started"
     # This is the location of the runtime jar for Spark job submission. If you have compiled the runtime yourself, you need to specify this location.

diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml
@@ -90,7 +90,7 @@ spark_config:
     workspace_token_value: ''
     # config string including run time information, spark version, machine size, etc.
     # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs
-    config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}
+    config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}, {"maven":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}
     # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/
     work_dir: 'dbfs:/feathr_getting_started'
     # this is the default location so end users don't have to compile the runtime again.