Add crawler to Glue workflow

Sage-Bionetworks · Sep 5, 2023 · 8941997 · 8941997
1 parent 0a04e3b
commit 8941997
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 3 deletions.
diff --git a/config/develop/namespaced/glue-workflow.yaml b/config/develop/namespaced/glue-workflow.yaml
@@ -6,11 +6,13 @@ dependencies:
   - develop/namespaced/glue-job-S3ToJsonS3.yaml
   - develop/namespaced/glue-job-JSONToParquet.yaml
   - develop/namespaced/glue-job-compare-parquet.yaml
+  - develop/glue-job-role.yaml
 parameters:
   Namespace: {{ stack_group_config.namespace }}
   JsonBucketName: {{ stack_group_config.intermediate_bucket_name }}
   ParquetBucketName: {{ stack_group_config.processed_data_bucket_name }}
   GlueDatabase: !stack_output_external "{{ stack_group_config.namespace }}-glue-tables::DatabaseName"
+  CrawlerRole: !stack_output_external glue-job-role::RoleArn
   S3ToJsonJobName: !stack_output_external "{{ stack_group_config.namespace }}-glue-job-S3ToJsonS3::JobName"
   CompareParquetStagingNamespace: {{ stack_group_config.namespace }}
   CompareParquetMainNamespace: "main"

diff --git a/config/prod/namespaced/glue-workflow.yaml b/config/prod/namespaced/glue-workflow.yaml
@@ -6,11 +6,13 @@ dependencies:
   - prod/namespaced/glue-job-S3ToJsonS3.yaml
   - prod/namespaced/glue-job-JSONToParquet.yaml
   - prod/namespaced/glue-job-compare-parquet.yaml
+  - prod/glue-job-role.yaml
 parameters:
   Namespace: {{ stack_group_config.namespace }}
   JsonBucketName: {{ stack_group_config.intermediate_bucket_name }}
   ParquetBucketName: {{ stack_group_config.processed_data_bucket_name }}
   GlueDatabase: !stack_output_external "{{ stack_group_config.namespace }}-glue-tables::DatabaseName"
+  CrawlerRole: !stack_output_external glue-job-role::RoleArn
   S3ToJsonJobName: !stack_output_external "{{ stack_group_config.namespace }}-glue-job-S3ToJsonS3::JobName"
   CompareParquetStagingNamespace: "staging"
   CompareParquetMainNamespace: "main"

diff --git a/templates/glue-workflow.j2 b/templates/glue-workflow.j2
@@ -42,6 +42,10 @@ Parameters:
     Description: >-
         Glue database containing Glue tables for use in JSON to Parquet job.
 
+  CrawlerRole:
+    Type: String
+    Description: ARN of the IAM Role the crawler will assume.
+
   S3ToJsonJobName:
     Type: String
     Description: The name of the S3 To JSON Job
@@ -97,18 +101,54 @@ Resources:
     Type: AWS::Glue::Trigger
     Properties:
       Name: !Sub "${Namespace}-S3ToJsonCompleteTrigger"
+      Actions:
+        - CrawlerName: !Ref StandardCrawler
+      Description: This trigger starts the crawler.
+      Type: CONDITIONAL
+      Predicate:
+        Conditions:
+        - JobName: !Ref S3ToJsonJobName
+          State: SUCCEEDED
+          LogicalOperator: EQUALS
+      StartOnCreation: true
+      WorkflowName: !Ref PrimaryWorkflow
+
+  StandardCrawler:
+    Type: AWS::Glue::Crawler
+    Properties:
+      Configuration: '{"Version":1.0,"CrawlerOutput":{"Partitions":{"AddOrUpdateBehavior":"InheritFromTable"}},"Grouping":{"TableGroupingPolicy":"CombineCompatibleSchemas"}}'
+      DatabaseName: !Ref GlueDatabase
+      Name: !Sub ${Namespace}-standard
+      RecrawlPolicy:
+        RecrawlBehavior: CRAWL_EVERYTHING
+      Role: !Ref CrawlerRole
+      SchemaChangePolicy:
+        DeleteBehavior: LOG
+        UpdateBehavior: LOG
+      Targets:
+        CatalogTargets:
+        - DatabaseName: !Ref GlueDatabase
+          Tables:
+          {% for data_type in sceptre_user_data.dataset_schemas.tables.keys() %}
+          - dataset_{{ data_type.lower() }}
+          {% endfor %}
+
+  CrawlerCompleteTrigger:
+    Type: AWS::Glue::Trigger
+    Properties:
+      Name: !Sub "${Namespace}-CrawlerCompleteTrigger"
       Actions:
         {% for dataset in datasets if not "HealthKit" in dataset["data_type"] and not "Fitbit" in dataset["data_type"] and not "Google" in dataset["data_type"] and not "Garmin" in dataset["data_type"] %}
         - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
           Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
         {% endfor %}
-      Description: This trigger kicks off every JSON to Parquet job which is not associated with a device and runs after completion of the S3 to JSON job.
+      Description: This trigger kicks off every JSON to Parquet job which is not associated with a device and runs after completion of the crawler.
       Type: CONDITIONAL
       Predicate:
         Conditions:
-        - JobName: !Ref S3ToJsonJobName
-          State: SUCCEEDED
+        - CrawlerName: !Ref StandardCrawler
           LogicalOperator: EQUALS
+          CrawlState: SUCCEEDED
       StartOnCreation: true
       WorkflowName: !Ref PrimaryWorkflow