Skip to content

Commit

Permalink
Add crawler to Glue workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
philerooski committed Sep 5, 2023
1 parent 0a04e3b commit 8941997
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 3 deletions.
2 changes: 2 additions & 0 deletions config/develop/namespaced/glue-workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ dependencies:
- develop/namespaced/glue-job-S3ToJsonS3.yaml
- develop/namespaced/glue-job-JSONToParquet.yaml
- develop/namespaced/glue-job-compare-parquet.yaml
- develop/glue-job-role.yaml
parameters:
Namespace: {{ stack_group_config.namespace }}
JsonBucketName: {{ stack_group_config.intermediate_bucket_name }}
ParquetBucketName: {{ stack_group_config.processed_data_bucket_name }}
GlueDatabase: !stack_output_external "{{ stack_group_config.namespace }}-glue-tables::DatabaseName"
CrawlerRole: !stack_output_external glue-job-role::RoleArn
S3ToJsonJobName: !stack_output_external "{{ stack_group_config.namespace }}-glue-job-S3ToJsonS3::JobName"
CompareParquetStagingNamespace: {{ stack_group_config.namespace }}
CompareParquetMainNamespace: "main"
Expand Down
2 changes: 2 additions & 0 deletions config/prod/namespaced/glue-workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ dependencies:
- prod/namespaced/glue-job-S3ToJsonS3.yaml
- prod/namespaced/glue-job-JSONToParquet.yaml
- prod/namespaced/glue-job-compare-parquet.yaml
- prod/glue-job-role.yaml
parameters:
Namespace: {{ stack_group_config.namespace }}
JsonBucketName: {{ stack_group_config.intermediate_bucket_name }}
ParquetBucketName: {{ stack_group_config.processed_data_bucket_name }}
GlueDatabase: !stack_output_external "{{ stack_group_config.namespace }}-glue-tables::DatabaseName"
CrawlerRole: !stack_output_external glue-job-role::RoleArn
S3ToJsonJobName: !stack_output_external "{{ stack_group_config.namespace }}-glue-job-S3ToJsonS3::JobName"
CompareParquetStagingNamespace: "staging"
CompareParquetMainNamespace: "main"
Expand Down
46 changes: 43 additions & 3 deletions templates/glue-workflow.j2
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ Parameters:
Description: >-
Glue database containing Glue tables for use in JSON to Parquet job.

CrawlerRole:
Type: String
Description: ARN of the IAM Role the crawler will assume.

S3ToJsonJobName:
Type: String
Description: The name of the S3 To JSON Job
Expand Down Expand Up @@ -97,18 +101,54 @@ Resources:
Type: AWS::Glue::Trigger
Properties:
Name: !Sub "${Namespace}-S3ToJsonCompleteTrigger"
Actions:
- CrawlerName: !Ref StandardCrawler
Description: This trigger starts the crawler.
Type: CONDITIONAL
Predicate:
Conditions:
- JobName: !Ref S3ToJsonJobName
State: SUCCEEDED
LogicalOperator: EQUALS
StartOnCreation: true
WorkflowName: !Ref PrimaryWorkflow

StandardCrawler:
Type: AWS::Glue::Crawler
Properties:
Configuration: '{"Version":1.0,"CrawlerOutput":{"Partitions":{"AddOrUpdateBehavior":"InheritFromTable"}},"Grouping":{"TableGroupingPolicy":"CombineCompatibleSchemas"}}'
DatabaseName: !Ref GlueDatabase
Name: !Sub ${Namespace}-standard
RecrawlPolicy:
RecrawlBehavior: CRAWL_EVERYTHING
Role: !Ref CrawlerRole
SchemaChangePolicy:
DeleteBehavior: LOG
UpdateBehavior: LOG
Targets:
CatalogTargets:
- DatabaseName: !Ref GlueDatabase
Tables:
{% for data_type in sceptre_user_data.dataset_schemas.tables.keys() %}
- dataset_{{ data_type.lower() }}
{% endfor %}

CrawlerCompleteTrigger:
Type: AWS::Glue::Trigger
Properties:
Name: !Sub "${Namespace}-CrawlerCompleteTrigger"
Actions:
{% for dataset in datasets if not "HealthKit" in dataset["data_type"] and not "Fitbit" in dataset["data_type"] and not "Google" in dataset["data_type"] and not "Garmin" in dataset["data_type"] %}
- JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
{% endfor %}
Description: This trigger kicks off every JSON to Parquet job which is not associated with a device and runs after completion of the S3 to JSON job.
Description: This trigger kicks off every JSON to Parquet job which is not associated with a device and runs after completion of the crawler.
Type: CONDITIONAL
Predicate:
Conditions:
- JobName: !Ref S3ToJsonJobName
State: SUCCEEDED
- CrawlerName: !Ref StandardCrawler
LogicalOperator: EQUALS
CrawlState: SUCCEEDED
StartOnCreation: true
WorkflowName: !Ref PrimaryWorkflow

Expand Down

0 comments on commit 8941997

Please sign in to comment.