From 2eca073f69f99a53417034bdb5bd3cd4685d106f Mon Sep 17 00:00:00 2001 From: "T. Thyer" Date: Tue, 28 Sep 2021 16:56:51 -0700 Subject: [PATCH 1/3] Add glue tables stack --- config/develop/glue-database.yaml | 1 + templates/glue-database.yaml | 577 +++++++++++++++++++++++++++++- 2 files changed, 577 insertions(+), 1 deletion(-) diff --git a/config/develop/glue-database.yaml b/config/develop/glue-database.yaml index bf1c446..fca7f76 100644 --- a/config/develop/glue-database.yaml +++ b/config/develop/glue-database.yaml @@ -2,5 +2,6 @@ template_path: glue-database.yaml stack_name: glue-database parameters: DatabaseName: bridge-downstream-etl + JsonBucketName: !stack_output_external bridge-downstream-dev-intermediate-bucket::BucketName stack_tags: {{ stack_group_config.default_stack_tags }} diff --git a/templates/glue-database.yaml b/templates/glue-database.yaml index dae342a..1ee3436 100644 --- a/templates/glue-database.yaml +++ b/templates/glue-database.yaml @@ -1,9 +1,19 @@ AWSTemplateFormatVersion: '2010-09-09' -Description: Groups metadata tables for Glue. +Description: Glue database and tables Parameters: DatabaseName: Type: String + Description: Name of the Glue database + + JsonBucketName: + Type: String + Description: Name of the S3 bucket storing the datasets + + JsonPrefix: + Type: String + Description: Prefix of the object keys + Default: raw_json Resources: @@ -14,6 +24,571 @@ Resources: DatabaseInput: Name: !Ref DatabaseName + AnswersTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: answers + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: distractions + Type: boolean + - Name: taskidentifier + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + - Name: number match_abandonassessment + Type: boolean + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=answers/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: Number Match_abandonAssessment,day,distractions,month,recordId,taskIdentifier,year + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + + InfoTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: info + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: format + Type: string + - Name: appversion + Type: string + - Name: appname + Type: string + - Name: datafilename + Type: string + - Name: schemarevision + Type: int + - Name: phoneinfo + Type: string + - Name: files + Type: array> + - Name: item + Type: string + - Name: taskidentifier + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=info/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: appName,appVersion,dataFilename,day,files,format,item,month,phoneInfo,recordId,schemaRevision,taskIdentifier,year + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + SortColumns: [] + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + + MetadataTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: metadata + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: datagroups + Type: string + - Name: deviceinfo + Type: string + - Name: rsdframeworkversion + Type: string + - Name: taskidentifier + Type: string + - Name: files + Type: array> + - Name: startdate + Type: string + - Name: taskrunuuid + Type: string + - Name: devicetypeidentifier + Type: string + - Name: appname + Type: string + - Name: appversion + Type: string + - Name: enddate + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + - Name: healthcode + Type: string + - Name: createdon + Type: string + - Name: scheduleidentifier + Type: string + - Name: activitylabel + Type: string + - Name: scheduledactivityguid + Type: string + - Name: scheduledon + Type: string + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=metadata/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: activityLabel,appName,appVersion,createdon,dataGroups,day,deviceInfo,deviceTypeIdentifier,endDate,files,healthcode,month,recordId,rsdFrameworkVersion,scheduleIdentifier,scheduledActivityGuid,scheduledOn,startDate,taskIdentifier,taskRunUUID,year + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + + MicrophoneLevelsTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: microphone_levels + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: uptime + Type: double + - Name: unit + Type: string + - Name: peak + Type: double + - Name: average + Type: double + - Name: steppath + Type: string + - Name: timeinterval + Type: int + - Name: timestamp + Type: double + - Name: taskidentifier + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=microphone_levels/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: average,day,month,peak,recordId,stepPath,taskIdentifier,timeInterval,timestamp,unit,uptime,year + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + + MotionTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: motion + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: uptime + Type: double + - Name: timestamp + Type: double + - Name: timestampdate + Type: string + - Name: steppath + Type: string + - Name: taskidentifier + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + - Name: x + Type: double + - Name: y + Type: double + - Name: sensortype + Type: string + - Name: z + Type: double + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=motion/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: day,month,recordId,sensorType,stepPath,taskIdentifier,timestamp,timestampDate,uptime,x,y,year,z + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + + TaskDataTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: taskdata + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: locale + Type: string + - Name: testversion + Type: string + - Name: enddate + Type: string + - Name: startdate + Type: string + - Name: steps + Type: array>,startDate:string,identifier:string,type:string,instruction:boolean,response:int,score:int,responseTime:bigint,timeRemaining:int>> + - Name: taskname + Type: string + - Name: userinteractions + Type: array> + - Name: stephistory + Type: array>,startDate:string,endDate:string,identifier:string,type:string,instruction:boolean,response:int,score:int,responseTime:bigint,timeRemaining:int>> + - Name: taskstatus + Type: string + - Name: taskidentifier + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=taskdata/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: day,endDate,locale,month,recordId,startDate,stepHistory,steps,taskIdentifier,taskName,taskStatus,testVersion,userInteractions,year + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + + TaskResultTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: taskresult + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: asyncresults + Type: array,seaLevelPressure:int,temperature:double,identifier:string,type:string,rain:string,groundLevelPressure:int>,airQuality:struct,provider:string,startDate:string,type:string,identifier:string,aqi:int>,startDate:string,type:string,identifier:string,endDate:string,contentType:string,startUptime:double,relativePath:string,errorDescription:string,errorDomain:string,errorCode:int>> + - Name: schemainfo + Type: struct + - Name: enddate + Type: string + - Name: startdate + Type: string + - Name: stephistory + Type: array>,startDate:string,identifier:string,type:string,instruction:boolean,response:int,score:int,responseTime:bigint,timeRemaining:int>>,taskName:string,userInteractions:array>,stepHistory:array>,startDate:string,endDate:string,identifier:string,type:string,instruction:boolean,response:int,score:int,responseTime:bigint,timeRemaining:int>>,taskStatus:string,answerType:struct,value:boolean,type:string,identifier:string>>,schemaInfo:struct,endDate:string,startDate:string,stepHistory:array>,startDate:string,endDate:string,identifier:string,type:string,instruction:boolean,response:int,score:int,responseTime:bigint,timeRemaining:int,skipToIdentifier:string,inputResults:array>,startDate:string,endDate:string,identifier:string,type:string,instruction:boolean>>>>,taskRunUUID:string,identifier:string,type:string,nodePath:array,assessmentIdentifier:string,schemaIdentifier:string,answerType:struct,value:boolean,questionText:string>> + - Name: taskrunuuid + Type: string + - Name: identifier + Type: string + - Name: type + Type: string + - Name: nodepath + Type: array + - Name: assessmentidentifier + Type: string + - Name: taskidentifier + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=taskresult/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: assessmentIdentifier,asyncResults,day,endDate,identifier,month,nodePath,recordId,schemaInfo,startDate,stepHistory,taskIdentifier,taskRunUUID,type,year + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + + WeatherTable: + Type: AWS::Glue::Table + Properties: + CatalogId: !Ref AWS::AccountId + DatabaseName: !Ref GlueDatabase + TableInput: + Name: weather + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + PartitionKeys: + - Name: taskidentifier + Type: string + - Name: year + Type: string + - Name: month + Type: string + - Name: day + Type: string + - Name: recordid + Type: string + Retention: 0 + StorageDescriptor: + Columns: + - Name: weather + Type: struct,seaLevelPressure:int,temperature:double,identifier:string,type:string,rain:string,groundLevelPressure:int> + - Name: airquality + Type: struct,provider:string,startDate:string,type:string,identifier:string,aqi:int> + - Name: startdate + Type: string + - Name: type + Type: string + - Name: identifier + Type: string + - Name: enddate + Type: string + - Name: taskidentifier + Type: string + - Name: year + Type: int + - Name: month + Type: int + - Name: day + Type: int + - Name: recordid + Type: string + Compressed: false + InputFormat: org.apache.hadoop.mapred.TextInputFormat + Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=weather/ + OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Parameters: + CrawlerSchemaDeserializerVersion: '1.0' + CrawlerSchemaSerializerVersion: '1.0' + classification: json + compressionType: none + typeOfData: file + SerdeInfo: + Parameters: + paths: airQuality,day,endDate,identifier,month,recordId,startDate,taskIdentifier,type,weather,year + SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + StoredAsSubDirectories: false + TableType: EXTERNAL_TABLE + Outputs: DatabaseName: From d6da17c58c2d2572baec879dd48830ddc2870340 Mon Sep 17 00:00:00 2001 From: "T. Thyer" Date: Wed, 6 Oct 2021 12:56:14 -0700 Subject: [PATCH 2/3] Update tables based on columns in mtb_construct database --- templates/glue-database.yaml | 84 +++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/templates/glue-database.yaml b/templates/glue-database.yaml index 1ee3436..8352778 100644 --- a/templates/glue-database.yaml +++ b/templates/glue-database.yaml @@ -63,6 +63,10 @@ Resources: Type: int - Name: recordid Type: string + - Name: dimensional change card sort_abandonassessment + Type: boolean + - Name: flanker inhibitory control_abandonassessment + Type: boolean - Name: number match_abandonassessment Type: boolean Compressed: false @@ -77,7 +81,7 @@ Resources: typeOfData: file SerdeInfo: Parameters: - paths: Number Match_abandonAssessment,day,distractions,month,recordId,taskIdentifier,year + paths: Dimensional Change Card Sort_abandonAssessment,Flanker Inhibitory Control_abandonAssessment,Number Match_abandonAssessment,day,distractions,month,recordId,taskIdentifier,year SerializationLibrary: org.openx.data.jsonserde.JsonSerDe StoredAsSubDirectories: false TableType: EXTERNAL_TABLE @@ -180,27 +184,35 @@ Resources: Retention: 0 StorageDescriptor: Columns: - - Name: datagroups + - Name: scheduledactivityguid + Type: string + - Name: appname Type: string - Name: deviceinfo Type: string + - Name: scheduledon + Type: string - Name: rsdframeworkversion Type: string + - Name: appversion + Type: string - Name: taskidentifier Type: string + - Name: enddate + Type: string + - Name: datagroups + Type: string + - Name: activitylabel + Type: string - Name: files - Type: array> - - Name: startdate + Type: array> + - Name: scheduleidentifier Type: string - Name: taskrunuuid Type: string - Name: devicetypeidentifier Type: string - - Name: appname - Type: string - - Name: appversion - Type: string - - Name: enddate + - Name: startdate Type: string - Name: year Type: int @@ -210,18 +222,12 @@ Resources: Type: int - Name: recordid Type: string + - Name: substudymemberships + Type: string - Name: healthcode Type: string - Name: createdon Type: string - - Name: scheduleidentifier - Type: string - - Name: activitylabel - Type: string - - Name: scheduledactivityguid - Type: string - - Name: scheduledon - Type: string Compressed: false InputFormat: org.apache.hadoop.mapred.TextInputFormat Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=metadata/ @@ -404,24 +410,32 @@ Resources: Retention: 0 StorageDescriptor: Columns: - - Name: locale + - Name: taskstatus Type: string + - Name: accuracy + Type: double - Name: testversion Type: string + - Name: nanticipationpractice + Type: int + - Name: stephistory + Type: array>,startDate:string,endDate:string,identifier:string,type:string,instruction:boolean,response:string,score:int,failRuleSection:string,responseTime:bigint,anticipationError:int,stepGroup:string,se:double,theta:double,practice:boolean,timeRemaining:int>> + - Name: locale + Type: string - Name: enddate Type: string + - Name: nanticipationlive + Type: int - Name: startdate Type: string - - Name: steps - Type: array>,startDate:string,identifier:string,type:string,instruction:boolean,response:int,score:int,responseTime:bigint,timeRemaining:int>> - Name: taskname Type: string - Name: userinteractions - Type: array> - - Name: stephistory - Type: array>,startDate:string,endDate:string,identifier:string,type:string,instruction:boolean,response:int,score:int,responseTime:bigint,timeRemaining:int>> - - Name: taskstatus - Type: string + Type: array> + - Name: rawscore + Type: int + - Name: steps + Type: array>,startDate:string,identifier:string,type:string,instruction:boolean,response:string,failRuleSection:string,score:int,responseTime:bigint,anticipationError:int,stepGroup:string,theta:double,se:double,practice:boolean,timeRemaining:int>> - Name: taskidentifier Type: string - Name: year @@ -432,6 +446,24 @@ Resources: Type: int - Name: recordid Type: string + - Name: totalerrors + Type: int + - Name: ratescore + Type: double + - Name: consideredsteps + Type: array> + - Name: starttheta + Type: int + - Name: startse + Type: int + - Name: itemcount + Type: int + - Name: administeredsteps + Type: array> + - Name: finalse + Type: double + - Name: finaltheta + Type: double Compressed: false InputFormat: org.apache.hadoop.mapred.TextInputFormat Location: !Sub s3://${JsonBucketName}/${JsonPrefix}/dataset=taskdata/ @@ -444,7 +476,7 @@ Resources: typeOfData: file SerdeInfo: Parameters: - paths: day,endDate,locale,month,recordId,startDate,stepHistory,steps,taskIdentifier,taskName,taskStatus,testVersion,userInteractions,year + paths: accuracy,administeredSteps,consideredSteps,day,endDate,finalSE,finalTheta,itemCount,locale,month,nAnticipationLive,nAnticipationPractice,rateScore,rawScore,recordId,startDate,startSE,startTheta,stepHistory,steps,taskIdentifier,taskName,taskStatus,testVersion,totalErrors,userInteractions,year SerializationLibrary: org.openx.data.jsonserde.JsonSerDe StoredAsSubDirectories: false TableType: EXTERNAL_TABLE From 440fe81f2d2823464b5c6f668c92a53c2553d027 Mon Sep 17 00:00:00 2001 From: "T. Thyer" Date: Wed, 6 Oct 2021 13:50:10 -0700 Subject: [PATCH 3/3] Add a key for the source bucket --- config/develop/glue-workflows.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/develop/glue-workflows.yaml b/config/develop/glue-workflows.yaml index 0156d34..f96f720 100644 --- a/config/develop/glue-workflows.yaml +++ b/config/develop/glue-workflows.yaml @@ -7,7 +7,7 @@ dependencies: - develop/glue-database.yaml parameters: SourceBucketName: !stack_output_external bridge-downstream-dev-source-bucket::BucketName - SourceKey: '' + SourceKey: 'mtb-sample-data' JsonBucketName: !stack_output_external bridge-downstream-dev-intermediate-bucket::BucketName ParquetBucketName: !stack_output_external bridge-downstream-dev-parquet-bucket::BucketName GlueDatabaseName: !stack_output_external glue-database::DatabaseName