Skip to content

Commit

Permalink
Data dump apps (#611)
Browse files Browse the repository at this point in the history
* (infra): use analytics read replica for data dumps jobs

* (infra): split data dump jobs / apps

* (infra): split data dump jobs / apps
  • Loading branch information
larisa17 authored Jun 10, 2024
1 parent 0c7faeb commit 3368ed9
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 29 deletions.
71 changes: 44 additions & 27 deletions infra/aws/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ export const dailyDataDumpTaskDefinition = createScheduledTask(
"--s3-uri=s3://passport-scorer/daily_data_dumps/",
"--batch-size=20000",
].join(" "),
scheduleExpression: "cron(30 0 ? * * *)", // Run the task daily at 00:30 UTC
scheduleExpression: "cron(30 0 ? * * *)", // Run the task daily at 00:45 UTC
alertTopic: pagerdutyTopic,
},
{
Expand All @@ -1076,32 +1076,49 @@ export const dailyDataDumpTaskDefinition = createScheduledTask(
false
);

export const dailyDataDumpTaskDefinitionParquet = createScheduledTask(
"daily-data-dump-parquet",
{
...baseScorerServiceConfig,
cpu: 1024,
memory: 2048,
securityGroup: secgrp,
ephemeralStorageSizeInGiB: 100,
command: [
"python",
"manage.py",
"scorer_dump_data_parquet",
"--database=read_replica_0",
"--apps=registry,ceramic_cache,account,scorer_weighted,trusta_labs",
"--s3-uri=s3://passport-scorer/daily_data_dumps/",
"--batch-size=20000",
].join(" "),
scheduleExpression: "cron(30 0 ? * * *)", // Run the task daily at 00:30 UTC
alertTopic: pagerdutyTopic,
},
{
...envConfig,
readReplicaConnectionUrl: readreplicaAnalyticsConnectionUrl,
},
86400, // 24h max period
false
// Apps: registry,ceramic_cache,account,scorer_weighted,trusta_labs
// Split the data dump by app to avoid having 1 bad app causing the whole dump to fail

const dailyDataDumpApps: string[] = [
"registry",
"ceramic_cache",
"account",
"scorer_weighted",
"trusta_labs",
];

export const dailyDataDumpTaskDefinitionParquetList = dailyDataDumpApps.map(
(app: string) => {
const dailyDataDumpTaskDefinitionParquet = createScheduledTask(
`daily-data-dump-parquet-${app}`,
{
...baseScorerServiceConfig,
cpu: 1024,
memory: 2048,
securityGroup: secgrp,
ephemeralStorageSizeInGiB: 100,
command: [
"python",
"manage.py",
"scorer_dump_data_parquet",
"--database=read_replica_0",
`--apps=${app}`,
"--s3-uri=s3://passport-scorer/daily_data_dumps/",
"--batch-size=20000",
].join(" "),
scheduleExpression: "cron(45 0 ? * * *)", // Run the task daily at 00:30 UTC
alertTopic: pagerdutyTopic,
},
{
...envConfig,
readReplicaConnectionUrl: readreplicaAnalyticsConnectionUrl,
},
86400, // 24h max period
false
);

return dailyDataDumpTaskDefinitionParquet;
}
);

/*
Expand Down
4 changes: 2 additions & 2 deletions infra/lib/scorer/scheduledTasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ export function createScheduledTask(
scheduleExpression,
});

const eventsStsAssumeRole = new aws.iam.Role(`${name}-eventsStsAssumeRole`, {
const eventsStsAssumeRole = new aws.iam.Role(`${name}-eventsRole`, {
assumeRolePolicy: JSON.stringify({
Version: "2012-10-17",
Statement: [
Expand Down Expand Up @@ -193,7 +193,7 @@ export function createScheduledTask(
},
});

new aws.cloudwatch.EventTarget(`scheduledEventTarget-${name}`, {
new aws.cloudwatch.EventTarget(`scheduled-${name}`, {
rule: scheduledEventRule.name,
arn: cluster.arn,
roleArn: eventsStsAssumeRole.arn,
Expand Down

0 comments on commit 3368ed9

Please sign in to comment.