Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OC-926: Make ECS task run incremental ARI import #747

Merged
merged 18 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ coverage
# VSC settings
.vscode
.env
*.code-workspace
*.code-workspace

# Docker
volume
49 changes: 36 additions & 13 deletions api/scripts/ariImport.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,43 @@ const checkBooleanArgValue = (arg: string): void => {
* - full: If "true", the script will import all ARIs from the ARI DB, instead of stopping when it
* thinks it has found all the new ones (the incremental way).
* - Default: false
* - reportFormat: Controls how the output of the job is reported. Can be "email" or "file". Emails
* are sent to the addresses listed in the INGEST_REPORT_RECIPIENTS environment variable. Files are
* written to "ari-import-report.txt".
* - Default: "file"
*
* e.g.:
* npm run ariImport -- allDepartments=true full=true
*/
const parseArguments = (): { importAllDepartments: boolean; dryRun: boolean; full: boolean } => {
const parseArguments = (): {
importAllDepartments: boolean;
dryRun: boolean;
full: boolean;
reportFormat: I.IngestReportFormat;
} => {
const args = Helpers.parseNpmScriptArgs();

for (const arg of Object.keys(args)) {
if (!['allDepartments', 'dryRun', 'full'].includes(arg)) {
if (!['allDepartments', 'dryRun', 'full', 'reportFormat'].includes(arg)) {
throw new Error(`Unexpected argument: ${arg}`);
}
}

const { allDepartments: allDepartmentsArg, dryRun: dryRunArg, full: fullArg } = args;
const { allDepartments: allDepartmentsArg, dryRun: dryRunArg, full: fullArg, reportFormat: reportFormatArg } = args;

for (const arg of [allDepartmentsArg, dryRunArg, fullArg]) {
checkBooleanArgValue(arg);
}

if (reportFormatArg && !(reportFormatArg === 'email' || reportFormatArg === 'file')) {
throw new Error(`"reportFormat" must be "email" or "file"`);
}

return {
importAllDepartments: !!allDepartmentsArg,
finlay-jisc marked this conversation as resolved.
Show resolved Hide resolved
dryRun: !!dryRunArg,
full: !!fullArg
importAllDepartments: allDepartmentsArg === 'true',
dryRun: dryRunArg === 'true',
full: fullArg === 'true',
reportFormat: reportFormatArg ? (reportFormatArg as I.IngestReportFormat) : 'file'
};
};

Expand All @@ -58,7 +72,11 @@ const parseArguments = (): { importAllDepartments: boolean; dryRun: boolean; ful
* Differs from incremental ingest by fetching all ARIs before processing them.
* It will not stop until all ARIs have been processed.
*/
export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): Promise<string> => {
export const fullAriIngest = async (
allDepartments: boolean,
dryRun: boolean,
reportFormat: I.IngestReportFormat
): Promise<string> => {
const startTime = performance.now();

// Collect all ARIs in a variable.
Expand Down Expand Up @@ -176,7 +194,7 @@ export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): P
const durationSeconds = Math.round((endTime - startTime) / 100) / 10;

// Write report file.
await ariUtils.ingestReport('file', {
await ariUtils.ingestReport(reportFormat, {
checkedCount: aris.length,
durationSeconds,
createdCount,
Expand All @@ -192,16 +210,21 @@ export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): P
} ARIs in ${durationSeconds} seconds.`;
};

const ariImport = async (allDepartments: boolean, dryRun: boolean, full: boolean): Promise<string> => {
const ariImport = async (
allDepartments: boolean,
dryRun: boolean,
full: boolean,
reportFormat: I.IngestReportFormat
): Promise<string> => {
if (!full) {
return await integrationService.incrementalAriIngest(dryRun, 'file');
return await integrationService.incrementalAriIngest(dryRun, reportFormat);
} else {
return await fullAriIngest(allDepartments, dryRun);
return await fullAriIngest(allDepartments, dryRun, reportFormat);
}
};

const { importAllDepartments, dryRun, full } = parseArguments();
const { importAllDepartments, dryRun, full, reportFormat } = parseArguments();

ariImport(importAllDepartments, dryRun, full)
ariImport(importAllDepartments, dryRun, full, reportFormat)
.then((message) => console.log(message))
.catch((err) => console.log(err));
8 changes: 8 additions & 0 deletions api/src/components/integration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ Publications imported via an integration with another system should have the fol

They should also always be owned by an organisational user account. That is, a user with the value `ORGANISATION` for the `role` field.

### Where/how does this run?

On deployed environments, integrations are run in containers on AWS Elastic Container Service. These containers are defined in the infrastructure code (see [Dockerfile](../../../../infra/docker/ariImportRunner/Dockerfile)), so they can be built and tested locally from the `infra/docker/ariImportRunner` directory with `docker compose up` (see [compose.yml](../../../../infra/docker/ariImportRunner/compose.yml)).

They can also be run ad hoc on the local environment via npm scripts, for example (from the `api` directory):

`npm run ariImport -- dryRun=true allDepartments=true full=false`

## Specific integrations

### ARI DB
Expand Down
2 changes: 1 addition & 1 deletion api/src/components/integration/ariUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ export const getParticipatingDepartmentNames = async (): Promise<string[]> => {
};

export const ingestReport = async (
format: 'email' | 'file',
format: I.IngestReportFormat,
ingestDetails: {
checkedCount: number;
durationSeconds: number;
Expand Down
3 changes: 2 additions & 1 deletion api/src/components/integration/service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import * as ariUtils from 'integration/ariUtils';
import * as ecs from 'lib/ecs';
import * as ingestLogService from 'ingestLog/service';
import * as Helpers from 'lib/helpers';
import * as I from 'interface';

/**
* Incremental ARI ingest.
Expand All @@ -12,7 +13,7 @@ import * as Helpers from 'lib/helpers';
* - It encounters an ARI with dateUpdated before the start time of the most
* recent successful ingest (if this start time is available).
*/
export const incrementalAriIngest = async (dryRun: boolean, reportFormat: 'email' | 'file'): Promise<string> => {
export const incrementalAriIngest = async (dryRun: boolean, reportFormat: I.IngestReportFormat): Promise<string> => {
const start = new Date();
const MAX_UNCHANGED_STREAK = 5;
// Get most start time of last successful run to help us know when to stop.
Expand Down
2 changes: 2 additions & 0 deletions api/src/lib/interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1068,3 +1068,5 @@ export interface HandledARI {
unrecognisedDepartment?: string;
unrecognisedTopics?: string[];
}

export type IngestReportFormat = 'email' | 'file';
13 changes: 13 additions & 0 deletions infra/docker/ariImportRunner/Dockerfile
finlay-jisc marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM public.ecr.aws/docker/library/node:20-alpine
RUN apk add \
ca-certificates \
curl \
gnupg \
git \
openssl \
openssl-dev \
libc6-compat
RUN git clone https://github.com/JiscSD/octopus /app
finlay-jisc marked this conversation as resolved.
Show resolved Hide resolved
WORKDIR /app/api
RUN npm i
CMD ["npm", "run", "ariImport", "--", "dryRun=false", "reportFormat=email"]
10 changes: 10 additions & 0 deletions infra/docker/ariImportRunner/compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# For testing the ARI import runner docker image locally.
services:
ari-import-runner:
build:
context: .
command:
["/bin/sh", "-c", "/app/infra/docker/ariImportRunner/local-ari-import.sh"]
finlay-jisc marked this conversation as resolved.
Show resolved Hide resolved
env_file: ../../../api/.env
# So it can access the DB and opensearch that are exposed on host ports.
network_mode: host
4 changes: 4 additions & 0 deletions infra/docker/ariImportRunner/local-ari-import.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
cd /app/api
npm run ariImport -- dryRun=false reportFormat=file
cat ari-import-report.txt
2 changes: 0 additions & 2 deletions infra/docker/poc/Dockerfile

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ phases:
build:
commands:
- echo Building docker image...
- docker build --platform linux/arm64 --tag $IMAGE_NAME ./infra/docker/poc
- docker build --platform linux/arm64 --tag $IMAGE_NAME ./infra/docker/ariImportRunner
- docker tag $IMAGE_NAME:latest $ACCOUNT_ID.dkr.ecr.$DEFAULT_REGION.amazonaws.com/$PROJECT_NAME-$ENVIRONMENT:$IMAGE_NAME
- docker tag $IMAGE_NAME:latest $ACCOUNT_ID.dkr.ecr.$DEFAULT_REGION.amazonaws.com/$PROJECT_NAME-$ENVIRONMENT:$COMMIT_ID
post_build:
Expand Down
4 changes: 2 additions & 2 deletions infra/modules/codepipeline/codebuild.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ resource "aws_codebuild_project" "deploy-docker-image" {

environment {
compute_type = "BUILD_GENERAL1_SMALL"
image = "aws/codebuild/standard:5.0"
type = "LINUX_CONTAINER"
image = "aws/codebuild/amazonlinux-aarch64-standard:3.0"
finlay-jisc marked this conversation as resolved.
Show resolved Hide resolved
type = "ARM_CONTAINER"
privileged_mode = true

environment_variable {
Expand Down
40 changes: 37 additions & 3 deletions infra/modules/ecs/iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,31 @@ data "aws_iam_policy_document" "task-exec-policy" {
]
resources = ["*"]
}
statement {
effect = "Allow"
actions = [
"ssm:GetParameters"
]
resources = [
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/base_url_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/datacite_endpoint_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/datacite_password_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/datacite_user_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/doi_prefix_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/db_connection_string_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elastic_search_protocol_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elasticsearch_user_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elasticsearch_password_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elasticsearch_endpoint_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/email_sender_address_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/ingest_report_recipients_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/mail_server_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/participating_ari_user_ids_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/queue_url_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/slack_channel_email_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/sqs_endpoint_${var.environment}_${var.project_name}"
]
}
}

resource "aws_iam_role_policy" "task-exec-policy" {
Expand All @@ -33,7 +58,7 @@ resource "aws_iam_role_policy" "task-exec-policy" {
policy = data.aws_iam_policy_document.task-exec-policy.json
}

data "aws_iam_policy_document" "ecs-task-role-policy" {
data "aws_iam_policy_document" "ecs-task-assume-role-policy" {
statement {
effect = "Allow"
actions = [
Expand All @@ -45,10 +70,9 @@ data "aws_iam_policy_document" "ecs-task-role-policy" {
}
}
}

resource "aws_iam_role" "ecs-task-role" {
name = "${var.project_name}-ecs-task-role-${var.environment}"
assume_role_policy = data.aws_iam_policy_document.ecs-task-role-policy.json
assume_role_policy = data.aws_iam_policy_document.ecs-task-assume-role-policy.json
}

data "aws_iam_policy_document" "task-policy" {
Expand All @@ -62,6 +86,16 @@ data "aws_iam_policy_document" "task-policy" {
]
resources = ["*"]
}
statement {
effect = "Allow"
actions = [
"ses:SendEmail",
"ses:SendRawEmail"
]
resources = [
"arn:aws:ses:eu-west-1:${local.account_id}:identity/*"
]
}
}

resource "aws_iam_role_policy" "task-policy" {
Expand Down
2 changes: 1 addition & 1 deletion infra/modules/ecs/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
output "task_security_group_id" {
value = aws_security_group.hello-world-task-sg.id
value = aws_security_group.ari-import-task-sg.id
}
Loading
Loading