Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OC-926: Make ECS task run incremental ARI import #747

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ coverage
# VSC settings
.vscode
.env
*.code-workspace
*.code-workspace

# Docker
volume
49 changes: 36 additions & 13 deletions api/scripts/ariImport.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,43 @@ const checkBooleanArgValue = (arg: string): void => {
* - full: If "true", the script will import all ARIs from the ARI DB, instead of stopping when it
* thinks it has found all the new ones (the incremental way).
* - Default: false
* - reportFormat: Controls how the output of the job is reported. Can be "email" or "file". Emails
* are sent to the addresses listed in the INGEST_REPORT_RECIPIENTS environment variable. Files are
* written to "ari-import-report.txt".
* - Default: "file"
*
* e.g.:
* npm run ariImport -- allDepartments=true full=true
*/
const parseArguments = (): { importAllDepartments: boolean; dryRun: boolean; full: boolean } => {
const parseArguments = (): {
importAllDepartments: boolean;
dryRun: boolean;
full: boolean;
reportFormat: I.IngestReportFormat;
} => {
const args = Helpers.parseNpmScriptArgs();

for (const arg of Object.keys(args)) {
if (!['allDepartments', 'dryRun', 'full'].includes(arg)) {
if (!['allDepartments', 'dryRun', 'full', 'reportFormat'].includes(arg)) {
throw new Error(`Unexpected argument: ${arg}`);
}
}

const { allDepartments: allDepartmentsArg, dryRun: dryRunArg, full: fullArg } = args;
const { allDepartments: allDepartmentsArg, dryRun: dryRunArg, full: fullArg, reportFormat: reportFormatArg } = args;

for (const arg of [allDepartmentsArg, dryRunArg, fullArg]) {
checkBooleanArgValue(arg);
}

if (reportFormatArg && !(reportFormatArg === 'email' || reportFormatArg === 'file')) {
throw new Error(`"reportFormat" must be "email" or "file"`);
}

return {
importAllDepartments: !!allDepartmentsArg,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These were actually always setting the property to true, so I changed the way they're set.

dryRun: !!dryRunArg,
full: !!fullArg
importAllDepartments: allDepartmentsArg === 'true',
dryRun: dryRunArg === 'true',
full: fullArg === 'true',
reportFormat: reportFormatArg ? (reportFormatArg as I.IngestReportFormat) : 'file'
};
};

Expand All @@ -58,7 +72,11 @@ const parseArguments = (): { importAllDepartments: boolean; dryRun: boolean; ful
* Differs from incremental ingest by fetching all ARIs before processing them.
* It will not stop until all ARIs have been processed.
*/
export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): Promise<string> => {
export const fullAriIngest = async (
allDepartments: boolean,
dryRun: boolean,
reportFormat: I.IngestReportFormat
): Promise<string> => {
const startTime = performance.now();

// Collect all ARIs in a variable.
Expand Down Expand Up @@ -176,7 +194,7 @@ export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): P
const durationSeconds = Math.round((endTime - startTime) / 100) / 10;

// Write report file.
await ariUtils.ingestReport('file', {
await ariUtils.ingestReport(reportFormat, {
checkedCount: aris.length,
durationSeconds,
createdCount,
Expand All @@ -192,16 +210,21 @@ export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): P
} ARIs in ${durationSeconds} seconds.`;
};

const ariImport = async (allDepartments: boolean, dryRun: boolean, full: boolean): Promise<string> => {
const ariImport = async (
allDepartments: boolean,
dryRun: boolean,
full: boolean,
reportFormat: I.IngestReportFormat
): Promise<string> => {
if (!full) {
return await integrationService.incrementalAriIngest(dryRun, 'file');
return await integrationService.incrementalAriIngest(dryRun, reportFormat);
} else {
return await fullAriIngest(allDepartments, dryRun);
return await fullAriIngest(allDepartments, dryRun, reportFormat);
}
};

const { importAllDepartments, dryRun, full } = parseArguments();
const { importAllDepartments, dryRun, full, reportFormat } = parseArguments();

ariImport(importAllDepartments, dryRun, full)
ariImport(importAllDepartments, dryRun, full, reportFormat)
.then((message) => console.log(message))
.catch((err) => console.log(err));
8 changes: 8 additions & 0 deletions api/src/components/integration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ Publications imported via an integration with another system should have the fol

They should also always be owned by an organisational user account. That is, a user with the value `ORGANISATION` for the `role` field.

### Where/how does this run?

On deployed environments, integrations are run in containers on AWS Elastic Container Service. These containers are defined in the infrastructure code (see [Dockerfile](../../../../infra/docker/ariImportRunner/Dockerfile)), so they can be built and tested locally from the `infra/docker/ariImportRunner` directory with `docker compose up` (see [compose.yml](../../../../infra/docker/ariImportRunner/compose.yml)).

They can also be run ad hoc on the local environment via npm scripts, for example (from the `api` directory):

`npm run ariImport -- dryRun=true allDepartments=true full=false`

## Specific integrations

### ARI DB
Expand Down
2 changes: 1 addition & 1 deletion api/src/components/integration/ariUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ export const getParticipatingDepartmentNames = async (): Promise<string[]> => {
};

export const ingestReport = async (
format: 'email' | 'file',
format: I.IngestReportFormat,
ingestDetails: {
checkedCount: number;
durationSeconds: number;
Expand Down
3 changes: 2 additions & 1 deletion api/src/components/integration/service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import * as ariUtils from 'integration/ariUtils';
import * as ecs from 'lib/ecs';
import * as ingestLogService from 'ingestLog/service';
import * as Helpers from 'lib/helpers';
import * as I from 'interface';

/**
* Incremental ARI ingest.
Expand All @@ -12,7 +13,7 @@ import * as Helpers from 'lib/helpers';
* - It encounters an ARI with dateUpdated before the start time of the most
* recent successful ingest (if this start time is available).
*/
export const incrementalAriIngest = async (dryRun: boolean, reportFormat: 'email' | 'file'): Promise<string> => {
export const incrementalAriIngest = async (dryRun: boolean, reportFormat: I.IngestReportFormat): Promise<string> => {
const start = new Date();
const MAX_UNCHANGED_STREAK = 5;
// Get most start time of last successful run to help us know when to stop.
Expand Down
2 changes: 2 additions & 0 deletions api/src/lib/interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1068,3 +1068,5 @@ export interface HandledARI {
unrecognisedDepartment?: string;
unrecognisedTopics?: string[];
}

export type IngestReportFormat = 'email' | 'file';
16 changes: 16 additions & 0 deletions infra/docker/ariImportRunner/Dockerfile
Copy link

@adwearing-jisc adwearing-jisc Jan 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# syntax=docker/dockerfile:1.7-labs
FROM public.ecr.aws/docker/library/node:20-alpine

RUN apk add \
    ca-certificates \
    curl \
    gnupg \
    git \
    openssl \
    openssl-dev \
    libc6-compat

WORKDIR /app

COPY package.json .
RUN npm i
COPY . .
CMD ["npm", "run", "ariImport", "--", "dryRun=false", "reportFormat=email"]

Alternative way of doing it, removes the mkdir command, WORKDIR does this for you. Splits things into multiple steps so you don't have to redo npm i everytime a file changes. Also seems to be better to set context root to the api folder, and only copy over the api stuff rather than everything (the above example assumes this change).

Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# syntax=docker/dockerfile:1.7-labs

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't seen this before, what does it do?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was needed to use the --exclude flag with COPY.

FROM public.ecr.aws/docker/library/node:20-alpine
RUN apk add \
ca-certificates \
curl \
gnupg \
git \
openssl \
openssl-dev \
libc6-compat
RUN mkdir /app
# Build context is project root
COPY --exclude=**/node_modules . /app
WORKDIR /app/api
RUN npm i
CMD ["npm", "run", "ariImport", "--", "dryRun=false", "reportFormat=email"]
11 changes: 11 additions & 0 deletions infra/docker/ariImportRunner/compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# For testing the ARI import runner docker image locally.
services:
ari-import-runner:
build:
context: ../../..
dockerfile: infra/docker/ariImportRunner/Dockerfile
command:
["/bin/sh", "-c", "/app/infra/docker/ariImportRunner/local-ari-import.sh"]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Override the container's command locally - only difference is to change the report format to a file because the container can't send an email.

env_file: ../../../api/.env
# So it can access the DB and opensearch that are exposed on host ports.
network_mode: host
4 changes: 4 additions & 0 deletions infra/docker/ariImportRunner/local-ari-import.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
cd /app/api
npm run ariImport -- dryRun=false reportFormat=file
cat ari-import-report.txt
2 changes: 0 additions & 2 deletions infra/docker/poc/Dockerfile

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ phases:
build:
commands:
- echo Building docker image...
- docker build --platform linux/arm64 --tag $IMAGE_NAME ./infra/docker/poc
- docker build --platform linux/arm64 --tag $IMAGE_NAME -f ./infra/docker/ariImportRunner/Dockerfile .
- docker tag $IMAGE_NAME:latest $ACCOUNT_ID.dkr.ecr.$DEFAULT_REGION.amazonaws.com/$PROJECT_NAME-$ENVIRONMENT:$IMAGE_NAME
- docker tag $IMAGE_NAME:latest $ACCOUNT_ID.dkr.ecr.$DEFAULT_REGION.amazonaws.com/$PROJECT_NAME-$ENVIRONMENT:$COMMIT_ID
post_build:
Expand Down
4 changes: 2 additions & 2 deletions infra/modules/codepipeline/codebuild.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ resource "aws_codebuild_project" "deploy-docker-image" {

environment {
compute_type = "BUILD_GENERAL1_SMALL"
image = "aws/codebuild/standard:5.0"
type = "LINUX_CONTAINER"
image = "aws/codebuild/amazonlinux-aarch64-standard:3.0"
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The codepipeline to build the image wasn't working until I changed this to build with an ARM based environment.

type = "ARM_CONTAINER"
privileged_mode = true

environment_variable {
Expand Down
40 changes: 37 additions & 3 deletions infra/modules/ecs/iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,31 @@ data "aws_iam_policy_document" "task-exec-policy" {
]
resources = ["*"]
}
statement {
effect = "Allow"
actions = [
"ssm:GetParameters"
]
resources = [
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/base_url_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/datacite_endpoint_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/datacite_password_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/datacite_user_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/doi_prefix_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/db_connection_string_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elastic_search_protocol_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elasticsearch_user_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elasticsearch_password_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/elasticsearch_endpoint_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/email_sender_address_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/ingest_report_recipients_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/mail_server_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/participating_ari_user_ids_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/queue_url_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/slack_channel_email_${var.environment}_${var.project_name}",
"arn:aws:ssm:${local.region_name}:${local.account_id}:parameter/sqs_endpoint_${var.environment}_${var.project_name}"
]
}
}

resource "aws_iam_role_policy" "task-exec-policy" {
Expand All @@ -33,7 +58,7 @@ resource "aws_iam_role_policy" "task-exec-policy" {
policy = data.aws_iam_policy_document.task-exec-policy.json
}

data "aws_iam_policy_document" "ecs-task-role-policy" {
data "aws_iam_policy_document" "ecs-task-assume-role-policy" {
statement {
effect = "Allow"
actions = [
Expand All @@ -45,10 +70,9 @@ data "aws_iam_policy_document" "ecs-task-role-policy" {
}
}
}

resource "aws_iam_role" "ecs-task-role" {
name = "${var.project_name}-ecs-task-role-${var.environment}"
assume_role_policy = data.aws_iam_policy_document.ecs-task-role-policy.json
assume_role_policy = data.aws_iam_policy_document.ecs-task-assume-role-policy.json
}

data "aws_iam_policy_document" "task-policy" {
Expand All @@ -62,6 +86,16 @@ data "aws_iam_policy_document" "task-policy" {
]
resources = ["*"]
}
statement {
effect = "Allow"
actions = [
"ses:SendEmail",
"ses:SendRawEmail"
]
resources = [
"arn:aws:ses:eu-west-1:${local.account_id}:identity/*"
]
}
}

resource "aws_iam_role_policy" "task-policy" {
Expand Down
2 changes: 1 addition & 1 deletion infra/modules/ecs/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
output "task_security_group_id" {
value = aws_security_group.hello-world-task-sg.id
value = aws_security_group.ari-import-task-sg.id
}
Loading
Loading