Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OC-927: Convert incremental ARI ingest into a standalone script #728

Merged
merged 11 commits into from
Nov 26, 2024
2 changes: 1 addition & 1 deletion api/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ prod.env
coverage
Octopus.postman_collection.json
dist
full-ari-import-report.txt
ari-import-report.txt
3 changes: 1 addition & 2 deletions api/jest-setup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@
// way serverless interprets env files.

import * as dotenv from 'dotenv';
import { expand } from 'dotenv-expand';

expand(dotenv.config());
dotenv.config();
16 changes: 0 additions & 16 deletions api/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@
"test:local": "STAGE=local jest --runInBand --testTimeout=60000",
"test:watch": "npm run test:local -- --watch",
"type": "tsc --noEmit",
"reindex": "ts-node scripts/reindex.ts",
"ariImport": "ts-node -r tsconfig-paths/register scripts/ariImport.ts",
finlay-jisc marked this conversation as resolved.
Show resolved Hide resolved
"createOrganisationalAccounts": "ts-node -r tsconfig-paths/register scripts/createOrganisationalAccounts.ts",
"createTopicMappings": "ts-node -r tsconfig-paths/register scripts/createTopicMappings.ts",
"createTopics": "ts-node -r tsconfig-paths/register scripts/createTopics.ts",
"createUserMappings": "ts-node -r tsconfig-paths/register scripts/createUserMappings.ts",
"fullAriImport": "ts-node -r tsconfig-paths/register scripts/fullAriImport.ts",
"reindex": "ts-node scripts/reindex.ts",
"updateOrganisationalAccounts": "ts-node -r tsconfig-paths/register scripts/updateOrganisationalAccounts.ts"
},
"dependencies": {
Expand All @@ -54,7 +54,6 @@
"aws-lambda": "^1.0.7",
"cheerio": "^1.0.0-rc.12",
"dotenv": "16.4.5",
"dotenv-expand": "11.0.7",
"html-to-text": "^9.0.5",
"isomorphic-dompurify": "^2.12.0",
"jsonwebtoken": "^9.0.2",
Expand Down
110 changes: 73 additions & 37 deletions api/scripts/fullAriImport.ts → api/scripts/ariImport.ts
Original file line number Diff line number Diff line change
@@ -1,39 +1,64 @@
import axios from 'axios';
import * as fs from 'fs/promises';
import * as dotenv from 'dotenv';
import { expand } from 'dotenv-expand';

// Important to do this so that environment variables are treated the same as in deployed code.
expand(dotenv.config());
finlay-jisc marked this conversation as resolved.
Show resolved Hide resolved
dotenv.config();

import * as ariUtils from 'integration/ariUtils';
import * as Helpers from 'lib/helpers';
import * as I from 'interface';
import * as integrationService from 'integration/service';

// Can take an argument to run for all departments, rather than just the ones specified in the
// PARTICIPATING_ARI_USER_IDS environment variable.
// npm run fullAriImport -- allDepartments=true
const parseArguments = (): { importAllDepartments: boolean } => {
const checkBooleanArgValue = (arg: string): void => {
if (arg && !(arg === 'true' || arg === 'false')) {
throw new Error(`"${arg}" must be "true" or "false"`);
}
};

/**
* Can take the following arguments:
* - allDepartments: If "true", the script will run for all departments,
* rather than just the ones specified in the PARTICIPATING_ARI_USER_IDS environment variable.
* - Has no effect unless "full" is "true".
* - Default: false
* - dryRun: If "true", the script will not actually create or update any publications,
* and instead report on what it would have done.
* - Default: false
* - full: If "true", the script will import all ARIs from the ARI DB, instead of stopping when it
* thinks it has found all the new ones (the incremental way).
* - Default: false
*
* e.g.:
* npm run ariImport -- allDepartments=true full=true
*/
const parseArguments = (): { importAllDepartments: boolean; dryRun: boolean; full: boolean } => {
const args = Helpers.parseNpmScriptArgs();

for (const arg of Object.keys(args)) {
if (!['allDepartments'].includes(arg)) {
if (!['allDepartments', 'dryRun', 'full'].includes(arg)) {
throw new Error(`Unexpected argument: ${arg}`);
}
}

const allDepartmentsArg = args.allDepartments;
const { allDepartments: allDepartmentsArg, dryRun: dryRunArg, full: fullArg } = args;

if (allDepartmentsArg && !(allDepartmentsArg === 'true' || allDepartmentsArg === 'false')) {
throw new Error('allDepartments must be "true" or "false"');
for (const arg of [allDepartmentsArg, dryRunArg, fullArg]) {
checkBooleanArgValue(arg);
}

return {
importAllDepartments: !!allDepartmentsArg
importAllDepartments: !!allDepartmentsArg,
dryRun: !!dryRunArg,
full: !!fullArg
};
};

const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
/**
* Full ARI ingest.
* Differs from incremental ingest by fetching all ARIs before processing them.
* It will not stop until all ARIs have been processed.
*/
export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): Promise<string> => {
const startTime = performance.now();

// Collect all ARIs in a variable.
Expand Down Expand Up @@ -87,7 +112,7 @@ const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
const unrecognisedTopics = new Set<string>();

for (const ari of aris) {
const handleAri = await ariUtils.handleIncomingARI(ari);
const handleAri = await ariUtils.handleIncomingARI(ari, dryRun);

if (handleAri.unrecognisedDepartment) {
unrecognisedDepartments.add(handleAri.unrecognisedDepartment);
Expand All @@ -101,16 +126,24 @@ const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
switch (handleAri.actionTaken) {
case 'create':
createdCount++;

// Datacite test has a firewall that only 750 request per IP across a 5 minute period.
// https://support.datacite.org/docs/is-there-a-rate-limit-for-making-requests-against-the-datacite-apis
// 5 minutes = 300 seconds. 300 / 750 = 0.4 seconds per request, so we take minimum 0.5s per hit to be safe.
// We hit datacite twice when creating an ARI, so wait 1 second.
await new Promise((resolve) => setTimeout(resolve, 1000));
if (!dryRun) {
await new Promise((resolve) => setTimeout(resolve, 1000));
}

break;
case 'update':
updatedCount++;

// We hit datacite once when updating an ARI in place, so wait half a second.
await new Promise((resolve) => setTimeout(resolve, 500));
if (!dryRun) {
await new Promise((resolve) => setTimeout(resolve, 500));
}

break;
case 'none':
skippedCount++;
Expand Down Expand Up @@ -140,32 +173,35 @@ const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
}

const endTime = performance.now();
const durationSeconds = Math.round((endTime - startTime) / 100) / 10;

// Write report file.
const duration = ((endTime - startTime) / 1000).toFixed(1);
const unrecognisedDepartmentsArray = Array.from(unrecognisedDepartments).sort();
const unrecognisedTopicsArray = Array.from(unrecognisedTopics).sort();
const reportBody = `\
Duration: ${duration} seconds.
Publications created: ${createdCount}.
Publications updated: ${updatedCount}.
Publications skipped: ${skippedCount}.\
${
unrecognisedDepartmentsArray.length
? '\nUnrecognised departments: "' + unrecognisedDepartmentsArray.join('", "') + '".'
: ''
}\
${unrecognisedTopicsArray.length ? '\nUnrecognised topics: "' + unrecognisedTopicsArray.join('", "') + '".' : ''}
`;
await fs.writeFile('full-ari-import-report.txt', reportBody);

return `Finished. Successfully handled ${aris.length - failed.length} of ${
await ariUtils.ingestReport('file', {
checkedCount: aris.length,
durationSeconds,
createdCount,
updatedCount,
unrecognisedDepartments: Array.from(unrecognisedDepartments).sort(),
unrecognisedTopics: Array.from(unrecognisedTopics).sort(),
dryRun,
full: true
});

return `${dryRun ? 'Dry run' : 'Real run'} finished. Successfully handled ${aris.length - failed.length} of ${
aris.length
} ARIs in ${duration} seconds.`;
} ARIs in ${durationSeconds} seconds.`;
};

const ariImport = async (allDepartments: boolean, dryRun: boolean, full: boolean): Promise<string> => {
if (!full) {
return await integrationService.incrementalAriIngest(dryRun, 'file');
} else {
return await fullAriIngest(allDepartments, dryRun);
}
};

const { importAllDepartments } = parseArguments();
const { importAllDepartments, dryRun, full } = parseArguments();

fullAriImport(importAllDepartments)
ariImport(importAllDepartments, dryRun, full)
.then((message) => console.log(message))
.catch((err) => console.log(err));
87 changes: 87 additions & 0 deletions api/src/components/integration/ariUtils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as client from 'lib/client';
import * as coAuthorService from 'coAuthor/service';
import * as config from 'config';
import * as email from 'email';
import * as Helpers from 'lib/helpers';
import * as I from 'interface';
import * as publicationService from 'publication/service';
Expand All @@ -9,6 +10,7 @@ import * as topicMappingService from 'topicMapping/service';
import * as userMappingService from 'userMapping/service';
import * as userService from 'user/service';

import * as fs from 'fs/promises';
import { Prisma } from '@prisma/client';

const parseAriTextField = (value: string): string => {
Expand Down Expand Up @@ -408,3 +410,88 @@ export const getParticipatingDepartmentNames = async (): Promise<string[]> => {

return queryResults.flatMap((userMappings) => userMappings.map((userMapping) => userMapping.value));
};

export const ingestReport = async (
format: 'email' | 'file',
ingestDetails: {
checkedCount: number;
durationSeconds: number;
createdCount: number;
updatedCount: number;
unrecognisedDepartments: string[];
unrecognisedTopics: string[];
dryRun: boolean;
full: boolean;
}
): Promise<void> => {
const {
checkedCount,
durationSeconds,
createdCount,
updatedCount,
unrecognisedDepartments,
unrecognisedTopics,
dryRun,
full
} = ingestDetails;
const intro = `${full ? 'Full' : 'Incremental'} ARI import ${dryRun ? 'dry ' : ''}run completed.`;
const timingInfo =
`Duration: ${durationSeconds} seconds.` +
(dryRun && (createdCount || updatedCount)
? ` A real run would have taken a minimum of ${
createdCount + updatedCount / 2
} additional seconds due to datacite API rate limits while creating/updating publications.`
: '');
const detailsPrefix = `The ${dryRun ? 'simulated ' : ''}results of this run are as follows.`;
const text = `
${intro}
${timingInfo}
${detailsPrefix}
ARIs checked: ${checkedCount}.
Publications created: ${createdCount}.
Publications updated: ${updatedCount}.
${unrecognisedDepartments.length ? 'Unrecognised departments: "' + unrecognisedDepartments.join('", "') + '".' : ''}
${unrecognisedTopics.length ? 'Unrecognised topics: "' + unrecognisedTopics.join('", "') + '".' : ''}`;

if (format === 'file') {
const fileName = 'ari-import-report.txt';
await fs.writeFile(fileName, text);
console.log(`Report file written to ${fileName}.`);

return;
}

if (format === 'email') {
const cleanDepartments = unrecognisedDepartments.map((department) => Helpers.getSafeHTML(department));
const cleanTopics = unrecognisedTopics.map((topic) => Helpers.getSafeHTML(topic));
const html = `
<html>
<body>
<p>${intro}</p>
<p>${timingInfo}</p>
<p>${detailsPrefix}</p>
<ul>
<li>ARIs checked: ${checkedCount}</li>
<li>Publications created: ${createdCount}</li>
<li>Publications updated: ${updatedCount}</li>
${
cleanDepartments.length
? '<li>Unrecognised departments: <ul><li>' +
cleanDepartments.join('</li><li>') +
'</li></ul></li>'
: ''
}
${
cleanTopics.length
? '<li>Unrecognised topics: <ul><li>' + cleanTopics.join('</li><li>') + '</li></ul></li>'
: ''
}
</ul>
</body>
</html>
`;
await email.ariIngestReport(html, text);

return;
}
};
2 changes: 1 addition & 1 deletion api/src/components/integration/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ export const incrementalAriIngest = async (
}

try {
const ingestResult = await integrationService.incrementalAriIngest(dryRun);
const ingestResult = await integrationService.incrementalAriIngest(dryRun, 'email');

return response.json(
200,
Expand Down
8 changes: 4 additions & 4 deletions api/src/components/integration/service.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import axios from 'axios';
import * as ariUtils from 'integration/ariUtils';
import * as email from 'lib/email';
import * as ingestLogService from 'ingestLog/service';

/**
Expand All @@ -11,7 +10,7 @@ import * as ingestLogService from 'ingestLog/service';
* - It encounters an ARI with dateUpdated before the start time of the most
* recent successful ingest (if this start time is available).
*/
export const incrementalAriIngest = async (dryRun: boolean): Promise<string> => {
export const incrementalAriIngest = async (dryRun: boolean, reportFormat: 'email' | 'file'): Promise<string> => {
const start = new Date();
const MAX_UNCHANGED_STREAK = 5;
// Get most start time of last successful run to help us know when to stop.
Expand Down Expand Up @@ -135,14 +134,15 @@ export const incrementalAriIngest = async (dryRun: boolean): Promise<string> =>
await ingestLogService.setEndTime(logId, end);
}

await email.incrementalAriIngestReport({
await ariUtils.ingestReport(reportFormat, {
checkedCount,
durationSeconds,
createdCount,
updatedCount,
unrecognisedDepartments: Array.from(unrecognisedDepartments).sort(),
unrecognisedTopics: Array.from(unrecognisedTopics).sort(),
dryRun
dryRun,
full: false
});

const writeCount = createdCount + updatedCount;
Expand Down
Loading