Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into OC-923
Browse files Browse the repository at this point in the history
  • Loading branch information
finlay-jisc committed Nov 27, 2024
2 parents 233e080 + 4b2f94c commit 9b2efff
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 140 deletions.
2 changes: 1 addition & 1 deletion api/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ prod.env
coverage
Octopus.postman_collection.json
dist
full-ari-import-report.txt
ari-import-report.txt
3 changes: 1 addition & 2 deletions api/jest-setup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@
// way serverless interprets env files.

import * as dotenv from 'dotenv';
import { expand } from 'dotenv-expand';

expand(dotenv.config());
dotenv.config();
17 changes: 1 addition & 16 deletions api/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 10 additions & 10 deletions api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"node": "20.x"
},
"prisma": {
"seed": "ts-node -r tsconfig-paths/register prisma/seed.ts"
"seed": "tsx prisma/seed.ts"
},
"scripts": {
"build": "swc src -d dist --copy-files -s false",
Expand All @@ -32,13 +32,13 @@
"test:local": "STAGE=local jest --runInBand --testTimeout=60000",
"test:watch": "npm run test:local -- --watch",
"type": "tsc --noEmit",
"reindex": "ts-node scripts/reindex.ts",
"createOrganisationalAccounts": "ts-node -r tsconfig-paths/register scripts/createOrganisationalAccounts.ts",
"createTopicMappings": "ts-node -r tsconfig-paths/register scripts/createTopicMappings.ts",
"createTopics": "ts-node -r tsconfig-paths/register scripts/createTopics.ts",
"createUserMappings": "ts-node -r tsconfig-paths/register scripts/createUserMappings.ts",
"fullAriImport": "ts-node -r tsconfig-paths/register scripts/fullAriImport.ts",
"updateOrganisationalAccounts": "ts-node -r tsconfig-paths/register scripts/updateOrganisationalAccounts.ts"
"ariImport": "tsx scripts/ariImport.ts",
"createOrganisationalAccounts": "tsx scripts/createOrganisationalAccounts.ts",
"createTopicMappings": "tsx scripts/createTopicMappings.ts",
"createTopics": "tsx scripts/createTopics.ts",
"createUserMappings": "tsx scripts/createUserMappings.ts",
"reindex": "tsx scripts/reindex.ts",
"updateOrganisationalAccounts": "tsx scripts/updateOrganisationalAccounts.ts"
},
"dependencies": {
"@middy/core": "^4.7.0",
Expand All @@ -50,11 +50,10 @@
"@sparticuz/chromium": "^119.0.2",
"ajv": "^8.12.0",
"ajv-formats": "^2.1.1",
"axios": "^1.7.4",
"aws-lambda": "^1.0.7",
"axios": "^1.7.4",
"cheerio": "^1.0.0-rc.12",
"dotenv": "16.4.5",
"dotenv-expand": "11.0.7",
"html-to-text": "^9.0.5",
"isomorphic-dompurify": "^2.12.0",
"jsonwebtoken": "^9.0.2",
Expand Down Expand Up @@ -98,6 +97,7 @@
"serverless-prune-plugin": "^2.1.0",
"ts-jest": "^29.1.1",
"ts-loader": "^9.4.4",
"tsx": "^4.19.2",
"typescript": "^5.2.2"
},
"optionalDependencies": {
Expand Down
110 changes: 73 additions & 37 deletions api/scripts/fullAriImport.ts → api/scripts/ariImport.ts
Original file line number Diff line number Diff line change
@@ -1,39 +1,64 @@
import axios from 'axios';
import * as fs from 'fs/promises';
import * as dotenv from 'dotenv';
import { expand } from 'dotenv-expand';

// Important to do this so that environment variables are treated the same as in deployed code.
expand(dotenv.config());
dotenv.config();

import * as ariUtils from 'integration/ariUtils';
import * as Helpers from 'lib/helpers';
import * as I from 'interface';
import * as integrationService from 'integration/service';

// Can take an argument to run for all departments, rather than just the ones specified in the
// PARTICIPATING_ARI_USER_IDS environment variable.
// npm run fullAriImport -- allDepartments=true
const parseArguments = (): { importAllDepartments: boolean } => {
const checkBooleanArgValue = (arg: string): void => {
if (arg && !(arg === 'true' || arg === 'false')) {
throw new Error(`"${arg}" must be "true" or "false"`);
}
};

/**
* Can take the following arguments:
* - allDepartments: If "true", the script will run for all departments,
* rather than just the ones specified in the PARTICIPATING_ARI_USER_IDS environment variable.
* - Has no effect unless "full" is "true".
* - Default: false
* - dryRun: If "true", the script will not actually create or update any publications,
* and instead report on what it would have done.
* - Default: false
* - full: If "true", the script will import all ARIs from the ARI DB, instead of stopping when it
* thinks it has found all the new ones (the incremental way).
* - Default: false
*
* e.g.:
* npm run ariImport -- allDepartments=true full=true
*/
const parseArguments = (): { importAllDepartments: boolean; dryRun: boolean; full: boolean } => {
const args = Helpers.parseNpmScriptArgs();

for (const arg of Object.keys(args)) {
if (!['allDepartments'].includes(arg)) {
if (!['allDepartments', 'dryRun', 'full'].includes(arg)) {
throw new Error(`Unexpected argument: ${arg}`);
}
}

const allDepartmentsArg = args.allDepartments;
const { allDepartments: allDepartmentsArg, dryRun: dryRunArg, full: fullArg } = args;

if (allDepartmentsArg && !(allDepartmentsArg === 'true' || allDepartmentsArg === 'false')) {
throw new Error('allDepartments must be "true" or "false"');
for (const arg of [allDepartmentsArg, dryRunArg, fullArg]) {
checkBooleanArgValue(arg);
}

return {
importAllDepartments: !!allDepartmentsArg
importAllDepartments: !!allDepartmentsArg,
dryRun: !!dryRunArg,
full: !!fullArg
};
};

const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
/**
* Full ARI ingest.
* Differs from incremental ingest by fetching all ARIs before processing them.
* It will not stop until all ARIs have been processed.
*/
export const fullAriIngest = async (allDepartments: boolean, dryRun: boolean): Promise<string> => {
const startTime = performance.now();

// Collect all ARIs in a variable.
Expand Down Expand Up @@ -87,7 +112,7 @@ const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
const unrecognisedTopics = new Set<string>();

for (const ari of aris) {
const handleAri = await ariUtils.handleIncomingARI(ari);
const handleAri = await ariUtils.handleIncomingARI(ari, dryRun);

if (handleAri.unrecognisedDepartment) {
unrecognisedDepartments.add(handleAri.unrecognisedDepartment);
Expand All @@ -101,16 +126,24 @@ const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
switch (handleAri.actionTaken) {
case 'create':
createdCount++;

// Datacite test has a firewall that only 750 request per IP across a 5 minute period.
// https://support.datacite.org/docs/is-there-a-rate-limit-for-making-requests-against-the-datacite-apis
// 5 minutes = 300 seconds. 300 / 750 = 0.4 seconds per request, so we take minimum 0.5s per hit to be safe.
// We hit datacite twice when creating an ARI, so wait 1 second.
await new Promise((resolve) => setTimeout(resolve, 1000));
if (!dryRun) {
await new Promise((resolve) => setTimeout(resolve, 1000));
}

break;
case 'update':
updatedCount++;

// We hit datacite once when updating an ARI in place, so wait half a second.
await new Promise((resolve) => setTimeout(resolve, 500));
if (!dryRun) {
await new Promise((resolve) => setTimeout(resolve, 500));
}

break;
case 'none':
skippedCount++;
Expand Down Expand Up @@ -140,32 +173,35 @@ const fullAriImport = async (allDepartments?: boolean): Promise<string> => {
}

const endTime = performance.now();
const durationSeconds = Math.round((endTime - startTime) / 100) / 10;

// Write report file.
const duration = ((endTime - startTime) / 1000).toFixed(1);
const unrecognisedDepartmentsArray = Array.from(unrecognisedDepartments).sort();
const unrecognisedTopicsArray = Array.from(unrecognisedTopics).sort();
const reportBody = `\
Duration: ${duration} seconds.
Publications created: ${createdCount}.
Publications updated: ${updatedCount}.
Publications skipped: ${skippedCount}.\
${
unrecognisedDepartmentsArray.length
? '\nUnrecognised departments: "' + unrecognisedDepartmentsArray.join('", "') + '".'
: ''
}\
${unrecognisedTopicsArray.length ? '\nUnrecognised topics: "' + unrecognisedTopicsArray.join('", "') + '".' : ''}
`;
await fs.writeFile('full-ari-import-report.txt', reportBody);

return `Finished. Successfully handled ${aris.length - failed.length} of ${
await ariUtils.ingestReport('file', {
checkedCount: aris.length,
durationSeconds,
createdCount,
updatedCount,
unrecognisedDepartments: Array.from(unrecognisedDepartments).sort(),
unrecognisedTopics: Array.from(unrecognisedTopics).sort(),
dryRun,
full: true
});

return `${dryRun ? 'Dry run' : 'Real run'} finished. Successfully handled ${aris.length - failed.length} of ${
aris.length
} ARIs in ${duration} seconds.`;
} ARIs in ${durationSeconds} seconds.`;
};

const ariImport = async (allDepartments: boolean, dryRun: boolean, full: boolean): Promise<string> => {
if (!full) {
return await integrationService.incrementalAriIngest(dryRun, 'file');
} else {
return await fullAriIngest(allDepartments, dryRun);
}
};

const { importAllDepartments } = parseArguments();
const { importAllDepartments, dryRun, full } = parseArguments();

fullAriImport(importAllDepartments)
ariImport(importAllDepartments, dryRun, full)
.then((message) => console.log(message))
.catch((err) => console.log(err));
87 changes: 87 additions & 0 deletions api/src/components/integration/ariUtils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as client from 'lib/client';
import * as coAuthorService from 'coAuthor/service';
import * as config from 'config';
import * as email from 'email';
import * as Helpers from 'lib/helpers';
import * as I from 'interface';
import * as publicationService from 'publication/service';
Expand All @@ -9,6 +10,7 @@ import * as topicMappingService from 'topicMapping/service';
import * as userMappingService from 'userMapping/service';
import * as userService from 'user/service';

import * as fs from 'fs/promises';
import { Prisma } from '@prisma/client';

const parseAriTextField = (value: string): string => {
Expand Down Expand Up @@ -408,3 +410,88 @@ export const getParticipatingDepartmentNames = async (): Promise<string[]> => {

return queryResults.flatMap((userMappings) => userMappings.map((userMapping) => userMapping.value));
};

export const ingestReport = async (
format: 'email' | 'file',
ingestDetails: {
checkedCount: number;
durationSeconds: number;
createdCount: number;
updatedCount: number;
unrecognisedDepartments: string[];
unrecognisedTopics: string[];
dryRun: boolean;
full: boolean;
}
): Promise<void> => {
const {
checkedCount,
durationSeconds,
createdCount,
updatedCount,
unrecognisedDepartments,
unrecognisedTopics,
dryRun,
full
} = ingestDetails;
const intro = `${full ? 'Full' : 'Incremental'} ARI import ${dryRun ? 'dry ' : ''}run completed.`;
const timingInfo =
`Duration: ${durationSeconds} seconds.` +
(dryRun && (createdCount || updatedCount)
? ` A real run would have taken a minimum of ${
createdCount + updatedCount / 2
} additional seconds due to datacite API rate limits while creating/updating publications.`
: '');
const detailsPrefix = `The ${dryRun ? 'simulated ' : ''}results of this run are as follows.`;
const text = `
${intro}
${timingInfo}
${detailsPrefix}
ARIs checked: ${checkedCount}.
Publications created: ${createdCount}.
Publications updated: ${updatedCount}.
${unrecognisedDepartments.length ? 'Unrecognised departments: "' + unrecognisedDepartments.join('", "') + '".' : ''}
${unrecognisedTopics.length ? 'Unrecognised topics: "' + unrecognisedTopics.join('", "') + '".' : ''}`;

if (format === 'file') {
const fileName = 'ari-import-report.txt';
await fs.writeFile(fileName, text);
console.log(`Report file written to ${fileName}.`);

return;
}

if (format === 'email') {
const cleanDepartments = unrecognisedDepartments.map((department) => Helpers.getSafeHTML(department));
const cleanTopics = unrecognisedTopics.map((topic) => Helpers.getSafeHTML(topic));
const html = `
<html>
<body>
<p>${intro}</p>
<p>${timingInfo}</p>
<p>${detailsPrefix}</p>
<ul>
<li>ARIs checked: ${checkedCount}</li>
<li>Publications created: ${createdCount}</li>
<li>Publications updated: ${updatedCount}</li>
${
cleanDepartments.length
? '<li>Unrecognised departments: <ul><li>' +
cleanDepartments.join('</li><li>') +
'</li></ul></li>'
: ''
}
${
cleanTopics.length
? '<li>Unrecognised topics: <ul><li>' + cleanTopics.join('</li><li>') + '</li></ul></li>'
: ''
}
</ul>
</body>
</html>
`;
await email.ariIngestReport(html, text);

return;
}
};
2 changes: 1 addition & 1 deletion api/src/components/integration/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ export const incrementalAriIngest = async (
}

try {
const ingestResult = await integrationService.incrementalAriIngest(dryRun);
const ingestResult = await integrationService.incrementalAriIngest(dryRun, 'email');

return response.json(
200,
Expand Down
Loading

0 comments on commit 9b2efff

Please sign in to comment.