Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Archive article and reply text to Wayback Machine #344

Merged
merged 9 commits into from
Sep 4, 2024
6 changes: 6 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,9 @@ LOG_REQUESTS=
# It will create the topic, subscription and schema if not exists.
#
ADMIN_PUBSUB_TOPIC=

# Internet Archive S30Like API key and secret from https://archive.org/account/s3.php
# They are used to call Save Page Now 2 Public API
#
INTERNET_ARCHIVE_S3_ACCESS_KEY=
INTERNET_ARCHIVE_S3_SECRET_KEY=
45 changes: 35 additions & 10 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
"@babel/preset-env": "^7.16.11",
"@babel/preset-typescript": "^7.24.1",
"@google-cloud/storage": "^6.11.0",
"@types/node": "^18",
"@typescript-eslint/eslint-plugin": "^5.56.0",
"@typescript-eslint/parser": "^5.56.0",
"apollo-server-testing": "^2.18.2",
Expand Down
10 changes: 9 additions & 1 deletion src/graphql/mutations/CreateArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import scrapUrls from 'util/scrapUrls';
import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
import MutationResult from 'graphql/models/MutationResult';
import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

/* Instantiate hash function */
const xxhash64 = h64();
Expand Down Expand Up @@ -45,7 +46,9 @@ async function createNewArticle({ text, reference: originalReference, user }) {
appId: user.appId,
};

await client.update({
const {
body: { result },
} = await client.update({
index: 'articles',
type: 'doc',
id: articleId,
Expand Down Expand Up @@ -85,6 +88,11 @@ async function createNewArticle({ text, reference: originalReference, user }) {
refresh: 'true', // Make sure the data is indexed when we create ReplyRequest
});

if (result === 'created') {
// Archive URLs in article and don't wait for the result
archiveUrlsFromText(text);
}

return articleId;
}

Expand Down
5 changes: 5 additions & 0 deletions src/graphql/mutations/CreateMediaArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
import MutationResult from 'graphql/models/MutationResult';
import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
import ArticleTypeEnum from 'graphql/models/ArticleTypeEnum';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

const METADATA = {
cacheControl: 'public, max-age=31536000, immutable',
Expand Down Expand Up @@ -273,6 +274,10 @@ export default {
if (!aiResponse) {
throw new Error('AI transcript not found');
}

// Archive URLs in transcript; don't wait for it
archiveUrlsFromText(aiResponse.text);

return writeAITranscript(articleId, aiResponse.text);
})
.then(() => {
Expand Down
8 changes: 8 additions & 0 deletions src/graphql/mutations/CreateReply.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { assertUser } from 'util/user';

import client from 'util/client';
import scrapUrls from 'util/scrapUrls';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

import ReplyTypeEnum from 'graphql/models/ReplyTypeEnum';
import MutationResult from 'graphql/models/MutationResult';
Expand Down Expand Up @@ -90,6 +91,13 @@ export default {
return _id;
});

// Archive both text and reference.
// No need to wait for the result.
//
newReplyPromise.then(() =>
Promise.all([archiveUrlsFromText(text), archiveUrlsFromText(reference)])
);

const scrapPromise = scrapUrls(`${text} ${reference}`, {
cacheLoader: loaders.urlLoader,
client,
Expand Down
17 changes: 16 additions & 1 deletion src/graphql/mutations/__tests__/CreateArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@ import MockDate from 'mockdate';
import fixtures, { fixture1Text } from '../__fixtures__/CreateArticle';
import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
import { getArticleId } from 'graphql/mutations/CreateArticle';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));

describe('creation', () => {
beforeEach(() => loadFixtures(fixtures));
beforeEach(async () => {
archiveUrlsFromText.mockClear();
await loadFixtures(fixtures);
});
afterEach(() => unloadFixtures(fixtures));

it('creates articles and a reply request and fills in URLs', async () => {
Expand Down Expand Up @@ -47,6 +53,15 @@ describe('creation', () => {
expect(article.replyRequestCount).toBe(1);
expect(article).toMatchSnapshot();

// Make sure archiveUrlsFromText is called with article text
expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
Array [
Array [
"FOO FOO http://foo.com/article/1",
],
]
`);

const replyRequestId = getReplyRequestId({
articleId: data.CreateArticle.id,
userId,
Expand Down
12 changes: 12 additions & 0 deletions src/graphql/mutations/__tests__/CreateMediaArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@ import client from 'util/client';
import fixtures from '../__fixtures__/CreateMediaArticle';
import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
import mediaManager from 'util/mediaManager';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

jest.mock('util/mediaManager');
jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));

describe('creation', () => {
beforeAll(() => loadFixtures(fixtures));
beforeEach(() => {
mediaManager.insert.mockClear();
archiveUrlsFromText.mockClear();
});
afterAll(() => unloadFixtures(fixtures));

Expand Down Expand Up @@ -68,6 +71,15 @@ describe('creation', () => {
]
`);

// Expect archiveUrlsFromText is called with OCR result
expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
Array [
Array [
"OCR result of output image",
],
]
`);

const {
body: { _source: article },
} = await client.get({
Expand Down
18 changes: 18 additions & 0 deletions src/graphql/mutations/__tests__/CreateReply.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
jest.mock('util/grpc');
jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));

import gql from 'util/GraphQL';
import { loadFixtures, unloadFixtures, resetFrom } from 'util/fixtures';
Expand All @@ -7,9 +8,13 @@ import MockDate from 'mockdate';
import fixtures from '../__fixtures__/CreateReply';
import resolveUrl from 'util/grpc';
import delayForMs from 'util/delayForMs';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

describe('CreateReply', () => {
beforeAll(() => loadFixtures(fixtures));
beforeEach(() => {
archiveUrlsFromText.mockClear();
});

it('creates replies and associates itself with specified article', async () => {
MockDate.set(1485593157011);
Expand Down Expand Up @@ -66,6 +71,19 @@ describe('CreateReply', () => {
});
expect(article._source.articleReplies[0].replyId).toBe(replyId);

// Make sure archiveUrlsFromText is called with text and reference
//
expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
Array [
Array [
"FOO FOO",
],
Array [
"http://shouldscrap.com/",
],
]
`);

// Wait until urls are resolved
await delayForMs(1000);
MockDate.reset();
Expand Down
Loading
Loading