From 6885f8946bec80152402b4894a4d74c766b870a6 Mon Sep 17 00:00:00 2001 From: Pavlo Karatsiuba Date: Tue, 4 Apr 2023 16:50:42 +0200 Subject: [PATCH 1/3] To download media files used additional redis storage. Clearing redis media storage after each dump --- src/RedisStore.ts | 22 ++++- src/mwoffliner.lib.ts | 6 +- src/types.d.ts | 1 + src/util/saveArticles.ts | 6 +- test/e2e/multimediaContent.test.ts | 124 +++++++++++++++++++++++++++++ test/unit/bootstrap.ts | 4 +- 6 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 test/e2e/multimediaContent.test.ts diff --git a/src/RedisStore.ts b/src/RedisStore.ts index 8206e27ae..bef017b97 100644 --- a/src/RedisStore.ts +++ b/src/RedisStore.ts @@ -8,6 +8,7 @@ class RedisStore implements RS { private storesReady: boolean private _filesToDownloadXPath: RKVS + private _mediaToDownloadXPath: RKVS private _filesToRetryXPath: RKVS private _articleDetailXId: RKVS private _redirectsXId: RKVS @@ -58,8 +59,15 @@ class RedisStore implements RS { } } + public async flushMediaToDownloadXPath() { + if (this._client.isReady && this.storesReady) { + logger.log('Flushing Redis DB for storing media') + await this._mediaToDownloadXPath.flush() + } + } + public async checkForExistingStores() { - const patterns = ['*-media', '*-media-retry', '*-detail', '*-redirect'] + const patterns = ['*-media', '*-files', '*-media-retry', '*-detail', '*-redirect'] let keys: string[] = [] for (const pattern of patterns) { keys = keys.concat(await this._client.keys(pattern)) @@ -77,7 +85,13 @@ class RedisStore implements RS { } private async populateStores() { - this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, { + this._mediaToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, { + u: 'url', + n: 'namespace', + m: 'mult', + w: 'width', + }) + this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-files`, { u: 'url', n: 'namespace', m: 'mult', @@ -119,6 +133,10 @@ class RedisStore implements RS { return this._filesToDownloadXPath } + public get mediaToDownloadXPath(): RKVS { + return this._mediaToDownloadXPath + } + public get filesToRetryXPath(): RKVS { return this._filesToRetryXPath } diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 980d72d99..818cf4054 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -208,7 +208,7 @@ async function execute(argv: any) { const redisStore = new RedisStore(argv.redis || config.defaults.redisPath) await redisStore.connect() - const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore + const { articleDetailXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore // Output directory const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out') @@ -346,6 +346,7 @@ async function execute(argv: any) { } else { try { await doDump(dump) + await mediaToDownloadXPath.flush() } catch (err) { debugger throw err @@ -440,6 +441,7 @@ async function execute(argv: any) { ) await downloadFiles(filesToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader) + await downloadFiles(mediaToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader) logger.log('Writing Article Redirects') await writeArticleRedirects(downloader, dump, zimCreator) @@ -616,7 +618,7 @@ async function execute(argv: any) { articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I') await Promise.all([ - filesToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail), + mediaToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail), articleDetailXId.set(articleId, articleDetail), ]) articlesWithImages++ diff --git a/src/types.d.ts b/src/types.d.ts index 50ffb8156..cb891729d 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -69,6 +69,7 @@ interface RKVS { // RedisStore Interface interface RS { readonly client: any // RedisClientType + readonly mediaToDownloadXPath: RKVS readonly filesToDownloadXPath: RKVS readonly filesToRetryXPath: RKVS readonly articleDetailXId: RKVS diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index 93bfb5db9..f4724b0a0 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -194,20 +194,21 @@ async function saveArticle( const { finalHTML, mediaDependencies, subtitles } = await processArticleHtml(articleHtml, redisStore, mw, dump, articleId, articleDetail, _moduleDependencies, downloader.webp) const filesToDownload: KVS = {} + const mediaToDownload: KVS = {} subtitles.forEach((s) => { filesToDownload[s.path] = { url: s.url, namespace: '-' } }) if (mediaDependencies.length) { - const existingVals = await redisStore.filesToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path)) + const existingVals = await redisStore.mediaToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path)) for (const dep of mediaDependencies) { const { mult, width } = getSizeFromUrl(dep.url) const existingVal = existingVals[dep.path] const currentDepIsHigherRes = !existingVal || existingVal.width < (width || 10e6) || existingVal.mult < (mult || 1) if (currentDepIsHigherRes) { - filesToDownload[dep.path] = { + mediaToDownload[dep.path] = { url: downloader.serializeUrl(dep.url), mult, width, @@ -216,6 +217,7 @@ async function saveArticle( } } + await redisStore.mediaToDownloadXPath.setMany(mediaToDownload) await redisStore.filesToDownloadXPath.setMany(filesToDownload) const zimArticle = new ZimArticle({ diff --git a/test/e2e/multimediaContent.test.ts b/test/e2e/multimediaContent.test.ts new file mode 100644 index 000000000..511a3280f --- /dev/null +++ b/test/e2e/multimediaContent.test.ts @@ -0,0 +1,124 @@ +import * as mwoffliner from '../../src/mwoffliner.lib.js' +import { execa } from 'execa' +import rimraf from 'rimraf' +import { zimcheckAvailable, zimcheck, zimdumpAvailable, zimdump } from '../util.js' +import 'dotenv/config' +import { jest } from '@jest/globals' + +jest.setTimeout(60000) + +describe('Multimedia', () => { + const now = new Date() + const testId = `mwo-test-${+now}` + + const parameters = { + mwUrl: 'https://en.m.wikipedia.org', + adminEmail: 'test@kiwix.org', + articleList: 'User:Kelson/MWoffliner_CI_reference', + outputDirectory: testId, + redis: process.env.REDIS, + customZimDescription: 'Example of the description', + } + + test('check multimedia content from wikipedia test page', async () => { + await execa('redis-cli flushall', { shell: true }) + + const [dump] = await mwoffliner.execute(parameters) + + expect(dump.status.articles.success).toEqual(1) + expect(dump.status.articles.fail).toEqual(0) + + if (await zimcheckAvailable()) { + await expect(zimcheck(dump.outFile)).resolves.not.toThrowError() + } else { + console.log('Zimcheck not installed, skipping test') + } + + if (await zimdumpAvailable()) { + const mediaFiles = await zimdump(`list --ns I ${dump.outFile}`) + + expect(mediaFiles.split('\n').sort()).toEqual( + [ + 'I/Kiwix_-_WikiArabia_Cairo_2017.pdf', + 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm', + 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg', + 'I/Kiwix_icon.svg.png', + 'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg', + 'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + 'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + ].sort(), + ) + } else { + console.log('Zimcheck not installed, skipping test') + } + + rimraf.sync(`./${testId}`) + const redisScan = await execa('redis-cli --scan', { shell: true }) + // Redis has been cleared + expect(redisScan.stdout).toEqual('') + }) + + test('check multimedia content from wikipedia test page with different formates', async () => { + await execa('redis-cli flushall', { shell: true }) + const dumps = await mwoffliner.execute({ ...parameters, format: ['nopic', 'novid', 'nopdf', 'nodet'] }) + + expect(dumps).toHaveLength(4) + for (const dump of dumps) { + expect(dump.status.articles.success).toEqual(1) + expect(dump.status.articles.fail).toEqual(0) + + if (await zimcheckAvailable()) { + await expect(zimcheck(dump.outFile)).resolves.not.toThrowError() + } else { + console.log('Zimcheck not installed, skipping test') + } + + if (await zimdumpAvailable()) { + const mediaFiles = await zimdump(`list --ns I ${dump.outFile}`) + if (dump.nopic) { + expect(mediaFiles.split('\n').sort()).toEqual( + [ + 'I/Kiwix_-_WikiArabia_Cairo_2017.pdf', + // 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm', // these files were omitted by nopic parameter + // 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg', + // 'I/Kiwix_icon.svg.png', + // 'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg', + // 'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + // 'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + ].sort(), + ) + } else if (dump.novid) { + expect(mediaFiles.split('\n').sort()).toEqual( + [ + 'I/Kiwix_-_WikiArabia_Cairo_2017.pdf', + // 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm', // these files were omitted by novid parameter + // 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg', + 'I/Kiwix_icon.svg.png', + // 'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg', + 'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + 'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + ].sort(), + ) + } else if (dump.nopdf) { + expect(mediaFiles.split('\n').sort()).toEqual( + [ + // 'I/Kiwix_-_WikiArabia_Cairo_2017.pdf', // this file was omitted by nopdf parameter + 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm', + 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg', + 'I/Kiwix_icon.svg.png', + 'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg', + 'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + 'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg', + ].sort(), + ) + } + } else { + console.log('Zimcheck not installed, skipping test') + } + } + rimraf.sync(`./${testId}`) + const redisScan = await execa('redis-cli --scan', { shell: true }) + // Redis has been cleared + expect(redisScan.stdout).toEqual('') + }) +}) diff --git a/test/unit/bootstrap.ts b/test/unit/bootstrap.ts index 9875b089c..10b0c18ec 100644 --- a/test/unit/bootstrap.ts +++ b/test/unit/bootstrap.ts @@ -9,8 +9,8 @@ export const redisStore = new RedisStore(process.env.REDIS || config.defaults.re export const startRedis = async () => { await redisStore.connect() - const { articleDetailXId, redirectsXId, filesToDownloadXPath, filesToRetryXPath } = redisStore - await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()]) + const { articleDetailXId, redirectsXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath } = redisStore + await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), mediaToDownloadXPath.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()]) } export const stopRedis = async () => { From 2a562a3a6d85623b407c772615177c279125da43 Mon Sep 17 00:00:00 2001 From: Pavlo Karatsiuba Date: Mon, 10 Apr 2023 18:29:22 +0200 Subject: [PATCH 2/3] Save subtitles also to the Redis media storage. --- src/util/saveArticles.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index f4724b0a0..76f24a97f 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -193,11 +193,10 @@ async function saveArticle( try { const { finalHTML, mediaDependencies, subtitles } = await processArticleHtml(articleHtml, redisStore, mw, dump, articleId, articleDetail, _moduleDependencies, downloader.webp) - const filesToDownload: KVS = {} const mediaToDownload: KVS = {} subtitles.forEach((s) => { - filesToDownload[s.path] = { url: s.url, namespace: '-' } + mediaToDownload[s.path] = { url: s.url, namespace: '-' } }) if (mediaDependencies.length) { @@ -218,7 +217,6 @@ async function saveArticle( } await redisStore.mediaToDownloadXPath.setMany(mediaToDownload) - await redisStore.filesToDownloadXPath.setMany(filesToDownload) const zimArticle = new ZimArticle({ url: articleId, From a1c0eb712fb832fa0187c8f716189ef90f2b0e12 Mon Sep 17 00:00:00 2001 From: Pavlo Karatsiuba Date: Wed, 12 Apr 2023 14:12:58 +0200 Subject: [PATCH 3/3] Clearing the Redis storage with media files and CSS after each iteration. --- src/RedisStore.ts | 22 ++-------------------- src/mwoffliner.lib.ts | 7 +++---- src/types.d.ts | 1 - src/util/dump.ts | 2 +- src/util/saveArticles.ts | 10 +++++----- test/unit/bootstrap.ts | 4 ++-- 6 files changed, 13 insertions(+), 33 deletions(-) diff --git a/src/RedisStore.ts b/src/RedisStore.ts index bef017b97..8206e27ae 100644 --- a/src/RedisStore.ts +++ b/src/RedisStore.ts @@ -8,7 +8,6 @@ class RedisStore implements RS { private storesReady: boolean private _filesToDownloadXPath: RKVS - private _mediaToDownloadXPath: RKVS private _filesToRetryXPath: RKVS private _articleDetailXId: RKVS private _redirectsXId: RKVS @@ -59,15 +58,8 @@ class RedisStore implements RS { } } - public async flushMediaToDownloadXPath() { - if (this._client.isReady && this.storesReady) { - logger.log('Flushing Redis DB for storing media') - await this._mediaToDownloadXPath.flush() - } - } - public async checkForExistingStores() { - const patterns = ['*-media', '*-files', '*-media-retry', '*-detail', '*-redirect'] + const patterns = ['*-media', '*-media-retry', '*-detail', '*-redirect'] let keys: string[] = [] for (const pattern of patterns) { keys = keys.concat(await this._client.keys(pattern)) @@ -85,13 +77,7 @@ class RedisStore implements RS { } private async populateStores() { - this._mediaToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, { - u: 'url', - n: 'namespace', - m: 'mult', - w: 'width', - }) - this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-files`, { + this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, { u: 'url', n: 'namespace', m: 'mult', @@ -133,10 +119,6 @@ class RedisStore implements RS { return this._filesToDownloadXPath } - public get mediaToDownloadXPath(): RKVS { - return this._mediaToDownloadXPath - } - public get filesToRetryXPath(): RKVS { return this._filesToRetryXPath } diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 818cf4054..4c4c6445c 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -208,7 +208,7 @@ async function execute(argv: any) { const redisStore = new RedisStore(argv.redis || config.defaults.redisPath) await redisStore.connect() - const { articleDetailXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore + const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore // Output directory const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out') @@ -346,7 +346,7 @@ async function execute(argv: any) { } else { try { await doDump(dump) - await mediaToDownloadXPath.flush() + await filesToDownloadXPath.flush() } catch (err) { debugger throw err @@ -441,7 +441,6 @@ async function execute(argv: any) { ) await downloadFiles(filesToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader) - await downloadFiles(mediaToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader) logger.log('Writing Article Redirects') await writeArticleRedirects(downloader, dump, zimCreator) @@ -618,7 +617,7 @@ async function execute(argv: any) { articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I') await Promise.all([ - mediaToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail), + filesToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail), articleDetailXId.set(articleId, articleDetail), ]) articlesWithImages++ diff --git a/src/types.d.ts b/src/types.d.ts index cb891729d..50ffb8156 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -69,7 +69,6 @@ interface RKVS { // RedisStore Interface interface RS { readonly client: any // RedisClientType - readonly mediaToDownloadXPath: RKVS readonly filesToDownloadXPath: RKVS readonly filesToRetryXPath: RKVS readonly articleDetailXId: RKVS diff --git a/src/util/dump.ts b/src/util/dump.ts index 24fef9605..37e82e36b 100644 --- a/src/util/dump.ts +++ b/src/util/dump.ts @@ -45,7 +45,7 @@ export async function getAndProcessStylesheets(downloader: Downloader, redisStor while ((match = cssUrlRegexp.exec(body))) { let url = match[1] - /* Avoid 'data', so no url dependency */ + /* Avoid 'data', so no URL dependency */ if (!url.match('^data')) { const filePathname = urlParser.parse(url, false, true).pathname if (filePathname) { diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index 76f24a97f..93bfb5db9 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -193,21 +193,21 @@ async function saveArticle( try { const { finalHTML, mediaDependencies, subtitles } = await processArticleHtml(articleHtml, redisStore, mw, dump, articleId, articleDetail, _moduleDependencies, downloader.webp) - const mediaToDownload: KVS = {} + const filesToDownload: KVS = {} subtitles.forEach((s) => { - mediaToDownload[s.path] = { url: s.url, namespace: '-' } + filesToDownload[s.path] = { url: s.url, namespace: '-' } }) if (mediaDependencies.length) { - const existingVals = await redisStore.mediaToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path)) + const existingVals = await redisStore.filesToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path)) for (const dep of mediaDependencies) { const { mult, width } = getSizeFromUrl(dep.url) const existingVal = existingVals[dep.path] const currentDepIsHigherRes = !existingVal || existingVal.width < (width || 10e6) || existingVal.mult < (mult || 1) if (currentDepIsHigherRes) { - mediaToDownload[dep.path] = { + filesToDownload[dep.path] = { url: downloader.serializeUrl(dep.url), mult, width, @@ -216,7 +216,7 @@ async function saveArticle( } } - await redisStore.mediaToDownloadXPath.setMany(mediaToDownload) + await redisStore.filesToDownloadXPath.setMany(filesToDownload) const zimArticle = new ZimArticle({ url: articleId, diff --git a/test/unit/bootstrap.ts b/test/unit/bootstrap.ts index 10b0c18ec..9875b089c 100644 --- a/test/unit/bootstrap.ts +++ b/test/unit/bootstrap.ts @@ -9,8 +9,8 @@ export const redisStore = new RedisStore(process.env.REDIS || config.defaults.re export const startRedis = async () => { await redisStore.connect() - const { articleDetailXId, redirectsXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath } = redisStore - await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), mediaToDownloadXPath.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()]) + const { articleDetailXId, redirectsXId, filesToDownloadXPath, filesToRetryXPath } = redisStore + await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()]) } export const stopRedis = async () => {