Skip to content

Commit

Permalink
Clearing the Redis storage with media files and CSS after each iterat…
Browse files Browse the repository at this point in the history
…ion.
  • Loading branch information
pavel-karatsiuba authored and kelson42 committed May 23, 2023
1 parent 2a562a3 commit 082c542
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 33 deletions.
22 changes: 2 additions & 20 deletions src/RedisStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class RedisStore implements RS {
private storesReady: boolean

private _filesToDownloadXPath: RKVS<FileDetail>
private _mediaToDownloadXPath: RKVS<FileDetail>
private _filesToRetryXPath: RKVS<FileDetail>
private _articleDetailXId: RKVS<ArticleDetail>
private _redirectsXId: RKVS<ArticleRedirect>
Expand Down Expand Up @@ -59,15 +58,8 @@ class RedisStore implements RS {
}
}

public async flushMediaToDownloadXPath() {
if (this._client.isReady && this.storesReady) {
logger.log('Flushing Redis DB for storing media')
await this._mediaToDownloadXPath.flush()
}
}

public async checkForExistingStores() {
const patterns = ['*-media', '*-files', '*-media-retry', '*-detail', '*-redirect']
const patterns = ['*-media', '*-media-retry', '*-detail', '*-redirect']
let keys: string[] = []
for (const pattern of patterns) {
keys = keys.concat(await this._client.keys(pattern))
Expand All @@ -85,13 +77,7 @@ class RedisStore implements RS {
}

private async populateStores() {
this._mediaToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, {
u: 'url',
n: 'namespace',
m: 'mult',
w: 'width',
})
this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-files`, {
this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, {
u: 'url',
n: 'namespace',
m: 'mult',
Expand Down Expand Up @@ -133,10 +119,6 @@ class RedisStore implements RS {
return this._filesToDownloadXPath
}

public get mediaToDownloadXPath(): RKVS<FileDetail> {
return this._mediaToDownloadXPath
}

public get filesToRetryXPath(): RKVS<FileDetail> {
return this._filesToRetryXPath
}
Expand Down
7 changes: 3 additions & 4 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ async function execute(argv: any) {

const redisStore = new RedisStore(argv.redis || config.defaults.redisPath)
await redisStore.connect()
const { articleDetailXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore
const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore

// Output directory
const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out')
Expand Down Expand Up @@ -346,7 +346,7 @@ async function execute(argv: any) {
} else {
try {
await doDump(dump)
await mediaToDownloadXPath.flush()
await filesToDownloadXPath.flush()
} catch (err) {
debugger
throw err
Expand Down Expand Up @@ -441,7 +441,6 @@ async function execute(argv: any) {
)

await downloadFiles(filesToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader)
await downloadFiles(mediaToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader)

logger.log('Writing Article Redirects')
await writeArticleRedirects(downloader, dump, zimCreator)
Expand Down Expand Up @@ -618,7 +617,7 @@ async function execute(argv: any) {
articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I')

await Promise.all([
mediaToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail),
filesToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail),
articleDetailXId.set(articleId, articleDetail),
])
articlesWithImages++
Expand Down
1 change: 0 additions & 1 deletion src/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ interface RKVS<T> {
// RedisStore Interface
interface RS {
readonly client: any // RedisClientType
readonly mediaToDownloadXPath: RKVS<FileDetail>
readonly filesToDownloadXPath: RKVS<FileDetail>
readonly filesToRetryXPath: RKVS<FileDetail>
readonly articleDetailXId: RKVS<ArticleDetail>
Expand Down
3 changes: 2 additions & 1 deletion src/util/dump.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ export async function getAndProcessStylesheets(downloader: Downloader, redisStor
while ((match = cssUrlRegexp.exec(body))) {
let url = match[1]

/* Avoid 'data', so no url dependency */
/* Avoid 'data', so no URL dependency */
if (!url.match('^data')) {
const filePathname = urlParser.parse(url, false, true).pathname
if (filePathname) {
Expand All @@ -60,6 +60,7 @@ export async function getAndProcessStylesheets(downloader: Downloader, redisStor

/* Download CSS dependency, but avoid duplicate calls */
if (!downloader.cssDependenceUrls.hasOwnProperty(url) && filename) {
if (filename) {
downloader.cssDependenceUrls[url] = true
filesToDownloadXPath.set(config.output.dirs.mediawiki + '/' + filename, { url: downloader.serializeUrl(url), namespace: '-' })
}
Expand Down
10 changes: 5 additions & 5 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -193,21 +193,21 @@ async function saveArticle(
try {
const { finalHTML, mediaDependencies, subtitles } = await processArticleHtml(articleHtml, redisStore, mw, dump, articleId, articleDetail, _moduleDependencies, downloader.webp)

const mediaToDownload: KVS<FileDetail> = {}
const filesToDownload: KVS<FileDetail> = {}

subtitles.forEach((s) => {
mediaToDownload[s.path] = { url: s.url, namespace: '-' }
filesToDownload[s.path] = { url: s.url, namespace: '-' }
})

if (mediaDependencies.length) {
const existingVals = await redisStore.mediaToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path))
const existingVals = await redisStore.filesToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path))

for (const dep of mediaDependencies) {
const { mult, width } = getSizeFromUrl(dep.url)
const existingVal = existingVals[dep.path]
const currentDepIsHigherRes = !existingVal || existingVal.width < (width || 10e6) || existingVal.mult < (mult || 1)
if (currentDepIsHigherRes) {
mediaToDownload[dep.path] = {
filesToDownload[dep.path] = {
url: downloader.serializeUrl(dep.url),
mult,
width,
Expand All @@ -216,7 +216,7 @@ async function saveArticle(
}
}

await redisStore.mediaToDownloadXPath.setMany(mediaToDownload)
await redisStore.filesToDownloadXPath.setMany(filesToDownload)

const zimArticle = new ZimArticle({
url: articleId,
Expand Down
4 changes: 2 additions & 2 deletions test/unit/bootstrap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ export const redisStore = new RedisStore(process.env.REDIS || config.defaults.re

export const startRedis = async () => {
await redisStore.connect()
const { articleDetailXId, redirectsXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath } = redisStore
await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), mediaToDownloadXPath.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()])
const { articleDetailXId, redirectsXId, filesToDownloadXPath, filesToRetryXPath } = redisStore
await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()])
}

export const stopRedis = async () => {
Expand Down

0 comments on commit 082c542

Please sign in to comment.