From 71676283d05f2be8014035b63141a1bfd4d6e0bc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 4 Sep 2024 22:59:37 -0700 Subject: [PATCH 1/4] warcwriter fixes: - ensure WARC rollover happens only after response/request + cdx or single record + cdx have been written - ensure request payload is buffered for POST request indexing - update to warcio 2.3.1 for POST request case-insensitive 'content-type' check - recorder: remove unused 'tempdir', no longer used as warcio chooses a temp file on it's own --- package.json | 2 +- src/crawler.ts | 4 ---- src/util/recorder.ts | 27 ++++----------------------- src/util/warcwriter.ts | 14 +++++++++++++- 4 files changed, 18 insertions(+), 29 deletions(-) diff --git a/package.json b/package.json index 08442000..552fd331 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.3.0", + "warcio": "git+https://github.com/webrecorder/warcio.js.git#indexer-fix-content-type", "ws": "^7.4.4", "yargs": "^17.7.2" }, diff --git a/src/crawler.ts b/src/crawler.ts index 1033df86..54bd77ae 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -156,7 +156,6 @@ export class Crawler { otherPagesFile: string; archivesDir: string; - tempdir: string; warcCdxDir: string; indexesDir: string; @@ -295,7 +294,6 @@ export class Crawler { // archives dir this.archivesDir = path.join(this.collDir, "archive"); - this.tempdir = path.join(os.tmpdir(), "tmp-dl"); // indexes dirs this.warcCdxDir = path.join(this.collDir, "warc-cdx"); @@ -480,7 +478,6 @@ export class Crawler { if (!this.params.dryRun) { await fsp.mkdir(this.archivesDir, { recursive: true }); - await fsp.mkdir(this.tempdir, { recursive: true }); await fsp.mkdir(this.warcCdxDir, { recursive: true }); } @@ -2581,7 +2578,6 @@ self.__bx_behaviors.selectMainBehavior(); workerid: id, crawler: this, writer, - tempdir: this.tempdir, }); this.browser.recorders.push(res); diff --git a/src/util/recorder.ts b/src/util/recorder.ts index f9481f2a..2acdda9c 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1,11 +1,7 @@ -import path from "path"; - -import { v4 as uuidv4 } from "uuid"; - import PQueue from "p-queue"; import { logger, formatErr } from "./logger.js"; -import { sleep, timedRun, timestampNow } from "./timing.js"; +import { sleep, timedRun } from "./timing.js"; import { RequestResponseInfo, isHTMLMime, @@ -142,8 +138,6 @@ export class Recorder { logDetails: Record = {}; skipping = false; - tempdir: string; - gzip = true; writer: WARCWriter; @@ -157,12 +151,10 @@ export class Recorder { workerid, writer, crawler, - tempdir, }: { workerid: WorkerId; writer: WARCWriter; crawler: Crawler; - tempdir: string; }) { this.workerid = workerid; this.crawler = crawler; @@ -170,8 +162,6 @@ export class Recorder { this.writer = writer; - this.tempdir = tempdir; - this.fetcherQ = new PQueue({ concurrency: 1 }); this.frameIdToExecId = null; @@ -1272,9 +1262,6 @@ class AsyncFetcher { recorder: Recorder; - tempdir: string; - filename: string; - manualRedirect = false; constructor({ @@ -1297,19 +1284,13 @@ class AsyncFetcher { this.recorder = recorder; - this.tempdir = recorder.tempdir; - this.filename = path.join( - this.tempdir, - `${timestampNow()}-${uuidv4()}.data`, - ); - this.maxFetchSize = maxFetchSize; this.manualRedirect = manualRedirect; } async load() { - const { reqresp, recorder, networkId, filename } = this; + const { reqresp, recorder, networkId } = this; const { url, status } = reqresp; const { pageid, crawlState, gzip, logDetails } = recorder; @@ -1359,7 +1340,7 @@ class AsyncFetcher { } catch (e) { logger.error( "Error reading + digesting payload", - { url, filename, ...formatErr(e), ...logDetails }, + { url, ...formatErr(e), ...logDetails }, "recorder", ); } @@ -1434,7 +1415,7 @@ class AsyncFetcher { } logger.debug( "Streaming Fetch Error", - { url, networkId, filename, ...formatErr(e), ...logDetails }, + { url, networkId, ...formatErr(e), ...logDetails }, "recorder", ); // indicate response is ultimately not valid diff --git a/src/util/warcwriter.ts b/src/util/warcwriter.ts index 2cd62a89..aed643a9 100644 --- a/src/util/warcwriter.ts +++ b/src/util/warcwriter.ts @@ -155,6 +155,10 @@ export class WARCWriter implements IndexerOffsetLength { this._writeCDX(responseRecord); + if (requestRecord.httpHeaders?.method !== "GET") { + await requestRecord.readFully(false); + } + const requestSerializer = new WARCSerializer(requestRecord, opts); this.recordLength = await this._writeRecord( requestRecord, @@ -162,6 +166,10 @@ export class WARCWriter implements IndexerOffsetLength { ); this._writeCDX(requestRecord); + + if (this.offset >= this.rolloverSize) { + this.fh = await this.initFH(); + } } private addToQueue( @@ -197,6 +205,10 @@ export class WARCWriter implements IndexerOffsetLength { this.recordLength = await this._writeRecord(record, requestSerializer); this._writeCDX(record); + + if (this.offset >= this.rolloverSize) { + this.fh = await this.initFH(); + } } writeNewResourceRecord( @@ -257,7 +269,7 @@ export class WARCWriter implements IndexerOffsetLength { let total = 0; const url = record.warcTargetURI; - if (!this.fh || this.offset >= this.rolloverSize) { + if (!this.fh) { this.fh = await this.initFH(); } From 0c9114768c0409a96ccc65920a1b708bf1ff9472 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 4 Sep 2024 23:16:41 -0700 Subject: [PATCH 2/4] update yarn.lock for gh warcio 2.3.1 --- yarn.lock | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yarn.lock b/yarn.lock index 2e276aec..6cb31654 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5277,10 +5277,9 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.3.0: - version "2.3.0" - resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.0.tgz#a655df9b5986a53e5d05aa68cda51bfefdfa8347" - integrity sha512-PCHcZ/fDE5+QECOFe/n/vzyDmAITJ1mvLx1jVONJ0uaV9OwcTbIWoh7Z0+OQwQdq8Wr1Nnb2hwhtHJ7J+9rHIQ== +warcio@^2.3.0, "warcio@git+https://github.com/webrecorder/warcio.js.git#indexer-fix-content-type": + version "2.3.1" + resolved "git+https://github.com/webrecorder/warcio.js.git#f1b5a0d57660d2f5c00b54982f409fde073c0122" dependencies: "@types/pako" "^1.0.7" "@types/stream-buffers" "^3.0.7" From cc7066efdb388e5c80e77fd69977fb67100da437 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 5 Sep 2024 10:45:51 -0700 Subject: [PATCH 3/4] update warcio.js to 2.3.1 --- package.json | 2 +- yarn.lock | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 552fd331..1868e9b5 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "git+https://github.com/webrecorder/warcio.js.git#indexer-fix-content-type", + "warcio": "^2.3.1", "ws": "^7.4.4", "yargs": "^17.7.2" }, diff --git a/yarn.lock b/yarn.lock index 6cb31654..64cf43d7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5277,7 +5277,7 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.3.0, "warcio@git+https://github.com/webrecorder/warcio.js.git#indexer-fix-content-type": +warcio@^2.3.0: version "2.3.1" resolved "git+https://github.com/webrecorder/warcio.js.git#f1b5a0d57660d2f5c00b54982f409fde073c0122" dependencies: @@ -5290,6 +5290,20 @@ warcio@^2.3.0, "warcio@git+https://github.com/webrecorder/warcio.js.git#indexer- uuid-random "^1.3.2" yargs "^17.6.2" +warcio@^2.3.1: + version "2.3.1" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.1.tgz#8ac9de897de1a556161168f2a3938b60929908ca" + integrity sha512-PjcWqzXfs6HdWfHi1V/i8MoMmV5M0Csg3rOa2mqCJ1dmCJXswVfQ0VXbEVumwavNIW2oFFj6LJoCHHeL4Ls/zw== + dependencies: + "@types/pako" "^1.0.7" + "@types/stream-buffers" "^3.0.7" + base32-encode "^2.0.0" + hash-wasm "^4.9.0" + pako "^1.0.11" + tempy "^3.1.0" + uuid-random "^1.3.2" + yargs "^17.6.2" + web-encoding@^1.1.5: version "1.1.5" resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864" From 3a28911492d3d1a5ac39cb158e38c3b98c0c6cae Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 5 Sep 2024 10:48:27 -0700 Subject: [PATCH 4/4] pin warcio --- package.json | 3 ++- yarn.lock | 15 +-------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/package.json b/package.json index 1868e9b5..b2c658bb 100644 --- a/package.json +++ b/package.json @@ -65,6 +65,7 @@ "testTimeout": 90000 }, "resolutions": { - "wrap-ansi": "7.0.0" + "wrap-ansi": "7.0.0", + "warcio": "^2.3.1" } } diff --git a/yarn.lock b/yarn.lock index 64cf43d7..98e3d2c1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5277,20 +5277,7 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.3.0: - version "2.3.1" - resolved "git+https://github.com/webrecorder/warcio.js.git#f1b5a0d57660d2f5c00b54982f409fde073c0122" - dependencies: - "@types/pako" "^1.0.7" - "@types/stream-buffers" "^3.0.7" - base32-encode "^2.0.0" - hash-wasm "^4.9.0" - pako "^1.0.11" - tempy "^3.1.0" - uuid-random "^1.3.2" - yargs "^17.6.2" - -warcio@^2.3.1: +warcio@^2.3.0, warcio@^2.3.1: version "2.3.1" resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.1.tgz#8ac9de897de1a556161168f2a3938b60929908ca" integrity sha512-PjcWqzXfs6HdWfHi1V/i8MoMmV5M0Csg3rOa2mqCJ1dmCJXswVfQ0VXbEVumwavNIW2oFFj6LJoCHHeL4Ls/zw==