From 4499294e034e46c161f354b8b68a7c9edc0d74b8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 5 Nov 2024 15:56:04 -0800 Subject: [PATCH 1/7] support indexing HTTP request (and WARC) records: - support customizing --fields for cdxj indexing - support 'req.*' fields which only apply to request records, other headers apply to response/main record - support 'referrer' as special shortcut for 'req.http:referer' - tests: update tests to include 'req.http:cookie' include in cdx - tests: update tests to include 'referrer' in cdx version: bump to 2.4.0 --- package.json | 2 +- src/commands/args.ts | 5 ++++ src/lib/index.ts | 9 ++++++- src/lib/indexer.ts | 55 +++++++++++++++++++++++++++++++--------- src/lib/utils.ts | 10 +++++--- test/testIndexer.test.ts | 35 +++++++++++++++++++++---- test/testUtils.test.ts | 6 ++--- 7 files changed, 97 insertions(+), 25 deletions(-) diff --git a/package.json b/package.json index 643803b..4a1766a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "warcio", - "version": "2.3.1", + "version": "2.4.0", "keywords": [ "WARC", "web archiving" diff --git a/src/commands/args.ts b/src/commands/args.ts index dd8e526..a4a98e9 100644 --- a/src/commands/args.ts +++ b/src/commands/args.ts @@ -42,6 +42,11 @@ export const cdxIndexCommandArgs = (yarg: yargs.Argv) => { describe: "Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)", type: "boolean", + }) + .option("fields", { + alias: "f", + describe: "fields to include in index", + type: "string", }); }; diff --git a/src/lib/index.ts b/src/lib/index.ts index 2235a91..d060b6f 100644 --- a/src/lib/index.ts +++ b/src/lib/index.ts @@ -17,7 +17,14 @@ export type { WARCSerializerOpts } from "./warcserializer"; export { WARCRecord, WARC_1_1, WARC_1_0 } from "./warcrecord"; export type { WARCRecordOpts, WARCType } from "./warcrecord"; -export { Indexer, CDXIndexer, CDXAndRecordIndexer } from "./indexer"; +export { + Indexer, + CDXIndexer, + CDXAndRecordIndexer, + DEFAULT_FIELDS, + DEFAULT_CDX_FIELDS, + DEFAULT_LEGACY_CDX_FIELDS, +} from "./indexer"; export { postToGetUrl, diff --git a/src/lib/indexer.ts b/src/lib/indexer.ts index 8219547..5253c12 100644 --- a/src/lib/indexer.ts +++ b/src/lib/indexer.ts @@ -10,17 +10,27 @@ import { type IndexerOffsetLength, } from "./types"; -const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"]; +export const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"]; // =========================================================================== abstract class BaseIndexer { opts: Partial; fields: string[]; + reqFields: string[]; parseHttp: boolean; - constructor(opts: Partial = {}) { + constructor( + opts: Partial = {}, + defaultFields: string[] = DEFAULT_FIELDS, + ) { this.opts = opts; - this.fields = opts.fields ? opts.fields.split(",") : DEFAULT_FIELDS; + if (opts.fields) { + this.fields = opts.fields.split(","); + this.reqFields = this.fields.filter((x) => isRequestHeader(x)); + } else { + this.fields = defaultFields; + this.reqFields = []; + } this.parseHttp = false; } @@ -109,6 +119,15 @@ abstract class BaseIndexer { field: string, record: WARCRecord, ): string | number | null | undefined { + // only handle req. fields for 'request' records + if (field.startsWith("req.")) { + if (record.warcType === "request") { + field = field.slice(4); + } else { + return null; + } + } + if (field === "http:status") { if ( record.httpHeaders && @@ -136,8 +155,8 @@ abstract class BaseIndexer { // =========================================================================== export class Indexer extends BaseIndexer { - constructor(opts?: Partial) { - super(opts); + constructor(opts?: Partial, defaultFields?: string[]) { + super(opts, defaultFields); for (const field of this.fields) { if (field.startsWith("http:")) { @@ -149,9 +168,9 @@ export class Indexer extends BaseIndexer { } // =========================================================================== -const DEFAULT_CDX_FIELDS = +export const DEFAULT_CDX_FIELDS = "urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(","); -const DEFAULT_LEGACY_CDX_FIELDS = +export const DEFAULT_LEGACY_CDX_FIELDS = "urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split( ",", ); @@ -172,10 +191,9 @@ export class CDXIndexer extends Indexer { _lastRecord: WARCRecord | null; constructor(opts?: Partial) { - super(opts); + super(opts, DEFAULT_CDX_FIELDS); this.includeAll = Boolean(opts?.all); this.overrideIndexForAll = Boolean(opts?.all); - this.fields = DEFAULT_CDX_FIELDS; this.parseHttp = true; this.noSurt = Boolean(opts?.noSurt); this._lastRecord = null; @@ -322,6 +340,12 @@ export class CDXIndexer extends Indexer { if (requestBody) { res["requestBody"] = requestBody; } + + if (reqRecord && this.reqFields.length) { + for (const field of this.reqFields) { + this.setField(field, reqRecord, res); + } + } } return res; @@ -334,12 +358,12 @@ export class CDXIndexer extends Indexer { delete result["timestamp"]; // eslint-disable-next-line @typescript-eslint/no-explicit-any - const replacer = (key: string, value: any) : any => { + const replacer = (key: string, value: any): any => { if (["offset", "length", "status"].includes(key)) { return value === null || value === undefined ? "" : "" + value; } return value; - } + }; return `${urlkey} ${timestamp} ${JSON.stringify(result, replacer)}\n`; } @@ -389,12 +413,15 @@ export class CDXIndexer extends Indexer { case "status": return super.getField("http:status", record); + case "referrer": + return super.getField("req.http:referer", record); + case "digest": value = record.warcPayloadDigest; return value ? value.split(":", 2)[1] : null; default: - return null; + return super.getField(field, record); } } } @@ -416,3 +443,7 @@ export class CDXAndRecordIndexer extends CDXIndexer { return cdx && { cdx, record, reqRecord }; } } + +export function isRequestHeader(header: string) { + return header.startsWith("req.") || header.toLowerCase() === "referrer"; +} diff --git a/src/lib/utils.ts b/src/lib/utils.ts index 7687923..bb3f3a6 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -62,7 +62,7 @@ export function postToGetUrl(request: Request) { return false; } - const getContentType = (headers: Headers | Map) : string => { + const getContentType = (headers: Headers | Map): string => { const ct = headers.get("content-type"); if (ct) { return ct; @@ -75,7 +75,7 @@ export function postToGetUrl(request: Request) { } } return ""; - } + }; const contentType = getContentType(headers); @@ -124,7 +124,11 @@ export function postToGetUrl(request: Request) { } if (query != null) { - request.url = appendRequestQuery(request.url, decodeURI(query), request.method); + request.url = appendRequestQuery( + request.url, + decodeURI(query), + request.method, + ); request.method = "GET"; request.requestBody = query; return true; diff --git a/test/testIndexer.test.ts b/test/testIndexer.test.ts index 657844e..08a92b7 100644 --- a/test/testIndexer.test.ts +++ b/test/testIndexer.test.ts @@ -1,7 +1,12 @@ import fs from "fs"; import { jest } from "@jest/globals"; import { main } from "../src/commands"; -import { Indexer, CDXIndexer, CDXAndRecordIndexer } from "../src/lib"; +import { + Indexer, + CDXIndexer, + CDXAndRecordIndexer, + DEFAULT_CDX_FIELDS, +} from "../src/lib"; import { WritableStreamBuffer } from "stream-buffers"; function get_warc_path(filename: string) { @@ -90,6 +95,21 @@ com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit", ); }); + test("cdxj warc.gz with referrer", async () => { + await index( + [ + "cdx-index", + get_warc_path("data/example.warc.gz"), + "--fields", + [...DEFAULT_CDX_FIELDS, "referrer"].join(","), + ], + `\ +com,example)/ 20170306040206 {"url":"http://example.com/","mime":"text/html","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"1228","offset":"784","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"} +com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"586","offset":"2621","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"} +`, + ); + }); + test("cdx11 warc.gz", async () => { await index( ["cdx-index", get_warc_path("data/example.warc.gz"), "--format", "cdx"], @@ -154,11 +174,16 @@ com,example)/ 20170306040348 http://example.com/ warc/revisit 200 G7HRM7BGOKSKMS test("post append", async () => { await index( - ["cdx-index", get_warc_path("data/post-test.warc.gz")], + [ + "cdx-index", + get_warc_path("data/post-test.warc.gz"), + "--fields", + [...DEFAULT_CDX_FIELDS, "req.http:cookie"].join(","), + ], `\ -org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc"} -org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3"} -org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^"} +org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc","req.http:cookie":"Max-Age=3600; Path=/"} +org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3","req.http:cookie":"Max-Age=3600; Path=/"} +org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^","req.http:cookie":"Max-Age=3600; Path=/"} `, ); }); diff --git a/test/testUtils.test.ts b/test/testUtils.test.ts index 1b8b92e..a5c59a4 100644 --- a/test/testUtils.test.ts +++ b/test/testUtils.test.ts @@ -153,8 +153,8 @@ describe("utils", () => { }); test("surt with space", () => { - expect(getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d ")).toBe( - "com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h=" - ); + expect( + getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d "), + ).toBe("com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h="); }); }); From 9687ec3ba19bf56f84e50d1924aa10d18c657faf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 5 Nov 2024 17:49:28 -0800 Subject: [PATCH 2/7] add exports --- src/index.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/index.ts b/src/index.ts index 951ae7f..ad62684 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,9 @@ export { Indexer, CDXIndexer, CDXAndRecordIndexer, + DEFAULT_FIELDS, + DEFAULT_CDX_FIELDS, + DEFAULT_LEGACY_CDX_FIELDS, postToGetUrl, getSurt, appendRequestQuery, From a9c3906f36f2ac94c13749fc77978a7e1d8872d8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 5 Nov 2024 18:03:43 -0800 Subject: [PATCH 3/7] set version to 2.4.0-beta.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 4a1766a..7ba4ca1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "warcio", - "version": "2.4.0", + "version": "2.4.0-beta.0", "keywords": [ "WARC", "web archiving" From 3eb1784e1beda52862936ca2c30380888ab4375a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 5 Nov 2024 18:14:24 -0800 Subject: [PATCH 4/7] allow passing string[] to 'fields' param --- src/lib/indexer.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib/indexer.ts b/src/lib/indexer.ts index 5253c12..e3378cf 100644 --- a/src/lib/indexer.ts +++ b/src/lib/indexer.ts @@ -25,7 +25,8 @@ abstract class BaseIndexer { ) { this.opts = opts; if (opts.fields) { - this.fields = opts.fields.split(","); + this.fields = + typeof opts.fields === "string" ? opts.fields.split(",") : opts.fields; this.reqFields = this.fields.filter((x) => isRequestHeader(x)); } else { this.fields = defaultFields; From 9b5b46e5666912d8c9996d87eeb2f5f68df96d1b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Nov 2024 18:19:27 -0800 Subject: [PATCH 5/7] cli: convert 'fields' param to array, add coerce to flatten multiple params types: add types for cli params --- src/commands/args.ts | 27 +++++++++++++++++---------- src/lib/indexer.ts | 3 +-- test/testIndexer.test.ts | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/commands/args.ts b/src/commands/args.ts index a4a98e9..057d0fe 100644 --- a/src/commands/args.ts +++ b/src/commands/args.ts @@ -1,5 +1,10 @@ +import { DEFAULT_CDX_FIELDS, DEFAULT_FIELDS } from "../lib/indexer"; import type yargs from "yargs"; +const coerce = (array: string[]): string[] => { + return array.flatMap((v) => v.split(",")).filter((x) => !!x); +}; + export const indexCommandArgs = (yarg: yargs.Argv) => { return yarg .positional("filenames", { @@ -11,14 +16,15 @@ export const indexCommandArgs = (yarg: yargs.Argv) => { .option("fields", { alias: "f", describe: "fields to include in index", - type: "string", + type: "array", + default: DEFAULT_FIELDS, + coerce, }); }; -//export type IndexCommandArgs = Awaited; -// todo: fix types? -// eslint-disable-next-line @typescript-eslint/no-explicit-any -export type IndexCommandArgs = any; +export type IndexCommandArgs = Awaited< + ReturnType["argv"] +>; export const cdxIndexCommandArgs = (yarg: yargs.Argv) => { return yarg @@ -46,11 +52,12 @@ export const cdxIndexCommandArgs = (yarg: yargs.Argv) => { .option("fields", { alias: "f", describe: "fields to include in index", - type: "string", + type: "array", + default: DEFAULT_CDX_FIELDS, + coerce, }); }; -//export type CdxIndexCommandArgs = Awaited; -// todo: fix types? -// eslint-disable-next-line @typescript-eslint/no-explicit-any -export type CdxIndexCommandArgs = any; //ReturnType; +export type CdxIndexCommandArgs = Awaited< + ReturnType["argv"] +>; diff --git a/src/lib/indexer.ts b/src/lib/indexer.ts index e3378cf..11971a9 100644 --- a/src/lib/indexer.ts +++ b/src/lib/indexer.ts @@ -25,8 +25,7 @@ abstract class BaseIndexer { ) { this.opts = opts; if (opts.fields) { - this.fields = - typeof opts.fields === "string" ? opts.fields.split(",") : opts.fields; + this.fields = opts.fields; this.reqFields = this.fields.filter((x) => isRequestHeader(x)); } else { this.fields = defaultFields; diff --git a/test/testIndexer.test.ts b/test/testIndexer.test.ts index 08a92b7..604b22f 100644 --- a/test/testIndexer.test.ts +++ b/test/testIndexer.test.ts @@ -333,7 +333,7 @@ com,example,some:8080)/ 20200405201750 {"url":"http://some.example.com:8080/","m test("test custom Indexer", async () => { const entries = []; - const indexer = new Indexer({ fields: "warc-type,warc-target-uri" }); + const indexer = new Indexer({ fields: ["warc-type", "warc-target-uri"] }); const files = [ { From f88186b5d04cc7bb9a7c19f80a8d90e26e4885ea Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Nov 2024 18:29:43 -0800 Subject: [PATCH 6/7] bump version to 2.4.0-beta.1 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7ba4ca1..c5558ad 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "warcio", - "version": "2.4.0-beta.0", + "version": "2.4.0-beta.1", "keywords": [ "WARC", "web archiving" From 749c51af0504485371e61880cf136e213df709bf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Nov 2024 19:01:02 -0800 Subject: [PATCH 7/7] import cleanup, beta 2 --- package.json | 5 ++--- src/commands/args.ts | 6 +++--- yarn.lock | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/package.json b/package.json index c5558ad..a8be6a0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "warcio", - "version": "2.4.0-beta.1", + "version": "2.4.0-beta.2", "keywords": [ "WARC", "web archiving" @@ -58,12 +58,11 @@ "pako": "^1.0.11", "tempy": "^3.1.0", "uuid-random": "^1.3.2", - "yargs": "^17.6.2" + "yargs": "^17.7.2" }, "devDependencies": { "@types/jest": "^29.2.3", "@types/node": "^18.11.9", - "@types/yargs": "^17.0.17", "@typescript-eslint/eslint-plugin": "^8.2.0", "@typescript-eslint/parser": "^8.2.0", "cross-fetch": "^4.0.0", diff --git a/src/commands/args.ts b/src/commands/args.ts index 057d0fe..5ed36fb 100644 --- a/src/commands/args.ts +++ b/src/commands/args.ts @@ -1,11 +1,11 @@ import { DEFAULT_CDX_FIELDS, DEFAULT_FIELDS } from "../lib/indexer"; -import type yargs from "yargs"; +import { type Argv } from "yargs"; const coerce = (array: string[]): string[] => { return array.flatMap((v) => v.split(",")).filter((x) => !!x); }; -export const indexCommandArgs = (yarg: yargs.Argv) => { +export const indexCommandArgs = (yarg: Argv) => { return yarg .positional("filenames", { describe: "WARC file(s) to index", @@ -26,7 +26,7 @@ export type IndexCommandArgs = Awaited< ReturnType["argv"] >; -export const cdxIndexCommandArgs = (yarg: yargs.Argv) => { +export const cdxIndexCommandArgs = (yarg: Argv) => { return yarg .positional("filenames", { describe: "WARC file(s) to index", diff --git a/yarn.lock b/yarn.lock index 14a72af..3e40554 100644 --- a/yarn.lock +++ b/yarn.lock @@ -870,7 +870,7 @@ resolved "https://registry.yarnpkg.com/@types/yargs-parser/-/yargs-parser-21.0.0.tgz#0c60e537fa790f5f9472ed2776c2b71ec117351b" integrity sha512-iO9ZQHkZxHn4mSakYV0vFHAVDyEOIJQrV2uZ06HxEPcx+mt8swXoZHIbaaJ2crJYFfErySgktuTZ3BeLz+XmFA== -"@types/yargs@^17.0.17", "@types/yargs@^17.0.8": +"@types/yargs@^17.0.8": version "17.0.17" resolved "https://registry.yarnpkg.com/@types/yargs/-/yargs-17.0.17.tgz#5672e5621f8e0fca13f433a8017aae4b7a2a03e7" integrity sha512-72bWxFKTK6uwWJAVT+3rF6Jo6RTojiJ27FQo8Rf60AL+VZbzoVPnMFhKsUnbjR8A3BTCYQ7Mv3hnl8T0A+CX9g== @@ -3301,7 +3301,7 @@ yargs-parser@^21.0.1, yargs-parser@^21.1.1: resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-21.1.1.tgz#9096bceebf990d21bb31fa9516e0ede294a77d35" integrity sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw== -yargs@^17.3.1, yargs@^17.6.2: +yargs@^17.3.1: version "17.6.2" resolved "https://registry.yarnpkg.com/yargs/-/yargs-17.6.2.tgz#2e23f2944e976339a1ee00f18c77fedee8332541" integrity sha512-1/9UrdHjDZc0eOU0HxOHoS78C69UD3JRMvzlJ7S79S2nTaWRA/whGCTV8o9e/N/1Va9YIV7Q4sOxD8VV4pCWOw== @@ -3314,6 +3314,19 @@ yargs@^17.3.1, yargs@^17.6.2: y18n "^5.0.5" yargs-parser "^21.1.1" +yargs@^17.7.2: + version "17.7.2" + resolved "https://registry.yarnpkg.com/yargs/-/yargs-17.7.2.tgz#991df39aca675a192b816e1e0363f9d75d2aa269" + integrity sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w== + dependencies: + cliui "^8.0.1" + escalade "^3.1.1" + get-caller-file "^2.0.5" + require-directory "^2.1.1" + string-width "^4.2.3" + y18n "^5.0.5" + yargs-parser "^21.1.1" + yocto-queue@^0.1.0: version "0.1.0" resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-0.1.0.tgz#0294eb3dee05028d31ee1a5fa2c556a6aaf10a1b"