Skip to content

Commit

Permalink
support indexing request WARC records: (#82)
Browse files Browse the repository at this point in the history
- support the `--fields` for cdx-index command
- cli: add typing to yargs cli args, treat 'fields' param as array, support multiple fields param or comma-separated param
- support `req.*` fields which only apply to request records, both WARC
and HTTP, other headers apply to response/main record
- support `referrer` as special shortcut for `req.http:referer`
- tests: update tests to include 'req.http:cookie' include in cdx
- tests: update tests to include 'referrer' in cdx
- compatibility with python cdxj-indexer
- version: bump to 2.4.0
  • Loading branch information
ikreymer authored Nov 7, 2024
1 parent fb1ff9c commit 1e53bbb
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 42 deletions.
5 changes: 2 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "warcio",
"version": "2.3.1",
"version": "2.4.0-beta.2",
"keywords": [
"WARC",
"web archiving"
Expand Down Expand Up @@ -58,12 +58,11 @@
"pako": "^1.0.11",
"tempy": "^3.1.0",
"uuid-random": "^1.3.2",
"yargs": "^17.6.2"
"yargs": "^17.7.2"
},
"devDependencies": {
"@types/jest": "^29.2.3",
"@types/node": "^18.11.9",
"@types/yargs": "^17.0.17",
"@typescript-eslint/eslint-plugin": "^8.2.0",
"@typescript-eslint/parser": "^8.2.0",
"cross-fetch": "^4.0.0",
Expand Down
36 changes: 24 additions & 12 deletions src/commands/args.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import type yargs from "yargs";
import { DEFAULT_CDX_FIELDS, DEFAULT_FIELDS } from "../lib/indexer";
import { type Argv } from "yargs";

export const indexCommandArgs = (yarg: yargs.Argv) => {
const coerce = (array: string[]): string[] => {
return array.flatMap((v) => v.split(",")).filter((x) => !!x);
};

export const indexCommandArgs = (yarg: Argv) => {
return yarg
.positional("filenames", {
describe: "WARC file(s) to index",
Expand All @@ -11,16 +16,17 @@ export const indexCommandArgs = (yarg: yargs.Argv) => {
.option("fields", {
alias: "f",
describe: "fields to include in index",
type: "string",
type: "array",
default: DEFAULT_FIELDS,
coerce,
});
};

//export type IndexCommandArgs = Awaited<typeof indexCommandArgs.argv>;
// todo: fix types?
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export type IndexCommandArgs = any;
export type IndexCommandArgs = Awaited<
ReturnType<typeof indexCommandArgs>["argv"]
>;

export const cdxIndexCommandArgs = (yarg: yargs.Argv) => {
export const cdxIndexCommandArgs = (yarg: Argv) => {
return yarg
.positional("filenames", {
describe: "WARC file(s) to index",
Expand All @@ -42,10 +48,16 @@ export const cdxIndexCommandArgs = (yarg: yargs.Argv) => {
describe:
"Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)",
type: "boolean",
})
.option("fields", {
alias: "f",
describe: "fields to include in index",
type: "array",
default: DEFAULT_CDX_FIELDS,
coerce,
});
};

//export type CdxIndexCommandArgs = Awaited<typeof cdxIndexCommandArgs.argv>;
// todo: fix types?
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export type CdxIndexCommandArgs = any; //ReturnType<cdxIndexCommandArgs>;
export type CdxIndexCommandArgs = Awaited<
ReturnType<typeof cdxIndexCommandArgs>["argv"]
>;
3 changes: 3 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ export {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_FIELDS,
DEFAULT_CDX_FIELDS,
DEFAULT_LEGACY_CDX_FIELDS,
postToGetUrl,
getSurt,
appendRequestQuery,
Expand Down
9 changes: 8 additions & 1 deletion src/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ export type { WARCSerializerOpts } from "./warcserializer";
export { WARCRecord, WARC_1_1, WARC_1_0 } from "./warcrecord";
export type { WARCRecordOpts, WARCType } from "./warcrecord";

export { Indexer, CDXIndexer, CDXAndRecordIndexer } from "./indexer";
export {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_FIELDS,
DEFAULT_CDX_FIELDS,
DEFAULT_LEGACY_CDX_FIELDS,
} from "./indexer";

export {
postToGetUrl,
Expand Down
55 changes: 43 additions & 12 deletions src/lib/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,27 @@ import {
type IndexerOffsetLength,
} from "./types";

const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"];
export const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"];

// ===========================================================================
abstract class BaseIndexer {
opts: Partial<IndexCommandArgs>;
fields: string[];
reqFields: string[];
parseHttp: boolean;

constructor(opts: Partial<IndexCommandArgs> = {}) {
constructor(
opts: Partial<IndexCommandArgs> = {},
defaultFields: string[] = DEFAULT_FIELDS,
) {
this.opts = opts;
this.fields = opts.fields ? opts.fields.split(",") : DEFAULT_FIELDS;
if (opts.fields) {
this.fields = opts.fields;
this.reqFields = this.fields.filter((x) => isRequestHeader(x));
} else {
this.fields = defaultFields;
this.reqFields = [];
}
this.parseHttp = false;
}

Expand Down Expand Up @@ -109,6 +119,15 @@ abstract class BaseIndexer {
field: string,
record: WARCRecord,
): string | number | null | undefined {
// only handle req. fields for 'request' records
if (field.startsWith("req.")) {
if (record.warcType === "request") {
field = field.slice(4);
} else {
return null;
}
}

if (field === "http:status") {
if (
record.httpHeaders &&
Expand Down Expand Up @@ -136,8 +155,8 @@ abstract class BaseIndexer {

// ===========================================================================
export class Indexer extends BaseIndexer {
constructor(opts?: Partial<IndexCommandArgs>) {
super(opts);
constructor(opts?: Partial<IndexCommandArgs>, defaultFields?: string[]) {
super(opts, defaultFields);

for (const field of this.fields) {
if (field.startsWith("http:")) {
Expand All @@ -149,9 +168,9 @@ export class Indexer extends BaseIndexer {
}

// ===========================================================================
const DEFAULT_CDX_FIELDS =
export const DEFAULT_CDX_FIELDS =
"urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(",");
const DEFAULT_LEGACY_CDX_FIELDS =
export const DEFAULT_LEGACY_CDX_FIELDS =
"urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(
",",
);
Expand All @@ -172,10 +191,9 @@ export class CDXIndexer extends Indexer {
_lastRecord: WARCRecord | null;

constructor(opts?: Partial<CdxIndexCommandArgs>) {
super(opts);
super(opts, DEFAULT_CDX_FIELDS);
this.includeAll = Boolean(opts?.all);
this.overrideIndexForAll = Boolean(opts?.all);
this.fields = DEFAULT_CDX_FIELDS;
this.parseHttp = true;
this.noSurt = Boolean(opts?.noSurt);
this._lastRecord = null;
Expand Down Expand Up @@ -322,6 +340,12 @@ export class CDXIndexer extends Indexer {
if (requestBody) {
res["requestBody"] = requestBody;
}

if (reqRecord && this.reqFields.length) {
for (const field of this.reqFields) {
this.setField(field, reqRecord, res);
}
}
}

return res;
Expand All @@ -334,12 +358,12 @@ export class CDXIndexer extends Indexer {
delete result["timestamp"];

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const replacer = (key: string, value: any) : any => {
const replacer = (key: string, value: any): any => {
if (["offset", "length", "status"].includes(key)) {
return value === null || value === undefined ? "" : "" + value;
}
return value;
}
};

return `${urlkey} ${timestamp} ${JSON.stringify(result, replacer)}\n`;
}
Expand Down Expand Up @@ -389,12 +413,15 @@ export class CDXIndexer extends Indexer {
case "status":
return super.getField("http:status", record);

case "referrer":
return super.getField("req.http:referer", record);

case "digest":
value = record.warcPayloadDigest;
return value ? value.split(":", 2)[1] : null;

default:
return null;
return super.getField(field, record);
}
}
}
Expand All @@ -416,3 +443,7 @@ export class CDXAndRecordIndexer extends CDXIndexer {
return cdx && { cdx, record, reqRecord };
}
}

export function isRequestHeader(header: string) {
return header.startsWith("req.") || header.toLowerCase() === "referrer";
}
10 changes: 7 additions & 3 deletions src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export function postToGetUrl(request: Request) {
return false;
}

const getContentType = (headers: Headers | Map<string, string>) : string => {
const getContentType = (headers: Headers | Map<string, string>): string => {
const ct = headers.get("content-type");
if (ct) {
return ct;
Expand All @@ -75,7 +75,7 @@ export function postToGetUrl(request: Request) {
}
}
return "";
}
};

const contentType = getContentType(headers);

Expand Down Expand Up @@ -124,7 +124,11 @@ export function postToGetUrl(request: Request) {
}

if (query != null) {
request.url = appendRequestQuery(request.url, decodeURI(query), request.method);
request.url = appendRequestQuery(
request.url,
decodeURI(query),
request.method,
);
request.method = "GET";
request.requestBody = query;
return true;
Expand Down
37 changes: 31 additions & 6 deletions test/testIndexer.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import fs from "fs";
import { jest } from "@jest/globals";
import { main } from "../src/commands";
import { Indexer, CDXIndexer, CDXAndRecordIndexer } from "../src/lib";
import {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_CDX_FIELDS,
} from "../src/lib";
import { WritableStreamBuffer } from "stream-buffers";

function get_warc_path(filename: string) {
Expand Down Expand Up @@ -90,6 +95,21 @@ com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit",
);
});

test("cdxj warc.gz with referrer", async () => {
await index(
[
"cdx-index",
get_warc_path("data/example.warc.gz"),
"--fields",
[...DEFAULT_CDX_FIELDS, "referrer"].join(","),
],
`\
com,example)/ 20170306040206 {"url":"http://example.com/","mime":"text/html","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"1228","offset":"784","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"}
com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"586","offset":"2621","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"}
`,
);
});

test("cdx11 warc.gz", async () => {
await index(
["cdx-index", get_warc_path("data/example.warc.gz"), "--format", "cdx"],
Expand Down Expand Up @@ -154,11 +174,16 @@ com,example)/ 20170306040348 http://example.com/ warc/revisit 200 G7HRM7BGOKSKMS

test("post append", async () => {
await index(
["cdx-index", get_warc_path("data/post-test.warc.gz")],
[
"cdx-index",
get_warc_path("data/post-test.warc.gz"),
"--fields",
[...DEFAULT_CDX_FIELDS, "req.http:cookie"].join(","),
],
`\
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^"}
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc","req.http:cookie":"Max-Age=3600; Path=/"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3","req.http:cookie":"Max-Age=3600; Path=/"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^","req.http:cookie":"Max-Age=3600; Path=/"}
`,
);
});
Expand Down Expand Up @@ -308,7 +333,7 @@ com,example,some:8080)/ 20200405201750 {"url":"http://some.example.com:8080/","m

test("test custom Indexer", async () => {
const entries = [];
const indexer = new Indexer({ fields: "warc-type,warc-target-uri" });
const indexer = new Indexer({ fields: ["warc-type", "warc-target-uri"] });

const files = [
{
Expand Down
6 changes: 3 additions & 3 deletions test/testUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ describe("utils", () => {
});

test("surt with space", () => {
expect(getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d ")).toBe(
"com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h="
);
expect(
getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d "),
).toBe("com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h=");
});
});
17 changes: 15 additions & 2 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@
resolved "https://registry.yarnpkg.com/@types/yargs-parser/-/yargs-parser-21.0.0.tgz#0c60e537fa790f5f9472ed2776c2b71ec117351b"
integrity sha512-iO9ZQHkZxHn4mSakYV0vFHAVDyEOIJQrV2uZ06HxEPcx+mt8swXoZHIbaaJ2crJYFfErySgktuTZ3BeLz+XmFA==

"@types/yargs@^17.0.17", "@types/yargs@^17.0.8":
"@types/yargs@^17.0.8":
version "17.0.17"
resolved "https://registry.yarnpkg.com/@types/yargs/-/yargs-17.0.17.tgz#5672e5621f8e0fca13f433a8017aae4b7a2a03e7"
integrity sha512-72bWxFKTK6uwWJAVT+3rF6Jo6RTojiJ27FQo8Rf60AL+VZbzoVPnMFhKsUnbjR8A3BTCYQ7Mv3hnl8T0A+CX9g==
Expand Down Expand Up @@ -3301,7 +3301,7 @@ yargs-parser@^21.0.1, yargs-parser@^21.1.1:
resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-21.1.1.tgz#9096bceebf990d21bb31fa9516e0ede294a77d35"
integrity sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==

yargs@^17.3.1, yargs@^17.6.2:
yargs@^17.3.1:
version "17.6.2"
resolved "https://registry.yarnpkg.com/yargs/-/yargs-17.6.2.tgz#2e23f2944e976339a1ee00f18c77fedee8332541"
integrity sha512-1/9UrdHjDZc0eOU0HxOHoS78C69UD3JRMvzlJ7S79S2nTaWRA/whGCTV8o9e/N/1Va9YIV7Q4sOxD8VV4pCWOw==
Expand All @@ -3314,6 +3314,19 @@ yargs@^17.3.1, yargs@^17.6.2:
y18n "^5.0.5"
yargs-parser "^21.1.1"

yargs@^17.7.2:
version "17.7.2"
resolved "https://registry.yarnpkg.com/yargs/-/yargs-17.7.2.tgz#991df39aca675a192b816e1e0363f9d75d2aa269"
integrity sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==
dependencies:
cliui "^8.0.1"
escalade "^3.1.1"
get-caller-file "^2.0.5"
require-directory "^2.1.1"
string-width "^4.2.3"
y18n "^5.0.5"
yargs-parser "^21.1.1"

yocto-queue@^0.1.0:
version "0.1.0"
resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-0.1.0.tgz#0294eb3dee05028d31ee1a5fa2c556a6aaf10a1b"
Expand Down

0 comments on commit 1e53bbb

Please sign in to comment.