Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support indexing request WARC records: #82

Merged
merged 7 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "warcio",
"version": "2.3.1",
"version": "2.4.0-beta.0",
"keywords": [
"WARC",
"web archiving"
Expand Down
5 changes: 5 additions & 0 deletions src/commands/args.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ export const cdxIndexCommandArgs = (yarg: yargs.Argv) => {
describe:
"Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)",
type: "boolean",
})
.option("fields", {
alias: "f",
describe: "fields to include in index",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
describe: "fields to include in index",
describe: "comma-separated list of fields to include in index",

Since we're not using yarg's array type, might be good to be explicit about the expected format of the input

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, maybe should use the array type, will look.

type: "string",
});
};

Expand Down
3 changes: 3 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ export {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_FIELDS,
DEFAULT_CDX_FIELDS,
DEFAULT_LEGACY_CDX_FIELDS,
postToGetUrl,
getSurt,
appendRequestQuery,
Expand Down
9 changes: 8 additions & 1 deletion src/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ export type { WARCSerializerOpts } from "./warcserializer";
export { WARCRecord, WARC_1_1, WARC_1_0 } from "./warcrecord";
export type { WARCRecordOpts, WARCType } from "./warcrecord";

export { Indexer, CDXIndexer, CDXAndRecordIndexer } from "./indexer";
export {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_FIELDS,
DEFAULT_CDX_FIELDS,
DEFAULT_LEGACY_CDX_FIELDS,
} from "./indexer";

export {
postToGetUrl,
Expand Down
56 changes: 44 additions & 12 deletions src/lib/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,28 @@ import {
type IndexerOffsetLength,
} from "./types";

const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"];
export const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"];

// ===========================================================================
abstract class BaseIndexer {
opts: Partial<IndexCommandArgs>;
fields: string[];
reqFields: string[];
parseHttp: boolean;

constructor(opts: Partial<IndexCommandArgs> = {}) {
constructor(
opts: Partial<IndexCommandArgs> = {},
defaultFields: string[] = DEFAULT_FIELDS,
) {
this.opts = opts;
this.fields = opts.fields ? opts.fields.split(",") : DEFAULT_FIELDS;
if (opts.fields) {
this.fields =
typeof opts.fields === "string" ? opts.fields.split(",") : opts.fields;
this.reqFields = this.fields.filter((x) => isRequestHeader(x));
} else {
this.fields = defaultFields;
this.reqFields = [];
}
this.parseHttp = false;
}

Expand Down Expand Up @@ -109,6 +120,15 @@ abstract class BaseIndexer {
field: string,
record: WARCRecord,
): string | number | null | undefined {
// only handle req. fields for 'request' records
if (field.startsWith("req.")) {
if (record.warcType === "request") {
field = field.slice(4);
} else {
return null;
}
}

if (field === "http:status") {
if (
record.httpHeaders &&
Expand Down Expand Up @@ -136,8 +156,8 @@ abstract class BaseIndexer {

// ===========================================================================
export class Indexer extends BaseIndexer {
constructor(opts?: Partial<IndexCommandArgs>) {
super(opts);
constructor(opts?: Partial<IndexCommandArgs>, defaultFields?: string[]) {
super(opts, defaultFields);

for (const field of this.fields) {
if (field.startsWith("http:")) {
Expand All @@ -149,9 +169,9 @@ export class Indexer extends BaseIndexer {
}

// ===========================================================================
const DEFAULT_CDX_FIELDS =
export const DEFAULT_CDX_FIELDS =
"urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(",");
const DEFAULT_LEGACY_CDX_FIELDS =
export const DEFAULT_LEGACY_CDX_FIELDS =
"urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(
",",
);
Expand All @@ -172,10 +192,9 @@ export class CDXIndexer extends Indexer {
_lastRecord: WARCRecord | null;

constructor(opts?: Partial<CdxIndexCommandArgs>) {
super(opts);
super(opts, DEFAULT_CDX_FIELDS);
this.includeAll = Boolean(opts?.all);
this.overrideIndexForAll = Boolean(opts?.all);
this.fields = DEFAULT_CDX_FIELDS;
this.parseHttp = true;
this.noSurt = Boolean(opts?.noSurt);
this._lastRecord = null;
Expand Down Expand Up @@ -322,6 +341,12 @@ export class CDXIndexer extends Indexer {
if (requestBody) {
res["requestBody"] = requestBody;
}

if (reqRecord && this.reqFields.length) {
for (const field of this.reqFields) {
this.setField(field, reqRecord, res);
}
}
}

return res;
Expand All @@ -334,12 +359,12 @@ export class CDXIndexer extends Indexer {
delete result["timestamp"];

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const replacer = (key: string, value: any) : any => {
const replacer = (key: string, value: any): any => {
if (["offset", "length", "status"].includes(key)) {
return value === null || value === undefined ? "" : "" + value;
}
return value;
}
};

return `${urlkey} ${timestamp} ${JSON.stringify(result, replacer)}\n`;
}
Expand Down Expand Up @@ -389,12 +414,15 @@ export class CDXIndexer extends Indexer {
case "status":
return super.getField("http:status", record);

case "referrer":
return super.getField("req.http:referer", record);

case "digest":
value = record.warcPayloadDigest;
return value ? value.split(":", 2)[1] : null;

default:
return null;
return super.getField(field, record);
}
}
}
Expand All @@ -416,3 +444,7 @@ export class CDXAndRecordIndexer extends CDXIndexer {
return cdx && { cdx, record, reqRecord };
}
}

export function isRequestHeader(header: string) {
return header.startsWith("req.") || header.toLowerCase() === "referrer";
}
10 changes: 7 additions & 3 deletions src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export function postToGetUrl(request: Request) {
return false;
}

const getContentType = (headers: Headers | Map<string, string>) : string => {
const getContentType = (headers: Headers | Map<string, string>): string => {
const ct = headers.get("content-type");
if (ct) {
return ct;
Expand All @@ -75,7 +75,7 @@ export function postToGetUrl(request: Request) {
}
}
return "";
}
};

const contentType = getContentType(headers);

Expand Down Expand Up @@ -124,7 +124,11 @@ export function postToGetUrl(request: Request) {
}

if (query != null) {
request.url = appendRequestQuery(request.url, decodeURI(query), request.method);
request.url = appendRequestQuery(
request.url,
decodeURI(query),
request.method,
);
request.method = "GET";
request.requestBody = query;
return true;
Expand Down
35 changes: 30 additions & 5 deletions test/testIndexer.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import fs from "fs";
import { jest } from "@jest/globals";
import { main } from "../src/commands";
import { Indexer, CDXIndexer, CDXAndRecordIndexer } from "../src/lib";
import {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_CDX_FIELDS,
} from "../src/lib";
import { WritableStreamBuffer } from "stream-buffers";

function get_warc_path(filename: string) {
Expand Down Expand Up @@ -90,6 +95,21 @@ com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit",
);
});

test("cdxj warc.gz with referrer", async () => {
await index(
[
"cdx-index",
get_warc_path("data/example.warc.gz"),
"--fields",
[...DEFAULT_CDX_FIELDS, "referrer"].join(","),
],
`\
com,example)/ 20170306040206 {"url":"http://example.com/","mime":"text/html","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"1228","offset":"784","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"}
com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"586","offset":"2621","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"}
`,
);
});

test("cdx11 warc.gz", async () => {
await index(
["cdx-index", get_warc_path("data/example.warc.gz"), "--format", "cdx"],
Expand Down Expand Up @@ -154,11 +174,16 @@ com,example)/ 20170306040348 http://example.com/ warc/revisit 200 G7HRM7BGOKSKMS

test("post append", async () => {
await index(
["cdx-index", get_warc_path("data/post-test.warc.gz")],
[
"cdx-index",
get_warc_path("data/post-test.warc.gz"),
"--fields",
[...DEFAULT_CDX_FIELDS, "req.http:cookie"].join(","),
],
`\
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^"}
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc","req.http:cookie":"Max-Age=3600; Path=/"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3","req.http:cookie":"Max-Age=3600; Path=/"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^","req.http:cookie":"Max-Age=3600; Path=/"}
`,
);
});
Expand Down
6 changes: 3 additions & 3 deletions test/testUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ describe("utils", () => {
});

test("surt with space", () => {
expect(getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d ")).toBe(
"com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h="
);
expect(
getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d "),
).toBe("com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h=");
});
});
Loading