Skip to content

Commit

Permalink
Support for multi-value headers (#84)
Browse files Browse the repository at this point in the history
- add new HeadersMultiMap to support set-cookie, as well as
warc-concurrent-to headers with multiple values (store internally as
map, convert to array for multi value headers, override iterator)
- update tests to check for multiple warc-concurrent-to
- also ensure multiple Set-Cookie works with case sensitive headers
- fixes #32
- ready to support warc-protocol from
iipc/warc-specifications#42
  • Loading branch information
ikreymer authored Nov 14, 2024
1 parent d20880d commit 1883c33
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 40 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "warcio",
"version": "2.4.0-beta.2",
"version": "2.4.0-beta.3",
"keywords": [
"WARC",
"web archiving"
Expand Down
1 change: 1 addition & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export {
NoConcatInflator,
StatusAndHeadersParser,
StatusAndHeaders,
HeadersMultiMap,
WARCParser,
WARCSerializer,
BaseSerializerBuffer,
Expand Down
1 change: 1 addition & 0 deletions src/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ export {
mfdToQueryString,
concatChunks,
splitChunk,
HeadersMultiMap,
} from "./utils";

export type {
Expand Down
34 changes: 15 additions & 19 deletions src/lib/statusandheaders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { concatChunks, splitChunk } from "./utils";
import { concatChunks, HeadersMultiMap, splitChunk } from "./utils";
import { type AsyncIterReader } from "./readers";

export const CRLF = new Uint8Array([13, 10]);
Expand All @@ -9,16 +9,16 @@ const decoder = new TextDecoder("utf-8");
// ===========================================================================
export class StatusAndHeaders {
statusline: string;
headers: Map<string, string> | Headers;
private readonly reencodeHeaders;
headers: HeadersMultiMap | Headers;
readonly reencodeHeaders?: Set<string>;

constructor({
statusline,
headers,
reencodeHeaders
reencodeHeaders,
}: {
statusline: string;
headers: Map<string, string> | Headers;
headers: HeadersMultiMap | Headers;
reencodeHeaders?: Set<string>;
}) {
this.statusline = statusline;
Expand Down Expand Up @@ -116,8 +116,11 @@ export class StatusAndHeadersParser {
{
headersClass,
firstLine,
}: { firstLine?: string; headersClass: typeof Map | typeof Headers } = {
headersClass: Map,
}: {
firstLine?: string;
headersClass: typeof HeadersMultiMap | typeof Headers;
} = {
headersClass: HeadersMultiMap,
},
) {
const fullStatusLine = firstLine ? firstLine : await reader.readline();
Expand Down Expand Up @@ -163,7 +166,6 @@ export class StatusAndHeadersParser {
value = headerBuff
.slice(valueStart, valueEnd < 0 ? undefined : valueEnd)
.trim();

} else {
value = null;
}
Expand All @@ -183,26 +185,20 @@ export class StatusAndHeadersParser {
return new StatusAndHeaders({
statusline,
headers,
reencodeHeaders: this.reencodeHeaders
reencodeHeaders: this.reencodeHeaders,
});
}

setHeader(
name: string,
value: string,
headers: Headers | Map<string, string>,
headers: Headers | HeadersMultiMap,
reencoded = false,
) {
try {
const isHeaders = headers instanceof Headers;
const nameLower = name.toLowerCase();
if (isHeaders && nameLower === "set-cookie") {
headers.append(name, value);
} else {
headers.set(name, value);
}
if (isHeaders && reencoded) {
this.reencodeHeaders.add(nameLower);
headers.append(name, value);
if (headers instanceof Headers && reencoded) {
this.reencodeHeaders.add(name.toLowerCase());
}
} catch (_e) {
if (!reencoded) {
Expand Down
58 changes: 58 additions & 0 deletions src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,61 @@ export function splitChunk(
): [Uint8Array, Uint8Array] {
return [chunk.slice(0, inx), chunk.slice(inx)];
}

// ===========================================================================
// headers multi map
const MULTI_VALUE_ALLOWED = ["set-cookie", "warc-concurrent-to"];

// using something other than comma to reduce change of any collisions with actual data
// in theory, collision still possible with arbitrary cookie value
const JOIN_MARKER = ",,,";

export class HeadersMultiMap extends Map<string, string> {
constructor(headersInit?: HeadersInit) {
// if an array of array, parse that and add individually here
if (headersInit instanceof Array) {
super();
for (const entry of headersInit) {
if (entry instanceof Array) {
const name = entry[0];
const value = entry[1];
this.append(name, value);
}
}
} else {
super(headersInit ? Object.entries(headersInit) : undefined);
}
}

getMultiple(name: string): string[] | undefined {
const value = super.get(name);
if (!value) {
return undefined;
}
if (MULTI_VALUE_ALLOWED.includes(name.toLowerCase())) {
return value.split(JOIN_MARKER);
}
return [value];
}

append(name: string, value: string) {
if (MULTI_VALUE_ALLOWED.includes(name.toLowerCase())) {
const prev = this.get(name);
this.set(name, prev !== undefined ? prev + JOIN_MARKER + value : value);
} else {
this.set(name, value);
}
}

override *[Symbol.iterator](): IterableIterator<[string, string]> {
for (const [name, value] of super[Symbol.iterator]()) {
if (MULTI_VALUE_ALLOWED.includes(name.toLowerCase())) {
for (const v of value.split(JOIN_MARKER)) {
yield [name, v];
}
} else {
yield [name, value];
}
}
}
}
5 changes: 3 additions & 2 deletions src/lib/warcparser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
import { WARCRecord } from "./warcrecord";
import { AsyncIterReader, LimitReader } from "./readers";
import { type Source, type IndexerOffsetLength } from "./types";
import { HeadersMultiMap } from "./utils";

const decoder = new TextDecoder();
const EMPTY = new Uint8Array([]);
Expand All @@ -27,7 +28,7 @@ export class WARCParser implements IndexerOffsetLength {
_offset: number;
_warcHeadersLength: number;

_headersClass: typeof Map | typeof Headers;
_headersClass: typeof HeadersMultiMap | typeof Headers;
_parseHttp: boolean;

_reader: AsyncIterReader;
Expand All @@ -41,7 +42,7 @@ export class WARCParser implements IndexerOffsetLength {
this._offset = 0;
this._warcHeadersLength = 0;

this._headersClass = keepHeadersCase ? Map : Headers;
this._headersClass = keepHeadersCase ? HeadersMultiMap : Headers;
this._parseHttp = parseHttp;

if (!(source instanceof AsyncIterReader)) {
Expand Down
55 changes: 37 additions & 18 deletions src/lib/warcrecord.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import uuid from "uuid-random";
import { BaseAsyncIterReader, AsyncIterReader, LimitReader } from "./readers";
import { StatusAndHeaders } from "./statusandheaders";
import { type Source } from "./types";
import { HeadersMultiMap } from "./utils";

const decoder = new TextDecoder("utf-8");
const encoder = new TextEncoder();
Expand Down Expand Up @@ -37,7 +38,7 @@ export type WARCRecordOpts = {
url?: string;
date?: string;
type?: WARCType;
warcHeaders?: Record<string, string>;
warcHeaders?: Record<string, string> | [string, string][];
filename?: string;
httpHeaders?: HeadersInit;
statusline?: string;
Expand Down Expand Up @@ -79,39 +80,43 @@ export class WARCRecord extends BaseAsyncIterReader {

date = checkDate(date || new Date().toISOString());

warcHeaders = { ...warcHeaders };
const warcHeadersMap = new HeadersMultiMap(warcHeaders);

if (type === "warcinfo") {
if (filename) {
warcHeaders["WARC-Filename"] = filename;
warcHeadersMap.set("WARC-Filename", filename);
}
} else if (url) {
try {
warcHeaders["WARC-Target-URI"] = new URL(url).href;
warcHeadersMap.set("WARC-Target-URI", new URL(url).href);
} catch (_e) {
warcHeaders["WARC-Target-URI"] = url;
warcHeadersMap.set("WARC-Target-URI", url);
}
}

warcHeaders["WARC-Date"] = date;
warcHeadersMap.set("WARC-Date", date);

if (type) {
warcHeaders["WARC-Type"] = type;
warcHeadersMap.set("WARC-Type", type);
}

if (type === "revisit") {
warcHeaders["WARC-Profile"] =
warcVersion === WARC_1_1 ? REVISIT_PROFILE_1_1 : REVISIT_PROFILE_1_0;
warcHeadersMap.set(
"WARC-Profile",
warcVersion === WARC_1_1 ? REVISIT_PROFILE_1_1 : REVISIT_PROFILE_1_0,
);
if (refersToUrl) {
warcHeaders["WARC-Refers-To-Target-URI"] = refersToUrl;
warcHeaders["WARC-Refers-To-Date"] = checkDate(
refersToDate || new Date().toISOString(),
warcHeadersMap.set("WARC-Refers-To-Target-URI", refersToUrl);
warcHeadersMap.set(
"WARC-Refers-To-Date",
checkDate(refersToDate || new Date().toISOString()),
);
}
}

const warcHeadersObj = new StatusAndHeaders({
statusline: warcVersion,
headers: new Map(Object.entries(warcHeaders)),
headers: warcHeadersMap,
});

if (!warcHeadersObj.headers.get("WARC-Record-ID")) {
Expand All @@ -130,19 +135,24 @@ export class WARCRecord extends BaseAsyncIterReader {
}

const record = new WARCRecord({ warcHeaders: warcHeadersObj, reader });
let headers: Map<string, string> | Headers | null = null;
let entries: [string, string][] = [];
let headers: HeadersMultiMap | Headers | null = null;
let isEmpty = false;

switch (type) {
case "response":
case "request":
case "revisit":
entries = Object.entries(httpHeaders);
headers = keepHeadersCase ? new Map(entries) : new Headers(httpHeaders);
if (keepHeadersCase) {
headers = new HeadersMultiMap(httpHeaders);
isEmpty = !headers.size;
} else {
headers = new Headers(httpHeaders);
isEmpty = !Object.entries(httpHeaders).length;
}

// for revisit records, if there are no http headers, don't add statusline
// for other request/response, add an empty statusline-only block
if (entries.length > 0 || type !== "revisit") {
if (!isEmpty || type !== "revisit") {
record.httpHeaders = new StatusAndHeaders({ statusline, headers });
}
break;
Expand Down Expand Up @@ -395,6 +405,15 @@ export class WARCRecord extends BaseAsyncIterReader {
get warcContentLength() {
return Number(this.warcHeaders.headers.get("Content-Length"));
}

get warcConcurrentTo() {
if (this.warcHeaders.headers instanceof HeadersMultiMap) {
return this.warcHeaders.headers.getMultiple("WARC-Concurrent-To");
} else {
const res = this.warcHeaders.headers.get("WARC-Concurrent-To");
return res ? res.split(",") : [];
}
}
}

// ===========================================================================
Expand Down
Loading

0 comments on commit 1883c33

Please sign in to comment.