Skip to content

Commit

Permalink
statusandheaders: if setting a header fails on Headers object, retry …
Browse files Browse the repository at this point in the history
…with encoded header (#83)

instead of skipping invalid headers:
- encode header value as latin-1 for use with Headers object
- encode back to original when serializing
fixes #81
  • Loading branch information
ikreymer authored Nov 11, 2024
1 parent f45d747 commit d20880d
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 20 deletions.
80 changes: 60 additions & 20 deletions src/lib/statusandheaders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,33 @@ const decoder = new TextDecoder("utf-8");
export class StatusAndHeaders {
statusline: string;
headers: Map<string, string> | Headers;
private readonly reencodeHeaders;

constructor({
statusline,
headers,
reencodeHeaders
}: {
statusline: string;
headers: Map<string, string> | Headers;
reencodeHeaders?: Set<string>;
}) {
this.statusline = statusline;
this.headers = headers;
this.reencodeHeaders = reencodeHeaders;
}

toString() {
const buff = [this.statusline];

const isHeaders = this.headers instanceof Headers;

for (const [name, value] of this.headers) {
buff.push(`${name}: ${value}`);
if (isHeaders && this.reencodeHeaders?.has(name)) {
buff.push(`${name}: ${latin1ToUTF(value)}`);
} else {
buff.push(`${name}: ${value}`);
}
}

return buff.join("\r\n") + "\r\n";
Expand Down Expand Up @@ -99,6 +109,8 @@ export class StatusAndHeaders {

// ===========================================================================
export class StatusAndHeadersParser {
reencodeHeaders = new Set<string>();

async parse(
reader: AsyncIterReader,
{
Expand All @@ -121,7 +133,6 @@ export class StatusAndHeadersParser {
}

const headers = new headersClass();
const canAppend = headers instanceof Headers;

const headerBuff = await readToDoubleCRLF(reader);

Expand All @@ -139,15 +150,7 @@ export class StatusAndHeadersParser {
.trimEnd();
} else {
if (value) {
try {
if (canAppend && name.toLowerCase() === "set-cookie") {
headers.append(name, value);
} else {
headers.set(name, value);
}
} catch (_e) {
// ignore
}
this.setHeader(name, value, headers);
value = null;
}

Expand All @@ -160,6 +163,7 @@ export class StatusAndHeadersParser {
value = headerBuff
.slice(valueStart, valueEnd < 0 ? undefined : valueEnd)
.trim();

} else {
value = null;
}
Expand All @@ -173,22 +177,40 @@ export class StatusAndHeadersParser {
}

if (value) {
try {
if (canAppend && name.toLowerCase() === "set-cookie") {
headers.append(name, value);
} else {
headers.set(name, value);
}
} catch (_e) {
// ignore
}
this.setHeader(name, value, headers);
}

return new StatusAndHeaders({
statusline,
headers,
reencodeHeaders: this.reencodeHeaders
});
}

setHeader(
name: string,
value: string,
headers: Headers | Map<string, string>,
reencoded = false,
) {
try {
const isHeaders = headers instanceof Headers;
const nameLower = name.toLowerCase();
if (isHeaders && nameLower === "set-cookie") {
headers.append(name, value);
} else {
headers.set(name, value);
}
if (isHeaders && reencoded) {
this.reencodeHeaders.add(nameLower);
}
} catch (_e) {
if (!reencoded) {
// if haven't reencoded already, try reencoding as latin1 before saving
this.setHeader(name, UTFToLatin1(value), headers, true);
}
}
}
}

// ===========================================================================
Expand All @@ -202,6 +224,24 @@ function splitRemainder(str: string, sep: string, limit: number) {
return newParts;
}

// ===========================================================================
function UTFToLatin1(value: string) {
const buf = new TextEncoder().encode(value);

let str = "";
buf.forEach((x) => (str += String.fromCharCode(x)));
return str;
}

// ===========================================================================
function latin1ToUTF(str: string) {
const buf = new Uint8Array(str.length);
for (let i = 0; i < str.length; i++) {
buf[i] = str.charCodeAt(i) & 0xff;
}
return new TextDecoder().decode(buf);
}

// ===========================================================================
export async function indexOfDoubleCRLF(
buffer: Uint8Array,
Expand Down
28 changes: 28 additions & 0 deletions test/testWARCParser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,34 @@ test("StatusAndHeaders test empty", async () => {
expect(result).toBe(null);
});

test("StatusAndHeaders test non-ascii", async () => {
const parser = new StatusAndHeadersParser();
const result = await parser.parse(
new AsyncIterReader(
getReader([
"\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Content-Location: https://example.com/example-испытание\r\n\
Other: Value\r\n\
Metadata: Test-メタデータ\r\n\
More: Values\r\n\
\r\n\
Body",
]),
),
{ headersClass: Headers },
);
expect(result?.toString()).toBe(`\
HTTP/1.0 200 OK\r\n\
content-location: https://example.com/example-испытание\r\n\
content-type: ABC\r\n\
metadata: Test-メタデータ\r\n\
more: Values\r\n\
other: Value\r\n\
`);
});

test("Load WARC Records", async () => {
const input =
'\
Expand Down

0 comments on commit d20880d

Please sign in to comment.