diff --git a/src/lib/statusandheaders.ts b/src/lib/statusandheaders.ts index f40787c..a101b94 100644 --- a/src/lib/statusandheaders.ts +++ b/src/lib/statusandheaders.ts @@ -10,23 +10,33 @@ const decoder = new TextDecoder("utf-8"); export class StatusAndHeaders { statusline: string; headers: Map | Headers; + private readonly reencodeHeaders; constructor({ statusline, headers, + reencodeHeaders }: { statusline: string; headers: Map | Headers; + reencodeHeaders?: Set; }) { this.statusline = statusline; this.headers = headers; + this.reencodeHeaders = reencodeHeaders; } toString() { const buff = [this.statusline]; + const isHeaders = this.headers instanceof Headers; + for (const [name, value] of this.headers) { - buff.push(`${name}: ${value}`); + if (isHeaders && this.reencodeHeaders?.has(name)) { + buff.push(`${name}: ${latin1ToUTF(value)}`); + } else { + buff.push(`${name}: ${value}`); + } } return buff.join("\r\n") + "\r\n"; @@ -99,6 +109,8 @@ export class StatusAndHeaders { // =========================================================================== export class StatusAndHeadersParser { + reencodeHeaders = new Set(); + async parse( reader: AsyncIterReader, { @@ -121,7 +133,6 @@ export class StatusAndHeadersParser { } const headers = new headersClass(); - const canAppend = headers instanceof Headers; const headerBuff = await readToDoubleCRLF(reader); @@ -139,15 +150,7 @@ export class StatusAndHeadersParser { .trimEnd(); } else { if (value) { - try { - if (canAppend && name.toLowerCase() === "set-cookie") { - headers.append(name, value); - } else { - headers.set(name, value); - } - } catch (_e) { - // ignore - } + this.setHeader(name, value, headers); value = null; } @@ -160,6 +163,7 @@ export class StatusAndHeadersParser { value = headerBuff .slice(valueStart, valueEnd < 0 ? undefined : valueEnd) .trim(); + } else { value = null; } @@ -173,22 +177,40 @@ export class StatusAndHeadersParser { } if (value) { - try { - if (canAppend && name.toLowerCase() === "set-cookie") { - headers.append(name, value); - } else { - headers.set(name, value); - } - } catch (_e) { - // ignore - } + this.setHeader(name, value, headers); } return new StatusAndHeaders({ statusline, headers, + reencodeHeaders: this.reencodeHeaders }); } + + setHeader( + name: string, + value: string, + headers: Headers | Map, + reencoded = false, + ) { + try { + const isHeaders = headers instanceof Headers; + const nameLower = name.toLowerCase(); + if (isHeaders && nameLower === "set-cookie") { + headers.append(name, value); + } else { + headers.set(name, value); + } + if (isHeaders && reencoded) { + this.reencodeHeaders.add(nameLower); + } + } catch (_e) { + if (!reencoded) { + // if haven't reencoded already, try reencoding as latin1 before saving + this.setHeader(name, UTFToLatin1(value), headers, true); + } + } + } } // =========================================================================== @@ -202,6 +224,24 @@ function splitRemainder(str: string, sep: string, limit: number) { return newParts; } +// =========================================================================== +function UTFToLatin1(value: string) { + const buf = new TextEncoder().encode(value); + + let str = ""; + buf.forEach((x) => (str += String.fromCharCode(x))); + return str; +} + +// =========================================================================== +function latin1ToUTF(str: string) { + const buf = new Uint8Array(str.length); + for (let i = 0; i < str.length; i++) { + buf[i] = str.charCodeAt(i) & 0xff; + } + return new TextDecoder().decode(buf); +} + // =========================================================================== export async function indexOfDoubleCRLF( buffer: Uint8Array, diff --git a/test/testWARCParser.test.ts b/test/testWARCParser.test.ts index e930e17..291e9ef 100644 --- a/test/testWARCParser.test.ts +++ b/test/testWARCParser.test.ts @@ -102,6 +102,34 @@ test("StatusAndHeaders test empty", async () => { expect(result).toBe(null); }); +test("StatusAndHeaders test non-ascii", async () => { + const parser = new StatusAndHeadersParser(); + const result = await parser.parse( + new AsyncIterReader( + getReader([ + "\ +HTTP/1.0 200 OK\r\n\ +Content-Type: ABC\r\n\ +Content-Location: https://example.com/example-испытание\r\n\ +Other: Value\r\n\ +Metadata: Test-メタデータ\r\n\ +More: Values\r\n\ +\r\n\ +Body", + ]), + ), + { headersClass: Headers }, + ); + expect(result?.toString()).toBe(`\ +HTTP/1.0 200 OK\r\n\ +content-location: https://example.com/example-испытание\r\n\ +content-type: ABC\r\n\ +metadata: Test-メタデータ\r\n\ +more: Values\r\n\ +other: Value\r\n\ +`); +}); + test("Load WARC Records", async () => { const input = '\