Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fetch fails on ByteString header with non-ASCII chars #1317

Closed
AlttiRi opened this issue Mar 31, 2022 · 13 comments · Fixed by #1560
Closed

fetch fails on ByteString header with non-ASCII chars #1317

AlttiRi opened this issue Mar 31, 2022 · 13 comments · Fixed by #1560
Labels
bug Something isn't working

Comments

@AlttiRi
Copy link

AlttiRi commented Mar 31, 2022

fetch fails on ByteString header if the header contains char codes 128 and over.

The code:

server.js
import http from "http";

const host = "localhost";
const port = 8000;
const server = http.createServer(requestListener);
server.listen(port, host, () => {
    console.log(`Server is running on http://${host}:${port}`);
});

const names = [
    `rock&roll.png`,
    `rock'n'roll.png`,
    `image — copy (1).png`,
    `_圖片_🖼_image_.png`,
    `100 % loading&perf.png`,
];

const CD1 = str2BStr(`inline; filename=${names[0]}`);
const CD2 = str2BStr(`inline; filename="${names[1]}"`);
const CD3 = str2BStr(`inline; filename="${names[2]}"; filename*=UTF-8''${encodeURIComponent(names[2])}`);
const CD4 = str2BStr(`inline; filename="${names[3]}"; filename*=UTF-8''${encodeURIComponent(names[3])}`);
const CD5 = str2BStr(`inline; filename="${names[4]}"; filename*=UTF-8''${encodeURIComponent(names[4])}`);


function requestListener(req, res) {
    res.setHeader("Content-Type", "text/html; charset=utf-8");
    res.setHeader("Content-Disposition-1", CD1);
    res.setHeader("Content-Disposition-2", CD2);
    res.setHeader("Content-Disposition-3", CD3);
    res.setHeader("Content-Disposition-4", CD4);
    res.setHeader("Content-Disposition-5", CD5);
    res.writeHead(200);
    res.end(names.map(name => `<li>${name}</li>`).join(""));
}


// --- Util ---
function str2BStr(string) {
    return arrayBufferToBinaryString(new TextEncoder().encode(string));
}
function bSrt2Str(bString) {
    return new TextDecoder().decode(binaryStringToArrayBuffer(bString));
}
function arrayBufferToBinaryString(arrayBuffer) {
    return arrayBuffer.reduce((accumulator, byte) => accumulator + String.fromCharCode(byte), "");
}
function binaryStringToArrayBuffer(binaryString) {
    const u8Array = new Uint8Array(binaryString.length);
    for (let i = 0; i < binaryString.length; i++) {
        u8Array[i] = binaryString.charCodeAt(i);
    }
    return u8Array;
}
client.js
import {fetch} from "undici";
// import fetch from "node-fetch";

let result = [...(await fetch("http://localhost:8000/", {method: "head"})).headers.entries()]
    .filter(([k, v]) => k.startsWith("content-disposition"))
    .map(([k, v]) => v)
console.log(result);
console.log(result.map(v => v.length));


// import {contentDispositionFilename} from "./util.js";
// console.log(result.map(v => contentDispositionFilename(v)));
util.js
export {contentDispositionFilename};

/* Using:
const cd = response.headers.get("content-disposition");
const name = contentDispositionFilename(cd);
 */
// RFC 5987:
// [1] inline; filename="file.jpg"; filename*=UTF-8''file.jpg
// Quoted:
// [2] inline; filename="file.jpg"
// Without quotes:
// [3] attachment; filename=file.jpg
//
// `filename=` in rare cases may be also encoded as URIComponent
function contentDispositionFilename(headerByteString, decode = false) {
    if (!headerByteString) {
        return;
    }

    const headerString = byteStringToString(headerByteString);
    if (headerString !== headerByteString) {
        console.log("contentDispositionHeaderByteString:", headerByteString);
        console.log("contentDispositionHeaderString:",     headerString);
    }

    let result;
    const encodedFilename = headerString.match(/(?<=filename\*=UTF-8'')[^;]+(?=;?$)/)?.[0]; // [1]
    if (encodedFilename) {
        result = decodeURIComponent(encodedFilename);
    } else {
        const filename = headerString.match(/(?<=filename=").+(?="$)/)?.[0] // [2]
                      || headerString.match(/(?<=filename=).+$/)[0];        // [3]

        if (decode) {
            result = decodeURIComponent(filename);
        } else {
            result = filename;
        }
    }
    console.log("contentDispositionFilename:", result);

    return result;
}
function byteStringToString(byteString) {
    const chars = [...byteString];
    const isBinaryString = chars.every(ch => ch.charCodeAt(0) < 256);
    if (isBinaryString) {
        console.log("isBinaryString", isBinaryString);

        // It's optional, just to skip `TextDecoder.decode`
        const isASCII = chars.every(ch => ch.charCodeAt(0) < 128);
        if (isASCII) {
            console.log("isASCII", isBinaryString);
            return byteString;
        }

        const bytes = new Uint8Array(chars.map(ch => ch.charCodeAt(0)));
        return new TextDecoder().decode(bytes);
    }
    return byteString;
}

Result

...\node_modules\undici\lib\fetch\index.js:186
        Object.assign(new TypeError('fetch failed'), { cause: response.error })
                      ^

TypeError: fetch failed
    at Object.processResponse (...\node_modules\undici\lib\fetch\index.js:186:23)
    at Fetch.fetchFinale (...\node_modules\undici\lib\fetch\index.js:940:17)
    at Fetch.mainFetch (...\node_modules\undici\lib\fetch\index.js:747:17)
    at processTicksAndRejections (node:internal/process/task_queues:96:5) {
  cause: TypeError [ERR_INVALID_CHAR]: Invalid character in header content ["Content-Disposition-3"]
      at normalizeAndValidateHeaderValue (...\node_modules\undici\lib\fetch\headers.js:48:3)
      at HeadersList.append (...\node_modules\undici\lib\fetch\headers.js:98:29)
      at Headers.append (...\node_modules\undici\lib\fetch\headers.js:220:31)
      at Object.onHeaders (...\node_modules\undici\lib\fetch\index.js:1844:21)
      at Request.onHeaders (...\node_modules\undici\lib\core\request.js:176:27)
      at Parser.onHeadersComplete (...\node_modules\undici\lib\client.js:859:23)
      at wasm_on_headers_complete (...\node_modules\undici\lib\client.js:458:30)
      at wasm://wasm/0002afd2:wasm-function[11]:0x427
      at wasm://wasm/0002afd2:wasm-function[44]:0x8ad
      at wasm://wasm/0002afd2:wasm-function[56]:0x5c62 {
    code: 'ERR_INVALID_CHAR'
  }
}

The expected result

Deno:
image

Browser console:
image

Browser:
image

Deno, with 2 last lines uncommented:
image

Node.js v17.5.0
undici v4.16.0

BTW, HTTP headers are ByteString (BinaryString).

https://developer.mozilla.org/en-US/docs/Web/API/DOMString/Binary
https://webidl.spec.whatwg.org/#idl-ByteString

Hm... someone have deleted the article about ByteString:
https://web.archive.org/web/20210608032047/https://developer.mozilla.org/en-US/docs/Web/API/ByteString
and replaced ByteString with String:
https://web.archive.org/web/20210731105134/https://developer.mozilla.org/en-US/docs/Web/API/Headers/get
That's not OK.

@AlttiRi AlttiRi added the bug Something isn't working label Mar 31, 2022
@AlttiRi
Copy link
Author

AlttiRi commented Mar 31, 2022

The online link for a quick look:
Rock & roll 音楽 («🎵🎶»).png
https://xenforo.com/community/attachments/rock-roll-音楽-«🎵🎶»-png.266784/

¡«£»ÿ.png
https://xenforo.com/community/attachments/¡«£»ÿ-png.266785/
It "works", but the header is not the same (byte equal as from the native fetch and other libs).
It "works" only because it's Latin 1, which is subset of UTF-8. (ASCII is also subset of UTF-8, however in case ASCII toByteString(ascii) === ascii)

But headers are not UTF-8 characters.
So, this const key = headersList[n + 1].toString() is wrong. (Buffer.toString by default uses "utf8")
Headers are UTF-8 bytes within a String object, or "some other 8-bit-per-code-unit encoding, although this is not required" [link] — is this case character out of ASCII will be platform's language dependent (different results for different local charset).

Headers are not UTF-8 characters.
Headers are UTF-8 bytes within a String object.

It's correct.
You can use headersList[n + 1].toString("utf8"), but it may return a String with non Latin 1 characters — with 256 char codes and more, so normalizeAndValidateHeaderValue will fail, obviously.
You can do it, but you must not, because headers are ByteString. And such string will pass normalizeAndValidateHeaderValue successfully.


UPD: Oops, these links requires an account).
So, if you will use them with undici.fetch don't forget about cookie.
(If you have an account, or just use the my demo server above.)


import {fetch as undiciFetch} from "undici";
import nodeFetch from "node-fetch";

// const headers = {
//     "cookie": "..." // It's required
// };

const url = "https://xenforo.com/community/attachments/¡«£»ÿ-png.266785/";
const nodeFetchCdHeader = (await nodeFetch(url, {headers})).headers.get("content-disposition");
const undiciFetchCdHeader = (await undiciFetch(url, {headers})).headers.get("content-disposition");
const toString = bStr => Buffer.from(bStr, "binary").toString("utf8");

console.log(nodeFetchCdHeader.length);      // OK (86)
console.log(nodeFetchCdHeader);             // OK (..."¡«£»ÿ.png"...)
console.log(toString(nodeFetchCdHeader));   // OK (..."¡«£»ÿ.png"...)

console.log(undiciFetchCdHeader.length);    // Not OK (81)
console.log(undiciFetchCdHeader);           // Not OK (..."¡«£»ÿ.png"...)
console.log(toString(undiciFetchCdHeader)); // Not OK (..."�����.png"...)

const url2 = "https://xenforo.com/community/attachments/rock-roll-音楽-«🎵🎶»-png.266784/";
await nodeFetch(url2, {headers});   // OK
await undiciFetch(url2, {headers}); // Error

Also you can test it in the browser's console:

function bSrt2Str(bString) {
    return new TextDecoder().decode(binaryStringToArrayBuffer(bString));
}
function binaryStringToArrayBuffer(binaryString) {
    const u8Array = new Uint8Array(binaryString.length);
    for (let i = 0; i < binaryString.length; i++) {
        u8Array[i] = binaryString.charCodeAt(i);
    }
    return u8Array;
}
// on https://xenforo.com/community/attachments/¡«£»ÿ-png.266785/ page
const fetchCdHeader = (await fetch("")).headers.get("Content-Disposition");
console.log(fetchCdHeader.length);    // (86)
console.log(fetchCdHeader);           // (..."¡«£»ÿ.png"...)
console.log(bSrt2Str(fetchCdHeader)); // (..."¡«£»ÿ.png"...)

@AlttiRi
Copy link
Author

AlttiRi commented Mar 31, 2022

undici/lib/fetch/index.js

Lines 1950 to 1951 in a211b18

const key = headersList[n + 0].toString()
const val = headersList[n + 1].toString()

The fix:

            const key = headersList[n + 0].toString('binary')
            const val = headersList[n + 1].toString('binary')

Since headersList is Buffer[].

Thx to console.log, since it has no JSDocs and dispatcher.d.ts is wrong:

onHeaders?(statusCode: number, headers: string[] | null, resume: () => void): boolean;

@ronag
Copy link
Member

ronag commented Apr 1, 2022

@Ethan-Arrowood

@Ethan-Arrowood
Copy link
Collaborator

@AlttiRi would you like to send your fix as a PR?

@AlttiRi
Copy link
Author

AlttiRi commented Apr 5, 2022

The additional note about undici.request, but not about undici.fetch:

undici.request returns headers as a String — it automatically converts UTF-8 bytes to a String.
For a library that is not going to be "fetch compatible" it's probably OK.
However, I think you should at least document this detail.

Setting header with and non-Latin1 (and non-ASCII) encoding in undici.request is allowed (no errors are thrown), but it produces a wrong result (I can't parse on the server the header received from undici.request correctly), I need to manually convert the header string to UTF-8 ByteString before I put it to undici.request.

It's looks strange.
It (undici.request) automatically converts utf-8 bytes to String on reading, but it does not convert a String to utf-8 bytes on header setting. Well, I can work with it.
Although in some cases it can be even useful if some library creates and returns a header in ByteString form, for example.

Just a note.

@AlttiRi
Copy link
Author

AlttiRi commented Apr 5, 2022

Also you can take a look at this issue:
nodejs/node#42579
There I wrote in more details about the ByteString header representation.

@ShogunPanda
Copy link
Contributor

Please see my comment on nodejs/node#42579 - I think they also apply for undici.

@AlttiRi
Copy link
Author

AlttiRi commented Apr 6, 2022

Not for this issue, definitely.

This issue is about that undici.fetch produces the different results, or does not work at all with real URLs in comparison with browsers fetch/xhr, node-fetch, deno's fetch.

@AlttiRi
Copy link
Author

AlttiRi commented Apr 21, 2022

So, what?

I just can't use undici.fetch in my media downloader because of:

cause: TypeError [ERR_INVALID_CHAR]: Invalid character in header content ["Content-Disposition"]

(And because of possible wrong names too.)


BTW, my pull request was automatically removed because of I have archived the forked repo?
I don't more see it here https://github.com/nodejs/undici/pulls.

fix(fetch): return headers as is (as ByteString)

Return headers in the same form as it do browsers, node-fetch, Deno's fetch.

  • Fix the error when a header contains UTF-8 bytes of a text with char codes are over 255.
  • Fix the different result for a header with non-ASCII character(s) when the header contains bytes of some 8-bits encoding.

The related issue: #1317

Note: headersList type is Buffer[].

image


UPD.
Okay, I have just removed the fork at all.
Do the pull request by yourself.

@mcollina
Copy link
Member

What does UPD mean?

Given you are not interested in fixing this, could we close the issue then?

@AlttiRi
Copy link
Author

AlttiRi commented Apr 21, 2022

What does UPD mean?

"Update"/"Edit". (I have appended the additional information by post editing)

Given you are not interested in fixing this

I'm very interested in fixing this. This bug is critical.


I just removed my fork because my pull request has disappeared here from the list (that was unexpectedly for me).
Well, anyway it were just 2 lines of changes, so anyone can recreate it instead without any problem.

Also I'm not interesting in a "contribution" by sending pull requests.
I'm only interested in this bug fixing (It's why I created this issue and spent a lot of time by testing it.).

@AlttiRi
Copy link
Author

AlttiRi commented Jul 19, 2022

Oh, was it fixed?

Finally, I will be able to use this library in a downloader program.

BTW, a real life (with no authentication needed) URL for tests:
https://web.archive.org/web/1/https://download1084.mediafire.com/z7j89kwbqrzg/gphbkhjqqx8aesq/✅⏳🔥💧.txt

(Web Archive is just for URL safety, it correctly returns the same headers as the original service (Mediafire).)

Currently 5.8.0 fails on this URL:

cause: TypeError: Argument is not a ByteString
      at webidl.converters.ByteString (...\node_modules\undici\lib\fetch\webidl.js:403:11)
      at Headers.append (...\node_modules\undici\lib\fetch\headers.js:201:31)
      at Object.onHeaders (...\node_modules\undici\lib\fetch\index.js:1955:21)
      at Request.onHeaders (...\node_modules\undici\lib\core\request.js:229:27)
...

I'm waiting for the next release with this bug fixed.

@AlttiRi
Copy link
Author

AlttiRi commented Aug 25, 2022

5.10.0 works as expected:

import {fetch} from "undici";


// Use this persistent Web Archive URL,
// or take the direct URL for your IP here: 
// https://www.mediafire.com/file/gphbkhjqqx8aesq/%E2%9C%85%E2%8F%B3%F0%9F%94%A5%F0%9F%92%A7.txt
// from the "DOWNLOAD (14B)" button element
const url = "https://web.archive.org/web/1/https://download1084.mediafire.com/z7j89kwbqrzg/gphbkhjqqx8aesq/✅⏳🔥💧.txt";

const resp = await fetch(url);

if (!resp.ok) { // status === 503 if "Internet Archive services are temporarily offline."
    throw "status: " + resp.status;
}

const cdHeader = Buffer.from(resp.headers.get("content-disposition"), "latin1").toString();
const {filename} = cdHeader.match(/filename="(?<filename>.+)"/)?.groups || {};
console.log(filename);

console.log(filename === "✅⏳🔥💧.txt");      // true
console.log(await resp.text() === "✅⏳🔥💧"); // true

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants