-
Notifications
You must be signed in to change notification settings - Fork 3.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
dom/nodes/Document-characterSet-normalization.html has a ton of casing issues #2453
Comments
@annevk can you write what should be done for this case when using some replacemenet labels? [save using UTF-8 without BOM] <!DOCTYPE html>
<html>
<head>
<meta charset="iso-2022-cn">
<title>Encoding test</title>
</head>
<body>
<script>
document.write("document.characterSet: " + document.characterSet + "<br>");
document.write("document.charset: " + document.charset + "<br>");
document.write("document.inputEncoding: " + document.inputEncoding);
</script>
</body>
</html> When I use labels for UTF-16LE/BE or x-user-defined then it's mapped to UTF-8 and windows-1252., but for replacement browsers (Firefox and Chrome) returns � as content and report using replacement encoding, should also be UTF-8 per getting an output encoding ? Here you have online links to page that use document encoded by UTF-8 but use other meta (read encoding using right click and "View Page Info"): http://w3c-test.org/dom/nodes/encoding.py?label=csiso2022kr And another question around HTML, which algorithm is last when finally using encoding, encoding sniffing algorithm or decode from encoding spec? Ask because I see this: "Usually, the encoding sniffing algorithm defined below is used to determine the character encoding. "The document's character encoding must immediately be set to the value returned from this algorithm, at the same time as the user agent uses the returned value to select the decoder to use for the input byte stream." << description for sniffing algorithm So sniffing alghorithm only return encoding which is passing to decode algorithm (and this one make some correct) or finall encoding comes from sniffing? Basically I try figured out where excatly in HTML or Encoding spec document's encoding is set when input has BOM, like here: [save using UTF-16LE with BOM] <!DOCTYPE html>
<html>
<head>
<meta charset="iso-8859-2">
<title>Encoding test</title>
</head>
<body>
<script>
document.write("document.characterSet: " + document.characterSet + "<br>");
document.write("document.charset: " + document.charset + "<br>");
document.write("document.inputEncoding: " + document.inputEncoding);
</script>
</body>
</html> Browsers output |
"Get an output encoding" is only for encoding, not decoding. There's some cleanup to do around the HTML parser and how it sniffs encodings. But generally sniffing should happen before decode is invoked. (And optimizations that the user agent can make should end up not being part of the standard model, imo.) |
Hmm but now it's used in prescan the byte stream to determine its encoding
I wanted to fix it, but at this moment I can not determine the correct behavior from the specification. Maybe wait until the specification will be corrected. |
I see, can you file an issue on that, that does indeed seem wrong. |
whatwg/html#1077 and find regression. |
@annevk I fix this code, please use them:
<!doctype html>
<title>document.characterSet (inputEncoding and charset as aliases) normalization tests</title>
<link rel=author title="Aryeh Gregor" href=ayg@aryeh.name>
<meta name=timeout content=long>
<div id=log></div>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<style>iframe { display: none }</style>
<script>
"use strict";
// Taken straight from https://encoding.spec.whatwg.org/
var encodingMap = {
"UTF-8": [
"unicode-1-1-utf-8",
"utf-8",
"utf8",
// As we use <meta>, utf-16 will map to utf-8 per
// https://html.spec.whatwg.org/multipage/#documentEncoding
"utf-16",
"utf-16le",
"utf-16be",
],
"IBM866": [
"866",
"cp866",
"csibm866",
"ibm866",
],
"ISO-8859-2": [
"csisolatin2",
"iso-8859-2",
"iso-ir-101",
"iso8859-2",
"iso88592",
"iso_8859-2",
"iso_8859-2:1987",
"l2",
"latin2",
],
"ISO-8859-3": [
"csisolatin3",
"iso-8859-3",
"iso-ir-109",
"iso8859-3",
"iso88593",
"iso_8859-3",
"iso_8859-3:1988",
"l3",
"latin3",
],
"ISO-8859-4": [
"csisolatin4",
"iso-8859-4",
"iso-ir-110",
"iso8859-4",
"iso88594",
"iso_8859-4",
"iso_8859-4:1988",
"l4",
"latin4",
],
"ISO-8859-5": [
"csisolatincyrillic",
"cyrillic",
"iso-8859-5",
"iso-ir-144",
"iso8859-5",
"iso88595",
"iso_8859-5",
"iso_8859-5:1988",
],
"ISO-8859-6": [
"arabic",
"asmo-708",
"csiso88596e",
"csiso88596i",
"csisolatinarabic",
"ecma-114",
"iso-8859-6",
"iso-8859-6-e",
"iso-8859-6-i",
"iso-ir-127",
"iso8859-6",
"iso88596",
"iso_8859-6",
"iso_8859-6:1987",
],
"ISO-8859-7": [
"csisolatingreek",
"ecma-118",
"elot_928",
"greek",
"greek8",
"iso-8859-7",
"iso-ir-126",
"iso8859-7",
"iso88597",
"iso_8859-7",
"iso_8859-7:1987",
"sun_eu_greek",
],
"ISO-8859-8": [
"csiso88598e",
"csisolatinhebrew",
"hebrew",
"iso-8859-8",
"iso-8859-8-e",
"iso-ir-138",
"iso8859-8",
"iso88598",
"iso_8859-8",
"iso_8859-8:1988",
"visual",
],
"ISO-8859-8-I": [
"csiso88598i",
"iso-8859-8-i",
"logical",
],
"ISO-8859-10": [
"csisolatin6",
"iso-8859-10",
"iso-ir-157",
"iso8859-10",
"iso885910",
"l6",
"latin6",
],
"ISO-8859-13": [
"iso-8859-13",
"iso8859-13",
"iso885913",
],
"ISO-8859-14": [
"iso-8859-14",
"iso8859-14",
"iso885914",
],
"ISO-8859-15": [
"csisolatin9",
"iso-8859-15",
"iso8859-15",
"iso885915",
"iso_8859-15",
"l9",
],
"ISO-8859-16": [
"iso-8859-16",
],
"KOI8-R": [
"cskoi8r",
"koi",
"koi8",
"koi8-r",
"koi8_r",
],
"KOI8-U": [
"koi8-ru",
"koi8-u",
],
"macintosh": [
"csmacintosh",
"mac",
"macintosh",
"x-mac-roman",
],
"windows-874": [
"dos-874",
"iso-8859-11",
"iso8859-11",
"iso885911",
"tis-620",
"windows-874",
],
"windows-1250": [
"cp1250",
"windows-1250",
"x-cp1250",
],
"windows-1251": [
"cp1251",
"windows-1251",
"x-cp1251",
],
"windows-1252": [
"ansi_x3.4-1968",
"ascii",
"cp1252",
"cp819",
"csisolatin1",
"ibm819",
"iso-8859-1",
"iso-ir-100",
"iso8859-1",
"iso88591",
"iso_8859-1",
"iso_8859-1:1987",
"l1",
"latin1",
"us-ascii",
"windows-1252",
"x-cp1252",
// As we use <meta>, x-user-defined will map to windows-1252 per
// https://html.spec.whatwg.org/multipage/#documentEncoding
"x-user-defined"
],
"windows-1253": [
"cp1253",
"windows-1253",
"x-cp1253",
],
"windows-1254": [
"cp1254",
"csisolatin5",
"iso-8859-9",
"iso-ir-148",
"iso8859-9",
"iso88599",
"iso_8859-9",
"iso_8859-9:1989",
"l5",
"latin5",
"windows-1254",
"x-cp1254",
],
"windows-1255": [
"cp1255",
"windows-1255",
"x-cp1255",
],
"windows-1256": [
"cp1256",
"windows-1256",
"x-cp1256",
],
"windows-1257": [
"cp1257",
"windows-1257",
"x-cp1257",
],
"windows-1258": [
"cp1258",
"windows-1258",
"x-cp1258",
],
"x-mac-cyrillic": [
"x-mac-cyrillic",
"x-mac-ukrainian",
],
"GBK": [
"chinese",
"csgb2312",
"csiso58gb231280",
"gb2312",
"gb_2312",
"gb_2312-80",
"gbk",
"iso-ir-58",
"x-gbk",
],
"gb18030": [
"gb18030",
],
"Big5": [
"big5",
"big5-hkscs",
"cn-big5",
"csbig5",
"x-x-big5",
],
"EUC-JP": [
"cseucpkdfmtjapanese",
"euc-jp",
"x-euc-jp",
],
"ISO-2022-JP": [
"csiso2022jp",
"iso-2022-jp",
],
"Shift_JIS": [
"csshiftjis",
"ms932",
"ms_kanji",
"shift-jis",
"shift_jis",
"sjis",
"windows-31j",
"x-sjis",
],
"EUC-KR": [
"cseuckr",
"csksc56011987",
"euc-kr",
"iso-ir-149",
"korean",
"ks_c_5601-1987",
"ks_c_5601-1989",
"ksc5601",
"ksc_5601",
"windows-949",
],
"replacement": [
"csiso2022kr",
"hz-gb-2312",
"iso-2022-cn",
"iso-2022-cn-ext",
"iso-2022-kr",
],
};
// Add spaces and mix up case
Object.keys(encodingMap).forEach(function(name) {
var lower = encodingMap[name];
var upper = encodingMap[name].map(function(s) { return s.toUpperCase() });
var mixed = encodingMap[name].map(function(s) {
var ret = "";
for (var i = 0; i < s.length; i += 2) {
ret += s[i].toUpperCase();
if (i + 1 < s.length) {
ret += s[i + 1];
}
}
return ret;
});
var spacey = encodingMap[name].map(function(s) {
return " \t\n\f\r" + s + " \t\n\f\r";
});
encodingMap[name] = [];
for (var i = 0; i < lower.length; i++) {
encodingMap[name].push(lower[i]);
/*
if (lower[i] != upper[i]) {
encodingMap[name].push(upper[i]);
}
if (lower[i] != mixed[i] && upper[i] != mixed[i]) {
encodingMap[name].push(mixed[i]);
}
encodingMap[name].push(spacey[i]);
*/
}
});
/*
function expected_case(encoding_label) {
if (encoding_label === 'big5') {
return 'Big5';
}
if (encoding_label === 'shift_jis') {
return 'Shift_JIS';
}
return encoding_label.toUpperCase();
}
*/
Object.keys(encodingMap).forEach(function(name) {
encodingMap[name].forEach(function(label) {
var iframe = document.createElement("iframe");
var t = async_test("Name " + format_value(name) +
" has label " + format_value(label) + " (characterSet)");
var t2 = async_test("Name " + format_value(name) +
" has label " + format_value(label) + " (inputEncoding)");
var t3 = async_test("Name " + format_value(name) +
" has label " + format_value(label) + " (charset)");
/*
iframe.src = "data:text/html,<!doctype html>" +
'<meta charset="' + label + '">';
*/
/*
var blob = new Blob(["<!doctype html>" + '<meta charset="' + label + '">'], {type: "text/html"});
iframe.src = window.URL.createObjectURL(blob);
*/
iframe.src = "encoding.py?label=" + label;
iframe.onload = function() {
t.step(function() {
assert_equals(iframe.contentDocument.characterSet, name);
});
t2.step(function() {
assert_equals(iframe.contentDocument.inputEncoding, name);
});
t3.step(function() {
assert_equals(iframe.contentDocument.charset, name);
});
document.body.removeChild(iframe);
t.done();
t2.done();
t3.done();
};
document.body.appendChild(iframe);
});
});
</script>
<!-- vim: set expandtab tabstop=2 shiftwidth=2: --> |
@ArkadiuszMichalski could you create a PR for this change or do you find that too difficult? https://help.github.com/articles/creating-a-pull-request/ has some pointers. |
@annevk done, I don't have practice around Github (yet) so any subsequent correct, synchronization, etc. can be a problem, but here it may not be necessary. |
I see, if GitHub offers a client for your OS that might be a simple way to get started. I suspect there must be tutorials somewhere, but I don't know of any. |
E.g., it expects "
WINDOWS-1252
" which https://dom.spec.whatwg.org/#dom-document-characterset definitely does not define.The text was updated successfully, but these errors were encountered: