Skip to content

Commit

Permalink
util: improve unicode support
Browse files Browse the repository at this point in the history
The array grouping function relies on the width of the characters.
It was not calculated correct so far, since it used the string
length instead.
This improves the unicode output by calculating the mono-spaced
font width (other fonts might differ).

PR-URL: nodejs#31319
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Steven R Loomis <srloomis@us.ibm.com>
Reviewed-By: Rich Trott <rtrott@gmail.com>
Reviewed-By: Minwoo Jung <nodecorelab@gmail.com>
  • Loading branch information
BridgeAR committed Jan 22, 2020
1 parent 2606e1e commit 8fb5fe2
Show file tree
Hide file tree
Showing 11 changed files with 211 additions and 192 deletions.
2 changes: 1 addition & 1 deletion lib/internal/cli_table.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ const {
ObjectPrototypeHasOwnProperty,
} = primordials;

const { getStringWidth } = require('internal/readline/utils');
const { getStringWidth } = require('internal/util/inspect');

// The use of Unicode characters below is the only non-comment use of non-ASCII
// Unicode characters in Node.js built-in modules. If they are ever removed or
Expand Down
117 changes: 0 additions & 117 deletions lib/internal/readline/utils.js
Original file line number Diff line number Diff line change
@@ -1,25 +1,13 @@
'use strict';

const {
RegExp,
Symbol,
} = primordials;

// Regex used for ansi escape code splitting
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
// Matches all ansi escape code sequences in a string
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
const ansi = new RegExp(ansiPattern, 'g');

const kUTF16SurrogateThreshold = 0x10000; // 2 ** 16
const kEscape = '\x1b';
const kSubstringSearch = Symbol('kSubstringSearch');

let getStringWidth;

function CSI(strings, ...args) {
let ret = `${kEscape}[`;
for (let n = 0; n < strings.length; n++) {
Expand Down Expand Up @@ -59,109 +47,6 @@ function charLengthAt(str, i) {
return str.codePointAt(i) >= kUTF16SurrogateThreshold ? 2 : 1;
}

if (internalBinding('config').hasIntl) {
const icu = internalBinding('icu');
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
// TODO(BridgeAR): Expose the options to the user. That is probably the
// best thing possible at the moment, since it's difficult to know what
// the receiving end supports.
getStringWidth = function getStringWidth(str) {
let width = 0;
str = stripVTControlCharacters(str);
for (let i = 0; i < str.length; i++) {
// Try to avoid calling into C++ by first handling the ASCII portion of
// the string. If it is fully ASCII, we skip the C++ part.
const code = str.charCodeAt(i);
if (code >= 127) {
width += icu.getStringWidth(str.slice(i));
break;
}
width += code >= 32 ? 1 : 0;
}
return width;
};
} else {
/**
* Returns the number of columns required to display the given string.
*/
getStringWidth = function getStringWidth(str) {
let width = 0;

str = stripVTControlCharacters(str);

for (const char of str) {
const code = char.codePointAt(0);
if (isFullWidthCodePoint(code)) {
width += 2;
} else if (!isZeroWidthCodePoint(code)) {
width++;
}
}

return width;
};

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
const isFullWidthCodePoint = (code) => {
// Code points are partially derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
return code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
(code >= 0x3250 && code <= 0x4dbf) ||
// CJK Unified Ideographs .. Yi Radicals
(code >= 0x4e00 && code <= 0xa4c6) ||
// Hangul Jamo Extended-A
(code >= 0xa960 && code <= 0xa97c) ||
// Hangul Syllables
(code >= 0xac00 && code <= 0xd7a3) ||
// CJK Compatibility Ideographs
(code >= 0xf900 && code <= 0xfaff) ||
// Vertical Forms
(code >= 0xfe10 && code <= 0xfe19) ||
// CJK Compatibility Forms .. Small Form Variants
(code >= 0xfe30 && code <= 0xfe6b) ||
// Halfwidth and Fullwidth Forms
(code >= 0xff01 && code <= 0xff60) ||
(code >= 0xffe0 && code <= 0xffe6) ||
// Kana Supplement
(code >= 0x1b000 && code <= 0x1b001) ||
// Enclosed Ideographic Supplement
(code >= 0x1f200 && code <= 0x1f251) ||
// Miscellaneous Symbols and Pictographs .. Emoticons
(code >= 0x1f300 && code <= 0x1f64f) ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
(code >= 0x20000 && code <= 0x3fffd)
);
};

const isZeroWidthCodePoint = (code) => {
return code <= 0x1F || // C0 control codes
(code > 0x7F && code <= 0x9F) || // C1 control codes
(code >= 0x0300 && code <= 0x036F) || // Combining Diacritical Marks
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
};
}

/**
* Tries to remove all VT control characters. Use to estimate displayed
* string width. May be buggy due to not running a real state machine
*/
function stripVTControlCharacters(str) {
return str.replace(ansi, '');
}

/*
Some patterns seen in terminal key escape codes, derived from combos seen
at http://www.midnight-commander.org/browser/lib/tty/key.c
Expand Down Expand Up @@ -477,8 +362,6 @@ module.exports = {
charLengthLeft,
commonPrefix,
emitKeys,
getStringWidth,
kSubstringSearch,
stripVTControlCharacters,
CSI
};
6 changes: 4 additions & 2 deletions lib/internal/repl/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ const {

const {
commonPrefix,
getStringWidth,
kSubstringSearch,
} = require('internal/readline/utils');

const { inspect } = require('util');
const {
getStringWidth,
inspect,
} = require('internal/util/inspect');

const debug = require('internal/util/debuglog').debuglog('repl');

Expand Down
131 changes: 122 additions & 9 deletions lib/internal/util/inspect.js
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,17 @@ const meta = [
'\\x98', '\\x99', '\\x9A', '\\x9B', '\\x9C', '\\x9D', '\\x9E', '\\x9F', // x9F
];

// Regex used for ansi escape code splitting
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
// Matches all ansi escape code sequences in a string
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
const ansi = new RegExp(ansiPattern, 'g');

let getStringWidth;

function getUserOptions(ctx) {
return {
stylize: ctx.stylize,
Expand Down Expand Up @@ -1154,7 +1165,7 @@ function groupArrayElements(ctx, output, value) {
// entries length of all output entries. We have to remove colors first,
// otherwise the length would not be calculated properly.
for (; i < outputLength; i++) {
const len = ctx.colors ? removeColors(output[i]).length : output[i].length;
const len = getStringWidth(output[i], ctx.colors);
dataLen[i] = len;
totalLength += len + separatorSpace;
if (maxLength < len)
Expand Down Expand Up @@ -1197,8 +1208,6 @@ function groupArrayElements(ctx, output, value) {
if (columns <= 1) {
return output;
}
// TODO(BridgeAR): Add unicode support. Use the readline getStringWidth
// function.
const tmp = [];
const maxLineLength = [];
for (let i = 0; i < columns; i++) {
Expand Down Expand Up @@ -1565,11 +1574,8 @@ function formatProperty(ctx, value, recurseTimes, key, type, desc) {
const diff = (ctx.compact !== true || type !== kObjectType) ? 2 : 3;
ctx.indentationLvl += diff;
str = formatValue(ctx, desc.value, recurseTimes);
if (diff === 3) {
const len = ctx.colors ? removeColors(str).length : str.length;
if (ctx.breakLength < len) {
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
}
if (diff === 3 && ctx.breakLength < getStringWidth(str, ctx.colors)) {
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
}
ctx.indentationLvl -= diff;
} else if (desc.get !== undefined) {
Expand Down Expand Up @@ -1889,9 +1895,116 @@ function formatWithOptionsInternal(inspectOptions, ...args) {
return str;
}

if (internalBinding('config').hasIntl) {
const icu = internalBinding('icu');
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
// TODO(BridgeAR): Expose the options to the user. That is probably the
// best thing possible at the moment, since it's difficult to know what
// the receiving end supports.
getStringWidth = function getStringWidth(str, removeControlChars = true) {
let width = 0;
if (removeControlChars)
str = stripVTControlCharacters(str);
for (let i = 0; i < str.length; i++) {
// Try to avoid calling into C++ by first handling the ASCII portion of
// the string. If it is fully ASCII, we skip the C++ part.
const code = str.charCodeAt(i);
if (code >= 127) {
width += icu.getStringWidth(str.slice(i));
break;
}
width += code >= 32 ? 1 : 0;
}
return width;
};
} else {
/**
* Returns the number of columns required to display the given string.
*/
getStringWidth = function getStringWidth(str, removeControlChars = true) {
let width = 0;

if (removeControlChars)
str = stripVTControlCharacters(str);

for (const char of str) {
const code = char.codePointAt(0);
if (isFullWidthCodePoint(code)) {
width += 2;
} else if (!isZeroWidthCodePoint(code)) {
width++;
}
}

return width;
};

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
const isFullWidthCodePoint = (code) => {
// Code points are partially derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
return code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
(code >= 0x3250 && code <= 0x4dbf) ||
// CJK Unified Ideographs .. Yi Radicals
(code >= 0x4e00 && code <= 0xa4c6) ||
// Hangul Jamo Extended-A
(code >= 0xa960 && code <= 0xa97c) ||
// Hangul Syllables
(code >= 0xac00 && code <= 0xd7a3) ||
// CJK Compatibility Ideographs
(code >= 0xf900 && code <= 0xfaff) ||
// Vertical Forms
(code >= 0xfe10 && code <= 0xfe19) ||
// CJK Compatibility Forms .. Small Form Variants
(code >= 0xfe30 && code <= 0xfe6b) ||
// Halfwidth and Fullwidth Forms
(code >= 0xff01 && code <= 0xff60) ||
(code >= 0xffe0 && code <= 0xffe6) ||
// Kana Supplement
(code >= 0x1b000 && code <= 0x1b001) ||
// Enclosed Ideographic Supplement
(code >= 0x1f200 && code <= 0x1f251) ||
// Miscellaneous Symbols and Pictographs 0x1f300 - 0x1f5ff
// Emoticons 0x1f600 - 0x1f64f
(code >= 0x1f300 && code <= 0x1f64f) ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
(code >= 0x20000 && code <= 0x3fffd)
);
};

const isZeroWidthCodePoint = (code) => {
return code <= 0x1F || // C0 control codes
(code > 0x7F && code <= 0x9F) || // C1 control codes
(code >= 0x300 && code <= 0x36F) || // Combining Diacritical Marks
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
};
}

/**
* Remove all VT control characters. Use to estimate displayed string width.
*/
function stripVTControlCharacters(str) {
return str.replace(ansi, '');
}

module.exports = {
inspect,
format,
formatWithOptions,
inspectDefaultOptions
getStringWidth,
inspectDefaultOptions,
stripVTControlCharacters
};
8 changes: 5 additions & 3 deletions lib/readline.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,19 @@ const {
ERR_INVALID_OPT_VALUE
} = require('internal/errors').codes;
const { validateString } = require('internal/validators');
const { inspect } = require('internal/util/inspect');
const {
inspect,
getStringWidth,
stripVTControlCharacters,
} = require('internal/util/inspect');
const EventEmitter = require('events');
const {
charLengthAt,
charLengthLeft,
commonPrefix,
CSI,
emitKeys,
getStringWidth,
kSubstringSearch,
stripVTControlCharacters
} = require('internal/readline/utils');

const { clearTimeout, setTimeout } = require('timers');
Expand Down
Loading

0 comments on commit 8fb5fe2

Please sign in to comment.