From 57cb40a434adce391216bbdde31e780b1b9d6f71 Mon Sep 17 00:00:00 2001 From: Koki Kato Date: Thu, 18 Apr 2019 01:38:29 +0900 Subject: [PATCH] Fix word separation (#3667) * Fix word separation * Add tests * Add Latin-1 punctuations in printable char table * Fix terminology * Add more tests * Classify keyword characters as punctuations * Add more test * Rename `table` to `symbolTable` --- src/common/motion/position.ts | 78 ++++++++++++++--------------------- test/motion.test.ts | 36 +++++++++++++++- 2 files changed, 65 insertions(+), 49 deletions(-) diff --git a/src/common/motion/position.ts b/src/common/motion/position.ts index 6aa2d7e80c5..65e3f9fbc6a 100644 --- a/src/common/motion/position.ts +++ b/src/common/motion/position.ts @@ -895,45 +895,7 @@ export class Position extends vscode.Position { return result; } - private static makeUnicodeWordRegex(characterSet: string): RegExp { - const segments = [ - // ASCII word characters (in many cases 0-9A-Za-z_) - // and non-word characters - ...Position.makeAsciiWordSegments(characterSet), - - // Unicode characters (punctuations, ideographs, ...) - ...Position.makeUnicodeWordSegments(), - - // Other spelling characters (Greek, ...) - '\\S+', - - '$^', - ]; - const result = new RegExp(segments.join('|'), 'ug'); - return result; - } - - private static makeAsciiWordSegments(nonWordChars: string): string[] { - const nonWordCodes = nonWordChars - .split('') - .sort() - .map(c => c.codePointAt(0)!); - nonWordCodes.push(0x7f); // guard - const wordChars: string[] = []; - let wordCode = 0x21; - for (let nonWordCode of nonWordCodes) { - for (; wordCode < nonWordCode; wordCode++) { - wordChars.push(String.fromCharCode(wordCode)); - } - wordCode = nonWordCode + 1; - } - - const wordSegment = `([${wordChars.join('')}]+)`; - const nonWordSegment = `[${_.escapeRegExp(nonWordChars).replace(/-/g, '\\-')}]+`; - return [wordSegment, nonWordSegment]; - } - - private static makeUnicodeWordSegments(): string[] { + private static makeUnicodeWordRegex(keywordChars: string): RegExp { // Distinct categories of characters enum CharKind { Punctuation, @@ -946,10 +908,11 @@ export class Position extends vscode.Position { Hangul, } + // List of printable characters (code point intervals) and their character kinds. + // Latin alphabets (e.g., ASCII alphabets and numbers, Latin-1 Supplement, European Latin) are excluded. // Imported from utf_class_buf in src/mbyte.c of Vim. - // Spelling alphabets are not listed here since they are covered as non-white letters. - // TODO(ajalab): add Emoji - const codePointRanges: [[number, number], CharKind][] = [ + const symbolTable: [[number, number], CharKind][] = [ + [[0x00a1, 0x00bf], CharKind.Punctuation], // Latin-1 punctuation [[0x037e, 0x037e], CharKind.Punctuation], // Greek question mark [[0x0387, 0x0387], CharKind.Punctuation], // Greek ano teleia [[0x055a, 0x055f], CharKind.Punctuation], // Armenian punctuation @@ -1013,23 +976,42 @@ export class Position extends vscode.Position { [[0x2f800, 0x2fa1f], CharKind.Ideograph], // CJK Ideographs ]; - const fragments: string[][] = []; + const codePointRangePatterns: string[][] = []; for (let kind in CharKind) { if (!isNaN(Number(kind))) { - fragments[kind] = []; + codePointRangePatterns[kind] = []; } } - for (let [[first, last], kind] of codePointRanges) { + for (let [[first, last], kind] of symbolTable) { if (first === last) { // '\u{hhhh}' - fragments[kind].push(`\\u{${first.toString(16)}}`); + codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}`); } else { // '\u{hhhh}-\u{hhhh}' - fragments[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`); + codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`); } } - return fragments.map(patterns => `([${patterns.join('')}]+)`); + + // Symbols in vim.iskeyword or editor.wordSeparators + // are treated as CharKind.Punctuation + const escapedKeywordChars = _.escapeRegExp(keywordChars).replace(/-/g, '\\-'); + codePointRangePatterns[Number(CharKind.Punctuation)].push(escapedKeywordChars); + + const codePointRanges = codePointRangePatterns.map(patterns => patterns.join('')); + const symbolSegments = codePointRanges.map(range => `([${range}]+)`); + + // wordSegment matches word characters. + // A word character is a symbol which is neither + // - space + // - a symbol listed in the table + // - a keyword (vim.iskeyword) + const wordSegment = `([^\\s${codePointRanges.join('')}]+)`; + + // https://regex101.com/r/X1agK6/2 + const segments = symbolSegments.concat(wordSegment, '$^'); + const regexp = new RegExp(segments.join('|'), 'ug'); + return regexp; } private getAllPositions(line: string, regex: RegExp): number[] { diff --git a/test/motion.test.ts b/test/motion.test.ts index 12d6c16173e..188bfb5b7fb 100644 --- a/test/motion.test.ts +++ b/test/motion.test.ts @@ -353,7 +353,13 @@ suite('word motion', () => { }); suite('unicode word motion', () => { - let text: Array = ['漢字ひらがなカタカナalphabets、いろいろな文字。', 'Καλημέρα κόσμε']; + let text: Array = [ + '漢字ひらがなカタカナalphabets、いろいろな文字。', + 'Καλημέρα κόσμε', + 'Die früh sich einst dem trüben Blick gezeigt.', + 'Được tiếp đãi ân cần', + '100£and100$and100¥#♯x', + ]; suiteSetup(() => { return setupWorkspace().then(() => { @@ -387,6 +393,28 @@ suite('unicode word motion', () => { assert.equal(motion.line, 1); assert.equal(motion.character, 9); }); + + test('move cursor word right recognizes a latin string which has diacritics as a single word', () => { + let motion = new Position(2, 4).getWordRight(); + assert.equal(motion.line, 2); + assert.equal(motion.character, 9); + }); + + test('move cursor word right recognizes a latin-1 symbol as punctuation', () => { + let motion = new Position(4, 3).getWordRight(); + assert.equal(motion.line, 4); + assert.equal(motion.character, 4); + + motion = motion.getWordRight(); // issue #3680 + assert.equal(motion.line, 4); + assert.equal(motion.character, 10); + }); + + test('move cursor word right recognizes a sequence of latin-1 symbols and other symbols as a word', () => { + let motion = new Position(4, 17).getWordRight(); + assert.equal(motion.line, 4); + assert.equal(motion.character, 20); + }); }); suite('word left', () => { @@ -413,6 +441,12 @@ suite('unicode word motion', () => { assert.equal(motion.line, 1); assert.equal(motion.character, 9); }); + + test('move cursor word left recognizes a latin string which has diacritics as a single word', () => { + let motion = new Position(3, 10).getWordLeft(); + assert.equal(motion.line, 3); + assert.equal(motion.character, 5); + }); }); });