From 57cb40a434adce391216bbdde31e780b1b9d6f71 Mon Sep 17 00:00:00 2001
From: Koki Kato <koki.kato1994@gmail.com>
Date: Thu, 18 Apr 2019 01:38:29 +0900
Subject: [PATCH] Fix word separation (#3667)

* Fix word separation

* Add tests

* Add Latin-1 punctuations in printable char table

* Fix terminology

* Add more tests

* Classify keyword characters as punctuations

* Add more test

* Rename `table` to `symbolTable`
---
 src/common/motion/position.ts | 78 ++++++++++++++---------------------
 test/motion.test.ts           | 36 +++++++++++++++-
 2 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/src/common/motion/position.ts b/src/common/motion/position.ts
index 6aa2d7e80c5..65e3f9fbc6a 100644
--- a/src/common/motion/position.ts
+++ b/src/common/motion/position.ts
@@ -895,45 +895,7 @@ export class Position extends vscode.Position {
     return result;
   }
 
-  private static makeUnicodeWordRegex(characterSet: string): RegExp {
-    const segments = [
-      // ASCII word characters (in many cases 0-9A-Za-z_)
-      // and non-word characters
-      ...Position.makeAsciiWordSegments(characterSet),
-
-      // Unicode characters (punctuations, ideographs, ...)
-      ...Position.makeUnicodeWordSegments(),
-
-      // Other spelling characters (Greek, ...)
-      '\\S+',
-
-      '$^',
-    ];
-    const result = new RegExp(segments.join('|'), 'ug');
-    return result;
-  }
-
-  private static makeAsciiWordSegments(nonWordChars: string): string[] {
-    const nonWordCodes = nonWordChars
-      .split('')
-      .sort()
-      .map(c => c.codePointAt(0)!);
-    nonWordCodes.push(0x7f); // guard
-    const wordChars: string[] = [];
-    let wordCode = 0x21;
-    for (let nonWordCode of nonWordCodes) {
-      for (; wordCode < nonWordCode; wordCode++) {
-        wordChars.push(String.fromCharCode(wordCode));
-      }
-      wordCode = nonWordCode + 1;
-    }
-
-    const wordSegment = `([${wordChars.join('')}]+)`;
-    const nonWordSegment = `[${_.escapeRegExp(nonWordChars).replace(/-/g, '\\-')}]+`;
-    return [wordSegment, nonWordSegment];
-  }
-
-  private static makeUnicodeWordSegments(): string[] {
+  private static makeUnicodeWordRegex(keywordChars: string): RegExp {
     // Distinct categories of characters
     enum CharKind {
       Punctuation,
@@ -946,10 +908,11 @@ export class Position extends vscode.Position {
       Hangul,
     }
 
+    // List of printable characters (code point intervals) and their character kinds.
+    // Latin alphabets (e.g., ASCII alphabets and numbers,  Latin-1 Supplement, European Latin) are excluded.
     // Imported from utf_class_buf in src/mbyte.c of Vim.
-    // Spelling alphabets are not listed here since they are covered as non-white letters.
-    // TODO(ajalab): add Emoji
-    const codePointRanges: [[number, number], CharKind][] = [
+    const symbolTable: [[number, number], CharKind][] = [
+      [[0x00a1, 0x00bf], CharKind.Punctuation], // Latin-1 punctuation
       [[0x037e, 0x037e], CharKind.Punctuation], // Greek question mark
       [[0x0387, 0x0387], CharKind.Punctuation], // Greek ano teleia
       [[0x055a, 0x055f], CharKind.Punctuation], // Armenian punctuation
@@ -1013,23 +976,42 @@ export class Position extends vscode.Position {
       [[0x2f800, 0x2fa1f], CharKind.Ideograph], // CJK Ideographs
     ];
 
-    const fragments: string[][] = [];
+    const codePointRangePatterns: string[][] = [];
     for (let kind in CharKind) {
       if (!isNaN(Number(kind))) {
-        fragments[kind] = [];
+        codePointRangePatterns[kind] = [];
       }
     }
 
-    for (let [[first, last], kind] of codePointRanges) {
+    for (let [[first, last], kind] of symbolTable) {
       if (first === last) {
         // '\u{hhhh}'
-        fragments[kind].push(`\\u{${first.toString(16)}}`);
+        codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}`);
       } else {
         // '\u{hhhh}-\u{hhhh}'
-        fragments[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
+        codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
       }
     }
-    return fragments.map(patterns => `([${patterns.join('')}]+)`);
+
+    // Symbols in vim.iskeyword or editor.wordSeparators
+    // are treated as CharKind.Punctuation
+    const escapedKeywordChars = _.escapeRegExp(keywordChars).replace(/-/g, '\\-');
+    codePointRangePatterns[Number(CharKind.Punctuation)].push(escapedKeywordChars);
+
+    const codePointRanges = codePointRangePatterns.map(patterns => patterns.join(''));
+    const symbolSegments = codePointRanges.map(range => `([${range}]+)`);
+
+    // wordSegment matches word characters.
+    // A word character is a symbol which is neither
+    // - space
+    // - a symbol listed in the table
+    // - a keyword (vim.iskeyword)
+    const wordSegment = `([^\\s${codePointRanges.join('')}]+)`;
+
+    // https://regex101.com/r/X1agK6/2
+    const segments = symbolSegments.concat(wordSegment, '$^');
+    const regexp = new RegExp(segments.join('|'), 'ug');
+    return regexp;
   }
 
   private getAllPositions(line: string, regex: RegExp): number[] {
diff --git a/test/motion.test.ts b/test/motion.test.ts
index 12d6c16173e..188bfb5b7fb 100644
--- a/test/motion.test.ts
+++ b/test/motion.test.ts
@@ -353,7 +353,13 @@ suite('word motion', () => {
 });
 
 suite('unicode word motion', () => {
-  let text: Array<string> = ['漢字ひらがなカタカナalphabets、いろいろな文字。', 'Καλημέρα κόσμε'];
+  let text: Array<string> = [
+    '漢字ひらがなカタカナalphabets、いろいろな文字。',
+    'Καλημέρα κόσμε',
+    'Die früh sich einst dem trüben Blick gezeigt.',
+    'Được tiếp đãi ân cần',
+    '100£and100$and100¥#♯x',
+  ];
 
   suiteSetup(() => {
     return setupWorkspace().then(() => {
@@ -387,6 +393,28 @@ suite('unicode word motion', () => {
       assert.equal(motion.line, 1);
       assert.equal(motion.character, 9);
     });
+
+    test('move cursor word right recognizes a latin string which has diacritics as a single word', () => {
+      let motion = new Position(2, 4).getWordRight();
+      assert.equal(motion.line, 2);
+      assert.equal(motion.character, 9);
+    });
+
+    test('move cursor word right recognizes a latin-1 symbol as punctuation', () => {
+      let motion = new Position(4, 3).getWordRight();
+      assert.equal(motion.line, 4);
+      assert.equal(motion.character, 4);
+
+      motion = motion.getWordRight(); // issue #3680
+      assert.equal(motion.line, 4);
+      assert.equal(motion.character, 10);
+    });
+
+    test('move cursor word right recognizes a sequence of latin-1 symbols and other symbols as a word', () => {
+      let motion = new Position(4, 17).getWordRight();
+      assert.equal(motion.line, 4);
+      assert.equal(motion.character, 20);
+    });
   });
 
   suite('word left', () => {
@@ -413,6 +441,12 @@ suite('unicode word motion', () => {
       assert.equal(motion.line, 1);
       assert.equal(motion.character, 9);
     });
+
+    test('move cursor word left recognizes a latin string which has diacritics as a single word', () => {
+      let motion = new Position(3, 10).getWordLeft();
+      assert.equal(motion.line, 3);
+      assert.equal(motion.character, 5);
+    });
   });
 });