Skip to content

Commit

Permalink
Fix word separation (#3667)
Browse files Browse the repository at this point in the history
* Fix word separation

* Add tests

* Add Latin-1 punctuations in printable char table

* Fix terminology

* Add more tests

* Classify keyword characters as punctuations

* Add more test

* Rename `table` to `symbolTable`
  • Loading branch information
ajalab authored and jpoon committed Apr 17, 2019
1 parent 6307efb commit 57cb40a
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 49 deletions.
78 changes: 30 additions & 48 deletions src/common/motion/position.ts
Original file line number Diff line number Diff line change
Expand Up @@ -895,45 +895,7 @@ export class Position extends vscode.Position {
return result;
}

private static makeUnicodeWordRegex(characterSet: string): RegExp {
const segments = [
// ASCII word characters (in many cases 0-9A-Za-z_)
// and non-word characters
...Position.makeAsciiWordSegments(characterSet),

// Unicode characters (punctuations, ideographs, ...)
...Position.makeUnicodeWordSegments(),

// Other spelling characters (Greek, ...)
'\\S+',

'$^',
];
const result = new RegExp(segments.join('|'), 'ug');
return result;
}

private static makeAsciiWordSegments(nonWordChars: string): string[] {
const nonWordCodes = nonWordChars
.split('')
.sort()
.map(c => c.codePointAt(0)!);
nonWordCodes.push(0x7f); // guard
const wordChars: string[] = [];
let wordCode = 0x21;
for (let nonWordCode of nonWordCodes) {
for (; wordCode < nonWordCode; wordCode++) {
wordChars.push(String.fromCharCode(wordCode));
}
wordCode = nonWordCode + 1;
}

const wordSegment = `([${wordChars.join('')}]+)`;
const nonWordSegment = `[${_.escapeRegExp(nonWordChars).replace(/-/g, '\\-')}]+`;
return [wordSegment, nonWordSegment];
}

private static makeUnicodeWordSegments(): string[] {
private static makeUnicodeWordRegex(keywordChars: string): RegExp {
// Distinct categories of characters
enum CharKind {
Punctuation,
Expand All @@ -946,10 +908,11 @@ export class Position extends vscode.Position {
Hangul,
}

// List of printable characters (code point intervals) and their character kinds.
// Latin alphabets (e.g., ASCII alphabets and numbers, Latin-1 Supplement, European Latin) are excluded.
// Imported from utf_class_buf in src/mbyte.c of Vim.
// Spelling alphabets are not listed here since they are covered as non-white letters.
// TODO(ajalab): add Emoji
const codePointRanges: [[number, number], CharKind][] = [
const symbolTable: [[number, number], CharKind][] = [
[[0x00a1, 0x00bf], CharKind.Punctuation], // Latin-1 punctuation
[[0x037e, 0x037e], CharKind.Punctuation], // Greek question mark
[[0x0387, 0x0387], CharKind.Punctuation], // Greek ano teleia
[[0x055a, 0x055f], CharKind.Punctuation], // Armenian punctuation
Expand Down Expand Up @@ -1013,23 +976,42 @@ export class Position extends vscode.Position {
[[0x2f800, 0x2fa1f], CharKind.Ideograph], // CJK Ideographs
];

const fragments: string[][] = [];
const codePointRangePatterns: string[][] = [];
for (let kind in CharKind) {
if (!isNaN(Number(kind))) {
fragments[kind] = [];
codePointRangePatterns[kind] = [];
}
}

for (let [[first, last], kind] of codePointRanges) {
for (let [[first, last], kind] of symbolTable) {
if (first === last) {
// '\u{hhhh}'
fragments[kind].push(`\\u{${first.toString(16)}}`);
codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}`);
} else {
// '\u{hhhh}-\u{hhhh}'
fragments[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
}
}
return fragments.map(patterns => `([${patterns.join('')}]+)`);

// Symbols in vim.iskeyword or editor.wordSeparators
// are treated as CharKind.Punctuation
const escapedKeywordChars = _.escapeRegExp(keywordChars).replace(/-/g, '\\-');
codePointRangePatterns[Number(CharKind.Punctuation)].push(escapedKeywordChars);

const codePointRanges = codePointRangePatterns.map(patterns => patterns.join(''));
const symbolSegments = codePointRanges.map(range => `([${range}]+)`);

// wordSegment matches word characters.
// A word character is a symbol which is neither
// - space
// - a symbol listed in the table
// - a keyword (vim.iskeyword)
const wordSegment = `([^\\s${codePointRanges.join('')}]+)`;

// https://regex101.com/r/X1agK6/2
const segments = symbolSegments.concat(wordSegment, '$^');
const regexp = new RegExp(segments.join('|'), 'ug');
return regexp;
}

private getAllPositions(line: string, regex: RegExp): number[] {
Expand Down
36 changes: 35 additions & 1 deletion test/motion.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,13 @@ suite('word motion', () => {
});

suite('unicode word motion', () => {
let text: Array<string> = ['漢字ひらがなカタカナalphabets、いろいろな文字。', 'Καλημέρα κόσμε'];
let text: Array<string> = [
'漢字ひらがなカタカナalphabets、いろいろな文字。',
'Καλημέρα κόσμε',
'Die früh sich einst dem trüben Blick gezeigt.',
'Được tiếp đãi ân cần',
'100£and100$and100¥#♯x',
];

suiteSetup(() => {
return setupWorkspace().then(() => {
Expand Down Expand Up @@ -387,6 +393,28 @@ suite('unicode word motion', () => {
assert.equal(motion.line, 1);
assert.equal(motion.character, 9);
});

test('move cursor word right recognizes a latin string which has diacritics as a single word', () => {
let motion = new Position(2, 4).getWordRight();
assert.equal(motion.line, 2);
assert.equal(motion.character, 9);
});

test('move cursor word right recognizes a latin-1 symbol as punctuation', () => {
let motion = new Position(4, 3).getWordRight();
assert.equal(motion.line, 4);
assert.equal(motion.character, 4);

motion = motion.getWordRight(); // issue #3680
assert.equal(motion.line, 4);
assert.equal(motion.character, 10);
});

test('move cursor word right recognizes a sequence of latin-1 symbols and other symbols as a word', () => {
let motion = new Position(4, 17).getWordRight();
assert.equal(motion.line, 4);
assert.equal(motion.character, 20);
});
});

suite('word left', () => {
Expand All @@ -413,6 +441,12 @@ suite('unicode word motion', () => {
assert.equal(motion.line, 1);
assert.equal(motion.character, 9);
});

test('move cursor word left recognizes a latin string which has diacritics as a single word', () => {
let motion = new Position(3, 10).getWordLeft();
assert.equal(motion.line, 3);
assert.equal(motion.character, 5);
});
});
});

Expand Down

0 comments on commit 57cb40a

Please sign in to comment.