Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: cspell-tools - add minCompoundLength setting #6449

Merged
merged 3 commits into from
Nov 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions packages/cspell-tools/cspell-tools.config.schema.json
Original file line number Diff line number Diff line change
@@ -53,6 +53,11 @@
"description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
"type": "number"
},
"minCompoundLength": {
"default": 4,
"description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
"type": "number"
},
"split": {
"anyOf": [
{
@@ -68,7 +73,7 @@
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
}
},
@@ -110,6 +115,11 @@
"description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
"type": "number"
},
"minCompoundLength": {
"default": 4,
"description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
"type": "number"
},
"split": {
"anyOf": [
{
@@ -125,7 +135,7 @@
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
}
},
@@ -266,6 +276,11 @@
"description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
"type": "number"
},
"minCompoundLength": {
"default": 4,
"description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
"type": "number"
},
"removeDuplicates": {
"default": false,
"description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.",
@@ -295,7 +310,7 @@
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
},
"targets": {
18 changes: 15 additions & 3 deletions packages/cspell-tools/src/compiler/SourceReader.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { RequireFields } from '../types.js';
import { createReader } from './Reader.js';
import type { Reader } from './readers/ReaderOptions.js';
import { parseFileLines } from './wordListParser.js';
import { parseFileLines, ParseFileOptions } from './wordListParser.js';
import type { AllowedSplitWordsCollection } from './WordsCollection.js';

export interface SourceReaderOptions {
@@ -24,6 +25,8 @@ export interface SourceReaderOptions {
allowedSplitWords: AllowedSplitWordsCollection;

storeSplitWordsAsCompounds: boolean | undefined;

minCompoundLength?: number | undefined;
}

export type AnnotatedWord = string;
@@ -64,8 +67,17 @@ function splitLines(lines: Iterable<string>, options: SourceReaderOptions): Iter
}

async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise<SourceReader> {
const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds } = options;
const words = [...parseFileLines(reader.lines, { legacy, split, allowedSplitWords, storeSplitWordsAsCompounds })];
const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds, minCompoundLength } = options;
const parseOptions = {
legacy,
split,
splitKeepBoth: undefined,
keepCase: undefined,
allowedSplitWords,
storeSplitWordsAsCompounds,
minCompoundLength,
} as const satisfies RequireFields<ParseFileOptions>;
const words = [...parseFileLines(reader.lines, parseOptions)];

return {
size: words.length,
2 changes: 2 additions & 0 deletions packages/cspell-tools/src/compiler/compile.ts
Original file line number Diff line number Diff line change
@@ -275,6 +275,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
split = sourceOptions.split || false,
maxDepth,
storeSplitWordsAsCompounds,
minCompoundLength,
} = fileSource;

const legacy = split === 'legacy';
@@ -293,6 +294,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
keepCase: keepRawCase,
allowedSplitWords,
storeSplitWordsAsCompounds,
minCompoundLength,
};

logWithTimestamp(`Reading ${path.basename(filename)}`);
7 changes: 6 additions & 1 deletion packages/cspell-tools/src/compiler/legacyLineToWords.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import { opConcatMap, opFilter, opMap, pipe } from '@cspell/cspell-pipe/sync';

import { defaultCompileSourceOptions } from '../config/configDefaults.js';
import { regExpSpaceOrDash, splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js';
import type { AllowedSplitWordsCollection } from './WordsCollection.js';

const regNonWord = /[^\p{L}\p{M}' _\d]+/giu;
const regExpRepeatChars = /(.)\1{5}/i;

const minCompoundLength = defaultCompileSourceOptions.minCompoundLength;

export function legacyLineToWords(
line: string,
keepCase: boolean,
@@ -15,10 +18,12 @@ export function legacyLineToWords(
const filteredLine = line.replaceAll(regNonWord, '|');
const wordGroups = filteredLine.split('|');

const _minCompoundLength = minCompoundLength;

const words = pipe(
wordGroups,
opConcatMap((a) => a.split(regExpSpaceOrDash)),
opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '')),
opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '', _minCompoundLength)),
opMap((a) => a.trim()),
opFilter((a) => !!a),
opFilter((s) => !regExpRepeatChars.test(s)),
40 changes: 22 additions & 18 deletions packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts
Original file line number Diff line number Diff line change
@@ -20,25 +20,29 @@ describe('splitCamelCaseIfAllowed', () => {
${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection', 'state', 'msg', 'closed']}
`('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => {
allowed = createAllowedSplitWords(allowed);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '')).toEqual(expected);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '', 4)).toEqual(expected);
});

test.each`
text | keepCase | allowed | expected
${''} | ${false} | ${undefined} | ${[]}
${'hello'} | ${false} | ${undefined} | ${['hello']}
${'helloThere'} | ${false} | ${['hello', 'there']} | ${['hello+', '+there']}
${'helloThere'} | ${false} | ${['hello', 'There']} | ${['hello+', '+There']}
${'helloThere'} | ${true} | ${['hello', 'There']} | ${['hello+', '+There']}
${'ERRORCode'} | ${false} | ${['error', 'code']} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['code']} | ${['ERRORCode']}
${'ERRORCode'} | ${false} | ${['code']} | ${['ERRORCode']}
${'ErrorCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']}
${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']}
${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection+', '+state+', '+msg', 'closed']}
`('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => {
allowed = createAllowedSplitWords(allowed);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+')).toEqual(expected);
});
text | keepCase | allowed | min | expected
${''} | ${false} | ${undefined} | ${4} | ${[]}
${'hello'} | ${false} | ${undefined} | ${4} | ${['hello']}
${'helloThere'} | ${false} | ${['hello', 'there']} | ${4} | ${['hello+', '+there']}
${'helloThere'} | ${false} | ${['hello', 'There']} | ${4} | ${['hello+', '+There']}
${'helloThere'} | ${true} | ${['hello', 'There']} | ${4} | ${['hello+', '+There']}
${'ERRORCode'} | ${false} | ${['error', 'code']} | ${4} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['error', 'code']} | ${4} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['code']} | ${4} | ${['ERRORCode']}
${'ERRORCode'} | ${false} | ${['code']} | ${4} | ${['ERRORCode']}
${'ErrorCode'} | ${true} | ${['error', 'code']} | ${4} | ${['error+', '+code']}
${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${3} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']}
${'xmlUCSIsCats'} | ${true} | ${['xml', 'UCS', 'is', 'cats']} | ${4} | ${['xml', 'UCS', 'is', '+cats']}
${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${4} | ${['ADP', 'connection+', '+state+', 'msg', 'closed']}
`(
'splitCamelCaseIfAllowed $text $keepCase $allowed',
({ text, keepCase, allowed, expected, min: minCompoundLength }) => {
allowed = createAllowedSplitWords(allowed);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+', minCompoundLength)).toEqual(expected);
},
);
});
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@ export function splitCamelCaseIfAllowed(
allowedWords: AllowedSplitWordsCollection,
keepCase: boolean,
compoundPrefix: string,
minCompoundLength: number,
): string[] {
const split = [...splitCamelCase(word)];
if (split.length == 1) return adjustCases(split, allowedWords, keepCase);
@@ -20,7 +21,7 @@ export function splitCamelCaseIfAllowed(
? adjusted
: adjusted.map((w, i) => {
const { px, sx } = wordIndexes[i];
const canCompound = w.length > 2;
const canCompound = w.length >= minCompoundLength;
const lc = w.toLowerCase();
const p = canCompound && isSingleLetter(px) ? compoundPrefix : '';
const s = canCompound && isSingleLetter(sx) ? compoundPrefix : '';
1 change: 1 addition & 0 deletions packages/cspell-tools/src/compiler/wordListParser.test.ts
Original file line number Diff line number Diff line change
@@ -76,6 +76,7 @@ function pf(...opts: Partial<ParseFileOptions>[]): ParseFileOptions {
const opt: ParseFileOptions = {
allowedSplitWords: defaultAllowedSplitWords,
storeSplitWordsAsCompounds: undefined,
minCompoundLength: undefined,
};
for (const op of opts) {
Object.assign(opt, op);
21 changes: 16 additions & 5 deletions packages/cspell-tools/src/compiler/wordListParser.ts
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@ import { opCombine, opCombine as opPipe, type Operator, opFilter, opMap } from '
import { createDictionaryLineParser } from 'cspell-trie-lib';
import { uniqueFilter } from 'hunspell-reader';

import { defaultCompileSourceOptions } from '../config/configDefaults.js';
import type { CompileOptions } from './CompileOptions.js';
import { legacyLineToWords } from './legacyLineToWords.js';
import { splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js';
@@ -85,21 +86,30 @@ export interface ParseFileOptions {
* @default undefined
*/
storeSplitWordsAsCompounds: boolean | undefined;

/**
* Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`.
* The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words.
* If the length is too low, then the dictionary will consider many misspelled words as correct.
* @default 4
*/
minCompoundLength: number | undefined;
}

type ParseFileOptionsRequired = Required<ParseFileOptions>;

const commentCharacter = '#';

const _defaultOptions: ParseFileOptionsRequired = {
const _defaultOptions = {
keepCase: true,
legacy: false,
split: false,
splitKeepBoth: false,
// splitSeparator: regExpSplit,
allowedSplitWords: { has: () => true, size: 0 },
storeSplitWordsAsCompounds: undefined,
};
storeSplitWordsAsCompounds: defaultCompileSourceOptions.storeSplitWordsAsCompounds,
minCompoundLength: defaultCompileSourceOptions.minCompoundLength,
} as const satisfies ParseFileOptionsRequired;

export const defaultParseDictionaryOptions: ParseFileOptionsRequired = Object.freeze(_defaultOptions);

@@ -119,6 +129,7 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
splitKeepBoth = _defaultOptions.splitKeepBoth,
allowedSplitWords = _defaultOptions.allowedSplitWords,
storeSplitWordsAsCompounds,
minCompoundLength = _defaultOptions.minCompoundLength,
} = _options;

let { legacy = _defaultOptions.legacy } = _options;
@@ -207,7 +218,7 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
}

function splitWordIntoWords(word: string): string[] {
return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix);
return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix, minCompoundLength);
}

function* splitWords(lines: Iterable<string>): Iterable<string> {
@@ -260,6 +271,6 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
* @param _options - defines prefixes used when parsing lines.
* @returns words that have been normalized.
*/
export function parseFileLines(lines: Iterable<string> | string, options: Partial<ParseFileOptions>): Iterable<string> {
export function parseFileLines(lines: Iterable<string> | string, options: ParseFileOptions): Iterable<string> {
return createParseFileLineMapper(options)(typeof lines === 'string' ? [lines] : lines);
}
10 changes: 9 additions & 1 deletion packages/cspell-tools/src/config/config.ts
Original file line number Diff line number Diff line change
@@ -186,11 +186,19 @@ export interface CompileSourceOptions {
allowedSplitWords?: FilePath | FilePath[] | undefined;

/**
* Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words.
* Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words.
* These words are prefixed / suffixed with `*`.
* @default false
*/
storeSplitWordsAsCompounds?: boolean | undefined;

/**
* Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`.
* The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words.
* If the length is too low, then the dictionary will consider many misspelled words as correct.
* @default 4
*/
minCompoundLength?: number | undefined;
}

export const configFileSchemaURL =
11 changes: 11 additions & 0 deletions packages/cspell-tools/src/config/configDefaults.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import type { RequireFields } from '../types.js';
import type { CompileSourceOptions } from './config.js';

export const defaultCompileSourceOptions = {
maxDepth: undefined,
split: false,
keepRawCase: false,
allowedSplitWords: undefined,
storeSplitWordsAsCompounds: false,
minCompoundLength: 4,
} as const satisfies RequireFields<CompileSourceOptions>;
6 changes: 6 additions & 0 deletions packages/cspell-tools/src/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/**
* Make all properties in T required, but keep the original optionality of the properties.
*/
export type RequireFields<T> = {
[P in keyof Required<T>]: T[P];
};