From b8d7ef6e19fa9324944f3ff7ab2e689f891cfa9d Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Sat, 2 Nov 2024 08:25:51 +0100 Subject: [PATCH 1/3] fix: cspell-tools - add minCompoundLength setting --- .../cspell-tools.config.schema.json | 21 ++++++++-- .../cspell-tools/src/compiler/SourceReader.ts | 2 + packages/cspell-tools/src/compiler/compile.ts | 2 + .../src/compiler/legacyLineToWords.ts | 7 +++- .../compiler/splitCamelCaseIfAllowed.test.ts | 40 ++++++++++--------- .../src/compiler/splitCamelCaseIfAllowed.ts | 3 +- .../src/compiler/wordListParser.test.ts | 1 + .../src/compiler/wordListParser.ts | 19 +++++++-- packages/cspell-tools/src/config/config.ts | 10 ++++- .../cspell-tools/src/config/configDefaults.ts | 17 ++++++++ 10 files changed, 94 insertions(+), 28 deletions(-) create mode 100644 packages/cspell-tools/src/config/configDefaults.ts diff --git a/packages/cspell-tools/cspell-tools.config.schema.json b/packages/cspell-tools/cspell-tools.config.schema.json index 3e34f628d69..a98384c90c2 100644 --- a/packages/cspell-tools/cspell-tools.config.schema.json +++ b/packages/cspell-tools/cspell-tools.config.schema.json @@ -53,6 +53,11 @@ "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.", "type": "number" }, + "minCompoundLength": { + "default": 4, + "description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.", + "type": "number" + }, "split": { "anyOf": [ { @@ -68,7 +73,7 @@ }, "storeSplitWordsAsCompounds": { "default": false, - "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", + "description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", "type": "boolean" } }, @@ -110,6 +115,11 @@ "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.", "type": "number" }, + "minCompoundLength": { + "default": 4, + "description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.", + "type": "number" + }, "split": { "anyOf": [ { @@ -125,7 +135,7 @@ }, "storeSplitWordsAsCompounds": { "default": false, - "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", + "description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", "type": "boolean" } }, @@ -266,6 +276,11 @@ "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.", "type": "number" }, + "minCompoundLength": { + "default": 4, + "description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.", + "type": "number" + }, "removeDuplicates": { "default": false, "description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.", @@ -295,7 +310,7 @@ }, "storeSplitWordsAsCompounds": { "default": false, - "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", + "description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.", "type": "boolean" }, "targets": { diff --git a/packages/cspell-tools/src/compiler/SourceReader.ts b/packages/cspell-tools/src/compiler/SourceReader.ts index 882cb15f457..24ec6033c16 100644 --- a/packages/cspell-tools/src/compiler/SourceReader.ts +++ b/packages/cspell-tools/src/compiler/SourceReader.ts @@ -24,6 +24,8 @@ export interface SourceReaderOptions { allowedSplitWords: AllowedSplitWordsCollection; storeSplitWordsAsCompounds: boolean | undefined; + + minCompoundLength?: number | undefined; } export type AnnotatedWord = string; diff --git a/packages/cspell-tools/src/compiler/compile.ts b/packages/cspell-tools/src/compiler/compile.ts index 2cf5a621135..8281acf677a 100644 --- a/packages/cspell-tools/src/compiler/compile.ts +++ b/packages/cspell-tools/src/compiler/compile.ts @@ -275,6 +275,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour split = sourceOptions.split || false, maxDepth, storeSplitWordsAsCompounds, + minCompoundLength, } = fileSource; const legacy = split === 'legacy'; @@ -293,6 +294,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour keepCase: keepRawCase, allowedSplitWords, storeSplitWordsAsCompounds, + minCompoundLength, }; logWithTimestamp(`Reading ${path.basename(filename)}`); diff --git a/packages/cspell-tools/src/compiler/legacyLineToWords.ts b/packages/cspell-tools/src/compiler/legacyLineToWords.ts index 94fd646dc9a..7fc4249703f 100644 --- a/packages/cspell-tools/src/compiler/legacyLineToWords.ts +++ b/packages/cspell-tools/src/compiler/legacyLineToWords.ts @@ -1,11 +1,14 @@ import { opConcatMap, opFilter, opMap, pipe } from '@cspell/cspell-pipe/sync'; +import { defaultCompileSourceOptions } from '../config/configDefaults.js'; import { regExpSpaceOrDash, splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js'; import type { AllowedSplitWordsCollection } from './WordsCollection.js'; const regNonWord = /[^\p{L}\p{M}' _\d]+/giu; const regExpRepeatChars = /(.)\1{5}/i; +const minCompoundLength = defaultCompileSourceOptions.minCompoundLength; + export function legacyLineToWords( line: string, keepCase: boolean, @@ -15,10 +18,12 @@ export function legacyLineToWords( const filteredLine = line.replaceAll(regNonWord, '|'); const wordGroups = filteredLine.split('|'); + const _minCompoundLength = minCompoundLength; + const words = pipe( wordGroups, opConcatMap((a) => a.split(regExpSpaceOrDash)), - opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '')), + opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '', _minCompoundLength)), opMap((a) => a.trim()), opFilter((a) => !!a), opFilter((s) => !regExpRepeatChars.test(s)), diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts index 76bb0aed6a0..b3a67dd3c1e 100644 --- a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts +++ b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts @@ -20,25 +20,29 @@ describe('splitCamelCaseIfAllowed', () => { ${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection', 'state', 'msg', 'closed']} `('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => { allowed = createAllowedSplitWords(allowed); - expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '')).toEqual(expected); + expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '', 4)).toEqual(expected); }); test.each` - text | keepCase | allowed | expected - ${''} | ${false} | ${undefined} | ${[]} - ${'hello'} | ${false} | ${undefined} | ${['hello']} - ${'helloThere'} | ${false} | ${['hello', 'there']} | ${['hello+', '+there']} - ${'helloThere'} | ${false} | ${['hello', 'There']} | ${['hello+', '+There']} - ${'helloThere'} | ${true} | ${['hello', 'There']} | ${['hello+', '+There']} - ${'ERRORCode'} | ${false} | ${['error', 'code']} | ${['error+', '+code']} - ${'ERRORCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']} - ${'ERRORCode'} | ${true} | ${['code']} | ${['ERRORCode']} - ${'ERRORCode'} | ${false} | ${['code']} | ${['ERRORCode']} - ${'ErrorCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']} - ${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']} - ${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection+', '+state+', '+msg', 'closed']} - `('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => { - allowed = createAllowedSplitWords(allowed); - expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+')).toEqual(expected); - }); + text | keepCase | allowed | min | expected + ${''} | ${false} | ${undefined} | ${4} | ${[]} + ${'hello'} | ${false} | ${undefined} | ${4} | ${['hello']} + ${'helloThere'} | ${false} | ${['hello', 'there']} | ${4} | ${['hello+', '+there']} + ${'helloThere'} | ${false} | ${['hello', 'There']} | ${4} | ${['hello+', '+There']} + ${'helloThere'} | ${true} | ${['hello', 'There']} | ${4} | ${['hello+', '+There']} + ${'ERRORCode'} | ${false} | ${['error', 'code']} | ${4} | ${['error+', '+code']} + ${'ERRORCode'} | ${true} | ${['error', 'code']} | ${4} | ${['error+', '+code']} + ${'ERRORCode'} | ${true} | ${['code']} | ${4} | ${['ERRORCode']} + ${'ERRORCode'} | ${false} | ${['code']} | ${4} | ${['ERRORCode']} + ${'ErrorCode'} | ${true} | ${['error', 'code']} | ${4} | ${['error+', '+code']} + ${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${3} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']} + ${'xmlUCSIsCats'} | ${true} | ${['xml', 'UCS', 'is', 'cats']} | ${4} | ${['xml', 'UCS', 'is', '+cats']} + ${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${4} | ${['ADP', 'connection+', '+state+', 'msg', 'closed']} + `( + 'splitCamelCaseIfAllowed $text $keepCase $allowed', + ({ text, keepCase, allowed, expected, min: minCompoundLength }) => { + allowed = createAllowedSplitWords(allowed); + expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+', minCompoundLength)).toEqual(expected); + }, + ); }); diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts index a51e2fc7b58..d9d16d845cf 100644 --- a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts +++ b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts @@ -9,6 +9,7 @@ export function splitCamelCaseIfAllowed( allowedWords: AllowedSplitWordsCollection, keepCase: boolean, compoundPrefix: string, + minCompoundLength: number, ): string[] { const split = [...splitCamelCase(word)]; if (split.length == 1) return adjustCases(split, allowedWords, keepCase); @@ -20,7 +21,7 @@ export function splitCamelCaseIfAllowed( ? adjusted : adjusted.map((w, i) => { const { px, sx } = wordIndexes[i]; - const canCompound = w.length > 2; + const canCompound = w.length >= minCompoundLength; const lc = w.toLowerCase(); const p = canCompound && isSingleLetter(px) ? compoundPrefix : ''; const s = canCompound && isSingleLetter(sx) ? compoundPrefix : ''; diff --git a/packages/cspell-tools/src/compiler/wordListParser.test.ts b/packages/cspell-tools/src/compiler/wordListParser.test.ts index c7540d704a5..dd3418d1295 100644 --- a/packages/cspell-tools/src/compiler/wordListParser.test.ts +++ b/packages/cspell-tools/src/compiler/wordListParser.test.ts @@ -76,6 +76,7 @@ function pf(...opts: Partial[]): ParseFileOptions { const opt: ParseFileOptions = { allowedSplitWords: defaultAllowedSplitWords, storeSplitWordsAsCompounds: undefined, + minCompoundLength: undefined, }; for (const op of opts) { Object.assign(opt, op); diff --git a/packages/cspell-tools/src/compiler/wordListParser.ts b/packages/cspell-tools/src/compiler/wordListParser.ts index 9900dd99362..7a8399cf9d6 100644 --- a/packages/cspell-tools/src/compiler/wordListParser.ts +++ b/packages/cspell-tools/src/compiler/wordListParser.ts @@ -2,6 +2,7 @@ import { opCombine, opCombine as opPipe, type Operator, opFilter, opMap } from ' import { createDictionaryLineParser } from 'cspell-trie-lib'; import { uniqueFilter } from 'hunspell-reader'; +import { defaultCompileSourceOptions } from '../config/configDefaults.js'; import type { CompileOptions } from './CompileOptions.js'; import { legacyLineToWords } from './legacyLineToWords.js'; import { splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js'; @@ -85,21 +86,30 @@ export interface ParseFileOptions { * @default undefined */ storeSplitWordsAsCompounds: boolean | undefined; + + /** + * Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. + * The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. + * If the length is too low, then the dictionary will consider many misspelled words as correct. + * @default 4 + */ + minCompoundLength: number | undefined; } type ParseFileOptionsRequired = Required; const commentCharacter = '#'; -const _defaultOptions: ParseFileOptionsRequired = { +const _defaultOptions = { keepCase: true, legacy: false, split: false, splitKeepBoth: false, // splitSeparator: regExpSplit, allowedSplitWords: { has: () => true, size: 0 }, - storeSplitWordsAsCompounds: undefined, -}; + storeSplitWordsAsCompounds: defaultCompileSourceOptions.storeSplitWordsAsCompounds, + minCompoundLength: defaultCompileSourceOptions.minCompoundLength, +} as const satisfies ParseFileOptionsRequired; export const defaultParseDictionaryOptions: ParseFileOptionsRequired = Object.freeze(_defaultOptions); @@ -119,6 +129,7 @@ export function createParseFileLineMapper(options?: Partial): splitKeepBoth = _defaultOptions.splitKeepBoth, allowedSplitWords = _defaultOptions.allowedSplitWords, storeSplitWordsAsCompounds, + minCompoundLength = _defaultOptions.minCompoundLength, } = _options; let { legacy = _defaultOptions.legacy } = _options; @@ -207,7 +218,7 @@ export function createParseFileLineMapper(options?: Partial): } function splitWordIntoWords(word: string): string[] { - return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix); + return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix, minCompoundLength); } function* splitWords(lines: Iterable): Iterable { diff --git a/packages/cspell-tools/src/config/config.ts b/packages/cspell-tools/src/config/config.ts index 22363381ea6..c30338762d8 100644 --- a/packages/cspell-tools/src/config/config.ts +++ b/packages/cspell-tools/src/config/config.ts @@ -186,11 +186,19 @@ export interface CompileSourceOptions { allowedSplitWords?: FilePath | FilePath[] | undefined; /** - * Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. + * Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. * These words are prefixed / suffixed with `*`. * @default false */ storeSplitWordsAsCompounds?: boolean | undefined; + + /** + * Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. + * The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. + * If the length is too low, then the dictionary will consider many misspelled words as correct. + * @default 4 + */ + minCompoundLength?: number | undefined; } export const configFileSchemaURL = diff --git a/packages/cspell-tools/src/config/configDefaults.ts b/packages/cspell-tools/src/config/configDefaults.ts new file mode 100644 index 00000000000..432e555972c --- /dev/null +++ b/packages/cspell-tools/src/config/configDefaults.ts @@ -0,0 +1,17 @@ +import type { CompileSourceOptions } from './config.js'; + +/** + * Make all properties in T required + */ +type RequireAllFields = { + [P in keyof Required]: T[P]; +}; + +export const defaultCompileSourceOptions = { + maxDepth: undefined, + split: false, + keepRawCase: false, + allowedSplitWords: undefined, + storeSplitWordsAsCompounds: false, + minCompoundLength: 4, +} as const satisfies RequireAllFields; From 3665c843e5b81cd057f3258c5aee5497240474e5 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Sat, 2 Nov 2024 08:55:24 +0100 Subject: [PATCH 2/3] Update SourceReader.ts --- packages/cspell-tools/src/compiler/SourceReader.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/cspell-tools/src/compiler/SourceReader.ts b/packages/cspell-tools/src/compiler/SourceReader.ts index 24ec6033c16..e2a93741c95 100644 --- a/packages/cspell-tools/src/compiler/SourceReader.ts +++ b/packages/cspell-tools/src/compiler/SourceReader.ts @@ -66,8 +66,16 @@ function splitLines(lines: Iterable, options: SourceReaderOptions): Iter } async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise { - const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds } = options; - const words = [...parseFileLines(reader.lines, { legacy, split, allowedSplitWords, storeSplitWordsAsCompounds })]; + const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds, minCompoundLength } = options; + const words = [ + ...parseFileLines(reader.lines, { + legacy, + split, + allowedSplitWords, + storeSplitWordsAsCompounds, + minCompoundLength, + }), + ]; return { size: words.length, From 49c732de5747bb72fbabc0cd40432c1b8dc04c6a Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Sat, 2 Nov 2024 09:08:20 +0100 Subject: [PATCH 3/3] Make it easier to add options correctly. --- .../cspell-tools/src/compiler/SourceReader.ts | 22 ++++++++++--------- .../src/compiler/wordListParser.ts | 2 +- .../cspell-tools/src/config/configDefaults.ts | 10 ++------- packages/cspell-tools/src/types.ts | 6 +++++ 4 files changed, 21 insertions(+), 19 deletions(-) create mode 100644 packages/cspell-tools/src/types.ts diff --git a/packages/cspell-tools/src/compiler/SourceReader.ts b/packages/cspell-tools/src/compiler/SourceReader.ts index e2a93741c95..ec43483c6e7 100644 --- a/packages/cspell-tools/src/compiler/SourceReader.ts +++ b/packages/cspell-tools/src/compiler/SourceReader.ts @@ -1,6 +1,7 @@ +import { RequireFields } from '../types.js'; import { createReader } from './Reader.js'; import type { Reader } from './readers/ReaderOptions.js'; -import { parseFileLines } from './wordListParser.js'; +import { parseFileLines, ParseFileOptions } from './wordListParser.js'; import type { AllowedSplitWordsCollection } from './WordsCollection.js'; export interface SourceReaderOptions { @@ -67,15 +68,16 @@ function splitLines(lines: Iterable, options: SourceReaderOptions): Iter async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise { const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds, minCompoundLength } = options; - const words = [ - ...parseFileLines(reader.lines, { - legacy, - split, - allowedSplitWords, - storeSplitWordsAsCompounds, - minCompoundLength, - }), - ]; + const parseOptions = { + legacy, + split, + splitKeepBoth: undefined, + keepCase: undefined, + allowedSplitWords, + storeSplitWordsAsCompounds, + minCompoundLength, + } as const satisfies RequireFields; + const words = [...parseFileLines(reader.lines, parseOptions)]; return { size: words.length, diff --git a/packages/cspell-tools/src/compiler/wordListParser.ts b/packages/cspell-tools/src/compiler/wordListParser.ts index 7a8399cf9d6..269b63fa96e 100644 --- a/packages/cspell-tools/src/compiler/wordListParser.ts +++ b/packages/cspell-tools/src/compiler/wordListParser.ts @@ -271,6 +271,6 @@ export function createParseFileLineMapper(options?: Partial): * @param _options - defines prefixes used when parsing lines. * @returns words that have been normalized. */ -export function parseFileLines(lines: Iterable | string, options: Partial): Iterable { +export function parseFileLines(lines: Iterable | string, options: ParseFileOptions): Iterable { return createParseFileLineMapper(options)(typeof lines === 'string' ? [lines] : lines); } diff --git a/packages/cspell-tools/src/config/configDefaults.ts b/packages/cspell-tools/src/config/configDefaults.ts index 432e555972c..49228fb7dee 100644 --- a/packages/cspell-tools/src/config/configDefaults.ts +++ b/packages/cspell-tools/src/config/configDefaults.ts @@ -1,12 +1,6 @@ +import type { RequireFields } from '../types.js'; import type { CompileSourceOptions } from './config.js'; -/** - * Make all properties in T required - */ -type RequireAllFields = { - [P in keyof Required]: T[P]; -}; - export const defaultCompileSourceOptions = { maxDepth: undefined, split: false, @@ -14,4 +8,4 @@ export const defaultCompileSourceOptions = { allowedSplitWords: undefined, storeSplitWordsAsCompounds: false, minCompoundLength: 4, -} as const satisfies RequireAllFields; +} as const satisfies RequireFields; diff --git a/packages/cspell-tools/src/types.ts b/packages/cspell-tools/src/types.ts new file mode 100644 index 00000000000..e405266d8f1 --- /dev/null +++ b/packages/cspell-tools/src/types.ts @@ -0,0 +1,6 @@ +/** + * Make all properties in T required, but keep the original optionality of the properties. + */ +export type RequireFields = { + [P in keyof Required]: T[P]; +};