streetsidesoftware · Jason3S · Nov 2, 2024 · Nov 2, 2024 · Nov 2, 2024 · Nov 2, 2024
diff --git a/packages/cspell-tools/cspell-tools.config.schema.json b/packages/cspell-tools/cspell-tools.config.schema.json
@@ -53,6 +53,11 @@
           "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
           "type": "number"
         },
+        "minCompoundLength": {
+          "default": 4,
+          "description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
+          "type": "number"
+        },
         "split": {
           "anyOf": [
             {
@@ -68,7 +73,7 @@
         },
         "storeSplitWordsAsCompounds": {
           "default": false,
-          "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
+          "description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
           "type": "boolean"
         }
       },
@@ -110,6 +115,11 @@
           "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
           "type": "number"
         },
+        "minCompoundLength": {
+          "default": 4,
+          "description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
+          "type": "number"
+        },
         "split": {
           "anyOf": [
             {
@@ -125,7 +135,7 @@
         },
         "storeSplitWordsAsCompounds": {
           "default": false,
-          "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
+          "description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
           "type": "boolean"
         }
       },
@@ -266,6 +276,11 @@
       "description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
       "type": "number"
     },
+    "minCompoundLength": {
+      "default": 4,
+      "description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
+      "type": "number"
+    },
     "removeDuplicates": {
       "default": false,
       "description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.",
@@ -295,7 +310,7 @@
     },
     "storeSplitWordsAsCompounds": {
       "default": false,
-      "description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
+      "description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
       "type": "boolean"
     },
     "targets": {

diff --git a/packages/cspell-tools/src/compiler/SourceReader.ts b/packages/cspell-tools/src/compiler/SourceReader.ts
@@ -1,6 +1,7 @@
+import { RequireFields } from '../types.js';
 import { createReader } from './Reader.js';
 import type { Reader } from './readers/ReaderOptions.js';
-import { parseFileLines } from './wordListParser.js';
+import { parseFileLines, ParseFileOptions } from './wordListParser.js';
 import type { AllowedSplitWordsCollection } from './WordsCollection.js';
 
 export interface SourceReaderOptions {
@@ -24,6 +25,8 @@ export interface SourceReaderOptions {
     allowedSplitWords: AllowedSplitWordsCollection;
 
     storeSplitWordsAsCompounds: boolean | undefined;
+
+    minCompoundLength?: number | undefined;
 }
 
 export type AnnotatedWord = string;
@@ -64,8 +67,17 @@ function splitLines(lines: Iterable<string>, options: SourceReaderOptions): Iter
 }
 
 async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise<SourceReader> {
-    const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds } = options;
-    const words = [...parseFileLines(reader.lines, { legacy, split, allowedSplitWords, storeSplitWordsAsCompounds })];
+    const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds, minCompoundLength } = options;
+    const parseOptions = {
+        legacy,
+        split,
+        splitKeepBoth: undefined,
+        keepCase: undefined,
+        allowedSplitWords,
+        storeSplitWordsAsCompounds,
+        minCompoundLength,
+    } as const satisfies RequireFields<ParseFileOptions>;
+    const words = [...parseFileLines(reader.lines, parseOptions)];
 
     return {
         size: words.length,

diff --git a/packages/cspell-tools/src/compiler/compile.ts b/packages/cspell-tools/src/compiler/compile.ts
@@ -275,6 +275,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
         split = sourceOptions.split || false,
         maxDepth,
         storeSplitWordsAsCompounds,
+        minCompoundLength,
     } = fileSource;
 
     const legacy = split === 'legacy';
@@ -293,6 +294,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
         keepCase: keepRawCase,
         allowedSplitWords,
         storeSplitWordsAsCompounds,
+        minCompoundLength,
     };
 
     logWithTimestamp(`Reading ${path.basename(filename)}`);

diff --git a/packages/cspell-tools/src/compiler/legacyLineToWords.ts b/packages/cspell-tools/src/compiler/legacyLineToWords.ts
@@ -1,11 +1,14 @@
 import { opConcatMap, opFilter, opMap, pipe } from '@cspell/cspell-pipe/sync';
 
+import { defaultCompileSourceOptions } from '../config/configDefaults.js';
 import { regExpSpaceOrDash, splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js';
 import type { AllowedSplitWordsCollection } from './WordsCollection.js';
 
 const regNonWord = /[^\p{L}\p{M}' _\d]+/giu;
 const regExpRepeatChars = /(.)\1{5}/i;
 
+const minCompoundLength = defaultCompileSourceOptions.minCompoundLength;
+
 export function legacyLineToWords(
     line: string,
     keepCase: boolean,
@@ -15,10 +18,12 @@ export function legacyLineToWords(
     const filteredLine = line.replaceAll(regNonWord, '|');
     const wordGroups = filteredLine.split('|');
 
+    const _minCompoundLength = minCompoundLength;
+
     const words = pipe(
         wordGroups,
         opConcatMap((a) => a.split(regExpSpaceOrDash)),
-        opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '')),
+        opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '', _minCompoundLength)),
         opMap((a) => a.trim()),
         opFilter((a) => !!a),
         opFilter((s) => !regExpRepeatChars.test(s)),

diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts
@@ -20,25 +20,29 @@ describe('splitCamelCaseIfAllowed', () => {
         ${'ADP_ConnectionStateMsg_Closed'} | ${true}  | ${undefined}                        | ${['ADP', 'connection', 'state', 'msg', 'closed']}
     `('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => {
         allowed = createAllowedSplitWords(allowed);
-        expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '')).toEqual(expected);
+        expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '', 4)).toEqual(expected);
     });
 
     test.each`
-        text                               | keepCase | allowed                             | expected
-        ${''}                              | ${false} | ${undefined}                        | ${[]}
-        ${'hello'}                         | ${false} | ${undefined}                        | ${['hello']}
-        ${'helloThere'}                    | ${false} | ${['hello', 'there']}               | ${['hello+', '+there']}
-        ${'helloThere'}                    | ${false} | ${['hello', 'There']}               | ${['hello+', '+There']}
-        ${'helloThere'}                    | ${true}  | ${['hello', 'There']}               | ${['hello+', '+There']}
-        ${'ERRORCode'}                     | ${false} | ${['error', 'code']}                | ${['error+', '+code']}
-        ${'ERRORCode'}                     | ${true}  | ${['error', 'code']}                | ${['error+', '+code']}
-        ${'ERRORCode'}                     | ${true}  | ${['code']}                         | ${['ERRORCode']}
-        ${'ERRORCode'}                     | ${false} | ${['code']}                         | ${['ERRORCode']}
-        ${'ErrorCode'}                     | ${true}  | ${['error', 'code']}                | ${['error+', '+code']}
-        ${'xmlUCSIsCatZ'}                  | ${true}  | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']}
-        ${'ADP_ConnectionStateMsg_Closed'} | ${true}  | ${undefined}                        | ${['ADP', 'connection+', '+state+', '+msg', 'closed']}
-    `('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => {
-        allowed = createAllowedSplitWords(allowed);
-        expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+')).toEqual(expected);
-    });
+        text                               | keepCase | allowed                             | min  | expected
+        ${''}                              | ${false} | ${undefined}                        | ${4} | ${[]}
+        ${'hello'}                         | ${false} | ${undefined}                        | ${4} | ${['hello']}
+        ${'helloThere'}                    | ${false} | ${['hello', 'there']}               | ${4} | ${['hello+', '+there']}
+        ${'helloThere'}                    | ${false} | ${['hello', 'There']}               | ${4} | ${['hello+', '+There']}
+        ${'helloThere'}                    | ${true}  | ${['hello', 'There']}               | ${4} | ${['hello+', '+There']}
+        ${'ERRORCode'}                     | ${false} | ${['error', 'code']}                | ${4} | ${['error+', '+code']}
+        ${'ERRORCode'}                     | ${true}  | ${['error', 'code']}                | ${4} | ${['error+', '+code']}
+        ${'ERRORCode'}                     | ${true}  | ${['code']}                         | ${4} | ${['ERRORCode']}
+        ${'ERRORCode'}                     | ${false} | ${['code']}                         | ${4} | ${['ERRORCode']}
+        ${'ErrorCode'}                     | ${true}  | ${['error', 'code']}                | ${4} | ${['error+', '+code']}
+        ${'xmlUCSIsCatZ'}                  | ${true}  | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${3} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']}
+        ${'xmlUCSIsCats'}                  | ${true}  | ${['xml', 'UCS', 'is', 'cats']}     | ${4} | ${['xml', 'UCS', 'is', '+cats']}
+        ${'ADP_ConnectionStateMsg_Closed'} | ${true}  | ${undefined}                        | ${4} | ${['ADP', 'connection+', '+state+', 'msg', 'closed']}
+    `(
+        'splitCamelCaseIfAllowed $text $keepCase $allowed',
+        ({ text, keepCase, allowed, expected, min: minCompoundLength }) => {
+            allowed = createAllowedSplitWords(allowed);
+            expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+', minCompoundLength)).toEqual(expected);
+        },
+    );
 });
diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts
@@ -9,6 +9,7 @@ export function splitCamelCaseIfAllowed(
     allowedWords: AllowedSplitWordsCollection,
     keepCase: boolean,
     compoundPrefix: string,
+    minCompoundLength: number,
 ): string[] {
     const split = [...splitCamelCase(word)];
     if (split.length == 1) return adjustCases(split, allowedWords, keepCase);
@@ -20,7 +21,7 @@ export function splitCamelCaseIfAllowed(
         ? adjusted
         : adjusted.map((w, i) => {
               const { px, sx } = wordIndexes[i];
-              const canCompound = w.length > 2;
+              const canCompound = w.length >= minCompoundLength;
               const lc = w.toLowerCase();
               const p = canCompound && isSingleLetter(px) ? compoundPrefix : '';
               const s = canCompound && isSingleLetter(sx) ? compoundPrefix : '';

diff --git a/packages/cspell-tools/src/compiler/wordListParser.test.ts b/packages/cspell-tools/src/compiler/wordListParser.test.ts
@@ -76,6 +76,7 @@ function pf(...opts: Partial<ParseFileOptions>[]): ParseFileOptions {
     const opt: ParseFileOptions = {
         allowedSplitWords: defaultAllowedSplitWords,
         storeSplitWordsAsCompounds: undefined,
+        minCompoundLength: undefined,
     };
     for (const op of opts) {
         Object.assign(opt, op);

diff --git a/packages/cspell-tools/src/compiler/wordListParser.ts b/packages/cspell-tools/src/compiler/wordListParser.ts
@@ -2,6 +2,7 @@ import { opCombine, opCombine as opPipe, type Operator, opFilter, opMap } from '
 import { createDictionaryLineParser } from 'cspell-trie-lib';
 import { uniqueFilter } from 'hunspell-reader';
 
+import { defaultCompileSourceOptions } from '../config/configDefaults.js';
 import type { CompileOptions } from './CompileOptions.js';
 import { legacyLineToWords } from './legacyLineToWords.js';
 import { splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js';
@@ -85,21 +86,30 @@ export interface ParseFileOptions {
      * @default undefined
      */
     storeSplitWordsAsCompounds: boolean | undefined;
+
+    /**
+     * Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`.
+     * The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words.
+     * If the length is too low, then the dictionary will consider many misspelled words as correct.
+     * @default 4
+     */
+    minCompoundLength: number | undefined;
 }
 
 type ParseFileOptionsRequired = Required<ParseFileOptions>;
 
 const commentCharacter = '#';
 
-const _defaultOptions: ParseFileOptionsRequired = {
+const _defaultOptions = {
     keepCase: true,
     legacy: false,
     split: false,
     splitKeepBoth: false,
     // splitSeparator: regExpSplit,
     allowedSplitWords: { has: () => true, size: 0 },
-    storeSplitWordsAsCompounds: undefined,
-};
+    storeSplitWordsAsCompounds: defaultCompileSourceOptions.storeSplitWordsAsCompounds,
+    minCompoundLength: defaultCompileSourceOptions.minCompoundLength,
+} as const satisfies ParseFileOptionsRequired;
 
 export const defaultParseDictionaryOptions: ParseFileOptionsRequired = Object.freeze(_defaultOptions);
 
@@ -119,6 +129,7 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
         splitKeepBoth = _defaultOptions.splitKeepBoth,
         allowedSplitWords = _defaultOptions.allowedSplitWords,
         storeSplitWordsAsCompounds,
+        minCompoundLength = _defaultOptions.minCompoundLength,
     } = _options;
 
     let { legacy = _defaultOptions.legacy } = _options;
@@ -207,7 +218,7 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
     }
 
     function splitWordIntoWords(word: string): string[] {
-        return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix);
+        return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix, minCompoundLength);
     }
 
     function* splitWords(lines: Iterable<string>): Iterable<string> {
@@ -260,6 +271,6 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
  * @param _options - defines prefixes used when parsing lines.
  * @returns words that have been normalized.
  */
-export function parseFileLines(lines: Iterable<string> | string, options: Partial<ParseFileOptions>): Iterable<string> {
+export function parseFileLines(lines: Iterable<string> | string, options: ParseFileOptions): Iterable<string> {
     return createParseFileLineMapper(options)(typeof lines === 'string' ? [lines] : lines);
 }
diff --git a/packages/cspell-tools/src/config/config.ts b/packages/cspell-tools/src/config/config.ts
@@ -186,11 +186,19 @@ export interface CompileSourceOptions {
     allowedSplitWords?: FilePath | FilePath[] | undefined;
 
     /**
-     * Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words.
+     * Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words.
      * These words are prefixed / suffixed with `*`.
      * @default false
      */
     storeSplitWordsAsCompounds?: boolean | undefined;
+
+    /**
+     * Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`.
+     * The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words.
+     * If the length is too low, then the dictionary will consider many misspelled words as correct.
+     * @default 4
+     */
+    minCompoundLength?: number | undefined;
 }
 
 export const configFileSchemaURL =

diff --git a/packages/cspell-tools/src/config/configDefaults.ts b/packages/cspell-tools/src/config/configDefaults.ts
@@ -0,0 +1,11 @@
+import type { RequireFields } from '../types.js';
+import type { CompileSourceOptions } from './config.js';
+
+export const defaultCompileSourceOptions = {
+    maxDepth: undefined,
+    split: false,
+    keepRawCase: false,
+    allowedSplitWords: undefined,
+    storeSplitWordsAsCompounds: false,
+    minCompoundLength: 4,
+} as const satisfies RequireFields<CompileSourceOptions>;
diff --git a/packages/cspell-tools/src/types.ts b/packages/cspell-tools/src/types.ts
@@ -0,0 +1,6 @@
+/**
+ * Make all properties in T required, but keep the original optionality of the properties.
+ */
+export type RequireFields<T> = {
+    [P in keyof Required<T>]: T[P];
+};