Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: cspell-tools - add minCompoundLength setting #6449

Merged
merged 3 commits into from
Nov 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions packages/cspell-tools/cspell-tools.config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@
"description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
"type": "number"
},
"minCompoundLength": {
"default": 4,
"description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
"type": "number"
},
"split": {
"anyOf": [
{
Expand All @@ -68,7 +73,7 @@
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
}
},
Expand Down Expand Up @@ -110,6 +115,11 @@
"description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
"type": "number"
},
"minCompoundLength": {
"default": 4,
"description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
"type": "number"
},
"split": {
"anyOf": [
{
Expand All @@ -125,7 +135,7 @@
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
}
},
Expand Down Expand Up @@ -266,6 +276,11 @@
"description": "Maximum number of nested Hunspell Rules to apply. This is needed for recursive dictionaries like Hebrew.",
"type": "number"
},
"minCompoundLength": {
"default": 4,
"description": "Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`. The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words. If the length is too low, then the dictionary will consider many misspelled words as correct.",
"type": "number"
},
"removeDuplicates": {
"default": false,
"description": "Remove duplicate words, favor lower case words over mixed case words. Combine compound prefixes where possible.",
Expand Down Expand Up @@ -295,7 +310,7 @@
},
"storeSplitWordsAsCompounds": {
"default": false,
"description": "Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"description": "Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words. These words are prefixed / suffixed with `*`.",
"type": "boolean"
},
"targets": {
Expand Down
18 changes: 15 additions & 3 deletions packages/cspell-tools/src/compiler/SourceReader.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { RequireFields } from '../types.js';
import { createReader } from './Reader.js';
import type { Reader } from './readers/ReaderOptions.js';
import { parseFileLines } from './wordListParser.js';
import { parseFileLines, ParseFileOptions } from './wordListParser.js';
import type { AllowedSplitWordsCollection } from './WordsCollection.js';

export interface SourceReaderOptions {
Expand All @@ -24,6 +25,8 @@ export interface SourceReaderOptions {
allowedSplitWords: AllowedSplitWordsCollection;

storeSplitWordsAsCompounds: boolean | undefined;

minCompoundLength?: number | undefined;
}

export type AnnotatedWord = string;
Expand Down Expand Up @@ -64,8 +67,17 @@ function splitLines(lines: Iterable<string>, options: SourceReaderOptions): Iter
}

async function textFileReader(reader: Reader, options: SourceReaderOptions): Promise<SourceReader> {
const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds } = options;
const words = [...parseFileLines(reader.lines, { legacy, split, allowedSplitWords, storeSplitWordsAsCompounds })];
const { legacy, splitWords: split, allowedSplitWords, storeSplitWordsAsCompounds, minCompoundLength } = options;
const parseOptions = {
legacy,
split,
splitKeepBoth: undefined,
keepCase: undefined,
allowedSplitWords,
storeSplitWordsAsCompounds,
minCompoundLength,
} as const satisfies RequireFields<ParseFileOptions>;
const words = [...parseFileLines(reader.lines, parseOptions)];

return {
size: words.length,
Expand Down
2 changes: 2 additions & 0 deletions packages/cspell-tools/src/compiler/compile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
split = sourceOptions.split || false,
maxDepth,
storeSplitWordsAsCompounds,
minCompoundLength,
} = fileSource;

const legacy = split === 'legacy';
Expand All @@ -293,6 +294,7 @@ async function readFileSource(fileSource: FileSource, sourceOptions: CompileSour
keepCase: keepRawCase,
allowedSplitWords,
storeSplitWordsAsCompounds,
minCompoundLength,
};

logWithTimestamp(`Reading ${path.basename(filename)}`);
Expand Down
7 changes: 6 additions & 1 deletion packages/cspell-tools/src/compiler/legacyLineToWords.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import { opConcatMap, opFilter, opMap, pipe } from '@cspell/cspell-pipe/sync';

import { defaultCompileSourceOptions } from '../config/configDefaults.js';
import { regExpSpaceOrDash, splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js';
import type { AllowedSplitWordsCollection } from './WordsCollection.js';

const regNonWord = /[^\p{L}\p{M}' _\d]+/giu;
const regExpRepeatChars = /(.)\1{5}/i;

const minCompoundLength = defaultCompileSourceOptions.minCompoundLength;

export function legacyLineToWords(
line: string,
keepCase: boolean,
Expand All @@ -15,10 +18,12 @@ export function legacyLineToWords(
const filteredLine = line.replaceAll(regNonWord, '|');
const wordGroups = filteredLine.split('|');

const _minCompoundLength = minCompoundLength;

const words = pipe(
wordGroups,
opConcatMap((a) => a.split(regExpSpaceOrDash)),
opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '')),
opConcatMap((a) => splitCamelCaseIfAllowed(a, allowedSplitWords, keepCase, '', _minCompoundLength)),
opMap((a) => a.trim()),
opFilter((a) => !!a),
opFilter((s) => !regExpRepeatChars.test(s)),
Expand Down
40 changes: 22 additions & 18 deletions packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,29 @@ describe('splitCamelCaseIfAllowed', () => {
${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection', 'state', 'msg', 'closed']}
`('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => {
allowed = createAllowedSplitWords(allowed);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '')).toEqual(expected);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '', 4)).toEqual(expected);
});

test.each`
text | keepCase | allowed | expected
${''} | ${false} | ${undefined} | ${[]}
${'hello'} | ${false} | ${undefined} | ${['hello']}
${'helloThere'} | ${false} | ${['hello', 'there']} | ${['hello+', '+there']}
${'helloThere'} | ${false} | ${['hello', 'There']} | ${['hello+', '+There']}
${'helloThere'} | ${true} | ${['hello', 'There']} | ${['hello+', '+There']}
${'ERRORCode'} | ${false} | ${['error', 'code']} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['code']} | ${['ERRORCode']}
${'ERRORCode'} | ${false} | ${['code']} | ${['ERRORCode']}
${'ErrorCode'} | ${true} | ${['error', 'code']} | ${['error+', '+code']}
${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']}
${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection+', '+state+', '+msg', 'closed']}
`('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => {
allowed = createAllowedSplitWords(allowed);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+')).toEqual(expected);
});
text | keepCase | allowed | min | expected
${''} | ${false} | ${undefined} | ${4} | ${[]}
${'hello'} | ${false} | ${undefined} | ${4} | ${['hello']}
${'helloThere'} | ${false} | ${['hello', 'there']} | ${4} | ${['hello+', '+there']}
${'helloThere'} | ${false} | ${['hello', 'There']} | ${4} | ${['hello+', '+There']}
${'helloThere'} | ${true} | ${['hello', 'There']} | ${4} | ${['hello+', '+There']}
${'ERRORCode'} | ${false} | ${['error', 'code']} | ${4} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['error', 'code']} | ${4} | ${['error+', '+code']}
${'ERRORCode'} | ${true} | ${['code']} | ${4} | ${['ERRORCode']}
${'ERRORCode'} | ${false} | ${['code']} | ${4} | ${['ERRORCode']}
${'ErrorCode'} | ${true} | ${['error', 'code']} | ${4} | ${['error+', '+code']}
${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${3} | ${['xml+', '+UCS+', 'is', '+cat+', 'z']}
${'xmlUCSIsCats'} | ${true} | ${['xml', 'UCS', 'is', 'cats']} | ${4} | ${['xml', 'UCS', 'is', '+cats']}
${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${4} | ${['ADP', 'connection+', '+state+', 'msg', 'closed']}
`(
'splitCamelCaseIfAllowed $text $keepCase $allowed',
({ text, keepCase, allowed, expected, min: minCompoundLength }) => {
allowed = createAllowedSplitWords(allowed);
expect(splitCamelCaseIfAllowed(text, allowed, keepCase, '+', minCompoundLength)).toEqual(expected);
},
);
});
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export function splitCamelCaseIfAllowed(
allowedWords: AllowedSplitWordsCollection,
keepCase: boolean,
compoundPrefix: string,
minCompoundLength: number,
): string[] {
const split = [...splitCamelCase(word)];
if (split.length == 1) return adjustCases(split, allowedWords, keepCase);
Expand All @@ -20,7 +21,7 @@ export function splitCamelCaseIfAllowed(
? adjusted
: adjusted.map((w, i) => {
const { px, sx } = wordIndexes[i];
const canCompound = w.length > 2;
const canCompound = w.length >= minCompoundLength;
const lc = w.toLowerCase();
const p = canCompound && isSingleLetter(px) ? compoundPrefix : '';
const s = canCompound && isSingleLetter(sx) ? compoundPrefix : '';
Expand Down
1 change: 1 addition & 0 deletions packages/cspell-tools/src/compiler/wordListParser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ function pf(...opts: Partial<ParseFileOptions>[]): ParseFileOptions {
const opt: ParseFileOptions = {
allowedSplitWords: defaultAllowedSplitWords,
storeSplitWordsAsCompounds: undefined,
minCompoundLength: undefined,
};
for (const op of opts) {
Object.assign(opt, op);
Expand Down
21 changes: 16 additions & 5 deletions packages/cspell-tools/src/compiler/wordListParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { opCombine, opCombine as opPipe, type Operator, opFilter, opMap } from '
import { createDictionaryLineParser } from 'cspell-trie-lib';
import { uniqueFilter } from 'hunspell-reader';

import { defaultCompileSourceOptions } from '../config/configDefaults.js';
import type { CompileOptions } from './CompileOptions.js';
import { legacyLineToWords } from './legacyLineToWords.js';
import { splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js';
Expand Down Expand Up @@ -85,21 +86,30 @@ export interface ParseFileOptions {
* @default undefined
*/
storeSplitWordsAsCompounds: boolean | undefined;

/**
* Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`.
* The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words.
* If the length is too low, then the dictionary will consider many misspelled words as correct.
* @default 4
*/
minCompoundLength: number | undefined;
}

type ParseFileOptionsRequired = Required<ParseFileOptions>;

const commentCharacter = '#';

const _defaultOptions: ParseFileOptionsRequired = {
const _defaultOptions = {
keepCase: true,
legacy: false,
split: false,
splitKeepBoth: false,
// splitSeparator: regExpSplit,
allowedSplitWords: { has: () => true, size: 0 },
storeSplitWordsAsCompounds: undefined,
};
storeSplitWordsAsCompounds: defaultCompileSourceOptions.storeSplitWordsAsCompounds,
minCompoundLength: defaultCompileSourceOptions.minCompoundLength,
} as const satisfies ParseFileOptionsRequired;

export const defaultParseDictionaryOptions: ParseFileOptionsRequired = Object.freeze(_defaultOptions);

Expand All @@ -119,6 +129,7 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
splitKeepBoth = _defaultOptions.splitKeepBoth,
allowedSplitWords = _defaultOptions.allowedSplitWords,
storeSplitWordsAsCompounds,
minCompoundLength = _defaultOptions.minCompoundLength,
} = _options;

let { legacy = _defaultOptions.legacy } = _options;
Expand Down Expand Up @@ -207,7 +218,7 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
}

function splitWordIntoWords(word: string): string[] {
return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix);
return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix, minCompoundLength);
}

function* splitWords(lines: Iterable<string>): Iterable<string> {
Expand Down Expand Up @@ -260,6 +271,6 @@ export function createParseFileLineMapper(options?: Partial<ParseFileOptions>):
* @param _options - defines prefixes used when parsing lines.
* @returns words that have been normalized.
*/
export function parseFileLines(lines: Iterable<string> | string, options: Partial<ParseFileOptions>): Iterable<string> {
export function parseFileLines(lines: Iterable<string> | string, options: ParseFileOptions): Iterable<string> {
return createParseFileLineMapper(options)(typeof lines === 'string' ? [lines] : lines);
}
10 changes: 9 additions & 1 deletion packages/cspell-tools/src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,19 @@ export interface CompileSourceOptions {
allowedSplitWords?: FilePath | FilePath[] | undefined;

/**
* Words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words.
* Camel case words that have been split using the `allowedSplitWords` are added to the dictionary as compoundable words.
* These words are prefixed / suffixed with `*`.
* @default false
*/
storeSplitWordsAsCompounds?: boolean | undefined;

/**
* Controls the minimum length of a compound word when storing words using `storeSplitWordsAsCompounds`.
* The compound words are prefixed / suffixed with `*`, to allow them to be combined with other compound words.
* If the length is too low, then the dictionary will consider many misspelled words as correct.
* @default 4
*/
minCompoundLength?: number | undefined;
}

export const configFileSchemaURL =
Expand Down
11 changes: 11 additions & 0 deletions packages/cspell-tools/src/config/configDefaults.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import type { RequireFields } from '../types.js';
import type { CompileSourceOptions } from './config.js';

export const defaultCompileSourceOptions = {
maxDepth: undefined,
split: false,
keepRawCase: false,
allowedSplitWords: undefined,
storeSplitWordsAsCompounds: false,
minCompoundLength: 4,
} as const satisfies RequireFields<CompileSourceOptions>;
6 changes: 6 additions & 0 deletions packages/cspell-tools/src/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/**
* Make all properties in T required, but keep the original optionality of the properties.
*/
export type RequireFields<T> = {
[P in keyof Required<T>]: T[P];
};