fix: Add option to not auto stem during split (#4310)

streetsidesoftware · Mar 13, 2023 · 23059da · 23059da
1 parent fae4975
commit 23059da
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 33 deletions.
diff --git a/packages/cspell-tools/src/compiler/legacyLineToWords.test.ts b/packages/cspell-tools/src/compiler/legacyLineToWords.test.ts
@@ -14,23 +14,23 @@ describe('Validate legacyLineToWords', () => {
  });
 
  test.each`
- line  | expectedResult
- ${'hello'}  | ${['hello']}
- ${'AppendIterator::getArrayIterator'}  | ${['append', 'iterator', 'get', 'array']}
- ${'Austin Martin'}  | ${['austin', 'martin']}
- ${'JPEGsBLOBs'}  | ${['jpegs', 'blobs']}
- ${'CURLs CURLing' /* Sadly we cannot do this one correctly */} | ${['curls', 'curling']}
- ${'DNSTable Lookup'}  | ${['dns', 'table', 'lookup']}
- ${'OUTRing'}  | ${['outring']}
- ${'OUTRings'}  | ${['outrings']}
- ${'DIRs'}  | ${['dirs']}
- ${'AVGAspect'}  | ${['avg', 'aspect']}
- ${'New York'}  | ${['new', 'york']}
- ${'Namespace DNSLookup'}  | ${['namespace', 'dns', 'lookup']}
- ${'well-educated'}  | ${['well', 'educated']}
- ${'CURLcode'}  | ${['cur', 'lcode']}
- ${'kDNSServiceErr_BadSig'}  | ${['k', 'dns', 'service', 'err', 'bad', 'sig']}
- ${'apd_get_active_symbols'}  | ${['apd', 'get', 'active', 'symbols']}
+ line | expectedResult
+ ${'hello'} | ${['hello']}
+ ${'AppendIterator::getArrayIterator'} | ${['append', 'iterator', 'get', 'array']}
+ ${'Austin Martin'} | ${['austin', 'martin']}
+ ${'JPEGSBlobs'} | ${['jpegs', 'blobs']}
+ ${'CURLS Curling'}  | ${['curls', 'curling']}
+ ${'DNSTable Lookup'} | ${['dns', 'table', 'lookup']}
+ ${'OUTRing'} | ${['out', 'ring']}
+ ${'OUTRings'} | ${['out', 'rings']}
+ ${'DIRs'} | ${['di', 'rs']}
+ ${'AVGAspect'} | ${['avg', 'aspect']}
+ ${'New York'} | ${['new', 'york']}
+ ${'Namespace DNSLookup'} | ${['namespace', 'dns', 'lookup']}
+ ${'well-educated'} | ${['well', 'educated']}
+ ${'CURLcode'} | ${['cur', 'lcode']}
+ ${'kDNSServiceErr_BadSig'} | ${['k', 'dns', 'service', 'err', 'bad', 'sig']}
+ ${'apd_get_active_symbols'} | ${['apd', 'get', 'active', 'symbols']}
  `('legacy splitting lines $line', ({ line, expectedResult }: { line: string; expectedResult: string[] }) => {
  expect([...pipe(legacyLineToWords(line, false, allowed), opFilter(distinct()))]).toEqual(expectedResult);
  });

diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.test.ts
@@ -3,17 +3,19 @@ import { splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed';
 
 describe('splitCamelCaseIfAllowed', () => {
  test.each`
- text | keepCase | allowed | expected
- ${''} | ${false} | ${undefined} | ${[]}
- ${'hello'} | ${false} | ${undefined} | ${['hello']}
- ${'helloThere'} | ${false} | ${['hello', 'there']} | ${['hello', 'there']}
- ${'helloThere'} | ${false} | ${['hello', 'There']} | ${['hello', 'There']}
- ${'helloThere'} | ${true} | ${['hello', 'There']} | ${['hello', 'There']}
- ${'ERRORCode'} | ${false} | ${['error', 'code']} | ${['error', 'code']}
- ${'ERRORCode'} | ${true} | ${['error', 'code']} | ${['ERROR', 'code']}
- ${'ERRORCode'} | ${true} | ${['code']} | ${['ERRORCode']}
- ${'ERRORCode'} | ${false} | ${['code']} | ${['ERRORCode']}
- ${'ErrorCode'} | ${true} | ${['error', 'code']} | ${['error', 'code']}
+ text | keepCase | allowed | expected
+ ${''} | ${false} | ${undefined} | ${[]}
+ ${'hello'} | ${false} | ${undefined} | ${['hello']}
+ ${'helloThere'} | ${false} | ${['hello', 'there']} | ${['hello', 'there']}
+ ${'helloThere'} | ${false} | ${['hello', 'There']} | ${['hello', 'There']}
+ ${'helloThere'} | ${true} | ${['hello', 'There']} | ${['hello', 'There']}
+ ${'ERRORCode'} | ${false} | ${['error', 'code']} | ${['error', 'code']}
+ ${'ERRORCode'} | ${true} | ${['error', 'code']} | ${['ERROR', 'code']}
+ ${'ERRORCode'} | ${true} | ${['code']} | ${['ERRORCode']}
+ ${'ERRORCode'} | ${false} | ${['code']} | ${['ERRORCode']}
+ ${'ErrorCode'} | ${true} | ${['error', 'code']} | ${['error', 'code']}
+ ${'xmlUCSIsCatZ'} | ${true} | ${['xml', 'UCS', 'is', 'cat', 'z']} | ${['xml', 'UCS', 'is', 'cat', 'z']}
+ ${'ADP_ConnectionStateMsg_Closed'} | ${true} | ${undefined} | ${['ADP', 'connection', 'state', 'msg', 'closed']}
  `('splitCamelCaseIfAllowed $text $keepCase $allowed', ({ text, keepCase, allowed, expected }) => {
  allowed = createAllowedSplitWords(allowed);
  expect(splitCamelCaseIfAllowed(text, allowed, keepCase)).toEqual(expected);

diff --git a/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts b/packages/cspell-tools/src/compiler/splitCamelCaseIfAllowed.ts
@@ -34,7 +34,7 @@ function isUnknown(word: string, allowedWords: AllowedSplitWordsCollection): boo
 }
 
 function splitCamelCase(word: string): Iterable<string> {
- const splitWords = Text.splitCamelCaseWord(word).filter((word) => !regExpIsNumber.test(word));
+ const splitWords = Text.splitCamelCaseWord(word, false).filter((word) => !regExpIsNumber.test(word));
  // We only want to preserve this: "New York" and not "Namespace DNSLookup"
  if (splitWords.length > 1 && regExpSpaceOrDash.test(word)) {
  return splitWords.flatMap((w) => w.split(regExpSpaceOrDash));

diff --git a/packages/cspell-tools/src/compiler/text.ts b/packages/cspell-tools/src/compiler/text.ts
@@ -7,8 +7,8 @@ const regExSplitWords2 = /(\p{Lu})(\p{Lu}\p{Ll})/gu;
 /**
  * Split camelCase words into an array of strings.
  */
-export function splitCamelCaseWord(word: string): string[] {
- const wPrime = word.replace(regExUpperSOrIng, (s) => s[0] + s.slice(1).toLowerCase());
+export function splitCamelCaseWord(word: string, autoStem = true): string[] {
+ const wPrime = autoStem ? word.replace(regExUpperSOrIng, (s) => s[0] + s.slice(1).toLowerCase()) : word;
  const pass1 = wPrime.replace(regExSplitWords, '$1|$2');
  const pass2 = pass1.replace(regExSplitWords2, '$1|$2');
  const pass3 = pass2.replace(/[\d_]+/g, '|');

diff --git a/packages/cspell-tools/src/compiler/wordListCompiler.ts b/packages/cspell-tools/src/compiler/wordListCompiler.ts
@@ -23,13 +23,23 @@ export async function compileWordList(
  destFilename: string,
  options: CompileOptions
 ): Promise<void> {
- const filter = normalizeTargetWords(options);
+ const finalLines = normalize(lines, options);
 
- const finalSeq = pipe(wordListHeaderLines, opAppend(pipe(lines, filter)));
+ const finalSeq = pipe(wordListHeaderLines, opAppend(finalLines));
 
  return createWordListTarget(destFilename)(finalSeq);
 }
 
+function normalize(lines: Iterable<string>, options: CompileOptions): Iterable<string> {
+ const filter = normalizeTargetWords(options);
+
+ const iter = pipe(lines, filter);
+ if (!options.sort) return iter;
+
+ const result = new Set(iter);
+ return [...result].sort();
+}
+
 function createWordListTarget(destFilename: string): (seq: Iterable<string>) => Promise<void> {
  const target = createTarget(destFilename);
  return (seq: Iterable<string>) =>