Merge pull request #26 from sillsdev/langtag_parsing

fix: tag parsing for reopening (#26)
sillsdev · Oct 28, 2024 · 3ebd00a · 3ebd00a
2 parents cdf7297 + bc3a60a
commit 3ebd00a
Show file tree

Hide file tree

Showing 22 changed files with 498 additions and 116 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -12,6 +12,8 @@
         "exonym",
         "langtag",
         "langtags",
-        "macrolanguage"
+        "macrolanguage",
+        "Subtag",
+        "subtags"
     ]
 }
diff --git a/components/language-chooser/common/find-language/README.md b/components/language-chooser/common/find-language/README.md
@@ -126,7 +126,7 @@ See the main [README](../../../../README.md).
 
 ### Language data processing pipeline
 
-If you modify [langtagProcessing.ts](./langtagProcessing.ts), run `npm run find-language/common/langtag-processing` to update [languageData.json](language-data/languageData.json) and [shortestTagLookups.json](language-data/shortestTagLookups.json).
+If you modify [langtagProcessing.ts](./langtagProcessing.ts), run `npm run find-language/common/langtag-processing` to update [languageData.json](language-data/languageData.json) and [equivalentTags.json](language-data/equivalentTags.json).
 
 #### ISO-639-3 language consolidation
 
@@ -136,7 +136,7 @@ find-language searches languages included in the ISO-639-3 standard; every resul
 
 The [createTag](./languageTagUtils.ts) function in this package will return the shortest (and thus preferred) tag for a given language/script/region/dialect combination. For example, given language code "emm" (Mamulique), script code "Latn" (Latin) and region code "MX" (Mexico), `createTag` will return "emm" because it is the preferred equivalent tag for emm-Latn-MX.
 
-[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags. langtagProcessing.ts reformats it into [shortestTagLookups.json](language-data/shortestTagLookups.json) which we use for mapping language tags to their shortest equivalent.
+[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags. langtagProcessing.ts reformats it into [equivalentTags.json](language-data/equivalentTags.json) which we use for mapping language tags to their shortest and maximal equivalents.
 
 ### Unit tests
 

diff --git a/components/language-chooser/common/find-language/getShortestSufficientLangtag.spec.ts b/components/language-chooser/common/find-language/getShortestSufficientLangtag.spec.ts
diff --git a/components/language-chooser/common/find-language/getShortestSufficientLangtag.ts b/components/language-chooser/common/find-language/getShortestSufficientLangtag.ts
diff --git a/components/language-chooser/common/find-language/index.ts b/components/language-chooser/common/find-language/index.ts
@@ -1,5 +1,4 @@
 export * from "./findLanguageInterfaces";
-export * from "./getShortestSufficientLangtag";
 export * from "./languageTagUtils";
 export * from "./matchingSubstringDemarcation";
 export * from "./searchForLanguage";

diff --git a/components/language-chooser/common/find-language/langtagProcessing.ts b/components/language-chooser/common/find-language/langtagProcessing.ts
@@ -236,19 +236,29 @@ function parseLangtagsJson() {
 }
 
 function parseLangTagsTxt() {
+  /*
+  From https://github.com/silnrsi/langtags/blob/master/doc/langtags.md 
+  Langtags.txt contains a sequence of equivalence sets. Each set consists of a 
+  list of language tags separated by =. The first tag on the line is the canonical
+   tag and the last tag on the line is the maximal tag. In addition, a tag is 
+   prefixed with * if there is an entry in the SLDR for that particular tag. */
   const langTagsTxtRaw = fs.readFileSync("language-data/langtags.txt", "utf8");
   const langTagsTxt = langTagsTxtRaw.replaceAll("*", "");
   const lines = langTagsTxt.split("\n");
   const tagLookups = [];
   for (const line of lines) {
+    if (line.length === 0) {
+      continue;
+    }
     const tags = line.split(" = ");
     tagLookups.push({
       shortest: tags[0],
+      maximal: tags[tags.length - 1],
       allTags: tags,
     });
   }
   fs.writeFileSync(
-    "language-data/shortestTagLookups.json",
+    "language-data/equivalentTags.json",
     JSON.stringify(tagLookups)
   );
 }

diff --git a/components/language-chooser/common/find-language/language-data/equivalentTags.json b/components/language-chooser/common/find-language/language-data/equivalentTags.json
diff --git a/components/language-chooser/common/find-language/language-data/shortestTagLookups.json b/components/language-chooser/common/find-language/language-data/shortestTagLookups.json
diff --git a/components/language-chooser/common/find-language/languageTagUtils.spec.ts b/components/language-chooser/common/find-language/languageTagUtils.spec.ts
@@ -1,7 +1,11 @@
 import { expect, it, describe } from "vitest";
-import { createTag } from "./languageTagUtils";
+import {
+  createTag,
+  getMaximalLangtag,
+  getShortestSufficientLangtag,
+} from "./languageTagUtils";
 
-describe("Language tag utils", () => {
+describe("Tag creation", () => {
   it("should create the correct language tag for a language", () => {
     expect(createTag({ languageCode: "eng" })).toEqual("eng");
     expect(
@@ -11,10 +15,10 @@ describe("Language tag utils", () => {
         languageCode: "eng",
         scriptCode: "Latn",
       })
-    ).toEqual("eng-Latn-US-foobar");
+    ).toEqual("eng-Latn-US-x-foobar");
   });
   expect(createTag({ languageCode: "eng", dialectCode: "foobar" })).toEqual(
-    "eng-foobar"
+    "eng-x-foobar"
   );
   expect(createTag({ languageCode: "eng", regionCode: "IN" })).toEqual(
     "eng-IN"
@@ -28,3 +32,68 @@ describe("Language tag utils", () => {
     ).toEqual("qaa-Latn-US-x-foobar");
   });
 });
+
+describe("get shortest equivalent version of langtag", () => {
+  it("should return the shortest tag if it exists", () => {
+    expect(getShortestSufficientLangtag("en")).toEqual("en");
+    expect(
+      getShortestSufficientLangtag(createTag({ languageCode: "frm" }))
+    ).toEqual("frm");
+    expect(
+      getShortestSufficientLangtag(
+        createTag({ languageCode: "frm", scriptCode: "Latn", regionCode: "FR" })
+      )
+    ).toEqual("frm");
+    expect(
+      getShortestSufficientLangtag(
+        createTag({ languageCode: "frm", regionCode: "FR" })
+      )
+    ).toEqual("frm");
+    expect(
+      getShortestSufficientLangtag(
+        createTag({ languageCode: "frm", scriptCode: "Latn" })
+      )
+    ).toEqual("frm");
+  });
+  it("should be case insensitive", () => {
+    expect(getShortestSufficientLangtag("fRm")).toEqual("frm");
+    expect(getShortestSufficientLangtag("FRM-LaTn")).toEqual("frm");
+  });
+  it("should return undefined if tag is not found in the equivalence list (langtags.txt)", () => {
+    expect(getShortestSufficientLangtag("zzz")).toBeUndefined();
+    expect(getShortestSufficientLangtag("")).toBeUndefined();
+    expect(getShortestSufficientLangtag("frm-Cyrl")).toBeUndefined();
+  });
+});
+
+describe("get maximal equivalent version of langtag", () => {
+  it("should return the maximal tag if it exists", () => {
+    expect(getMaximalLangtag("dtp-Latn-MY")).toEqual("dtp-Latn-MY");
+    expect(getMaximalLangtag("dtp")).toEqual("dtp-Latn-MY");
+    expect(getMaximalLangtag("ktr")).toEqual("dtp-Latn-MY");
+    expect(getMaximalLangtag("kzt-MY")).toEqual("dtp-Latn-MY");
+    expect(
+      getMaximalLangtag(
+        createTag({ languageCode: "dtp", regionCode: "MY", scriptCode: "Latn" })
+      )
+    ).toEqual("dtp-Latn-MY");
+    expect(
+      getMaximalLangtag(createTag({ languageCode: "dtp", scriptCode: "Latn" }))
+    ).toEqual("dtp-Latn-MY");
+    expect(
+      getMaximalLangtag(createTag({ languageCode: "dtp", regionCode: "MY" }))
+    ).toEqual("dtp-Latn-MY");
+    expect(getMaximalLangtag(createTag({ languageCode: "dtp" }))).toEqual(
+      "dtp-Latn-MY"
+    );
+  });
+  it("should be case insensitive", () => {
+    expect(getMaximalLangtag("DTP-Latn-My")).toEqual("dtp-Latn-MY");
+    expect(getMaximalLangtag("DtP")).toEqual("dtp-Latn-MY");
+  });
+  it("should return undefined if tag is not found in the equivalence list (langtags.txt)", () => {
+    expect(getMaximalLangtag("zzz")).toBeUndefined();
+    expect(getMaximalLangtag("")).toBeUndefined();
+    expect(getMaximalLangtag("frm-Cyrl")).toBeUndefined();
+  });
+});
diff --git a/components/language-chooser/common/find-language/languageTagUtils.ts b/components/language-chooser/common/find-language/languageTagUtils.ts
@@ -1,4 +1,26 @@
-import { getShortestSufficientLangtag } from "./getShortestSufficientLangtag";
+import equivalentTags from "./language-data/equivalentTags.json" assert { type: "json" };
+
+// Keys are lower cased
+const shortPreferredTagLookup = new Map<string, string>();
+const maximalTagLookup = new Map<string, string>();
+for (const tagset of equivalentTags) {
+  for (const tag of tagset.allTags) {
+    shortPreferredTagLookup.set(tag.toLowerCase(), tagset.shortest);
+    maximalTagLookup.set(tag.toLowerCase(), tagset.maximal);
+  }
+}
+
+// case insensitive. Returns undefined if langtag is not in langtags.txt and so equivalents cannot be looked up
+export function getShortestSufficientLangtag(
+  langtag: string
+): string | undefined {
+  return shortPreferredTagLookup.get(langtag.toLowerCase());
+}
+
+// case insensitive. Returns undefined if langtag is not in langtags.txt and so equivalents cannot be looked up
+export function getMaximalLangtag(langtag: string): string | undefined {
+  return maximalTagLookup.get(langtag.toLowerCase());
+}
 
 export function createTag({
   languageCode,
@@ -24,11 +46,13 @@ export function createTag({
   if (regionCode) {
     tag += `-${regionCode}`;
   }
-  if (!languageCode) {
+  // TODO future work: If we ever make the language chooser aware of registered variants, some should not be preceded by the "-x-"
+  // For example, compare aai-x-suboro and be-tarask in langtags.txt and langtags.json
+  if (!languageCode || dialectCode) {
     tag += "-x";
   }
   if (dialectCode) {
     tag += `-${dialectCode}`;
   }
-  return getShortestSufficientLangtag(tag);
+  return getShortestSufficientLangtag(tag) || tag;
 }
diff --git a/components/language-chooser/common/find-language/regionsAndScripts.ts b/components/language-chooser/common/find-language/regionsAndScripts.ts
@@ -12,6 +12,19 @@ export function getAllRegions(): IRegion[] {
   });
 }
 
+export function getRegionBySubtag(code: string): IRegion | undefined {
+  const regionInfo = iso31661.find(
+    (r) => r.alpha2.toLowerCase() === code.toLowerCase()
+  );
+  if (regionInfo) {
+    return {
+      name: regionInfo.name,
+      code: regionInfo.alpha2,
+    } as IRegion;
+  }
+  return undefined;
+}
+
 // ISO-15924 is a script code to script name lookup
 export function getAllScripts(): IScript[] {
   return iso15924.map((script) => {
@@ -21,3 +34,16 @@ export function getAllScripts(): IScript[] {
     } as IScript;
   });
 }
+
+export function getScriptBySubtag(code: string): IScript | undefined {
+  const scriptInfo = iso15924.find(
+    (s) => s.code.toLowerCase() === code.toLowerCase()
+  );
+  if (scriptInfo) {
+    return {
+      name: scriptInfo.name,
+      code: scriptInfo.code,
+    } as IScript;
+  }
+  return undefined;
+}
diff --git a/components/language-chooser/common/find-language/searchForLanguage.spec.ts b/components/language-chooser/common/find-language/searchForLanguage.spec.ts
@@ -1,4 +1,4 @@
-import { searchForLanguage } from "./searchForLanguage";
+import { getLanguageBySubtag, searchForLanguage } from "./searchForLanguage";
 import { ILanguage } from "./findLanguageInterfaces";
 import { describe, expect, it } from "vitest";
 import { expectTypeOf } from "vitest";
@@ -113,3 +113,10 @@ function indexOfLanguageInSearchResults(
   );
   return index;
 }
+
+describe("getLanguageBySubtag", () => {
+  it("should find languages by valid languageSubtag field", () => {
+    expect(getLanguageBySubtag("aaa")?.exonym).toEqual("Ghotuo");
+    expect(getLanguageBySubtag("ab")?.exonym).toEqual("Abkhaz");
+  });
+});
diff --git a/components/language-chooser/common/find-language/searchForLanguage.ts b/components/language-chooser/common/find-language/searchForLanguage.ts
@@ -61,3 +61,17 @@ export function searchForLanguage(
   );
   return [...prefixOnlyResults, ...nonPrefixResults];
 }
+
+//get language (not macrolanguage) with exact match on subtag
+export function getLanguageBySubtag(code: string): ILanguage | undefined {
+  const fuse = new Fuse(languages as ILanguage[], {
+    keys: ["languageSubtag", "iso639_3_code"],
+    threshold: 0, // exact matches only
+    findAllMatches: true, // in case one is a macrolanguage
+  });
+  const results = fuse.search(code);
+  const filteredResults = results.filter(
+    (result) => !result.item.isMacrolanguage
+  );
+  return filteredResults[0]?.item;
+}
diff --git a/components/language-chooser/react/common/language-chooser-react-hook/index.ts b/components/language-chooser/react/common/language-chooser-react-hook/index.ts
@@ -1 +1,11 @@
 export * from "./useLanguageChooser";
+export {
+  isUnlistedLanguage,
+  createTagFromOrthography,
+  // We don't want to export parseLangtagForLangChooser because it is not a comprehensive langtag parser.
+  // Just built to handle the langtags output by the language chooser and the libPalasso language picker that was in BloomDesktop.
+} from "./languageTagHandling";
+export type {
+  IOrthography,
+  ICustomizableLanguageDetails,
+} from "./languageTagHandling";