Skip to content

Commit

Permalink
Merge pull request #26 from sillsdev/langtag_parsing
Browse files Browse the repository at this point in the history
fix: tag parsing for reopening (#26)
  • Loading branch information
andrew-polk authored Oct 28, 2024
2 parents cdf7297 + bc3a60a commit 3ebd00a
Show file tree
Hide file tree
Showing 22 changed files with 498 additions and 116 deletions.
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
"exonym",
"langtag",
"langtags",
"macrolanguage"
"macrolanguage",
"Subtag",
"subtags"
]
}
4 changes: 2 additions & 2 deletions components/language-chooser/common/find-language/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ See the main [README](../../../../README.md).

### Language data processing pipeline

If you modify [langtagProcessing.ts](./langtagProcessing.ts), run `npm run find-language/common/langtag-processing` to update [languageData.json](language-data/languageData.json) and [shortestTagLookups.json](language-data/shortestTagLookups.json).
If you modify [langtagProcessing.ts](./langtagProcessing.ts), run `npm run find-language/common/langtag-processing` to update [languageData.json](language-data/languageData.json) and [equivalentTags.json](language-data/equivalentTags.json).

#### ISO-639-3 language consolidation

Expand All @@ -136,7 +136,7 @@ find-language searches languages included in the ISO-639-3 standard; every resul

The [createTag](./languageTagUtils.ts) function in this package will return the shortest (and thus preferred) tag for a given language/script/region/dialect combination. For example, given language code "emm" (Mamulique), script code "Latn" (Latin) and region code "MX" (Mexico), `createTag` will return "emm" because it is the preferred equivalent tag for emm-Latn-MX.

[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags. langtagProcessing.ts reformats it into [shortestTagLookups.json](language-data/shortestTagLookups.json) which we use for mapping language tags to their shortest equivalent.
[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags. langtagProcessing.ts reformats it into [equivalentTags.json](language-data/equivalentTags.json) which we use for mapping language tags to their shortest and maximal equivalents.

### Unit tests

Expand Down

This file was deleted.

This file was deleted.

1 change: 0 additions & 1 deletion components/language-chooser/common/find-language/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
export * from "./findLanguageInterfaces";
export * from "./getShortestSufficientLangtag";
export * from "./languageTagUtils";
export * from "./matchingSubstringDemarcation";
export * from "./searchForLanguage";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,19 +236,29 @@ function parseLangtagsJson() {
}

function parseLangTagsTxt() {
/*
From https://github.com/silnrsi/langtags/blob/master/doc/langtags.md
Langtags.txt contains a sequence of equivalence sets. Each set consists of a
list of language tags separated by =. The first tag on the line is the canonical
tag and the last tag on the line is the maximal tag. In addition, a tag is
prefixed with * if there is an entry in the SLDR for that particular tag. */
const langTagsTxtRaw = fs.readFileSync("language-data/langtags.txt", "utf8");
const langTagsTxt = langTagsTxtRaw.replaceAll("*", "");
const lines = langTagsTxt.split("\n");
const tagLookups = [];
for (const line of lines) {
if (line.length === 0) {
continue;
}
const tags = line.split(" = ");
tagLookups.push({
shortest: tags[0],
maximal: tags[tags.length - 1],
allTags: tags,
});
}
fs.writeFileSync(
"language-data/shortestTagLookups.json",
"language-data/equivalentTags.json",
JSON.stringify(tagLookups)
);
}
Expand Down

Large diffs are not rendered by default.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { expect, it, describe } from "vitest";
import { createTag } from "./languageTagUtils";
import {
createTag,
getMaximalLangtag,
getShortestSufficientLangtag,
} from "./languageTagUtils";

describe("Language tag utils", () => {
describe("Tag creation", () => {
it("should create the correct language tag for a language", () => {
expect(createTag({ languageCode: "eng" })).toEqual("eng");
expect(
Expand All @@ -11,10 +15,10 @@ describe("Language tag utils", () => {
languageCode: "eng",
scriptCode: "Latn",
})
).toEqual("eng-Latn-US-foobar");
).toEqual("eng-Latn-US-x-foobar");
});
expect(createTag({ languageCode: "eng", dialectCode: "foobar" })).toEqual(
"eng-foobar"
"eng-x-foobar"
);
expect(createTag({ languageCode: "eng", regionCode: "IN" })).toEqual(
"eng-IN"
Expand All @@ -28,3 +32,68 @@ describe("Language tag utils", () => {
).toEqual("qaa-Latn-US-x-foobar");
});
});

describe("get shortest equivalent version of langtag", () => {
it("should return the shortest tag if it exists", () => {
expect(getShortestSufficientLangtag("en")).toEqual("en");
expect(
getShortestSufficientLangtag(createTag({ languageCode: "frm" }))
).toEqual("frm");
expect(
getShortestSufficientLangtag(
createTag({ languageCode: "frm", scriptCode: "Latn", regionCode: "FR" })
)
).toEqual("frm");
expect(
getShortestSufficientLangtag(
createTag({ languageCode: "frm", regionCode: "FR" })
)
).toEqual("frm");
expect(
getShortestSufficientLangtag(
createTag({ languageCode: "frm", scriptCode: "Latn" })
)
).toEqual("frm");
});
it("should be case insensitive", () => {
expect(getShortestSufficientLangtag("fRm")).toEqual("frm");
expect(getShortestSufficientLangtag("FRM-LaTn")).toEqual("frm");
});
it("should return undefined if tag is not found in the equivalence list (langtags.txt)", () => {
expect(getShortestSufficientLangtag("zzz")).toBeUndefined();
expect(getShortestSufficientLangtag("")).toBeUndefined();
expect(getShortestSufficientLangtag("frm-Cyrl")).toBeUndefined();
});
});

describe("get maximal equivalent version of langtag", () => {
it("should return the maximal tag if it exists", () => {
expect(getMaximalLangtag("dtp-Latn-MY")).toEqual("dtp-Latn-MY");
expect(getMaximalLangtag("dtp")).toEqual("dtp-Latn-MY");
expect(getMaximalLangtag("ktr")).toEqual("dtp-Latn-MY");
expect(getMaximalLangtag("kzt-MY")).toEqual("dtp-Latn-MY");
expect(
getMaximalLangtag(
createTag({ languageCode: "dtp", regionCode: "MY", scriptCode: "Latn" })
)
).toEqual("dtp-Latn-MY");
expect(
getMaximalLangtag(createTag({ languageCode: "dtp", scriptCode: "Latn" }))
).toEqual("dtp-Latn-MY");
expect(
getMaximalLangtag(createTag({ languageCode: "dtp", regionCode: "MY" }))
).toEqual("dtp-Latn-MY");
expect(getMaximalLangtag(createTag({ languageCode: "dtp" }))).toEqual(
"dtp-Latn-MY"
);
});
it("should be case insensitive", () => {
expect(getMaximalLangtag("DTP-Latn-My")).toEqual("dtp-Latn-MY");
expect(getMaximalLangtag("DtP")).toEqual("dtp-Latn-MY");
});
it("should return undefined if tag is not found in the equivalence list (langtags.txt)", () => {
expect(getMaximalLangtag("zzz")).toBeUndefined();
expect(getMaximalLangtag("")).toBeUndefined();
expect(getMaximalLangtag("frm-Cyrl")).toBeUndefined();
});
});
Original file line number Diff line number Diff line change
@@ -1,4 +1,26 @@
import { getShortestSufficientLangtag } from "./getShortestSufficientLangtag";
import equivalentTags from "./language-data/equivalentTags.json" assert { type: "json" };

// Keys are lower cased
const shortPreferredTagLookup = new Map<string, string>();
const maximalTagLookup = new Map<string, string>();
for (const tagset of equivalentTags) {
for (const tag of tagset.allTags) {
shortPreferredTagLookup.set(tag.toLowerCase(), tagset.shortest);
maximalTagLookup.set(tag.toLowerCase(), tagset.maximal);
}
}

// case insensitive. Returns undefined if langtag is not in langtags.txt and so equivalents cannot be looked up
export function getShortestSufficientLangtag(
langtag: string
): string | undefined {
return shortPreferredTagLookup.get(langtag.toLowerCase());
}

// case insensitive. Returns undefined if langtag is not in langtags.txt and so equivalents cannot be looked up
export function getMaximalLangtag(langtag: string): string | undefined {
return maximalTagLookup.get(langtag.toLowerCase());
}

export function createTag({
languageCode,
Expand All @@ -24,11 +46,13 @@ export function createTag({
if (regionCode) {
tag += `-${regionCode}`;
}
if (!languageCode) {
// TODO future work: If we ever make the language chooser aware of registered variants, some should not be preceded by the "-x-"
// For example, compare aai-x-suboro and be-tarask in langtags.txt and langtags.json
if (!languageCode || dialectCode) {
tag += "-x";
}
if (dialectCode) {
tag += `-${dialectCode}`;
}
return getShortestSufficientLangtag(tag);
return getShortestSufficientLangtag(tag) || tag;
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@ export function getAllRegions(): IRegion[] {
});
}

export function getRegionBySubtag(code: string): IRegion | undefined {
const regionInfo = iso31661.find(
(r) => r.alpha2.toLowerCase() === code.toLowerCase()
);
if (regionInfo) {
return {
name: regionInfo.name,
code: regionInfo.alpha2,
} as IRegion;
}
return undefined;
}

// ISO-15924 is a script code to script name lookup
export function getAllScripts(): IScript[] {
return iso15924.map((script) => {
Expand All @@ -21,3 +34,16 @@ export function getAllScripts(): IScript[] {
} as IScript;
});
}

export function getScriptBySubtag(code: string): IScript | undefined {
const scriptInfo = iso15924.find(
(s) => s.code.toLowerCase() === code.toLowerCase()
);
if (scriptInfo) {
return {
name: scriptInfo.name,
code: scriptInfo.code,
} as IScript;
}
return undefined;
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { searchForLanguage } from "./searchForLanguage";
import { getLanguageBySubtag, searchForLanguage } from "./searchForLanguage";
import { ILanguage } from "./findLanguageInterfaces";
import { describe, expect, it } from "vitest";
import { expectTypeOf } from "vitest";
Expand Down Expand Up @@ -113,3 +113,10 @@ function indexOfLanguageInSearchResults(
);
return index;
}

describe("getLanguageBySubtag", () => {
it("should find languages by valid languageSubtag field", () => {
expect(getLanguageBySubtag("aaa")?.exonym).toEqual("Ghotuo");
expect(getLanguageBySubtag("ab")?.exonym).toEqual("Abkhaz");
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,17 @@ export function searchForLanguage(
);
return [...prefixOnlyResults, ...nonPrefixResults];
}

//get language (not macrolanguage) with exact match on subtag
export function getLanguageBySubtag(code: string): ILanguage | undefined {
const fuse = new Fuse(languages as ILanguage[], {
keys: ["languageSubtag", "iso639_3_code"],
threshold: 0, // exact matches only
findAllMatches: true, // in case one is a macrolanguage
});
const results = fuse.search(code);
const filteredResults = results.filter(
(result) => !result.item.isMacrolanguage
);
return filteredResults[0]?.item;
}
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
export * from "./useLanguageChooser";
export {
isUnlistedLanguage,
createTagFromOrthography,
// We don't want to export parseLangtagForLangChooser because it is not a comprehensive langtag parser.
// Just built to handle the langtags output by the language chooser and the libPalasso language picker that was in BloomDesktop.
} from "./languageTagHandling";
export type {
IOrthography,
ICustomizableLanguageDetails,
} from "./languageTagHandling";
Loading

0 comments on commit 3ebd00a

Please sign in to comment.