From 37794a3ff4e8d62ef9da2cabb84d369e191d6b2e Mon Sep 17 00:00:00 2001 From: ROpdebee <15186467+ROpdebee@users.noreply.github.com> Date: Sun, 19 Jun 2022 15:20:01 +0200 Subject: [PATCH 1/2] feat(guess lang): script to guess language and script from tracklist --- src/lib/MB/types.ts | 17 +++ src/lib/util/format.ts | 4 + src/mb_guess_language/index.tsx | 143 ++++++++++++++++++++++++ src/mb_guess_language/libretranslate.ts | 92 +++++++++++++++ src/mb_guess_language/meta.ts | 16 +++ src/mb_guess_language/script.ts | 59 ++++++++++ src/mb_guess_language/tsconfig.json | 10 ++ tsconfig.json | 1 + 8 files changed, 342 insertions(+) create mode 100644 src/mb_guess_language/index.tsx create mode 100644 src/mb_guess_language/libretranslate.ts create mode 100644 src/mb_guess_language/meta.ts create mode 100644 src/mb_guess_language/script.ts create mode 100644 src/mb_guess_language/tsconfig.json diff --git a/src/lib/MB/types.ts b/src/lib/MB/types.ts index 7119a4988..06419d5dc 100644 --- a/src/lib/MB/types.ts +++ b/src/lib/MB/types.ts @@ -18,10 +18,27 @@ export interface ExternalLinks { }; } +export interface ReleaseEditorMedium { + loaded(): boolean; + loading(): boolean; + loadTracks(): void; + tracks(): Array<{ + name(): string; + }>; +} + +export interface ReleaseEditorFields { + release(): { + name(): string; + mediums(): ReleaseEditorMedium[]; + }; +} + export interface ReleaseEditor { externalLinks: { current: ExternalLinks; }; + rootField: ReleaseEditorFields; } declare global { diff --git a/src/lib/util/format.ts b/src/lib/util/format.ts index 57091f1d0..5720ecbff 100644 --- a/src/lib/util/format.ts +++ b/src/lib/util/format.ts @@ -11,3 +11,7 @@ export function formatFileSize(size: number): string { const truncatedSize = Number((size / Math.pow(1024, order)).toFixed(2)); return `${truncatedSize} ${suffixes[order]}`; } + +export function formatPercentage(perc: number): string { + return `${(perc * 100).toFixed(2)}%`; +} diff --git a/src/mb_guess_language/index.tsx b/src/mb_guess_language/index.tsx new file mode 100644 index 000000000..afa9b7d91 --- /dev/null +++ b/src/mb_guess_language/index.tsx @@ -0,0 +1,143 @@ +import pThrottle from 'p-throttle'; + +import type { ReleaseEditorMedium } from '@lib/MB/types'; +import { ConsoleSink } from '@lib/logging/consoleSink'; +import { LogLevel } from '@lib/logging/levels'; +import { LOGGER } from '@lib/logging/logger'; +import { assertDefined } from '@lib/util/assert'; +import { logFailure, retryTimes } from '@lib/util/async'; +import { qs } from '@lib/util/dom'; + +import { detectLanguage } from './libretranslate'; +import { detectScript } from './script'; + +import DEBUG_MODE from 'consts:debug-mode'; +import USERSCRIPT_ID from 'consts:userscript-id'; + +async function expandMedium(medium: ReleaseEditorMedium): Promise { + // Already loaded. + if (medium.loaded()) { + return; + } + + // Not yet loading: Release with > 3 mediums, expand them. + if (!medium.loading()) { + medium.loadTracks(); + } + + // Wait until medium has finished loading. Need to poll. Continuously poll + // every 250ms, time out after 5s. + return retryTimes((): void => { + if (!medium.loaded()) throw new Error('Medium did not load'); + }, 20, 250); +} + +async function _getTrackTitlesFromMedium(medium: ReleaseEditorMedium): Promise { + await expandMedium(medium); + return medium.tracks().map((track) => track.name()); +} + +// Load at most 4 mediums each second. +const getTrackTitlesFromMedium = pThrottle({ + limit: 4, + interval: 1000, +})(_getTrackTitlesFromMedium); + +async function getTrackTitles(): Promise { + const mediums = window.MB.releaseEditor?.rootField.release().mediums() ?? []; + + const trackTitlesPerMedium = await Promise.all(mediums.map((medium) => getTrackTitlesFromMedium(medium))); + const trackTitles = trackTitlesPerMedium.flat(); + + if (trackTitles.length === 0) { + throw new Error('No tracklist to guess from'); + } + + return trackTitles; +} + +async function getTitles(): Promise { + const trackTitles = await getTrackTitles(); + + const releaseTitle = window.MB.releaseEditor?.rootField.release().name(); + assertDefined(releaseTitle, 'Release title is undefined?'); + return [ + releaseTitle, + ...trackTitles, + ]; +} + +async function doGuess(): Promise { + const titles = await getTitles(); + + try { + await guessLanguage(titles); + } catch (err) { + LOGGER.error('Failed to guess language', err); + } + + guessScript(titles); +} + +function selectOption(element: HTMLSelectElement, label: string): void { + const idx = [...element.options] + .findIndex((option) => option.text.trim() === label); + if (idx < 0) { + throw new Error(`Label ${label} not found in selection dropdown list`); + } + + element.selectedIndex = idx; + element.dispatchEvent(new Event('change')); +} + +async function guessLanguage(titles: string[]): Promise { + const text = titles.join('. '); + const language = await detectLanguage(text); + selectOption(qs('select#language'), language); +} + +function guessScript(titles: string[]): void { + // Remove spaces, they're just filler and lead to poorer matches. + const text = titles.join('').replaceAll(/\s+/g, ''); + const script = detectScript(text); + if (!script) { + LOGGER.error('Could not determine script'); + return; + } + + selectOption(qs('select#script'), script === 'Han' ? 'Han (Hanzi, Kanji, Hanja)' : script); +} + +function addButton(): void { + const btn = as HTMLButtonElement; + const loadingSpan = ; + + qs('table.row-form > tbody').append( + + + {btn} + {loadingSpan} + + ); +} + + +LOGGER.configure({ + logLevel: DEBUG_MODE ? LogLevel.DEBUG : LogLevel.INFO, +}); +LOGGER.addSink(new ConsoleSink(USERSCRIPT_ID)); +addButton(); diff --git a/src/mb_guess_language/libretranslate.ts b/src/mb_guess_language/libretranslate.ts new file mode 100644 index 000000000..f7866650b --- /dev/null +++ b/src/mb_guess_language/libretranslate.ts @@ -0,0 +1,92 @@ +import { LOGGER } from '@lib/logging/logger'; + +interface SuccessResult { + language: LanguageCode; + confidence: number; +} + +interface ErrorResult { + error: string; +} + +type Result = SuccessResult[] | ErrorResult; + +const LANGUAGE_MAPPINGS = { + en: 'English', + ar: 'Arabic', + az: 'Azerbaijani', + zh: 'Chinese', + cs: 'Czech', + da: 'Danish', + nl: 'Dutch', + eo: 'Esperanto', + fi: 'Finnish', + fr: 'French', + de: 'German', + el: 'Greek', + he: 'Hebrew', + hi: 'Hindi', + hu: 'Hungarian', + id: 'Indonesian', + ga: 'Irish', + it: 'Italian', + ja: 'Japanese', + ko: 'Korean', + fa: 'Persian', + pl: 'Polish', + pt: 'Portuguese', + ru: 'Russian', + sk: 'Slovak', + es: 'Spanish', + sv: 'Swedish', + tr: 'Turkish', + uk: 'Ukrainian', + vi: 'Vietnamese', +}; + +type LanguageCode = keyof typeof LANGUAGE_MAPPINGS; + +// In the order in which we'll try them. +// Generally: The ones with the least rate limited is tried first. It just so +// happens that those also support the least languages, so we try the ones with +// more rate limiting in case of failure. +const API_BASES = [ + 'https://translate.argosopentech.com', // Seemingly unlimited, but limited language support + // 'https://libretranslate.com', // 30 per minute, larger language support, but we need an API key to access it. + 'https://libretranslate.de', // 15 per minute, largest language support of all tested. +]; + +export async function detectLanguage(text: string, confidenceThreshold = 0.75): Promise { + for (const apiBase of API_BASES) { + try { + const result = await doRequest(apiBase, text); + const reliableResult = result.find((res) => (res.confidence / 100) >= confidenceThreshold); + if (reliableResult) { + LOGGER.info(`Identified as ${reliableResult.language} with confidence ${reliableResult.confidence}%`); + return LANGUAGE_MAPPINGS[reliableResult.language]; + } + LOGGER.debug(JSON.stringify(result)); + } catch (err) { + LOGGER.error(`Failed to detect language of text using ${apiBase}`, err); + } + } + + throw new Error('Could not detect language reliably'); +} + +async function doRequest(apiBase: string, text: string): Promise { + const resp = await fetch(`${apiBase}/detect`, { + method: 'post', + headers: { + accept: 'application/json', + }, + body: new URLSearchParams({ q: text }), + }); + + const respContent = await resp.json() as Result; + if ('error' in respContent) { + throw new Error(respContent.error); + } + + return respContent; +} diff --git a/src/mb_guess_language/meta.ts b/src/mb_guess_language/meta.ts new file mode 100644 index 000000000..ebcabe357 --- /dev/null +++ b/src/mb_guess_language/meta.ts @@ -0,0 +1,16 @@ +import type { UserscriptMetadata } from '@lib/util/metadata'; +import { transformMBMatchURL } from '@lib/util/metadata'; + +const metadata: UserscriptMetadata = { + name: 'MB: Guess language and script', + description: 'Guess language and script from release tracklist', + 'run-at': 'document-end', + match: [ + 'release/*/add', + 'release/*/add?*', + 'release/*/edit', + 'release/*/edit?*', + ].map((path) => transformMBMatchURL(path)), +}; + +export default metadata; diff --git a/src/mb_guess_language/script.ts b/src/mb_guess_language/script.ts new file mode 100644 index 000000000..08fce4bb6 --- /dev/null +++ b/src/mb_guess_language/script.ts @@ -0,0 +1,59 @@ +import { LOGGER } from '@lib/logging/logger'; +import { formatPercentage } from '@lib/util/format'; + +// Only the ones that are "Frequently used" according to MB. +const REGEXES = { + Arabic: /\p{Script=Arabic}/u, + Cyrillic: /\p{Script=Cyrillic}/u, + Greek: /\p{Script=Greek}/u, + // We cannot distinguish between simplified and traditional. There are + // implementations out there, but they list each of the traditional/simplified + // characters, which is very bloated. + Han: /\p{Script=Han}/u, + Hebrew: /\p{Script=Hebrew}/u, + // There's a separate script in MB for Katakana, but it's not always applicable. + Japanese: /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u, + Korean: /[\p{Script=Han}\p{Script=Hangul}]/u, + Thai: /\p{Script=Thai}/u, + Latin: /\p{Script=Latin}/u, +}; + +type ScriptName = keyof typeof REGEXES; + +function countMatchingCharacters(text: string, regexp: RegExp): number { + return text.match(new RegExp(regexp, 'g'))?.length ?? 0; +} + +function selectBestMatch(scriptToCount: Map): [ScriptName, number] { + const counts = [...scriptToCount.entries()].sort(([, c1], [, c2]) => c2 - c1); + return counts[0]; +} + +export function detectScript(text: string, confidenceThreshold = 0.75): ScriptName | undefined { + const scriptToCount = new Map( + (Object.entries(REGEXES) as Array<[ScriptName, RegExp]>) + .map(([script, regex]): [ScriptName, number] => [script, countMatchingCharacters(text, regex)])); + + // Save and remove Latin from the results, to prefer non-Latin over Latin + // in mixed tracklists. + const latinCount = scriptToCount.get('Latin')!; + const latinConfidence = latinCount / text.length; + scriptToCount.delete('Latin'); + + // Prefer non-Latin if it makes up at least 15% of the text (arbitrary threshold) + // and together with Latin leads to a good enough match. + // See https://musicbrainz.org/doc/Style/Release#Language_and_script + const bestMatch = selectBestMatch(scriptToCount); + const bestMatchConfidence = bestMatch[1] / text.length; + if (bestMatchConfidence >= 0.15 && bestMatchConfidence + latinConfidence >= confidenceThreshold) { + LOGGER.info(`Identified as ${bestMatch[0]} with confidence ${formatPercentage(bestMatchConfidence + latinConfidence)}, of which ${formatPercentage(latinConfidence)} Latin`); + return bestMatch[0]; + } + + if (latinConfidence > 0.75) { + LOGGER.info(`Identified as Latin with confidence ${formatPercentage(latinConfidence)}`); + return 'Latin'; + } + + return undefined; +} diff --git a/src/mb_guess_language/tsconfig.json b/src/mb_guess_language/tsconfig.json new file mode 100644 index 000000000..366aa6e4c --- /dev/null +++ b/src/mb_guess_language/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "../../configs/tsconfig.base-web.json", + "include": ["**/*"], + "references": [ + { "path": "../lib/" } + ], + "compilerOptions": { + "types": ["nativejsx/types/jsx"] + } +} diff --git a/tsconfig.json b/tsconfig.json index f9f09528b..c6d955626 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -31,6 +31,7 @@ { "path": "./src/lib" }, { "path": "./src/mb_caa_dimensions" }, { "path": "./src/mb_enhanced_cover_art_uploads" }, + { "path": "./src/mb_guess_language" }, { "path": "./src/mb_multi_external_links" }, { "path": "./tests/unit/build" }, { "path": "./tests/unit/lib" }, From 96acc45d2060241986c336faa227c09beb1a3d61 Mon Sep 17 00:00:00 2001 From: ROpdebee <15186467+ROpdebee@users.noreply.github.com> Date: Mon, 20 Jun 2022 20:48:36 +0200 Subject: [PATCH 2/2] fix(guess lang): fix match patterns for release/add --- src/mb_guess_language/meta.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mb_guess_language/meta.ts b/src/mb_guess_language/meta.ts index ebcabe357..7bc7add5c 100644 --- a/src/mb_guess_language/meta.ts +++ b/src/mb_guess_language/meta.ts @@ -6,8 +6,8 @@ const metadata: UserscriptMetadata = { description: 'Guess language and script from release tracklist', 'run-at': 'document-end', match: [ - 'release/*/add', - 'release/*/add?*', + 'release/add', + 'release/add?*', 'release/*/edit', 'release/*/edit?*', ].map((path) => transformMBMatchURL(path)),