Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/lib/MB/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,27 @@ export interface ExternalLinks {
};
}

export interface ReleaseEditorMedium {
loaded(): boolean;
loading(): boolean;
loadTracks(): void;
tracks(): Array<{
name(): string;
}>;
}

export interface ReleaseEditorFields {
release(): {
name(): string;
mediums(): ReleaseEditorMedium[];
};
}

export interface ReleaseEditor {
externalLinks: {
current: ExternalLinks;
};
rootField: ReleaseEditorFields;
}

declare global {
Expand Down
4 changes: 4 additions & 0 deletions src/lib/util/format.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@ export function formatFileSize(size: number): string {
const truncatedSize = Number((size / Math.pow(1024, order)).toFixed(2));
return `${truncatedSize} ${suffixes[order]}`;
}

export function formatPercentage(perc: number): string {
return `${(perc * 100).toFixed(2)}%`;
}
143 changes: 143 additions & 0 deletions src/mb_guess_language/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import pThrottle from 'p-throttle';

import type { ReleaseEditorMedium } from '@lib/MB/types';
import { ConsoleSink } from '@lib/logging/consoleSink';
import { LogLevel } from '@lib/logging/levels';
import { LOGGER } from '@lib/logging/logger';
import { assertDefined } from '@lib/util/assert';
import { logFailure, retryTimes } from '@lib/util/async';
import { qs } from '@lib/util/dom';

import { detectLanguage } from './libretranslate';
import { detectScript } from './script';

import DEBUG_MODE from 'consts:debug-mode';
import USERSCRIPT_ID from 'consts:userscript-id';

async function expandMedium(medium: ReleaseEditorMedium): Promise<void> {
// Already loaded.
if (medium.loaded()) {
return;
}

// Not yet loading: Release with > 3 mediums, expand them.
if (!medium.loading()) {
medium.loadTracks();
}

// Wait until medium has finished loading. Need to poll. Continuously poll
// every 250ms, time out after 5s.
return retryTimes((): void => {
if (!medium.loaded()) throw new Error('Medium did not load');
}, 20, 250);
}

async function _getTrackTitlesFromMedium(medium: ReleaseEditorMedium): Promise<string[]> {
await expandMedium(medium);
return medium.tracks().map((track) => track.name());
}

// Load at most 4 mediums each second.
const getTrackTitlesFromMedium = pThrottle({
limit: 4,
interval: 1000,
})(_getTrackTitlesFromMedium);

async function getTrackTitles(): Promise<string[]> {
const mediums = window.MB.releaseEditor?.rootField.release().mediums() ?? [];

const trackTitlesPerMedium = await Promise.all(mediums.map((medium) => getTrackTitlesFromMedium(medium)));
const trackTitles = trackTitlesPerMedium.flat();

if (trackTitles.length === 0) {
throw new Error('No tracklist to guess from');
}

return trackTitles;
}

async function getTitles(): Promise<string[]> {
const trackTitles = await getTrackTitles();

const releaseTitle = window.MB.releaseEditor?.rootField.release().name();
assertDefined(releaseTitle, 'Release title is undefined?');
return [
releaseTitle,
...trackTitles,
];
}

async function doGuess(): Promise<void> {
const titles = await getTitles();

try {
await guessLanguage(titles);
} catch (err) {
LOGGER.error('Failed to guess language', err);
}

guessScript(titles);
}

function selectOption(element: HTMLSelectElement, label: string): void {
const idx = [...element.options]
.findIndex((option) => option.text.trim() === label);
if (idx < 0) {
throw new Error(`Label ${label} not found in selection dropdown list`);
}

element.selectedIndex = idx;
element.dispatchEvent(new Event('change'));
}

async function guessLanguage(titles: string[]): Promise<void> {
const text = titles.join('. ');
const language = await detectLanguage(text);
selectOption(qs<HTMLSelectElement>('select#language'), language);
}

function guessScript(titles: string[]): void {
// Remove spaces, they're just filler and lead to poorer matches.
const text = titles.join('').replaceAll(/\s+/g, '');
const script = detectScript(text);
if (!script) {
LOGGER.error('Could not determine script');
return;
}

selectOption(qs<HTMLSelectElement>('select#script'), script === 'Han' ? 'Han (Hanzi, Kanji, Hanja)' : script);
}

function addButton(): void {
const btn = <button
type='button'
onClick={(evt): void => {
evt.preventDefault();
loadingSpan.style.display = '';
btn.disabled = true;

logFailure(
doGuess()
.finally(() => {
loadingSpan.style.display = 'none';
btn.disabled = false;
}));
}}
>Guess language and script</button> as HTMLButtonElement;
const loadingSpan = <span className='loading-message' style={{ display: 'none', marginLeft: '10px' }}/>;

qs('table.row-form > tbody').append(<tr>
<td />
<td colSpan={2}>
{btn}
{loadingSpan}
</td>
</tr>);
}


LOGGER.configure({
logLevel: DEBUG_MODE ? LogLevel.DEBUG : LogLevel.INFO,
});
LOGGER.addSink(new ConsoleSink(USERSCRIPT_ID));
addButton();
92 changes: 92 additions & 0 deletions src/mb_guess_language/libretranslate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import { LOGGER } from '@lib/logging/logger';

interface SuccessResult {
language: LanguageCode;
confidence: number;
}

interface ErrorResult {
error: string;
}

type Result = SuccessResult[] | ErrorResult;

const LANGUAGE_MAPPINGS = {
en: 'English',
ar: 'Arabic',
az: 'Azerbaijani',
zh: 'Chinese',
cs: 'Czech',
da: 'Danish',
nl: 'Dutch',
eo: 'Esperanto',
fi: 'Finnish',
fr: 'French',
de: 'German',
el: 'Greek',
he: 'Hebrew',
hi: 'Hindi',
hu: 'Hungarian',
id: 'Indonesian',
ga: 'Irish',
it: 'Italian',
ja: 'Japanese',
ko: 'Korean',
fa: 'Persian',
pl: 'Polish',
pt: 'Portuguese',
ru: 'Russian',
sk: 'Slovak',
es: 'Spanish',
sv: 'Swedish',
tr: 'Turkish',
uk: 'Ukrainian',
vi: 'Vietnamese',
};

type LanguageCode = keyof typeof LANGUAGE_MAPPINGS;

// In the order in which we'll try them.
// Generally: The ones with the least rate limited is tried first. It just so
// happens that those also support the least languages, so we try the ones with
// more rate limiting in case of failure.
const API_BASES = [
'https://translate.argosopentech.com', // Seemingly unlimited, but limited language support
// 'https://libretranslate.com', // 30 per minute, larger language support, but we need an API key to access it.
'https://libretranslate.de', // 15 per minute, largest language support of all tested.
];

export async function detectLanguage(text: string, confidenceThreshold = 0.75): Promise<string> {
for (const apiBase of API_BASES) {
try {
const result = await doRequest(apiBase, text);
const reliableResult = result.find((res) => (res.confidence / 100) >= confidenceThreshold);
if (reliableResult) {
LOGGER.info(`Identified as ${reliableResult.language} with confidence ${reliableResult.confidence}%`);
return LANGUAGE_MAPPINGS[reliableResult.language];
}
LOGGER.debug(JSON.stringify(result));
} catch (err) {
LOGGER.error(`Failed to detect language of text using ${apiBase}`, err);
}
}

throw new Error('Could not detect language reliably');
}

async function doRequest(apiBase: string, text: string): Promise<SuccessResult[]> {
const resp = await fetch(`${apiBase}/detect`, {
method: 'post',
headers: {
accept: 'application/json',
},
body: new URLSearchParams({ q: text }),
});

const respContent = await resp.json() as Result;
if ('error' in respContent) {
throw new Error(respContent.error);
}

return respContent;
}
16 changes: 16 additions & 0 deletions src/mb_guess_language/meta.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import type { UserscriptMetadata } from '@lib/util/metadata';
import { transformMBMatchURL } from '@lib/util/metadata';

const metadata: UserscriptMetadata = {
name: 'MB: Guess language and script',
description: 'Guess language and script from release tracklist',
'run-at': 'document-end',
match: [
'release/add',
'release/add?*',
'release/*/edit',
'release/*/edit?*',
].map((path) => transformMBMatchURL(path)),
};

export default metadata;
59 changes: 59 additions & 0 deletions src/mb_guess_language/script.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import { LOGGER } from '@lib/logging/logger';
import { formatPercentage } from '@lib/util/format';

// Only the ones that are "Frequently used" according to MB.
const REGEXES = {
Arabic: /\p{Script=Arabic}/u,
Cyrillic: /\p{Script=Cyrillic}/u,
Greek: /\p{Script=Greek}/u,
// We cannot distinguish between simplified and traditional. There are
// implementations out there, but they list each of the traditional/simplified
// characters, which is very bloated.
Han: /\p{Script=Han}/u,
Hebrew: /\p{Script=Hebrew}/u,
// There's a separate script in MB for Katakana, but it's not always applicable.
Japanese: /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u,
Korean: /[\p{Script=Han}\p{Script=Hangul}]/u,
Thai: /\p{Script=Thai}/u,
Latin: /\p{Script=Latin}/u,
};

type ScriptName = keyof typeof REGEXES;

function countMatchingCharacters(text: string, regexp: RegExp): number {
return text.match(new RegExp(regexp, 'g'))?.length ?? 0;
}

function selectBestMatch(scriptToCount: Map<ScriptName, number>): [ScriptName, number] {
const counts = [...scriptToCount.entries()].sort(([, c1], [, c2]) => c2 - c1);
return counts[0];
}

export function detectScript(text: string, confidenceThreshold = 0.75): ScriptName | undefined {
const scriptToCount = new Map(
(Object.entries(REGEXES) as Array<[ScriptName, RegExp]>)
.map(([script, regex]): [ScriptName, number] => [script, countMatchingCharacters(text, regex)]));

// Save and remove Latin from the results, to prefer non-Latin over Latin
// in mixed tracklists.
const latinCount = scriptToCount.get('Latin')!;
const latinConfidence = latinCount / text.length;
scriptToCount.delete('Latin');

// Prefer non-Latin if it makes up at least 15% of the text (arbitrary threshold)
// and together with Latin leads to a good enough match.
// See https://musicbrainz.org/doc/Style/Release#Language_and_script
const bestMatch = selectBestMatch(scriptToCount);
const bestMatchConfidence = bestMatch[1] / text.length;
if (bestMatchConfidence >= 0.15 && bestMatchConfidence + latinConfidence >= confidenceThreshold) {
LOGGER.info(`Identified as ${bestMatch[0]} with confidence ${formatPercentage(bestMatchConfidence + latinConfidence)}, of which ${formatPercentage(latinConfidence)} Latin`);
return bestMatch[0];
}

if (latinConfidence > 0.75) {
LOGGER.info(`Identified as Latin with confidence ${formatPercentage(latinConfidence)}`);
return 'Latin';
}

return undefined;
}
10 changes: 10 additions & 0 deletions src/mb_guess_language/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"extends": "../../configs/tsconfig.base-web.json",
"include": ["**/*"],
"references": [
{ "path": "../lib/" }
],
"compilerOptions": {
"types": ["nativejsx/types/jsx"]
}
}
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
{ "path": "./src/lib" },
{ "path": "./src/mb_caa_dimensions" },
{ "path": "./src/mb_enhanced_cover_art_uploads" },
{ "path": "./src/mb_guess_language" },
{ "path": "./src/mb_multi_external_links" },
{ "path": "./tests/unit/build" },
{ "path": "./tests/unit/lib" },
Expand Down