From 64c56b1bc4d8da8c60f260d3657e65a7f351310b Mon Sep 17 00:00:00 2001 From: Borewit Date: Thu, 15 Aug 2024 15:20:33 +0200 Subject: [PATCH] Parse EBML (Matroska, webm) using async notification --- lib/ebml/EbmlIterator.ts | 96 +++++++++++------- lib/ebml/types.ts | 1 - lib/matroska/MatroskaDtd.ts | 12 +-- lib/matroska/MatroskaParser.ts | 176 +++++++++++++++++---------------- lib/matroska/types.ts | 14 +-- test/test-file-matroska.ts | 33 ++++--- 6 files changed, 185 insertions(+), 147 deletions(-) diff --git a/lib/ebml/EbmlIterator.ts b/lib/ebml/EbmlIterator.ts index 8b1fa368d..811279aea 100644 --- a/lib/ebml/EbmlIterator.ts +++ b/lib/ebml/EbmlIterator.ts @@ -14,10 +14,21 @@ export interface ILinkedElementType extends IElementType { readonly container?: { [id: number]: ILinkedElementType; }; } +export enum ParseAction { + ReadNext = 0, // Continue reading the next elements + IgnoreElement = 2, // Ignore (do not read) this element + SkipSiblings = 3, // Skip all remaining elements at the same level + TerminateParsing = 4, // Terminate the parsing process + SkipElement = 5 // Consider the element has read, assume position is at the next element +} + /** * @return true, to quit the parser */ -export type ElementListener = (dtdElement: ILinkedElementType, value: ValueType) => Promise; +export type IElementListener = { + startNext: (dtdElement: ILinkedElementType) => ParseAction, + elementValue: (dtdElement: ILinkedElementType, value: ValueType, offset: number) => Promise +} /** * Extensible Binary Meta Language (EBML) iterator @@ -34,7 +45,6 @@ export class EbmlIterator { private ebmlMaxIDLength = 4; private ebmlMaxSizeLength = 8; - private cancel = false; /** * @param {ITokenizer} tokenizer Input @@ -49,17 +59,17 @@ export class EbmlIterator { this.parserMap.set(DataType.float, e => this.readFloat(e)); } - public async iterate(dtdElement: IElementType, posDone: number, listener: ElementListener): Promise { - this.cancel = false; + public async iterate(dtdElement: IElementType, posDone: number, listener: IElementListener): Promise { return this.parseContainer(linkParents(dtdElement), posDone, listener); } - private async parseContainer(dtdElement: ILinkedElementType, posDone: number, listener: ElementListener): Promise { + private async parseContainer(dtdElement: ILinkedElementType, posDone: number, listener: IElementListener): Promise { const tree: ITree = {}; - while (this.tokenizer.position < posDone && !this.cancel) { + while (this.tokenizer.position < posDone) { let element: IHeader; + const elementPosition= this.tokenizer.position; try { - element = await this.readElement(); + element = await this.readElement(); } catch (error) { if (error instanceof EndOfStreamError) { break; @@ -68,35 +78,53 @@ export class EbmlIterator { } const child = (dtdElement.container as { [id: number]: ILinkedElementType; })[element.id]; if (child) { - if (child.ignore) { - debug(`Ignore element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container}`); - await this.tokenizer.ignore(element.len); - } else { - if (element.id === 0x1F43B675) { - // Hack to ignore remaining segment, when cluster element received - // await this.tokenizer.ignore(posDone - this.tokenizer.position); - // break; - } - debug(`Reading element: name=${getElementPath(child)}{id=0x${element.id}, container=${!!child.container}}`); - if (child.container) { - const res = await this.parseContainer(child, element.len >= 0 ? this.tokenizer.position + element.len : -1, listener); - if (child.multiple) { - if (!tree[child.name]) { - tree[child.name] = []; + const action = listener.startNext(child); + switch (action) { + case ParseAction.ReadNext: { + if (element.id === 0x1F43B675) { + // Hack to ignore remaining segment, when cluster element received + // await this.tokenizer.ignore(posDone - this.tokenizer.position); + // break; + } + debug(`Read element: name=${getElementPath(child)}{id=0x${element.id.toString(16)}, container=${!!child.container}} at position=${elementPosition}`); + if (child.container) { + const res = await this.parseContainer(child, element.len >= 0 ? this.tokenizer.position + element.len : -1, listener); + if (child.multiple) { + if (!tree[child.name]) { + tree[child.name] = []; + } + (tree[child.name] as ITree[]).push(res); + } else { + tree[child.name] = res; } - (tree[child.name] as ITree[]).push(res); + await listener.elementValue(child, res, elementPosition); } else { - tree[child.name] = res; - } - this.cancel = await listener(child, res); - } else { - const parser = this.parserMap.get(child.value as DataType); - if (typeof parser === 'function') { - const value = await parser(element); - tree[child.name] = value; - this.cancel = await listener(child, value); + const parser = this.parserMap.get(child.value as DataType); + if (typeof parser === 'function') { + const value = await parser(element); + tree[child.name] = value; + await listener.elementValue(child, value, elementPosition); + } } - } + } break; + + case ParseAction.SkipElement: + debug(`Go to next element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + break; + + case ParseAction.IgnoreElement: + debug(`Ignore element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + await this.tokenizer.ignore(element.len); + break; + + case ParseAction.SkipSiblings: + debug(`Ignore remaining container, at: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + await this.tokenizer.ignore(posDone - this.tokenizer.position); + break; + + case ParseAction.TerminateParsing: + debug(`Terminate parsing at element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + return tree; } } else { switch (element.id) { @@ -105,7 +133,7 @@ export class EbmlIterator { await this.tokenizer.ignore(element.len); break; default: - debug(`parseEbml: parent=${getElementPath(dtdElement)}, unknown child: id=${element.id.toString(16)}`); + debug(`parseEbml: parent=${getElementPath(dtdElement)}, unknown child: id=${element.id.toString(16)} at position=${elementPosition}`); this.padding += element.len; await this.tokenizer.ignore(element.len); } diff --git a/lib/ebml/types.ts b/lib/ebml/types.ts index a97113f69..810fc26e6 100644 --- a/lib/ebml/types.ts +++ b/lib/ebml/types.ts @@ -24,7 +24,6 @@ export interface IElementType { readonly value?: DataType; readonly container?: { [id: number]: IElementType; }; readonly multiple?: boolean; - readonly ignore?: boolean; } export interface IEbmlDoc { diff --git a/lib/matroska/MatroskaDtd.ts b/lib/matroska/MatroskaDtd.ts index e9048c99d..3bd32625e 100644 --- a/lib/matroska/MatroskaDtd.ts +++ b/lib/matroska/MatroskaDtd.ts @@ -27,15 +27,16 @@ export const matroskaDtd: IElementType = { name: 'segment', container: { - // Meta Seek Information + // Meta Seek Information (also known as MetaSeek) 0x114d9b74: { name: 'seekHead', container: { 0x4dbb: { name: 'seek', + multiple: true, container: { - 0x53ab: {name: 'seekId', value: DataType.binary}, - 0x53ac: {name: 'seekPosition', value: DataType.uint} + 0x53ab: {name: 'id', value: DataType.binary}, + 0x53ac: {name: 'position', value: DataType.uint} } } } @@ -69,8 +70,8 @@ export const matroskaDtd: IElementType = { 0x58d7: {name: 'silentTracks ', multiple: true}, 0xa7: {name: 'position', value: DataType.uid}, 0xab: {name: 'prevSize', value: DataType.uid}, - 0xa0: {name: 'blockGroup', ignore: true}, - 0xa3: {name: 'simpleBlock', ignore: true} + 0xa0: {name: 'blockGroup'}, + 0xa3: {name: 'simpleBlock'} } }, @@ -174,7 +175,6 @@ export const matroskaDtd: IElementType = { // Cueing Data 0x1c53bb6b: { name: 'cues', - ignore: true, container: { 0xbb: { name: 'cuePoint', diff --git a/lib/matroska/MatroskaParser.ts b/lib/matroska/MatroskaParser.ts index 8b304d3fa..6a2d30907 100644 --- a/lib/matroska/MatroskaParser.ts +++ b/lib/matroska/MatroskaParser.ts @@ -4,11 +4,11 @@ import type { ITokenizer } from 'strtok3'; import type { INativeMetadataCollector } from '../common/MetadataCollector.js'; import { BasicParser } from '../common/BasicParser.js'; import { matroskaDtd } from './MatroskaDtd.js'; -import { IAttachments, type IMatroskaDoc, IMatroskaSegment, ISegmentInformation, ITags, ITrackElement, type ITrackEntry, TargetType, TrackType } from './types.js'; +import { type IAttachments, type ISeekHead, type ISegmentInformation, type ITags, type ITrackElement, type ITrackEntry, TargetType, TrackType } from './types.js'; import type { AnyTagValue, IOptions, ITrackInfo } from '../type.js'; import type { ITokenParser } from '../ParserFactory.js'; -import { EbmlIterator } from '../ebml/EbmlIterator.js'; +import { EbmlIterator, ParseAction } from '../ebml/EbmlIterator.js'; const debug = initDebug('music-metadata:parser:matroska'); @@ -38,95 +38,103 @@ export class MatroskaParser extends BasicParser { const matroskaIterator = new EbmlIterator(this.tokenizer); debug('Initializing DTD end MatroskaIterator'); - const matroska = await matroskaIterator.iterate(matroskaDtd, containerSize, async (element, value) => { - debug(`Received: name=${element.name}, value=${value}`); - switch (element.id) { - case 0x4282: // docType - this.metadata.setFormat('container', `EBML/${value}`); - break; - - case 0x1549a966: {// info - const info = value as ISegmentInformation; - const timecodeScale = info.timecodeScale ? info.timecodeScale : 1000000; - if (typeof info.duration === 'number') { - const duration = info.duration * timecodeScale / 1000000000; - await this.addTag('segment:title', info.title); - this.metadata.setFormat('duration', Number(duration)); - } + await matroskaIterator.iterate(matroskaDtd, containerSize, { + startNext: (element) => { + switch (element.id) { + // case 0x1f43b675: // cluster + case 0x1c53bb6b: // Cueing Data + debug(`Skip element: name=${element.name}, id=0x${element.id.toString(16)}`); + return ParseAction.IgnoreElement; + case 0x1f43b675: // cluster + return ParseAction.IgnoreElement; + default: + return ParseAction.ReadNext; } - break; - - case 0x1654ae6b: { // tracks - const audioTracks = value as ITrackElement; - if (audioTracks?.entries) { - audioTracks.entries.forEach(entry => { - const stream: ITrackInfo = { - codecName: entry.codecID.replace('A_', '').replace('V_', ''), - codecSettings: entry.codecSettings, - flagDefault: entry.flagDefault, - flagLacing: entry.flagLacing, - flagEnabled: entry.flagEnabled, - language: entry.language, - name: entry.name, - type: entry.trackType, - audio: entry.audio, - video: entry.video - }; - this.metadata.addStreamInfo(stream); - }); - - const audioTrack = audioTracks.entries - .filter(entry => entry.trackType === TrackType.audio) - .reduce((acc: ITrackEntry | null, cur: ITrackEntry): ITrackEntry => { - if (!acc) return cur; - if (cur.flagDefault && !acc.flagDefault) return cur; - if (cur.trackNumber < acc.trackNumber) return cur; - return acc; - }, null); - - if (audioTrack) { - this.metadata.setFormat('codec', audioTrack.codecID.replace('A_', '')); - this.metadata.setFormat('sampleRate', audioTrack.audio.samplingFrequency); - this.metadata.setFormat('numberOfChannels', audioTrack.audio.channels); + }, + elementValue: async (element, value, offset) => { + debug(`Received: name=${element.name}, value=${value}`); + switch (element.id) { + case 0x4282: // docType + this.metadata.setFormat('container', `EBML/${value}`); + break; + + case 0x1549a966: { // Info (Segment Information) + const info = value as ISegmentInformation; + const timecodeScale = info.timecodeScale ? info.timecodeScale : 1000000; + if (typeof info.duration === 'number') { + const duration = info.duration * timecodeScale / 1000000000; + await this.addTag('segment:title', info.title); + this.metadata.setFormat('duration', Number(duration)); } } - } - break; - - case 0x1254c367: { // 'tags' - const tags = value as unknown as ITags; - await Promise.all(tags.tag.map(async tag => { - const target = tag.target; - const targetType = target?.targetTypeValue ? TargetType[target.targetTypeValue] : (target?.targetType ? target.targetType : 'track'); - await Promise.all(tag.simpleTags.map(async simpleTag => { - const value = simpleTag.string ? simpleTag.string : simpleTag.binary; - await this.addTag(`${targetType}:${simpleTag.name}`, value); + break; + + case 0x1654ae6b: { // tracks + const audioTracks = value as ITrackElement; + if (audioTracks?.entries) { + audioTracks.entries.forEach(entry => { + const stream: ITrackInfo = { + codecName: entry.codecID.replace('A_', '').replace('V_', ''), + codecSettings: entry.codecSettings, + flagDefault: entry.flagDefault, + flagLacing: entry.flagLacing, + flagEnabled: entry.flagEnabled, + language: entry.language, + name: entry.name, + type: entry.trackType, + audio: entry.audio, + video: entry.video + }; + this.metadata.addStreamInfo(stream); + }); + + const audioTrack = audioTracks.entries + .filter(entry => entry.trackType === TrackType.audio) + .reduce((acc: ITrackEntry | null, cur: ITrackEntry): ITrackEntry => { + if (!acc) return cur; + if (cur.flagDefault && !acc.flagDefault) return cur; + if (cur.trackNumber < acc.trackNumber) return cur; + return acc; + }, null); + + if (audioTrack) { + this.metadata.setFormat('codec', audioTrack.codecID.replace('A_', '')); + this.metadata.setFormat('sampleRate', audioTrack.audio.samplingFrequency); + this.metadata.setFormat('numberOfChannels', audioTrack.audio.channels); + } + } + } + break; + + case 0x1254c367: { // tags + const tags = value as unknown as ITags; + await Promise.all(tags.tag.map(async tag => { + const target = tag.target; + const targetType = target?.targetTypeValue ? TargetType[target.targetTypeValue] : (target?.targetType ? target.targetType : 'track'); + await Promise.all(tag.simpleTags.map(async simpleTag => { + const value = simpleTag.string ? simpleTag.string : simpleTag.binary; + await this.addTag(`${targetType}:${simpleTag.name}`, value); + })); })); - })); - } - break; - - case 0x1941a469: { //attachments - const attachments = value as unknown as IAttachments; - await Promise.all(attachments.attachedFiles - .filter(file => file.mimeType.startsWith('image/')) - .map(file => this.addTag('picture', { - data: file.data, - format: file.mimeType, - description: file.description, - name: file.name - }))); + } + break; + + case 0x1941a469: { // attachments + const attachments = value as unknown as IAttachments; + await Promise.all(attachments.attachedFiles + .filter(file => file.mimeType.startsWith('image/')) + .map(file => this.addTag('picture', { + data: file.data, + format: file.mimeType, + description: file.description, + name: file.name + }))); + } + break; } - break; - - //case 'cluster': - case 0x18538067: // segment - debug(`Cancel EBML parser after element ${element.name}`); - return true; // Quite EBML parser, we got the metadata we need } - return false; - }) as unknown as IMatroskaDoc; + }); } private async addTag(tagId: string, value: AnyTagValue): Promise { diff --git a/lib/matroska/types.ts b/lib/matroska/types.ts index c59ac39dc..19f32a17b 100644 --- a/lib/matroska/types.ts +++ b/lib/matroska/types.ts @@ -1,12 +1,12 @@ import type { IEbmlDoc } from '../ebml/types.js'; -export interface ISeekHead { - id?: Uint8Array; - position?: number; +export interface ISeek { + id: Uint8Array; + position: number; } -export interface IMetaSeekInformation { - seekHeads: ISeekHead[]; +export interface ISeekHead { + seek: ISeek[]; } export interface ISegmentInformation { @@ -147,8 +147,8 @@ export interface IAttachments { } export interface IMatroskaSegment { - metaSeekInfo?: IMetaSeekInformation; - seekHeads?: ISeekHead[] + metaSeekInfo?: ISeekHead; + seekHeads?: ISeek[] info?: ISegmentInformation; tracks?: ITrackElement; tags?: ITags; diff --git a/test/test-file-matroska.ts b/test/test-file-matroska.ts index 7c5e73d5f..dcba63de9 100644 --- a/test/test-file-matroska.ts +++ b/test/test-file-matroska.ts @@ -102,7 +102,7 @@ describe('Matroska formats', () => { const filePath = path.join(matroskaSamplePath, 'My Baby Boy.webm'); - const {format, common} = await mm.parseFile(filePath, {duration: true}); + const {format, common} = await mm.parseFile(filePath); assert.strictEqual(format.container, 'EBML/webm', 'format.container'); assert.strictEqual(format.codec, 'OPUS', 'format.codec'); @@ -127,11 +127,10 @@ describe('Matroska formats', () => { // https://github.com/Borewit/music-metadata/issues/384 describe('Multiple audio tracks', () => { - it('parse: "matroska-test-w1-test5-short.mkv"', async () => { - + async function parse(options?: mm.IOptions) { const mkvPath = path.join(matroskaSamplePath, 'matroska-test-w1-test5-short.mkv'); - const {format, common} = await mm.parseFile(mkvPath); + const {format, common} = await mm.parseFile(mkvPath,options); assert.deepEqual(format.container, 'EBML/matroska', 'format.container'); assert.deepEqual(format.tagTypes, [ 'matroska' ], 'format.tagTypes'); @@ -143,6 +142,14 @@ describe('Matroska formats', () => { assert.deepEqual(common.title, 'Elephant Dreams', 'common.title'); assert.deepEqual(common.album, 'Matroska Test Files - Wave 1', 'common.album'); + } + + it('parse: "matroska-test-w1-test5-short.mkv"', () => { + return parse(); + }); + + it('parse: "matroska-test-w1-test5-short.mkv `mkvUseIndex` flag', () => { + return parse({mkvUseIndex: true}); }); }); @@ -160,6 +167,13 @@ describe('Matroska formats', () => { assert.strictEqual(format.numberOfChannels, 1, 'format.numberOfChannels'); }); + it('Parse stream with `mkvUseIndex` flag', async () => { + const {format} = await mm.parseFile(mkvPath, {mkvUseIndex: true}); + assert.strictEqual(format.container, 'EBML/webm', 'format.container'); + assert.strictEqual(format.codec, 'OPUS', 'format.codec'); + assert.strictEqual(format.numberOfChannels, 1, 'format.numberOfChannels'); + }); + }); describe('Handle corrupt Matroska file', () => { @@ -183,15 +197,4 @@ describe('Matroska formats', () => { assert.isUndefined(format.duration, 'format.duration'); }); - it('parse: 1 GB', async () => { - - const mkvPath = 'C:\\Users\\Maarten Gerbrands\\Downloads\\lg-uhd-secret-garden.mkv'; - const metadata = await mm.parseFile(mkvPath); - assert.isDefined(metadata, 'determine file-type'); - assert.strictEqual(metadata.format.container, 'EBML/matroska', 'fileType.mime'); - assert.strictEqual(metadata.format.codec, 'AC3', 'format.codec'); - assert.strictEqual(metadata.format.sampleRate, 48000, 'metadata.format.sampleRate'); - assert.approximately(metadata.format.duration, 184.69, 0.01, 'metadata.format.duration'); - }); - });