diff --git a/lib/ebml/EbmlIterator.ts b/lib/ebml/EbmlIterator.ts index 921131839..811279aea 100644 --- a/lib/ebml/EbmlIterator.ts +++ b/lib/ebml/EbmlIterator.ts @@ -8,11 +8,28 @@ import * as Token from 'token-types'; const debug = initDebug('music-metadata:parser:ebml'); -interface ILinkedElementType extends IElementType { +export interface ILinkedElementType extends IElementType { + id: number; parent: ILinkedElementType | undefined; readonly container?: { [id: number]: ILinkedElementType; }; } +export enum ParseAction { + ReadNext = 0, // Continue reading the next elements + IgnoreElement = 2, // Ignore (do not read) this element + SkipSiblings = 3, // Skip all remaining elements at the same level + TerminateParsing = 4, // Terminate the parsing process + SkipElement = 5 // Consider the element has read, assume position is at the next element +} + +/** + * @return true, to quit the parser + */ +export type IElementListener = { + startNext: (dtdElement: ILinkedElementType) => ParseAction, + elementValue: (dtdElement: ILinkedElementType, value: ValueType, offset: number) => Promise +} + /** * Extensible Binary Meta Language (EBML) iterator * https://en.wikipedia.org/wiki/Extensible_Binary_Meta_Language @@ -42,16 +59,17 @@ export class EbmlIterator { this.parserMap.set(DataType.float, e => this.readFloat(e)); } - public async iterate(dtdElement: IElementType, posDone: number): Promise { - return this.parseContainer(linkParents(dtdElement), posDone); + public async iterate(dtdElement: IElementType, posDone: number, listener: IElementListener): Promise { + return this.parseContainer(linkParents(dtdElement), posDone, listener); } - private async parseContainer(dtdElement: ILinkedElementType, posDone: number): Promise { + private async parseContainer(dtdElement: ILinkedElementType, posDone: number, listener: IElementListener): Promise { const tree: ITree = {}; while (this.tokenizer.position < posDone) { let element: IHeader; + const elementPosition= this.tokenizer.position; try { - element = await this.readElement(); + element = await this.readElement(); } catch (error) { if (error instanceof EndOfStreamError) { break; @@ -60,32 +78,53 @@ export class EbmlIterator { } const child = (dtdElement.container as { [id: number]: ILinkedElementType; })[element.id]; if (child) { - if (child.ignore) { - debug(`Ignore element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container}`); - await this.tokenizer.ignore(element.len); - } else { - if (element.id === 0x1F43B675) { - // Hack to ignore remaining segment, when cluster element received - // await this.tokenizer.ignore(posDone - this.tokenizer.position); - // break; - } - debug(`Reading element: name=${getElementPath(child)}{id=0x${element.id}, container=${!!child.container}}`); - if (child.container) { - const res = await this.parseContainer(child, element.len >= 0 ? this.tokenizer.position + element.len : -1); - if (child.multiple) { - if (!tree[child.name]) { - tree[child.name] = []; + const action = listener.startNext(child); + switch (action) { + case ParseAction.ReadNext: { + if (element.id === 0x1F43B675) { + // Hack to ignore remaining segment, when cluster element received + // await this.tokenizer.ignore(posDone - this.tokenizer.position); + // break; + } + debug(`Read element: name=${getElementPath(child)}{id=0x${element.id.toString(16)}, container=${!!child.container}} at position=${elementPosition}`); + if (child.container) { + const res = await this.parseContainer(child, element.len >= 0 ? this.tokenizer.position + element.len : -1, listener); + if (child.multiple) { + if (!tree[child.name]) { + tree[child.name] = []; + } + (tree[child.name] as ITree[]).push(res); + } else { + tree[child.name] = res; } - (tree[child.name] as ITree[]).push(res); + await listener.elementValue(child, res, elementPosition); } else { - tree[child.name] = res; - } - } else { - const parser = this.parserMap.get(child.value as DataType); - if (typeof parser === 'function') { - tree[child.name] = await parser(element); + const parser = this.parserMap.get(child.value as DataType); + if (typeof parser === 'function') { + const value = await parser(element); + tree[child.name] = value; + await listener.elementValue(child, value, elementPosition); + } } - } + } break; + + case ParseAction.SkipElement: + debug(`Go to next element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + break; + + case ParseAction.IgnoreElement: + debug(`Ignore element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + await this.tokenizer.ignore(element.len); + break; + + case ParseAction.SkipSiblings: + debug(`Ignore remaining container, at: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + await this.tokenizer.ignore(posDone - this.tokenizer.position); + break; + + case ParseAction.TerminateParsing: + debug(`Terminate parsing at element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`); + return tree; } } else { switch (element.id) { @@ -94,7 +133,7 @@ export class EbmlIterator { await this.tokenizer.ignore(element.len); break; default: - debug(`parseEbml: parent=${getElementPath(dtdElement)}, unknown child: id=${element.id.toString(16)}`); + debug(`parseEbml: parent=${getElementPath(dtdElement)}, unknown child: id=${element.id.toString(16)} at position=${elementPosition}`); this.padding += element.len; await this.tokenizer.ignore(element.len); } @@ -191,8 +230,11 @@ function readUIntBeAsBigInt(buf: Uint8Array, len: number): bigint { function linkParents(element: IElementType): ILinkedElementType { if (element.container) { Object.keys(element.container) - .map(id => (element.container as { [id: string]: ILinkedElementType; })[id]) - .forEach(child => { + .map(id => { + const child = (element.container as { [id: string]: ILinkedElementType; })[id]; + child.id = Number.parseInt(id); + return child; + }).forEach(child => { child.parent = element as ILinkedElementType; linkParents(child); }); diff --git a/lib/ebml/types.ts b/lib/ebml/types.ts index a97113f69..810fc26e6 100644 --- a/lib/ebml/types.ts +++ b/lib/ebml/types.ts @@ -24,7 +24,6 @@ export interface IElementType { readonly value?: DataType; readonly container?: { [id: number]: IElementType; }; readonly multiple?: boolean; - readonly ignore?: boolean; } export interface IEbmlDoc { diff --git a/lib/matroska/MatroskaDtd.ts b/lib/matroska/MatroskaDtd.ts index 92c382cd9..3bd32625e 100644 --- a/lib/matroska/MatroskaDtd.ts +++ b/lib/matroska/MatroskaDtd.ts @@ -27,15 +27,16 @@ export const matroskaDtd: IElementType = { name: 'segment', container: { - // Meta Seek Information + // Meta Seek Information (also known as MetaSeek) 0x114d9b74: { name: 'seekHead', container: { 0x4dbb: { name: 'seek', + multiple: true, container: { - 0x53ab: {name: 'seekId', value: DataType.binary}, - 0x53ac: {name: 'seekPosition', value: DataType.uint} + 0x53ab: {name: 'id', value: DataType.binary}, + 0x53ac: {name: 'position', value: DataType.uint} } } } @@ -69,8 +70,8 @@ export const matroskaDtd: IElementType = { 0x58d7: {name: 'silentTracks ', multiple: true}, 0xa7: {name: 'position', value: DataType.uid}, 0xab: {name: 'prevSize', value: DataType.uid}, - 0xa0: {name: 'blockGroup', ignore: true}, - 0xa3: {name: 'simpleBlock', ignore: true} + 0xa0: {name: 'blockGroup'}, + 0xa3: {name: 'simpleBlock'} } }, @@ -289,7 +290,6 @@ export const matroskaDtd: IElementType = { } } } - } } } diff --git a/lib/matroska/MatroskaParser.ts b/lib/matroska/MatroskaParser.ts index e7a911cfa..7a67e7274 100644 --- a/lib/matroska/MatroskaParser.ts +++ b/lib/matroska/MatroskaParser.ts @@ -4,11 +4,11 @@ import type { ITokenizer } from 'strtok3'; import type { INativeMetadataCollector } from '../common/MetadataCollector.js'; import { BasicParser } from '../common/BasicParser.js'; import { matroskaDtd } from './MatroskaDtd.js'; -import { type IMatroskaDoc, type ITrackEntry, TargetType, TrackType } from './types.js'; +import { type IAttachments, type ISegmentInformation, type ITags, type ITrackElement, type ITrackEntry, TargetType, TrackType } from './types.js'; import type { AnyTagValue, IOptions, ITrackInfo } from '../type.js'; import type { ITokenParser } from '../ParserFactory.js'; -import { EbmlIterator } from '../ebml/EbmlIterator.js'; +import { EbmlIterator, ParseAction } from '../ebml/EbmlIterator.js'; const debug = initDebug('music-metadata:parser:matroska'); @@ -38,78 +38,103 @@ export class MatroskaParser extends BasicParser { const matroskaIterator = new EbmlIterator(this.tokenizer); debug('Initializing DTD end MatroskaIterator'); - const matroska = await matroskaIterator.iterate(matroskaDtd, containerSize) as unknown as IMatroskaDoc; - - this.metadata.setFormat('container', `EBML/${matroska.ebml.docType}`); - if (matroska.segment) { - - const info = matroska.segment.info; - if (info) { - const timecodeScale = info.timecodeScale ? info.timecodeScale :1000000; - if (typeof info.duration === 'number') { - const duration = info.duration * timecodeScale / 1000000000; - await this.addTag('segment:title', info.title); - this.metadata.setFormat('duration', Number(duration)); + await matroskaIterator.iterate(matroskaDtd, containerSize, { + startNext: (element) => { + switch (element.id) { + // case 0x1f43b675: // cluster + case 0x1c53bb6b: // Cueing Data + debug(`Skip element: name=${element.name}, id=0x${element.id.toString(16)}`); + return ParseAction.IgnoreElement; + case 0x1f43b675: // cluster + return ParseAction.IgnoreElement; + default: + return ParseAction.ReadNext; } - } - - const audioTracks = matroska.segment.tracks; - if (audioTracks?.entries) { - - audioTracks.entries.forEach(entry => { - const stream: ITrackInfo = { - codecName: entry.codecID.replace('A_', '').replace('V_', ''), - codecSettings: entry.codecSettings, - flagDefault: entry.flagDefault, - flagLacing: entry.flagLacing, - flagEnabled: entry.flagEnabled, - language: entry.language, - name: entry.name, - type: entry.trackType, - audio: entry.audio, - video: entry.video - }; - this.metadata.addStreamInfo(stream); - }); - - const audioTrack = audioTracks.entries - .filter(entry => entry.trackType === TrackType.audio) - .reduce((acc: ITrackEntry | null, cur: ITrackEntry): ITrackEntry => { - if (!acc) return cur; - if (cur.flagDefault && !acc.flagDefault) return cur; - if (cur.trackNumber < acc.trackNumber) return cur; - return acc; - }, null); - - if (audioTrack) { - this.metadata.setFormat('codec', audioTrack.codecID.replace('A_', '')); - this.metadata.setFormat('sampleRate', audioTrack.audio.samplingFrequency); - this.metadata.setFormat('numberOfChannels', audioTrack.audio.channels); - } - - if (matroska.segment.tags) { - await Promise.all(matroska.segment.tags.tag.map(async tag => { - const target = tag.target; - const targetType = target?.targetTypeValue ? TargetType[target.targetTypeValue] : (target?.targetType ? target.targetType : 'track'); - await Promise.all(tag.simpleTags.map(async simpleTag => { - const value = simpleTag.string ? simpleTag.string : simpleTag.binary; - await this.addTag(`${targetType}:${simpleTag.name}`, value); + }, + elementValue: async (element, value) => { + debug(`Received: name=${element.name}, value=${value}`); + switch (element.id) { + case 0x4282: // docType + this.metadata.setFormat('container', `EBML/${value}`); + break; + + case 0x1549a966: { // Info (Segment Information) + const info = value as ISegmentInformation; + const timecodeScale = info.timecodeScale ? info.timecodeScale : 1000000; + if (typeof info.duration === 'number') { + const duration = info.duration * timecodeScale / 1000000000; + await this.addTag('segment:title', info.title); + this.metadata.setFormat('duration', Number(duration)); + } + } + break; + + case 0x1654ae6b: { // tracks + const audioTracks = value as ITrackElement; + if (audioTracks?.entries) { + audioTracks.entries.forEach(entry => { + const stream: ITrackInfo = { + codecName: entry.codecID.replace('A_', '').replace('V_', ''), + codecSettings: entry.codecSettings, + flagDefault: entry.flagDefault, + flagLacing: entry.flagLacing, + flagEnabled: entry.flagEnabled, + language: entry.language, + name: entry.name, + type: entry.trackType, + audio: entry.audio, + video: entry.video + }; + this.metadata.addStreamInfo(stream); + }); + + const audioTrack = audioTracks.entries + .filter(entry => entry.trackType === TrackType.audio) + .reduce((acc: ITrackEntry | null, cur: ITrackEntry): ITrackEntry => { + if (!acc) return cur; + if (cur.flagDefault && !acc.flagDefault) return cur; + if (cur.trackNumber < acc.trackNumber) return cur; + return acc; + }, null); + + if (audioTrack) { + this.metadata.setFormat('codec', audioTrack.codecID.replace('A_', '')); + this.metadata.setFormat('sampleRate', audioTrack.audio.samplingFrequency); + this.metadata.setFormat('numberOfChannels', audioTrack.audio.channels); + } + } + } + break; + + case 0x1254c367: { // tags + const tags = value as unknown as ITags; + await Promise.all(tags.tag.map(async tag => { + const target = tag.target; + const targetType = target?.targetTypeValue ? TargetType[target.targetTypeValue] : (target?.targetType ? target.targetType : 'track'); + await Promise.all(tag.simpleTags.map(async simpleTag => { + const value = simpleTag.string ? simpleTag.string : simpleTag.binary; + await this.addTag(`${targetType}:${simpleTag.name}`, value); + })); })); - })); - } - - if (matroska.segment.attachments) { - await Promise.all(matroska.segment.attachments.attachedFiles - .filter(file => file.mimeType.startsWith('image/')) - .map(file => this.addTag('picture', { - data: file.data, - format: file.mimeType, - description: file.description, - name: file.name - }))); + } + break; + + case 0x1941a469: { // attachments + const attachments = value as unknown as IAttachments; + await Promise.all(attachments.attachedFiles + .filter(file => file.mimeType.startsWith('image/')) + .map(file => this.addTag('picture', { + data: file.data, + format: file.mimeType, + description: file.description, + name: file.name + }))); + + } + break; } } - } + }); } private async addTag(tagId: string, value: AnyTagValue): Promise { diff --git a/lib/matroska/types.ts b/lib/matroska/types.ts index c59ac39dc..19f32a17b 100644 --- a/lib/matroska/types.ts +++ b/lib/matroska/types.ts @@ -1,12 +1,12 @@ import type { IEbmlDoc } from '../ebml/types.js'; -export interface ISeekHead { - id?: Uint8Array; - position?: number; +export interface ISeek { + id: Uint8Array; + position: number; } -export interface IMetaSeekInformation { - seekHeads: ISeekHead[]; +export interface ISeekHead { + seek: ISeek[]; } export interface ISegmentInformation { @@ -147,8 +147,8 @@ export interface IAttachments { } export interface IMatroskaSegment { - metaSeekInfo?: IMetaSeekInformation; - seekHeads?: ISeekHead[] + metaSeekInfo?: ISeekHead; + seekHeads?: ISeek[] info?: ISegmentInformation; tracks?: ITrackElement; tags?: ITags; diff --git a/test/test-file-matroska.ts b/test/test-file-matroska.ts index 99cedb8cd..e2dda7d40 100644 --- a/test/test-file-matroska.ts +++ b/test/test-file-matroska.ts @@ -102,7 +102,7 @@ describe('Matroska formats', () => { const filePath = path.join(matroskaSamplePath, 'My Baby Boy.webm'); - const {format, common} = await mm.parseFile(filePath, {duration: true}); + const {format, common} = await mm.parseFile(filePath); assert.strictEqual(format.container, 'EBML/webm', 'format.container'); assert.strictEqual(format.codec, 'OPUS', 'format.codec');