Skip to content

Commit 5a68da2

Browse files
committed
feat: Add parser and writer for TTML format
1 parent 73c5eca commit 5a68da2

13 files changed

+562
-3
lines changed

Diff for: README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pysubs2
1414
pysubs2 is a Python library for editing subtitle files.
1515
It’s based on *SubStation Alpha*, the native format of
1616
[Aegisub](http://www.aegisub.org/); it also supports *SubRip (SRT)*,
17-
*MicroDVD*, *MPL2*, *TMP*, *WebVTT* and *SAMI* formats and *OpenAI Whisper* captions.
17+
*MicroDVD*, *MPL2*, *TMP*, *WebVTT*, *TTML* and *SAMI* formats and *OpenAI Whisper* captions.
1818

1919
There is a small CLI tool for batch conversion and retiming.
2020

Diff for: docs/api-reference.rst

+4
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ Here you can find specific details regarding support of the individual subtitle
131131
:members:
132132
:show-inheritance:
133133

134+
.. autoclass:: pysubs2.formats.ttml.TTMLFormat
135+
:members:
136+
:show-inheritance:
137+
134138
.. autoclass:: pysubs2.formats.sami.SAMIFormat
135139
:members:
136140
:show-inheritance:

Diff for: docs/index.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ pysubs2
33

44
pysubs2 is a Python library for editing subtitle files. It’s based on *SubStation Alpha*,
55
the native format of `Aegisub <http://www.aegisub.org/>`_; it also supports *SubRip (SRT)*,
6-
*MicroDVD*, *MPL2*, *TMP*, *WebVTT* and *SAMI* formats. There is a small CLI tool for batch conversion and retiming.
6+
*MicroDVD*, *MPL2*, *TMP*, *WebVTT*, *TTML* and *SAMI* formats. There is a small CLI tool for batch conversion and retiming.
77

88
.. code-block:: bash
99

Diff for: docs/supported-formats.rst

+21
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,19 @@ Implemented in :class:`pysubs2.webvtt.WebVTTFormat`.
8686
.. versionchanged:: 1.0.0
8787
Added basic support for WebVTT subtitle format.
8888

89+
TTML
90+
~~~~
91+
92+
A complex XML-based format from W3C, format identifier is ``"ttml"``. This format is capable of advanced styling,
93+
which is currently not supported at all by the parser, but there is some support in the writer.
94+
95+
`Link to TTML specification <https://www.w3.org/TR/ttml1>`_, official name is
96+
"Timed Text Markup Language 1".
97+
Implemented in :class:`pysubs2.ttml.TTMLFormat`.
98+
99+
.. versionchanged:: 1.8.0
100+
Added basic support for TTML subtitle format.
101+
89102
SAMI
90103
~~~~
91104

@@ -97,6 +110,9 @@ official name is "Synchronized Accessible Media Interchange".
97110

98111
Implemented in :class:`pysubs2.sami.SAMIFormat`.
99112

113+
.. versionchanged:: 1.8.0
114+
Added basic support for SAMI subtitle format.
115+
100116
OpenAI Whisper
101117
~~~~~~~~~~~~~~
102118

@@ -105,6 +121,11 @@ format identifier is ``"whisper_jax"``. Only parser is implemented.
105121

106122
Implemented in :class:`pysubs2.whisper.WhisperJAXFormat`.
107123

124+
.. versionchanged:: 1.8.0
125+
Added support for parsing text representation of Whisper JAX output (previously,
126+
only loading Whisper output directly from its Python API was supported).
127+
128+
108129
Frame-based Formats
109130
-------------------
110131

Diff for: pysubs2/common.py

+74-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from dataclasses import dataclass
2-
from typing import Tuple, Union
2+
from typing import Tuple, Union, Optional, Dict, Iterable, Iterator
33
from enum import IntEnum
4+
import xml.etree.ElementTree as ET
5+
from contextlib import contextmanager
46

57

68
@dataclass(init=False)
@@ -62,3 +64,74 @@ def to_ssa_alignment(self) -> int:
6264

6365

6466
IntOrFloat = Union[int, float]
67+
68+
69+
def etree_iter_child_nodes(elem: ET.Element) -> Iterator[Union[ET.Element, str]]:
70+
"""
71+
Yield child text nodes (as str) and subelements for given XML element
72+
73+
Workaround for awkward ``xml.etree.ElementTree`` API.
74+
75+
See also:
76+
`etree_append_child_nodes()`
77+
78+
"""
79+
if elem.text:
80+
yield elem.text
81+
for child_elem in elem:
82+
yield child_elem
83+
if child_elem.tail:
84+
yield child_elem.tail
85+
86+
87+
def etree_append_child_nodes(elem: ET.Element, nodes: Iterable[Union[ET.Element, str]]) -> None:
88+
"""
89+
Add child text nodes and subelements to given XML element
90+
91+
See also:
92+
`etree_iter_child_nodes()`
93+
94+
"""
95+
last_child = elem[-1] if len(elem) > 0 else None
96+
for node in nodes:
97+
if isinstance(node, str):
98+
if last_child is None:
99+
if elem.text is None:
100+
elem.text = node
101+
else:
102+
elem.text += node
103+
else:
104+
if last_child.tail is None:
105+
last_child.tail = node
106+
else:
107+
last_child.tail += node
108+
else:
109+
elem.append(node)
110+
last_child = node
111+
112+
113+
@contextmanager
114+
def etree_register_namespace_override() -> Iterator[None]:
115+
"""
116+
Context manager that reverts global changes from ``xml.etree.ElementTree.register_namespace()``
117+
118+
Workaround for poor namespace handling in ``xml.etree.ElementTree``.
119+
120+
"""
121+
namespace_map: Optional[Dict[str, str]] = None
122+
namespace_map_original_content = {}
123+
try:
124+
namespace_map = getattr(ET.register_namespace, "_namespace_map", None)
125+
if namespace_map is not None:
126+
namespace_map_original_content = namespace_map.copy()
127+
except Exception:
128+
pass
129+
130+
yield
131+
132+
try:
133+
if namespace_map is not None:
134+
namespace_map.clear()
135+
namespace_map.update(namespace_map_original_content)
136+
except Exception:
137+
pass

Diff for: pysubs2/formats/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .substation import SubstationFormat
99
from .mpl2 import MPL2Format
1010
from .tmp import TmpFormat
11+
from .ttml import TTMLFormat
1112
from .webvtt import WebVTTFormat
1213
from .whisper import WhisperJAXFormat
1314
from ..exceptions import UnknownFormatIdentifierError, UnknownFileExtensionError, FormatAutodetectionError
@@ -23,6 +24,7 @@
2324
".vtt": "vtt",
2425
".sami": "sami",
2526
".smi": "sami",
27+
".ttml": "ttml",
2628
}
2729

2830
#: Dict mapping format identifiers to implementations (FormatBase subclasses).
@@ -37,6 +39,7 @@
3739
"vtt": WebVTTFormat,
3840
"sami": SAMIFormat,
3941
"whisper_jax": WhisperJAXFormat,
42+
"ttml": TTMLFormat,
4043
}
4144

4245
FORMAT_IDENTIFIERS = list(FORMAT_IDENTIFIER_TO_FORMAT_CLASS.keys())

Diff for: pysubs2/formats/subrip.py

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ def guess_format(cls, text: str) -> Optional[str]:
4444
# disambiguation vs. WebVTT
4545
return None
4646

47+
if "http://www.w3.org/ns/ttml" in text:
48+
# disambiguation vs. TTML
49+
return None
50+
4751
for line in text.splitlines():
4852
if len(cls.TIMESTAMP.findall(line)) == 2:
4953
return "srt"

0 commit comments

Comments
 (0)