Natooz · Natooz · Jun 30, 2024 · Jun 29, 2024 · Jun 29, 2024 · Jun 29, 2024
diff --git a/benchmarks/tokenizer_training/benchmark_training.py b/benchmarks/tokenizer_training/benchmark_training.py
@@ -124,7 +124,7 @@ def seq_len_splits(datasets_params: list[tuple[str, dict, str]]) -> None:
 
     Measures the average token sequence length (in base tokens) after splitting the
     token sequence of whole files into bars or beats.
-    These measures can be used to chose good `max_input_chars_per_word` values for
+    These measures can be used to chose good ``max_input_chars_per_word`` values for
     WordPiece.
 
     :param datasets_params: sets of data and tokenizers params.
@@ -339,7 +339,7 @@ def wordpiece_max_chars(
     """
     Measure the training, encoding and decoding times of WordPiece.
 
-    Measures are made with different `max_input_chars_per_word` values, datasets and
+    Measures are made with different ``max_input_chars_per_word`` values, datasets and
     sequence split to see their impact on training, encoding and decoding times.
     It also measures the ratio of "unknown" tokens resulting of ids encoding,
     measuring the proportion of data covered by these sets of parameters / data.

diff --git a/docs/examples.rst b/docs/examples.rst
@@ -122,10 +122,11 @@ Creates a Dataset and a collator to be used with a PyTorch DataLoader to train a
     from miditok.pytorch_data import DatasetMIDI, DataCollator
     from torch.utils.data import DataLoader
 
+    tokenizer = REMI()  # using defaults parameters (constants.py)
     midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
     dataset = DatasetMIDI(
         files_paths=midi_paths,
-        tokenizer,
+        tokenizer=tokenizer,
         max_seq_len=1024,
         bos_token_id=tokenizer.pad_token_id,
         eos_token_id=tokenizer["BOS_None"],

diff --git a/docs/train.rst b/docs/train.rst
@@ -28,7 +28,7 @@ In the case of music, newly learned tokens can represent whole notes (i.e. succe
 Reduced sequence lengths
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-Serializing music files in single "basic" attribute tokens naturally induces fairly long token sequences. As a note is made of at least three tokens (`Pitch`, `Velocity`, `Duration`/`NoteOff`, optionally `Program`), the resulting token sequence will have a number of tokens at least three times the number of notes.
+Serializing music files in single "basic" attribute tokens naturally induces fairly long token sequences. As a note is made of at least three tokens (``Pitch``, ``Velocity``, ``Duration``/``NoteOff``, optionally ``Program``), the resulting token sequence will have a number of tokens at least three times the number of notes.
 
 This is problematic as the time and space complexity of Transformer models grow quadratically with the input sequence length. Thus, the longer the sequence is, the more computations will be made and memory will be used.
 
@@ -63,7 +63,7 @@ Byte Pair Encoding (BPE)
 
 `BPE <https://en.wikipedia.org/wiki/Byte_pair_encoding>`_ is a compression algorithm that replaces the most recurrent token successions of a corpus, by newly created ones. It starts from a vocabulary containing tokens representing the initial alphabet of the modality of the data at hand, and iteratively counts the occurrences of each token successions, or bigrams, in the data, and merges the most recurrent one with a new token representing both of them, until the vocabulary reaches the desired size.
 
-For instance, in the character sequence ``aabaabaacaa``, the sub-sequence ``aa`` occurs three times and is the most recurrent one. Learning BPE on this sequence would replace ``aa`` with a new symbol, e.g., `d`, resulting in a compressed sequence ``dbdbdcd``. The latter can be reduced again by replacing the ``db`` subsequence, giving ``eedcd``. The vocabulary, which initially contained three characters (``a``, ``b`` and ``c``) now also contains ``d`` and ``e``. In practice BPE is learned on a corpus until the vocabulary reaches a target size.
+For instance, in the character sequence ``aabaabaacaa``, the sub-sequence ``aa`` occurs three times and is the most recurrent one. Learning BPE on this sequence would replace ``aa`` with a new symbol, e.g., ``d``, resulting in a compressed sequence ``dbdbdcd``. The latter can be reduced again by replacing the ``db`` subsequence, giving ``eedcd``. The vocabulary, which initially contained three characters (``a``, ``b`` and ``c``) now also contains ``d`` and ``e``. In practice BPE is learned on a corpus until the vocabulary reaches a target size.
 
 Today in the NLP field, BPE is used with many tokenizers to build their vocabulary, as `it allows to encode rare words and segmenting unknown or composed words as sequences of sub-word units <https://aclanthology.org/P16-1162/>`_. The base initial vocabulary is the set of all the unique characters present in the data, which compose the words that are automatically learned as tokens by the BPE algorithm.
 

diff --git a/miditok/classes.py b/miditok/classes.py
@@ -396,7 +396,7 @@ class TokenizerConfig:
         events. In multitrack setting, The value of each ``Pedal`` token will be equal
         to the program of the track. (default: ``False``)
     :param use_pitch_bends: will use ``PitchBend`` tokens. In multitrack setting, a
-        ``Program`` token will be added before each ``PitchBend` token.
+        ``Program`` token will be added before each ``PitchBend`` token.
         (default: ``False``)
     :param use_pitch_intervals: if given True, will represent the pitch of the notes
         with pitch intervals tokens. This way, successive and simultaneous notes will
@@ -408,7 +408,7 @@ class TokenizerConfig:
     :param use_programs: will use ``Program`` tokens to specify the instrument/MIDI
         program of the notes, if the tokenizer is compatible (:ref:`TSD`, :ref:`REMI`,
         :ref:`MIDI-Like`, :ref:`Structured` and :ref:`CPWord`). Use this parameter with
-        the ``programs``, ``one_token_stream_for_programs`` and `program_changes`
+        the ``programs``, ``one_token_stream_for_programs`` and ``program_changes``
         arguments. By default, it will prepend a ``Program`` tokens before each
         ``Pitch``/``NoteOn`` token to indicate its associated instrument, and will
         treat all the tracks of a file as a single sequence of tokens. :ref:`CPWord`,
@@ -472,8 +472,8 @@ class TokenizerConfig:
         durations you expect. (default: ``False``)
     :param pitch_bend_range: range of the pitch bend to consider, to be given as a
         tuple with the form ``(lowest_value, highest_value, num_of_values)``. There
-        will be ``num_of_values`` tokens equally spaced between ``lowest_value` and
-        `highest_value``. (default: ``(-8192, 8191, 32)``)
+        will be ``num_of_values`` tokens equally spaced between ``lowest_value`` and
+        ``highest_value``. (default: ``(-8192, 8191, 32)``)
     :param delete_equal_successive_time_sig_changes: setting this option True will
         delete identical successive time signature changes when preprocessing a music
         file after loading it. For examples, if a file has two time signature changes
@@ -880,7 +880,7 @@ def __eq__(self, other: object) -> bool:
         Check two configs are equal.
 
         :param other: other config object to compare.
-        :return: `True` if all attributes are equal, `False` otherwise.
+        :return: ``True`` if all attributes are equal, ``False`` otherwise.
         """
         # We don't use the == operator as it yields False when comparing lists and
         # tuples containing the same elements. This method is not recursive and only

diff --git a/miditok/midi_tokenizer.py b/miditok/midi_tokenizer.py
@@ -333,9 +333,9 @@
     @property
     def pad_token_id(self) -> int:
         """
-        Return the id of the padding token (`PAD_None`). It is usually 0.
+        Return the id of the padding token (``PAD_None``). It is usually 0.
 
-        :return: id of the padding token (`PAD_None`).
+        :return: id of the padding token (``PAD_None``).
         """
         return (
             self._vocab_base["PAD_None"]
@@ -1109,7 +1109,7 @@
 
         The workflow of this method is as follows: the global events (*Tempo*,
         *TimeSignature*...) and track events (*Pitch*, *Velocity*, *Pedal*...) are
-        gathered into a list, then the time events are added. If `one_token_stream` is
+        gathered into a list, then the time events are added. If ``one_token_stream`` is
         ``True``, all events of all tracks are treated all at once, otherwise the
         events of each track are treated independently.
 
@@ -1524,7 +1524,7 @@
 
     def _create_global_events(self, score: Score) -> list[Event]:
         r"""
-        Create the *global* music tokens: `Tempo` and `TimeSignature`.
+        Create the *global* music tokens: ``Tempo`` and ``TimeSignature``.
 
         :param score: ``symusic.Score`` to extract the events from.
         :return: list of ``miditok.classes.Event``.
@@ -3048,7 +3048,7 @@
         The resulting json files will have an ``ids`` entry containing the token ids.
         The format of the ids will correspond to the format of the tokenizer
         (``tokenizer.io_format``). Note that the file tree of the source files, up to
-        the deepest common root directory if `files_paths` is given as a list of paths,
+        the deepest common root directory if ``files_paths`` is given as a list of paths,
         will be reproducing in ``out_dir``. The config of the tokenizer will be saved
         as a file named ``tokenizer_config_file_name`` (default: ``tokenizer.json``)
         in the ``out_dir`` directory.

diff --git a/miditok/tokenizations/__init__.py b/miditok/tokenizations/__init__.py
@@ -1,8 +1,8 @@
 """
 Tokenizer module.
 
-This module implement tokenizer classes, which inherit from `MusicTokenizer` and
-override specific methods such as `_add_time_events` or `_tokens_to_score` with
+This module implement tokenizer classes, which inherit from ``MusicTokenizer`` and
+override specific methods such as ``_add_time_events`` or ``_tokens_to_score`` with
 their specific behaviors/representations.
 """
 

diff --git a/miditok/tokenizations/midi_like.py b/miditok/tokenizations/midi_like.py
@@ -25,18 +25,18 @@
     (``config.additional_params["max_duration"]``) to be given as a tuple of three
     integers following ``(num_beats, num_frames, res_frames)``, the resolutions being
     in the frames per beat.
-    If you specify `use_programs` as `True` in the config file, the tokenizer will add
-    ``Program`` tokens before each `Pitch` tokens to specify its instrument, and will
+    If you specify ``use_programs`` as ``True`` in the config file, the tokenizer will add
+    ``Program`` tokens before each ``Pitch`` tokens to specify its instrument, and will
     treat all tracks as a single stream of tokens.
 
-    **Note:** as `MIDILike` uses *TimeShifts* events to move the time from note to
+    **Note:** as ``MIDILike`` uses *TimeShifts* events to move the time from note to
     note, it could be unsuited for tracks with long pauses. In such case, the
-    maximum *TimeShift* value will be used. Also, the `MIDILike` tokenizer might alter
+    maximum *TimeShift* value will be used. Also, the ``MIDILike`` tokenizer might alter
     the durations of overlapping notes. If two notes of the same instrument with the
     same pitch are overlapping, i.e. a first one is still being played when a second
     one is also played, the offset time of the first will be set to the onset time of
     the second. This is done to prevent unwanted duration alterations that could happen
-    in such case, as the `NoteOff` token associated to the first note will also end the
+    in such case, as the ``NoteOff`` token associated to the first note will also end the
     second one.
     **Note:** When decoding multiple token sequences (of multiple tracks), i.e. when
     ``config.use_programs`` is False, only the tempos and time signatures of the first

diff --git a/miditok/tokenizations/mumidi.py b/miditok/tokenizations/mumidi.py
@@ -103,7 +103,7 @@ def _score_to_tokens(
         r"""
         Convert a **preprocessed** ``symusic.Score`` object to a sequence of tokens.
 
-        MuMIDI has its own implementation and doesn't use `_add_time_events`.
+        MuMIDI has its own implementation and doesn't use ``_add_time_events``.
 
         :param score: the :class:`symusic.Score` object to convert.
         :return: a :class:`miditok.TokSequence` if ``tokenizer.one_token_stream`` is

diff --git a/miditok/tokenizations/structured.py b/miditok/tokenizations/structured.py
@@ -25,7 +25,7 @@ class Structured(MusicTokenizer):
     Token types always follow the same pattern: *Pitch* -> *Velocity* -> *Duration* ->
     *TimeShift*. The latter is set to 0 for simultaneous notes. To keep this property,
     no additional token can be inserted in MidiTok's implementation, except *Program*
-    that can optionally be added preceding `Pitch` tokens. If you specify
+    that can optionally be added preceding ``Pitch`` tokens. If you specify
     ``use_programs`` as ``True`` in the config file, the tokenizer will add *Program*
     tokens before each *Pitch* tokens to specify its instrument, and will treat all
     tracks as a single stream of tokens.
@@ -210,7 +210,7 @@ def _score_to_tokens(
         Convert a **preprocessed** ``symusic.Score`` object to a sequence of tokens.
 
         We override the parent method to handle the "non-program" case where
-        *TimeShift* events have already been added by `_notes_to_events`.
+        *TimeShift* events have already been added by ``_notes_to_events``.
 
         The workflow of this method is as follows: the global events (*Tempo*,
         *TimeSignature*...) and track events (*Pitch*, *Velocity*, *Pedal*...) are