Skip to content

Commit

Permalink
add use_sec option in alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
Patchethium committed Aug 13, 2023
1 parent 6eaf892 commit 68e31b1
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 10 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ I'll cover this part if it's needed by anyone. Please let me know by creating an
- Rust crate
- multi-language
- Storing `pau` index in binary model
- Option to convert frame number into milisecond
- Record and warn the user when score is too low

## Licence
Expand All @@ -49,7 +48,7 @@ I'll cover this part if it's needed by anyone. Please let me know by creating an

The file `snfa/stft.py` contains code adapted from `librosa` which obeys `ISC Licence` with different copyright claim. A copy of `librosa`'s licence can be found in [librosa's repo](https://github.com/librosa/librosa/blob/main/LICENSE.md).

The file `snfa/backtrack.py` contains code adapted from `torchaudio` which obeys `BSD 2-Clause "Simplified" License`. A copy of `torchaudio`'s licence can be found in [torchaudio's repo](https://github.com/pytorch/audio/blob/main/LICENSE).
The file `snfa/viterbi.py` contains code adapted from `torchaudio` which obeys `BSD 2-Clause "Simplified" License`. A copy of `torchaudio`'s licence can be found in [torchaudio's repo](https://github.com/pytorch/audio/blob/main/LICENSE).

## Credit

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "snfa"
version = "0.0.5"
version = "0.0.1"
authors = [{ name = "Patchethium" }]
description = "a simple neural forced aligner for phoneme to audio alignment"
readme = "README.md"
Expand Down
19 changes: 12 additions & 7 deletions src/snfa/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ def softmax(x, axis=-1):
e_x = np.exp(x - np.max(x, axis, keepdims=True))
return e_x / np.sum(e_x, axis, keepdims=True)


def l1_normalize(arr, axis=None):
arr = arr - np.min(arr)
norm = np.sum(np.abs(arr), axis=axis, keepdims=True)
normalized_arr = arr / norm
return normalized_arr


def log_softmax(x, axis=-1):
return np.log(softmax(x, axis))

Expand Down Expand Up @@ -79,12 +81,10 @@ def __call__(self, x: np.ndarray) -> np.ndarray:
class Aligner:
def __init__(self, filename: str = "model.bin"):
f = open(filename, "rb")

# Read metadata first, 8 is the amount of metadata entries
# each entry is one int32 (4 bytes)
meta_data: np.ndarray = np.frombuffer(
f.read(8 * 4), np.int32, count=8
)
meta_data: np.ndarray = np.frombuffer(f.read(8 * 4), np.int32, count=8)
# the entry list
[
self.n_fft,
Expand Down Expand Up @@ -159,7 +159,7 @@ def get_indices(self, ph):
raise Exception("phoneme not in model's phoneme set")
return tokens

def align(self, x, ph):
def align(self, x, ph, use_sec=False):
mel = self.mel(x)
indices = self.get_indices(ph)

Expand All @@ -171,10 +171,15 @@ def align(self, x, ph):
path = viterbi.backtrack(trellis)

segments = viterbi.merge_repeats(path, indices)
if use_sec:
for seg in segments:
seg.start = seg.start * self.hop_size / self.sr
seg.end = seg.end * self.hop_size / self.sr
return segments, path, trellis, emission, labels

def __call__(self, x: np.ndarray, ph: List[str]):
return self.align(x, ph)
def __call__(self, x: np.ndarray, ph: List[str], use_sec=False):
return self.align(x, ph, use_sec)


if __name__ == "__main__":
alinger = Aligner("cv_jp.bin")
Expand Down

0 comments on commit 68e31b1

Please sign in to comment.