Encoding Long Audio Clips #71

aviaefrat · 2023-09-12T22:48:24Z

I need the EnCodec tokens of long audio clips (hours long).
Inputing such files as-is results in cuda OOM.
I've seen you "do not try to be smart about long files".
Does chunking the long audio files naively (and concatenating the EnCodec tokens post-hoc) produce identical results as inputting an entire file to the model?
If not, how should I chunk my audio files?

julien-blanchon · 2024-05-12T13:54:56Z

Hey @aviaefrat did you find an elegent solution to this ?

foreverhell · 2024-07-24T03:20:25Z

I have tries to split the long audio files naively and concatenate the EnCodec tokens, but the produce results are not consistent except the first clip. I do not know how to keep them same.

MiniXC · 2024-11-22T11:00:54Z

Here is a (admittedly not too clean) snippet that I successfully used for encoding/decoding long files.

from pathlib import Path
from argparse import ArgumentParser
import io
import gzip
import tarfile
import shutil

from encodec import EncodecModel
from encodec.compress import compress, decompress
from encodec.utils import convert_audio
import torchaudio
import torch
import numpy as np
from tqdm.auto import tqdm

model = EncodecModel.encodec_model_24khz()

SECOND_CHUNKS = 1

device = None

def compress_encodec(src_path, tgt_path):
    wav, sr = torchaudio.load(src_path)
    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
    # change tgt path to not have a suffix
    tgt_path = Path(tgt_path)
    if "." in tgt_path.name:
        tgt_path = tgt_path.parent / tgt_path.name.split(".")[0]
    if wav.shape[1] > model.sample_rate * SECOND_CHUNKS:
        tgt_path_temp = Path("/tmp") / tgt_path.name
        for i in tqdm(range(0, wav.shape[1], model.sample_rate * SECOND_CHUNKS)):
            wav_part = wav[..., i:i + model.sample_rate * SECOND_CHUNKS]
            if device.type == "cuda":
                wav_part = wav_part.cuda()
            file = compress(model, wav_part)
            with open(tgt_path_temp.with_suffix(f".{i}.ecdc"), "wb") as f:
                f.write(file)
        with tarfile.open(tgt_path.with_suffix(".tar.gz"), "w:gz") as tar:
            for i in range(0, wav.shape[1], model.sample_rate * SECOND_CHUNKS):
                tar.add(tgt_path_temp.with_suffix(f".{i}.ecdc"), arcname=tgt_path_temp.with_suffix(f".{i}.ecdc").name)
        # remove the individual files
        for i in range(0, wav.shape[1], model.sample_rate * SECOND_CHUNKS):
            Path(tgt_path_temp.with_suffix(f".{i}.ecdc")).unlink()
    else:
        if device.type == "cuda":
            wav = wav.cuda()
        file = compress(model, wav)
        with gzip.open(tgt_path, "wb") as f:
            f.write(file)

def decompress_encodec(codes_path, tgt_path):
    if str(codes_path).endswith(".tar.gz"):
        all_files = []
        with tarfile.open(codes_path, "r:gz") as tar:
            tar.extractall(path=Path("/tmp"))
            all_files = tar.getnames()
        all_files = [Path("/tmp") / Path(file) for file in all_files]
        all_files = sorted(all_files, key=lambda x: int(x.name.split(".")[1]))
        all_files = [codes_path.parent / Path(file) for file in all_files]
        wav_parts = []
        for file in tqdm(all_files):
            with open(file, "rb") as f:
                codes = f.read()
            wav = decompress(codes, device)
            wav_parts.append(wav[0])
        wav = torch.cat(wav_parts, dim=1)
        torchaudio.save(tgt_path, wav, model.sample_rate)
    else:
        with open(codes_path, "rb") as f:
            codes = f.read()
        wav = decompress(codes)
        torchaudio.save(tgt_path, wav, model.sample_rate)

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("src_path", type=Path)
    parser.add_argument("tgt_path", type=Path)
    parser.add_argument("--bandwidth", type=float, default=6.0)
    parser.add_argument("--decompress", action="store_true")
    parser.add_argument("--device", type=str, default="cuda")
    args = parser.parse_args()
    model.set_target_bandwidth(args.bandwidth)
    device = torch.device(args.device)
    model = model.to(device)
    if args.decompress:
        decompress_encodec(args.src_path, args.tgt_path)
    else:
        compress_encodec(args.src_path, args.tgt_path)

aviaefrat added the question Further information is requested label Sep 12, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Encoding Long Audio Clips #71

Encoding Long Audio Clips #71

aviaefrat commented Sep 12, 2023

julien-blanchon commented May 12, 2024

foreverhell commented Jul 24, 2024

MiniXC commented Nov 22, 2024 •

edited

Loading

Encoding Long Audio Clips #71

Encoding Long Audio Clips #71

Comments

aviaefrat commented Sep 12, 2023

julien-blanchon commented May 12, 2024

foreverhell commented Jul 24, 2024

MiniXC commented Nov 22, 2024 • edited Loading

MiniXC commented Nov 22, 2024 •

edited

Loading