Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a basic trainer and dataset #1

Merged
merged 17 commits into from
Jul 14, 2024
1 change: 1 addition & 0 deletions e2_tts_pytorch/assets/vocab.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
manmay-nakhashi marked this conversation as resolved.
Show resolved Hide resolved
27 changes: 27 additions & 0 deletions e2_tts_pytorch/dataset/e2_collate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import torch
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
mel_specs = [item['mel_spec'].squeeze(0) for item in batch]
mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])

max_mel_length = mel_lengths.max().item()
padded_mel_specs = []
for spec in mel_specs:
padding = (0, max_mel_length - spec.size(-1))
padded_spec = torch.nn.functional.pad(spec, padding, mode='constant', value=0)
padded_mel_specs.append(padded_spec)

mel_specs = torch.stack(padded_mel_specs)

text = [item['text'] for item in batch]
text_lengths = torch.LongTensor([len(item) for item in text])
text = pad_sequence([torch.LongTensor(item) for item in text], batch_first=True)

batch_dict = {
'mel': mel_specs,
'mel_lengths': mel_lengths,
'text': text,
'text_lengths': text_lengths,
}
return batch_dict
70 changes: 70 additions & 0 deletions e2_tts_pytorch/dataset/e2_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import torch
from torch.utils.data import Dataset
import torchaudio
from e2_tts_pytorch.utils.compute_mel import TorchMelSpectrogram
from datasets import load_dataset
import logging
from einops import rearrange, reduce

logger = logging.getLogger(__name__)

class E2EDataset(Dataset):
def __init__(self, hf_dataset):
self.data = load_dataset(hf_dataset, split='train')
self.target_sample_rate = 22050
self.hop_length = 256
self.mel_spectrogram = TorchMelSpectrogram(sampling_rate=self.target_sample_rate)

self.char_set = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.,!?-:;'\"()[] ")
manmay-nakhashi marked this conversation as resolved.
Show resolved Hide resolved
self.char_to_id = {char: i for i, char in enumerate(sorted(self.char_set))}
self.char_to_id['<unk>'] = len(self.char_to_id) # Unknown token
self.char_to_id['<sos>'] = len(self.char_to_id) # Start of sequence token
self.char_to_id['<eos>'] = len(self.char_to_id) # End of sequence token
self.id_to_char = {i: char for char, i in self.char_to_id.items()}
def __len__(self):
return len(self.data)

def __getitem__(self, index):
row = self.data[index]
audio = row['audio']['array']
logger.info(f"Audio shape: {audio.shape}")
sample_rate = row['audio']['sampling_rate']
duration = audio.shape[-1] / sample_rate

if duration > 20 or duration < 0.3:
logger.warning(f"Skipping due to duration out of bound: {duration}")
return self.__getitem__((index + 1) % len(self.data))

audio_tensor = torch.from_numpy(audio).float()

if sample_rate != self.target_sample_rate:
resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
audio_tensor = resampler(audio_tensor)

audio_tensor = rearrange(audio_tensor, 't -> 1 t')

mel_spec = self.mel_spectrogram(audio_tensor)

mel_spec = rearrange(mel_spec, '1 d t -> d t')

text = row['transcript']
text = text.replace(" ", "[SPACE]")
text_tokens = self.encode(text)

return {
'mel_spec': mel_spec,
'text': text_tokens,
}

def encode(self, text):
tokens = [self.char_to_id['<sos>']]
for char in text:
if char in self.char_to_id:
tokens.append(self.char_to_id[char])
else:
tokens.append(self.char_to_id['<unk>'])
tokens.append(self.char_to_id['<eos>'])
return torch.tensor(tokens, dtype=torch.long)

def decode(self, token_ids):
return ''.join([self.id_to_char[id.item()] for id in token_ids])
101 changes: 101 additions & 0 deletions e2_tts_pytorch/trainer/e2_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.nn import functional as F
from accelerate import Accelerator
from e2_tts_pytorch.dataset.e2_collate import collate_fn
import os
import logging
from e2_tts_pytorch.utils.compute_mel import TorchMelSpectrogram
from einops import rearrange
from torch.utils.tensorboard import SummaryWriter

class E2Trainer:
def __init__(self, model, optimizer, duration_predictor=None,
checkpoint_path=None, log_file="logs.txt",
max_grad_norm=1.0,
sample_rate=22050,
tensorboard_log_dir='runs/e2_tts_experiment'):
self.target_sample_rate = sample_rate
self.accelerator = Accelerator(log_with="all")
self.model = model
self.duration_predictor = duration_predictor
self.optimizer = optimizer
self.checkpoint_path = checkpoint_path
self.mel_spectrogram = TorchMelSpectrogram(sampling_rate=self.target_sample_rate)
self.model, self.optimizer = self.accelerator.prepare(
self.model, self.optimizer
)
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
handler = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
self.logger.addHandler(handler)
self.max_grad_norm = max_grad_norm

self.writer = SummaryWriter(log_dir=tensorboard_log_dir)

def save_checkpoint(self, step, finetune=False):
if self.checkpoint_path is None:
self.checkpoint_path = "model.pth"
checkpoint = {
'model_state_dict': self.accelerator.unwrap_model(self.model).state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'step': step
}
torch.save(checkpoint, self.checkpoint_path)

def load_checkpoint(self):
if self.checkpoint_path is not None and os.path.exists(self.checkpoint_path):
checkpoint = torch.load(self.checkpoint_path)
self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
return checkpoint['step']
return 0

def train(self, train_dataset, epochs, batch_size, grad_accumulation_steps=1, num_workers=12, save_step=1000):
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True, num_workers=num_workers, pin_memory=True)
train_dataloader = self.accelerator.prepare(train_dataloader)
start_step = self.load_checkpoint()
global_step = start_step
for epoch in range(epochs):
self.model.train()
progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}", unit="step", disable=not self.accelerator.is_local_main_process)
epoch_loss = 0.0
for batch in progress_bar:
text_inputs = batch['text']
text_lengths = batch['text_lengths']
mel_spec = rearrange(batch['mel'], 'b d n -> b n d')
mel_lengths = batch["mel_lengths"]

if self.duration_predictor is not None:
dur_loss = self.duration_predictor(mel_spec, target_duration=batch.get('durations'))
self.writer.add_scalar('Duration Loss', dur_loss.item(), global_step)

loss = self.model(mel_spec, text=text_inputs, lens=mel_lengths)
self.accelerator.backward(loss)

if self.max_grad_norm > 0:
self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)

self.optimizer.step()
self.optimizer.zero_grad()

if self.accelerator.is_local_main_process:
self.logger.info(f"Step {global_step+1}: E2E Loss = {loss.item():.4f}")
self.writer.add_scalar('E2E Loss', loss.item(), global_step)

global_step += 1
epoch_loss += loss.item()
progress_bar.set_postfix(loss=loss.item())

if global_step % save_step == 0:
self.save_checkpoint(global_step)

epoch_loss /= len(train_dataloader)
if self.accelerator.is_local_main_process:
self.logger.info(f"Epoch {epoch+1}/{epochs} - Average E2E Loss = {epoch_loss:.4f}")
self.writer.add_scalar('Epoch Average Loss', epoch_loss, epoch)

self.writer.close()
43 changes: 43 additions & 0 deletions e2_tts_pytorch/utils/compute_mel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import torch
from torch import nn
import torchaudio
manmay-nakhashi marked this conversation as resolved.
Show resolved Hide resolved
class TorchMelSpectrogram(nn.Module):
def __init__(
self,
filter_length=1024,
hop_length=256,
win_length=1024,
n_mel_channels=80,
mel_fmin=0,
mel_fmax=8000,
sampling_rate=22050,
normalize=False,
):
super().__init__()
self.filter_length = filter_length
manmay-nakhashi marked this conversation as resolved.
Show resolved Hide resolved
self.hop_length = hop_length
self.win_length = win_length
self.n_mel_channels = n_mel_channels
self.mel_fmin = mel_fmin
self.mel_fmax = mel_fmax
self.sampling_rate = sampling_rate
self.mel_stft = torchaudio.transforms.MelSpectrogram(
n_fft=self.filter_length,
hop_length=self.hop_length,
win_length=self.win_length,
power=2,
normalized=normalize,
sample_rate=self.sampling_rate,
f_min=self.mel_fmin,
f_max=self.mel_fmax,
n_mels=self.n_mel_channels,
norm="slaney",
)
def forward(self, inp):
if len(inp.shape) == 3:
inp = inp.squeeze(1)
assert len(inp.shape) == 2
self.mel_stft = self.mel_stft.to(inp.device)
mel = self.mel_stft(inp)
mel = torch.log(torch.clamp(mel, min=1e-5))
return mel
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ dependencies = [
'jaxtyping',
'torch>=2.0',
'torchdiffeq',
'x-transformers>=1.31.12'
'x-transformers>=1.31.12',
'accelerate>=0.32.1',
'datasets>=2.20.0',
'tqdm>=4.65.0',
'torchaudio>=2.3.1'
]

[project.urls]
Expand Down
44 changes: 44 additions & 0 deletions train_e2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import torch
import torch.nn.init as init
manmay-nakhashi marked this conversation as resolved.
Show resolved Hide resolved
from torch.optim import Adam
from e2_tts_pytorch.dataset.e2_dataset import E2EDataset
from e2_tts_pytorch.e2_tts import E2TTS, DurationPredictor
from e2_tts_pytorch.trainer.e2_trainer import E2Trainer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset = E2EDataset("MushanW/GLOBE")

duration_predictor = DurationPredictor(
transformer = dict(
dim = 512,
depth = 2,
)
).to(device)

e2tts = E2TTS(
duration_predictor = duration_predictor,
transformer = dict(
dim = 80,
depth = 4,
skip_connect_type = 'concat'
),
).to(device)


optimizer = Adam(e2tts.parameters(), lr=1e-4)

checkpoint_path = 'e2e.pt'
log_file = 'e2e.txt'

trainer = E2Trainer(
e2tts,
optimizer,
checkpoint_path=checkpoint_path,
log_file=log_file
)

epochs = 10
batch_size = 8
grad_accumulation_steps = 1

trainer.train(train_dataset, epochs, batch_size, grad_accumulation_steps, save_step=1000)