Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lengyue/tts gpt decoder diff #121

Merged
merged 39 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
6456be7
Add encoder and feature extractor for GradTTS
leng-yue Sep 18, 2023
070151a
Update datasets for GradTTS
leng-yue Sep 18, 2023
3883030
Added GradTTS
leng-yue Sep 18, 2023
bd59fd0
Dataset should be larger
leng-yue Sep 18, 2023
5ec16ee
Fix typo
leng-yue Sep 18, 2023
1ded66d
Optimize bert tokenizer
leng-yue Sep 18, 2023
520ca70
Fix typo
leng-yue Sep 18, 2023
25d19e9
Support npy existing checking
leng-yue Sep 18, 2023
492b0c7
Update some utils
leng-yue Sep 21, 2023
7cfa480
Add sample dataset utils & optimize tool chain
leng-yue Sep 21, 2023
844760d
Add cross attn to convnext
leng-yue Sep 21, 2023
a963f6d
Update dataset utils
leng-yue Sep 21, 2023
1f29c69
Finished implementation
leng-yue Sep 21, 2023
729891f
Fix some alignmeng bug
leng-yue Sep 21, 2023
37f95e0
Optimize DDP training
leng-yue Sep 21, 2023
b79c02f
Tune bert params & scaled pe
leng-yue Sep 22, 2023
6aff8c6
Double BS for stable training
leng-yue Sep 22, 2023
bcdfd58
Set lr to 2e-5 for bert
leng-yue Sep 22, 2023
398f6c0
Bump deps & fix ci
leng-yue Sep 22, 2023
3915984
Always build docs
leng-yue Sep 22, 2023
2a38283
Update page ci
leng-yue Sep 22, 2023
5164d02
Update CI
leng-yue Sep 22, 2023
c164418
Add pure transformer decoder
leng-yue Sep 22, 2023
4347d6a
Integrate new transformer denoiser
leng-yue Sep 22, 2023
82b3fe5
Better Log & Optimizer
leng-yue Sep 25, 2023
d203276
Optimize export
leng-yue Sep 24, 2023
54fd175
add diffusers
leng-yue Sep 25, 2023
90ebe9e
Update lockfile
leng-yue Sep 25, 2023
264da15
Add gpt decoder only diff
leng-yue Sep 27, 2023
ca7a9a3
Add encodec decoder
leng-yue Sep 28, 2023
af556cd
Add GPT finetune utils
leng-yue Sep 28, 2023
77b7524
remove nccl control
leng-yue Sep 28, 2023
4848267
Update gitignore
leng-yue Sep 28, 2023
f46c967
fix license in ci
leng-yue Sep 28, 2023
30ece92
fix LICENSE
leng-yue Sep 28, 2023
5185408
Support sampling without pitches
leng-yue Sep 28, 2023
98b1749
Update batch processing tools
leng-yue Oct 9, 2023
ea0c93f
Add to flac converter
leng-yue Oct 9, 2023
aa15c67
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 11, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/results
/checkpoints
/dataset
12 changes: 1 addition & 11 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,10 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install Dependencies
run: |
pip3 install --upgrade pip && \
pip3 install pdm && \
pdm sync
- name: Lint with black and isort
run: |
pdm run black . --check
pdm run isort . --check
- uses: pre-commit/action@v3.0.0

docs:
runs-on: ubuntu-latest
# Only run this job if the lint job is successful and the current branch is main
needs: lint
if: ${{ github.ref == 'refs/heads/main' }}
steps:
- uses: actions/checkout@v3
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,4 @@ exp_*.sh
exported
pitches_editor
.pdm-python
.pgx.*
7 changes: 5 additions & 2 deletions configs/_base_/trainers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
log_every_n_steps=10,
val_check_interval=5000,
check_val_every_n_epoch=None,
max_steps=1_000_000,
max_steps=2_000_000,
# Warning: If you are training the model with fs2 (and see nan), you should either use bf16 or fp32
precision="16-mixed",
precision="bf16-mixed",
accumulate_grad_batches=1,
callbacks=[
ModelCheckpoint(
filename="{epoch}-{step}-{valid_loss:.4f}",
Expand All @@ -34,5 +35,7 @@
trainer["strategy"] = DDPStrategy(
process_group_backend=process_group_backend,
gradient_as_bucket_view=True,
find_unused_parameters=True,
static_graph=True,
ddp_comm_hook=default.fp16_compress_hook,
)
53 changes: 53 additions & 0 deletions configs/encodec_decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
_base_ = [
"./_base_/archs/diff_svc_v2.py",
"./_base_/trainers/base.py",
"./_base_/schedulers/warmup_cosine.py",
"./_base_/datasets/audio_folder.py",
]

speaker_mapping = {
"default": 0,
}

dataset = dict(
train=dict(
type="NaiveDenoiserDataset",
path="dataset/tts",
speaker_id=0,
),
valid=dict(
type="NaiveDenoiserDataset",
path="dataset/tts/valid",
speaker_id=0,
),
)

model = dict(
text_encoder=dict(
type="NaiveProjectionEncoder",
input_size=128,
output_size=256,
),
speaker_encoder=dict(
_delete_=True,
),
pitch_encoder=dict(
_delete_=True,
),
vocoder=dict(
_delete_=True,
type="ADaMoSHiFiGANV1",
use_natural_log=False,
checkpoint_path="checkpoints/adamos/convnext_hifigan_more_supervised_001560000.ckpt",
),
)

preprocessing = dict(
text_features_extractor=dict(
type="Encodec",
model="facebook/encodec_24khz",
first_codebook_only=True,
),
pitch_extractor=None,
augmentations=[],
)
174 changes: 174 additions & 0 deletions configs/tts_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# Warning: This config is developing, and subject to change.

from pathlib import Path

from fish_diffusion.datasets.naive import NaiveTTSDataset
from fish_diffusion.schedulers.warmup_cosine_scheduler import (
LambdaWarmUpCosineScheduler,
)

_base_ = [
"./_base_/trainers/base.py",
"./_base_/schedulers/warmup_cosine.py",
"./_base_/datasets/naive_svc.py",
]

speakers = []

# Process SVC mixin datasets
mixin_datasets = [
("LibriTTS-100", "dataset/LibriTTS/train-clean-100"),
("LibriTTS-360", "dataset/LibriTTS/train-clean-360"),
("LibriTTS-500", "dataset/LibriTTS/train-other-500"),
]
train_datasets = []

for name, path in mixin_datasets:
for speaker_path in sorted(Path(path).iterdir()):
if not any(speaker_path.rglob("*.npy")):
continue

speaker_name = f"{name}-{speaker_path.name}"
if speaker_name not in speakers:
speakers.append(speaker_name)

train_datasets.append(
dict(
type="NaiveTTSDataset",
path=str(speaker_path),
speaker_id=speaker_name,
)
)

# Sort speakers
speakers.sort()
speaker_mapping = {speaker: i for i, speaker in enumerate(speakers)}

for dataset in train_datasets:
dataset["speaker_id"] = speaker_mapping[dataset["speaker_id"]]

# Config model
sampling_rate = 44100
mel_channels = 128
# bert_dim = 768
gradient_checkpointing = True

model = dict(
type="GradTTS",
gradient_checkpointing=gradient_checkpointing,
diffusion=dict(
type="GaussianDiffusion",
mel_channels=mel_channels,
noise_schedule="linear",
timesteps=1000,
max_beta=0.01,
s=0.008,
noise_loss="l1",
denoiser=dict(
type="LlamaDenoiser",
bos_token_id=1,
eos_token_id=2,
hidden_act="silu",
hidden_size=768,
initializer_range=0.02,
intermediate_size=768 * 4,
max_position_embeddings=4096,
model_type="llama",
num_attention_heads=16,
num_hidden_layers=24,
num_key_value_heads=16,
rms_norm_eps=1e-05,
rope_scaling=None,
tie_word_embeddings=False,
vocab_size=32000,
),
sampler_interval=10,
spec_min=[-5],
spec_max=[0],
),
# speaker_encoder=dict(
# type="NaiveProjectionEncoder",
# input_size=10000, # len(speaker_mapping),
# output_size=bert_dim,
# use_embedding=True,
# ),
# text_encoder=dict(
# type="BertEncoder",
# model_name="bert-base-cased",
# pretrained=True,
# ),
# duration_predictor=dict(
# type="NaiveProjectionEncoder",
# input_size=bert_dim,
# output_size=1,
# ),
vocoder=dict(
type="ADaMoSHiFiGANV1",
use_natural_log=False,
checkpoint_path="checkpoints/adamos/convnext_hifigan_more_supervised_001560000.ckpt",
),
)

dataset = dict(
_delete_=True,
train=dict(
type="ConcatDataset",
datasets=train_datasets,
collate_fn=NaiveTTSDataset.collate_fn,
),
valid=dict(
type="SampleDataset",
num_samples=8,
dataset=dict(
type="ConcatDataset",
datasets=train_datasets,
collate_fn=NaiveTTSDataset.collate_fn,
),
collate_fn=NaiveTTSDataset.collate_fn,
),
)

dataloader = dict(
train=dict(
batch_size=4,
),
valid=dict(
batch_size=8,
),
)

trainer = dict(
accumulate_grad_batches=4,
# strategy="ddp"
)

preprocessing = dict(
text_features_extractor=dict(
type="LlamaTokenizer",
model_name="meta-llama/Llama-2-7b-hf",
label_suffix=".normalized.txt",
),
)

lambda_func = LambdaWarmUpCosineScheduler(
warm_up_steps=10000,
val_final=1e-5,
val_base=1e-4,
val_start=0,
max_decay_steps=300000,
)

optimizer = dict(
_delete_=True,
type="AdamW",
lr=1.0,
weight_decay=1e-2,
betas=(0.9, 0.999),
eps=1e-6,
)

scheduler = dict(
_delete_=True,
type="LambdaLR",
lr_lambda=lambda_func,
)
3 changes: 2 additions & 1 deletion fish_diffusion/archs/diffsinger/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .diffsinger import DiffSinger, DiffSingerLightning
from .grad_tts import GradTTS

__all__ = ["DiffSingerLightning", "DiffSinger"]
__all__ = ["DiffSingerLightning", "DiffSinger", "GradTTS"]
Loading