Skip to content

Commit

Permalink
【PPMix No.35】add diffsinger (#860)
Browse files Browse the repository at this point in the history
Co-authored-by: luyao-cv <1367355728@qq.com>
  • Loading branch information
swagger-coder and luyao-cv authored Dec 5, 2024
1 parent de8f291 commit f170c93
Show file tree
Hide file tree
Showing 82 changed files with 11,386 additions and 0 deletions.
135 changes: 135 additions & 0 deletions paddlemix/config/diffsinger/acoustic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
base_config:
- configs/base.yaml

task_cls: training.acoustic_task.AcousticTask
num_spk: 1
speakers:
- opencpop
spk_ids: []
test_prefixes: [
'2044',
'2086',
'2092',
'2093',
'2100',
]

vocoder: NsfHifiGAN
vocoder_ckpt: /home/ypf/workspace2/code/Paddle_test/DiffSinger_infer/checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02_pd/model.ckpt
audio_sample_rate: 44100
audio_num_mel_bins: 128
hop_size: 512 # Hop size.
fft_size: 2048 # FFT size.
win_size: 2048 # FFT size.
fmin: 40
fmax: 16000

binarization_args:
shuffle: true
num_workers: 0
augmentation_args:
random_pitch_shifting:
enabled: false
range: [-5., 5.]
scale: 0.75
fixed_pitch_shifting:
enabled: false
targets: [-5., 5.]
scale: 0.5
random_time_stretching:
enabled: false
range: [0.5, 2.]
scale: 0.75

raw_data_dir: 'data/opencpop/raw'
binary_data_dir: 'data/opencpop/binary'
binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer
dictionary: dictionaries/opencpop-extension.txt
spec_min: [-12]
spec_max: [0]
mel_vmin: -14.
mel_vmax: 4.
mel_base: 'e'
energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
voicing_smooth_width: 0.12
tension_smooth_width: 0.12

use_spk_id: false
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_key_shift_embed: false
use_speed_embed: false

diffusion_type: reflow
time_scale_factor: 1000
timesteps: 1000
max_beta: 0.02
rel_pos: true
sampling_algorithm: euler
sampling_steps: 20
diff_accelerator: ddim
diff_speedup: 10
hidden_size: 256
backbone_type: 'lynxnet'
backbone_args:
num_channels: 1024
num_layers: 6
kernel_size: 31
dropout_rate: 0.0
main_loss_type: l2
main_loss_log_norm: false
schedule_type: 'linear'

# shallow diffusion
use_shallow_diffusion: true
T_start: 0.4
T_start_infer: 0.4
K_step: 400
K_step_infer: 400

shallow_diffusion_args:
train_aux_decoder: true
train_diffusion: true
val_gt_start: false
aux_decoder_arch: convnext
aux_decoder_args:
num_channels: 512
num_layers: 6
kernel_size: 7
dropout_rate: 0.1
aux_decoder_grad: 0.1

lambda_aux_mel_loss: 0.2

# train and eval
num_sanity_val_steps: 1
optimizer_args:
lr: 0.0006
lr_scheduler_args:
step_size: 10000
gamma: 0.75
max_batch_frames: 50000
max_batch_size: 64
dataset_size_key: 'lengths'
val_with_vocoder: true
val_check_interval: 2000
num_valid_plots: 10
max_updates: 160000
num_ckpt_keep: 5
permanent_ckpt_start: 80000
permanent_ckpt_interval: 20000

finetune_enabled: false
finetune_ckpt_path: null

finetune_ignored_params:
- model.fs2.encoder.embed_tokens
- model.fs2.txt_embed
- model.fs2.spk_embed
finetune_strict_shapes: true

freezing_enabled: false
frozen_params: []
94 changes: 94 additions & 0 deletions paddlemix/config/diffsinger/base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# task
task_cls: null

#############
# dataset
#############
sort_by_len: true
raw_data_dir: null
binary_data_dir: null
binarizer_cls: null
binarization_args:
shuffle: false
num_workers: 0

audio_sample_rate: 44100
hop_size: 512
win_size: 2048
fft_size: 2048 # Extra window size is filled with 0 paddings to match this parameter
sampler_frame_count_grid: 6
ds_workers: 4
dataloader_prefetch_factor: 2

#########
# model
#########
hidden_size: 256
dropout: 0.1
use_pos_embed: true
enc_layers: 4
num_heads: 2
enc_ffn_kernel_size: 9
ffn_act: gelu
use_spk_id: false

###########
# optimization
###########
optimizer_args:
optimizer_cls: torch.optim.AdamW
lr: 0.0004
beta1: 0.9
beta2: 0.98
weight_decay: 0
lr_scheduler_args:
scheduler_cls: torch.optim.lr_scheduler.StepLR
step_size: 50000
gamma: 0.5
clip_grad_norm: 1

###########
# train and eval
###########
num_ckpt_keep: 5
accumulate_grad_batches: 1
log_interval: 100
num_sanity_val_steps: 1 # steps of validation at the beginning
val_check_interval: 2000
max_updates: 120000
max_batch_frames: 32000
max_batch_size: 100000
max_val_batch_frames: 60000
max_val_batch_size: 1
pe: rmvpe #parselmouth
pe_ckpt: 'checkpoints/rmvpe/model.pt'
hnsep: vr
hnsep_ckpt: 'checkpoints/vr/model.pt'
f0_min: 65
f0_max: 1100
num_valid_plots: 10

###########
# pytorch lightning
# Read https://lightning.ai/docs/pytorch/stable/common/trainer.html#trainer-class-api for possible values
###########
pl_trainer_accelerator: 'auto'
pl_trainer_devices: [4,5,6,7] # 'auto'
pl_trainer_precision: '16-mixed'
pl_trainer_num_nodes: 1
pl_trainer_strategy:
name: auto
process_group_backend: nccl
find_unused_parameters: false
nccl_p2p: true

###########
# finetune
###########
finetune_enabled: false
finetune_ckpt_path: null
finetune_ignored_params: []
finetune_strict_shapes: true

freezing_enabled: false
frozen_params: []
102 changes: 102 additions & 0 deletions paddlemix/config/diffsinger/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
base_config: configs/acoustic.yaml

raw_data_dir:
- data/xxx1/raw
- data/xxx2/raw
speakers:
- speaker1
- speaker2
spk_ids: []
test_prefixes:
- wav1
- wav2
- wav3
- wav4
- wav5
dictionary: dictionaries/opencpop-extension.txt
binary_data_dir: data/xxx/binary
binarization_args:
num_workers: 0
pe: parselmouth
pe_ckpt: 'checkpoints/rmvpe/model.pt'
hnsep: vr
hnsep_ckpt: 'checkpoints/vr/model.pt'
vocoder: NsfHifiGAN
vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt

use_spk_id: false
num_spk: 1

# NOTICE: before enabling variance embeddings, please read the docs at
# https://github.com/openvpi/DiffSinger/tree/main/docs/BestPractices.md#choosing-variance-parameters
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false

use_key_shift_embed: true
use_speed_embed: true

augmentation_args:
random_pitch_shifting:
enabled: true
range: [-5., 5.]
scale: 0.75
fixed_pitch_shifting:
enabled: false
targets: [-5., 5.]
scale: 0.5
random_time_stretching:
enabled: true
range: [0.5, 2.]
scale: 0.75

# diffusion and shallow diffusion
diffusion_type: reflow
use_shallow_diffusion: true
T_start: 0.4
T_start_infer: 0.4
K_step: 300
K_step_infer: 300
backbone_type: 'lynxnet'
backbone_args:
num_channels: 1024
num_layers: 6
kernel_size: 31
dropout_rate: 0.0
#backbone_type: 'wavenet'
#backbone_args:
# num_channels: 512
# num_layers: 20
# dilation_cycle_length: 4
shallow_diffusion_args:
train_aux_decoder: true
train_diffusion: true
val_gt_start: false
aux_decoder_arch: convnext
aux_decoder_args:
num_channels: 512
num_layers: 6
kernel_size: 7
dropout_rate: 0.1
aux_decoder_grad: 0.1
lambda_aux_mel_loss: 0.2

optimizer_args:
lr: 0.0006
lr_scheduler_args:
scheduler_cls: torch.optim.lr_scheduler.StepLR
step_size: 10000
gamma: 0.75
max_batch_frames: 50000
max_batch_size: 64
max_updates: 160000

num_valid_plots: 10
val_with_vocoder: true
val_check_interval: 2000
num_ckpt_keep: 5
permanent_ckpt_start: 120000
permanent_ckpt_interval: 20000
pl_trainer_devices: 'auto'
pl_trainer_precision: '16-mixed'
Loading

0 comments on commit f170c93

Please sign in to comment.