diff --git a/paddlemix/examples/audioldm2/README.md b/paddlemix/examples/audioldm2/README.md new file mode 100644 index 000000000..428b11e4f --- /dev/null +++ b/paddlemix/examples/audioldm2/README.md @@ -0,0 +1,27 @@ +# AudioLDM2 + +## 1. 模型简介 + +该模型是 [AudioLDM2](https://arxiv.org/abs/2308.05734) 的 paddle 实现。 + + +## 2. Demo + +### 2.1 依赖安装 + +- 请确保已安装 ppdiffusers ([参考方法](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/README.md?plain=1#L62)) + +- 其余依赖安装: + +```bash +cd /paddlemix/models/audioldm2 +pip install -r requirement.txt +``` + +### 2.2 动态图推理 +```bash +python run_predict.py \ +--text "Musical constellations twinkling in the night sky, forming a cosmic melody." \ +--model_name_or_path "/my_model_path" \ +--seed 1001 \ +``` diff --git a/paddlemix/examples/audioldm2/run_predict.py b/paddlemix/examples/audioldm2/run_predict.py new file mode 100644 index 000000000..5319b0a5c --- /dev/null +++ b/paddlemix/examples/audioldm2/run_predict.py @@ -0,0 +1,302 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +import paddle +from paddlenlp.trainer import PdArgumentParser +import os +import time +import soundfile as sf +from paddlemix.models.audioldm2.modeling import AudioLDM2Model +from paddlemix.models.audioldm2.encoders.phoneme_encoder import text as text +import random +import numpy as np +import re + +def seed_everything(seed): + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + +def text2phoneme(data): + return text._clean_text(re.sub(r'<.*?>', '', data), ["english_cleaners2"]) + +def text_to_filename(text): + return text.replace(" ", "_").replace("'", "_").replace('"', "_") + +CACHE = { + "get_vits_phoneme_ids":{ + "PAD_LENGTH": 310, + "_pad": '_', + "_punctuation": ';:,.!?¡¿—…"«»“” ', + "_letters": 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', + "_letters_ipa": "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ", + "_special": "♪☎☒☝⚠" + } +} + +CACHE["get_vits_phoneme_ids"]["symbols"] = [CACHE["get_vits_phoneme_ids"]["_pad"]] + list(CACHE["get_vits_phoneme_ids"]["_punctuation"]) + list(CACHE["get_vits_phoneme_ids"]["_letters"]) + list(CACHE["get_vits_phoneme_ids"]["_letters_ipa"]) + list(CACHE["get_vits_phoneme_ids"]["_special"]) +CACHE["get_vits_phoneme_ids"]["_symbol_to_id"] = {s: i for i, s in enumerate(CACHE["get_vits_phoneme_ids"]["symbols"])} + +def get_vits_phoneme_ids_no_padding(phonemes): + pad_token_id = 0 + pad_length = CACHE["get_vits_phoneme_ids"]["PAD_LENGTH"] + _symbol_to_id = CACHE["get_vits_phoneme_ids"]["_symbol_to_id"] + batchsize = len(phonemes) + + clean_text = phonemes[0] + "⚠" + sequence = [] + + for symbol in clean_text: + if(symbol not in _symbol_to_id.keys()): + print("%s is not in the vocabulary. %s" % (symbol, clean_text)) + symbol = "_" + symbol_id = _symbol_to_id[symbol] + sequence += [symbol_id] + + def _pad_phonemes(phonemes_list): + return phonemes_list + [pad_token_id] * (pad_length-len(phonemes_list)) + + sequence = sequence[:pad_length] + + return {"phoneme_idx": paddle.to_tensor(_pad_phonemes(sequence), dtype="int64").unsqueeze(0).expand([batchsize, -1])} + + +def make_batch_for_text_to_audio(text, transcription="", waveform=None, fbank=None, batchsize=1): + text = [text] * batchsize + if(transcription): + transcription = text2phoneme(transcription) + transcription = [transcription] * batchsize + + if batchsize < 1: + print("Warning: Batchsize must be at least 1. Batchsize is set to .") + + if fbank is None: + fbank = paddle.zeros( + (batchsize, 1024, 64) + ) # Not used, here to keep the code format + else: + fbank = paddle.to_tensor(fbank, dtype="float32") + fbank = fbank.expand([batchsize, 1024, 64]) + assert fbank.shape[0] == batchsize + + stft = paddle.zeros((batchsize, 1024, 512)) # Not used + phonemes = get_vits_phoneme_ids_no_padding(transcription) + + waveform = paddle.zeros((batchsize, 160000)) # Not used + ta_kaldi_fbank = paddle.zeros((batchsize, 1024, 128)) + + batch = { + "text": text, # list + "fname": [text_to_filename(t) for t in text], # list + "waveform": waveform, + "stft": stft, + "log_mel_spec": fbank, + "ta_kaldi_fbank": ta_kaldi_fbank, + } + batch.update(phonemes) + return batch + +def get_time(): + t = time.localtime() + return time.strftime("%d_%m_%Y_%H_%M_%S", t) + +def save_wave(waveform, savepath, name="outwav", samplerate=16000): + if type(name) is not list: + name = [name] * waveform.shape[0] + + for i in range(waveform.shape[0]): + if waveform.shape[0] > 1: + fname = "%s_%s.wav" % ( + os.path.basename(name[i]) + if (not ".wav" in name[i]) + else os.path.basename(name[i]).split(".")[0], + i, + ) + else: + fname = "%s.wav" % os.path.basename(name[i]) if (not ".wav" in name[i]) else os.path.basename(name[i]).split(".")[0] + # Avoid the file name too long to be saved + if len(fname) > 255: + fname = f"{hex(hash(fname))}.wav" + + path = os.path.join( + savepath, fname + ) + print("Save audio to %s" % path) + sf.write(path, waveform[i, 0], samplerate=samplerate) + +def read_list(fname): + result = [] + with open(fname, "r", encoding="utf-8") as f: + for each in f.readlines(): + each = each.strip('\n') + result.append(each) + return result + +def text_to_audio( + model, + text, + transcription="", + seed=42, + ddim_steps=200, + duration=10, + batchsize=1, + guidance_scale=3.5, + n_candidate_gen_per_text=3, + latent_t_per_second=25.6, + ): + + seed_everything(int(seed)) + waveform = None + + batch = make_batch_for_text_to_audio(text, transcription=transcription, waveform=waveform, batchsize=batchsize) + + model.latent_t_size = int(duration * latent_t_per_second) + + waveform = model( + batch, + unconditional_guidance_scale=guidance_scale, + ddim_steps=ddim_steps, + n_gen=n_candidate_gen_per_text, + duration=duration, + ) + + return waveform + + +@dataclass +class DataArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using `PdArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + text: str = field(default="", metadata={"help": "Text prompt to the model for audio generation."}) + transcription: str = field(default="", metadata={"help": "Transcription for Text-to-Speech."}) + text_list: str = field(default="", metadata={"help": "A file (utf-8 encoded) that contains text prompt to the model for audio generation."}) + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + default="audioldm2-full", + metadata={"help": "Path to pretrained model or model identifier"}, + ) + save_path: str = field( + default="./output", + metadata={"help": "The path to save model output."}, + ) + device: str = field( + default="gpu", + metadata={"help": "The device for computation. If not specified, the script will automatically choose gpu."}, + ) + batchsize: int = field( + default=1, + metadata={"help": "Generate how many samples at the same time."}, + ) + ddim_steps: int = field( + default=200, + metadata={"help": "The sampling step for DDIM."}, + ) + guidance_scale: float = field( + default=3.5, + metadata={"help": "Guidance scale (Large => better quality and relavancy to text; Small => better diversity)."}, + ) + duration: float = field( + default=10.0, + metadata={"help": "The duration of the samples."}, + ) + n_candidate_gen_per_text: int = field( + default=3, + metadata={"help": "Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation."}, + ) + seed: int = field( + default=42, + metadata={"help": "Change this value (any integer number) will lead to a different generation result."}, + ) + +def main(): + parser = PdArgumentParser((ModelArguments, DataArguments)) + model_args, data_args = parser.parse_args_into_dataclasses() + + # process args + text = data_args.text + transcription = data_args.transcription + text_list = data_args.text_list + + save_path = os.path.join(model_args.save_path, get_time()) + random_seed = model_args.seed + duration = model_args.duration + sample_rate = 16000 + latent_t_per_second = 25.6 + + print("Warning: For AudioLDM2 we currently only support 10s of generation. Please use audioldm_48k or audioldm_16k_crossattn_t5 if you want a different duration.") + duration = 10 + + guidance_scale = model_args.guidance_scale + n_candidate_gen_per_text = model_args.n_candidate_gen_per_text + + if transcription: + if "speech" not in model_args.model_name_or_path: + print("Warning: You choose to perform Text-to-Speech by providing the transcription. However you do not choose the correct model name (audioldm2-speech-gigaspeech or audioldm2-speech-ljspeech).") + print("Warning: We will use audioldm2-speech-gigaspeech by default") + model_args.model_name_or_path = "audioldm2-speech-gigaspeech" + if not text: + print("Warning: You should provide text as a input to describe the speaker. Use default (A male reporter is speaking).") + text = "A female reporter is speaking full of emotion" + + if text_list: + print("Generate audio based on the text prompts in %s" % text_list) + prompt_todo = read_list(text_list) + else: + prompt_todo = [text] + + # build audioldm2 model + paddle.set_device(model_args.device) + audioldm2 = AudioLDM2Model.from_pretrained(model_args.model_name_or_path) + + # predict + os.makedirs(save_path, exist_ok=True) + for text in prompt_todo: + if "|" in text: + text, name = text.split("|") + else: + name = text[:128] + + if transcription: + name += "-TTS-%s" % transcription + + waveform = text_to_audio( + audioldm2, + text, + transcription=transcription, # To avoid the model to ignore the last vocab + seed=random_seed, + duration=duration, + guidance_scale=guidance_scale, + ddim_steps=model_args.ddim_steps, + n_candidate_gen_per_text=n_candidate_gen_per_text, + batchsize=model_args.batchsize, + latent_t_per_second=latent_t_per_second + ) + + save_wave(waveform, save_path, name=name, samplerate=sample_rate) + +if __name__ == "__main__": + main() diff --git a/paddlemix/models/__init__.py b/paddlemix/models/__init__.py index 9bc505502..9688b7e53 100644 --- a/paddlemix/models/__init__.py +++ b/paddlemix/models/__init__.py @@ -21,3 +21,5 @@ from .qwen_vl import * from .visualglm.configuration import * from .visualglm.modeling import * +from .audioldm2.modeling import * +from .audioldm2.configuration import * diff --git a/paddlemix/models/audioldm2/audiomae/__init__.py b/paddlemix/models/audioldm2/audiomae/__init__.py new file mode 100644 index 000000000..fd05a9208 --- /dev/null +++ b/paddlemix/models/audioldm2/audiomae/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlemix/models/audioldm2/audiomae/mae.py b/paddlemix/models/audioldm2/audiomae/mae.py new file mode 100644 index 000000000..a00305ee5 --- /dev/null +++ b/paddlemix/models/audioldm2/audiomae/mae.py @@ -0,0 +1,367 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial + +import paddle +import paddle.nn as nn +from ..utils import to_2tuple, DropPath, Mlp +from ..clap_module.htsat_model import SwinTransformerBlock + +class Attention(nn.Layer): + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0., + proj_drop: float = 0., + norm_layer: nn.Layer = nn.LayerNorm, + ) -> None: + super().__init__() + assert dim % num_heads == 0, 'dim should be divisible by num_heads' + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim ** -0.5 + self.fused_attn = False + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape([B, N, 3, self.num_heads, self.head_dim]).transpose([2, 0, 3, 1, 4]) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + + if self.fused_attn: + x = nn.functional.scaled_dot_product_attention( + q, k, v, + dropout=self.attn_drop.p if self.training else 0., + )[0] + else: + q = q * self.scale + k_perm = list(range(k.dim())) + new_perm = k_perm + new_perm[-2],new_perm[-1] = k_perm[-1],k_perm[-2] + attn = q @ k.transpose(new_perm) + attn = nn.functional.softmax(attn,axis=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x_perm = list(range(x.dim())) + new_perm = x_perm + new_perm[1],new_perm[2] = x_perm[2],x_perm[1] + x = x.transpose(new_perm).reshape([B, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class LayerScale(nn.Layer): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + tmp = init_values * paddle.ones(dim) + self.gamma = paddle.create_parameter(shape=tmp.shape, + dtype=tmp.dtype, + default_initializer=nn.initializer.Assign(tmp)) + self.gamma.stop_gradient = False + + def forward(self, x): + if self.inplace: + x = paddle.multiply(x, self.gamma) + return x + else: + return x * self.gamma + # return paddle.multiply(x, self.gamma) if self.inplace else x * self.gamma + +class Block(nn.Layer): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + init_values=None, + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + +class PatchEmbed_org(nn.Layer): + """Image to Patch Embedding""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_hw = (img_size[1] // patch_size[1], img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + # assert H == self.img_size[0] and W == self.img_size[1], \ + # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + y = x.flatten(2).transpose([0, 2, 1]) + return y + +class MaskedAutoencoderViT(nn.Layer): + """Masked Autoencoder with VisionTransformer backbone""" + + def __init__( + self, + img_size=224, + patch_size=16, + stride=10, + in_chans=3, + embed_dim=1024, + depth=24, + num_heads=16, + decoder_embed_dim=512, + decoder_depth=8, + decoder_num_heads=16, + mlp_ratio=4.0, + norm_layer=nn.LayerNorm, + norm_pix_loss=False, + audio_exp=False, + alpha=0.0, + temperature=0.2, + mode=0, + contextual_depth=8, + split_pos=False, + pos_trainable=False, + use_nce=False, + beta=4.0, + decoder_mode=0, + mask_t_prob=0.6, + mask_f_prob=0.5, + mask_2d=False, + epoch=0, + no_shift=False, + use_custom_patch=False, + ): + super().__init__() + + self.audio_exp = audio_exp + self.embed_dim = embed_dim + self.decoder_embed_dim = decoder_embed_dim + # -------------------------------------------------------------------------- + # MAE encoder specifics + self.patch_embed = PatchEmbed_org(img_size, patch_size, in_chans, embed_dim) + self.use_custom_patch = use_custom_patch + + num_patches = self.patch_embed.num_patches + tmp = paddle.zeros([1, 1, embed_dim]) + self.cls_token = paddle.create_parameter(shape=tmp.shape, + dtype=tmp.dtype, + default_initializer=nn.initializer.Assign(tmp)) + self.cls_token.stop_gradient = False + + # self.split_pos = split_pos # not useful + tmp = paddle.zeros([1, num_patches + 1, embed_dim]) + self.pos_embed = paddle.create_parameter(shape=tmp.shape, + dtype=tmp.dtype, + default_initializer=nn.initializer.Assign(tmp)) # fixed sin-cos embedding + self.pos_embed.stop_gradient = not pos_trainable + + self.encoder_depth = depth + self.contextual_depth = contextual_depth + self.blocks = nn.LayerList( + [ + Block( + embed_dim, + num_heads, + mlp_ratio, + qkv_bias=True, + norm_layer=norm_layer, + ) # qk_scale=None + for i in range(depth) + ] + ) + self.norm = norm_layer(embed_dim) + + # -------------------------------------------------------------------------- + # MAE decoder specifics + self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias_attr=True) + + tmp = paddle.zeros([1, 1, decoder_embed_dim]) + self.mask_token = paddle.create_parameter(shape=tmp.shape, + dtype=tmp.dtype, + default_initializer=nn.initializer.Assign(tmp)) + self.mask_token.stop_gradient = False + + tmp = paddle.zeros([1, num_patches + 1, decoder_embed_dim]) + self.decoder_pos_embed = paddle.create_parameter(shape=tmp.shape, + dtype=tmp.dtype, + default_initializer=nn.initializer.Assign(tmp)) # fixed sin-cos embedding + self.decoder_pos_embed.stop_gradient = not pos_trainable + + self.no_shift = no_shift + + self.decoder_mode = decoder_mode + if ( + self.use_custom_patch + ): # overlapped patches as in AST. Similar performance yet compute heavy + window_size = (6, 6) + feat_size = (102, 12) + else: + window_size = (4, 4) + feat_size = (64, 8) + if self.decoder_mode == 1: + decoder_modules = [] + for index in range(16): + if self.no_shift: + shift_size = (0, 0) + else: + if (index % 2) == 0: + shift_size = (0, 0) + else: + shift_size = (2, 0) + decoder_modules.append( + SwinTransformerBlock( + dim=decoder_embed_dim, + num_heads=16, + input_resolution=feat_size, + window_size=window_size, + shift_size=shift_size, + mlp_ratio=mlp_ratio, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + # extra_norm=False, + # sequential_attn=False, + norm_layer=norm_layer, # nn.LayerNorm, + ) + ) + self.decoder_blocks = nn.LayerList(decoder_modules) + else: + # Transfomer + self.decoder_blocks = nn.LayerList( + [ + Block( + decoder_embed_dim, + decoder_num_heads, + mlp_ratio, + qkv_bias=True, + norm_layer=norm_layer, + ) # qk_scale=None, + for i in range(decoder_depth) + ] + ) + + self.decoder_norm = norm_layer(decoder_embed_dim) + self.decoder_pred = nn.Linear( + decoder_embed_dim, patch_size**2 * in_chans, bias_attr=True + ) # decoder to patch + + # -------------------------------------------------------------------------- + + self.norm_pix_loss = norm_pix_loss + + self.patch_size = patch_size + self.stride = stride + + # audio exps + self.alpha = alpha + self.T = temperature + self.mode = mode + self.use_nce = use_nce + self.beta = beta + + self.log_softmax = nn.LogSoftmax(axis=-1) + + self.mask_t_prob = mask_t_prob + self.mask_f_prob = mask_f_prob + self.mask_2d = mask_2d + + self.epoch = epoch + + # self.initialize_weights() + + def forward_encoder_no_mask(self, x): + # embed patches + x = self.patch_embed(x) + + # add pos embed w/o cls token + x = x + self.pos_embed[:, 1:, :] + + # masking: length -> length * mask_ratio + # x, mask, ids_restore = self.random_masking(x, mask_ratio) + # append cls token + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand([x.shape[0], -1, -1]) + x = paddle.concat((cls_tokens, x), axis=1) + + # apply Transformer blocks + contextual_embs = [] + for n, blk in enumerate(self.blocks): + x = blk(x) + if n > self.contextual_depth: + contextual_embs.append(self.norm(x)) + contextual_emb = paddle.stack(contextual_embs, axis=0).mean(axis=0) + + return contextual_emb + + +def mae_vit_base_patch16_dec512d8b(**kwargs): + model = MaskedAutoencoderViT( + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + decoder_embed_dim=512, + decoder_num_heads=16, + mlp_ratio=4, + norm_layer=partial(nn.LayerNorm, epsilon=1e-6), + **kwargs, + ) + return model + + +# set recommended archs +mae_vit_base_patch16 = mae_vit_base_patch16_dec512d8b # decoder: 512 dim, 8 blocks diff --git a/paddlemix/models/audioldm2/clap_module/clap.py b/paddlemix/models/audioldm2/clap_module/clap.py new file mode 100644 index 000000000..c6bccbbb7 --- /dev/null +++ b/paddlemix/models/audioldm2/clap_module/clap.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .model import CLAP, CLAPAudioCfg, CLAPTextCfg +import dataclasses +from dataclasses import dataclass + +@dataclass +class CLAPConfig: + embed_dim: int = 1024 + audio_cfg: CLAPAudioCfg = CLAPAudioCfg() + text_cfg: CLAPTextCfg = CLAPTextCfg() + +def create_clap_model( + amodel_name: str, + tmodel_name: str, + pretrained: str = "", + precision: str = "fp32", + force_quick_gelu: bool = False, + enable_fusion: bool = False, + fusion_type: str = "None" +): + pretrained = pretrained.lower() + + model_cfg = CLAPConfig() + model_cfg = dataclasses.asdict(model_cfg) + if force_quick_gelu: + # override for use of QuickGELU on non-OpenAI transformer models + model_cfg["quick_gelu"] = True + + model_cfg["text_cfg"]["model_type"] = tmodel_name + model_cfg["enable_fusion"] = enable_fusion + model_cfg["fusion_type"] = fusion_type + model = CLAP(**model_cfg) + + return model, model_cfg diff --git a/paddlemix/models/audioldm2/clap_module/feature_fusion.py b/paddlemix/models/audioldm2/clap_module/feature_fusion.py new file mode 100644 index 000000000..4a2fc987d --- /dev/null +++ b/paddlemix/models/audioldm2/clap_module/feature_fusion.py @@ -0,0 +1,200 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + + +class DAF(nn.Layer): + """ + 直接相加 DirectAddFuse + """ + + def __init__(self): + super(DAF, self).__init__() + + def forward(self, x, residual): + return x + residual + + +class iAFF(nn.Layer): + """ + 多特征融合 iAFF + """ + + def __init__(self, channels=64, r=4, type="2D"): + super(iAFF, self).__init__() + inter_channels = int(channels // r) + + if type == "1D": + # 本地注意力 + self.local_att = nn.Sequential( + nn.Conv1D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(inter_channels), + nn.ReLU(inplace=True), + nn.Conv1D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(channels), + ) + + # 全局注意力 + self.global_att = nn.Sequential( + nn.AdaptiveAvgPool1D(1), + nn.Conv1D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(inter_channels), + nn.ReLU(), + nn.Conv1D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(channels), + ) + + # 第二次本地注意力 + self.local_att2 = nn.Sequential( + nn.Conv1D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(inter_channels), + nn.ReLU(), + nn.Conv1D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(channels), + ) + # 第二次全局注意力 + self.global_att2 = nn.Sequential( + nn.AdaptiveAvgPool1D(1), + nn.Conv1D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(inter_channels), + nn.ReLU(), + nn.Conv1D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(channels), + ) + elif type == "2D": + # 本地注意力 + self.local_att = nn.Sequential( + nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(inter_channels), + nn.ReLU(), + nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(channels), + ) + + # 全局注意力 + self.global_att = nn.Sequential( + nn.AdaptiveAvgPool2D(1), + nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(inter_channels), + nn.ReLU(), + nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(channels), + ) + + # 第二次本地注意力 + self.local_att2 = nn.Sequential( + nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(inter_channels), + nn.ReLU(), + nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(channels), + ) + # 第二次全局注意力 + self.global_att2 = nn.Sequential( + nn.AdaptiveAvgPool2D(1), + nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(inter_channels), + nn.ReLU(), + nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(channels), + ) + else: + raise f"the type is not supported" + + self.sigmoid = nn.Sigmoid() + + def forward(self, x, residual): + flag = False + xa = x + residual + if xa.size(0) == 1: + xa = paddle.concat([xa, xa], axis=0) + flag = True + xl = self.local_att(xa) + xg = self.global_att(xa) + xlg = xl + xg + wei = self.sigmoid(xlg) + xi = x * wei + residual * (1 - wei) + + xl2 = self.local_att2(xi) + xg2 = self.global_att(xi) + xlg2 = xl2 + xg2 + wei2 = self.sigmoid(xlg2) + xo = x * wei2 + residual * (1 - wei2) + if flag: + xo = xo[0].unsqueeze(0) + return xo + + +class AFF(nn.Layer): + """ + 多特征融合 AFF + """ + + def __init__(self, channels=64, r=4, type="2D"): + super(AFF, self).__init__() + inter_channels = int(channels // r) + + if type == "1D": + self.local_att = nn.Sequential( + nn.Conv1D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(inter_channels), + nn.ReLU(), + nn.Conv1D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(channels), + ) + self.global_att = nn.Sequential( + nn.AdaptiveAvgPool1D(1), + nn.Conv1D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(inter_channels), + nn.ReLU(), + nn.Conv1D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm1D(channels), + ) + elif type == "2D": + self.local_att = nn.Sequential( + nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(inter_channels), + nn.ReLU(), + nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(channels), + ) + self.global_att = nn.Sequential( + nn.AdaptiveAvgPool2D(1), + nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(inter_channels), + nn.ReLU(), + nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(channels), + ) + else: + raise f"the type is not supported." + + self.sigmoid = nn.Sigmoid() + + def forward(self, x, residual): + flag = False + xa = x + residual + if xa.size(0) == 1: + xa = paddle.concat([xa, xa], axis=0) + flag = True + xl = self.local_att(xa) + xg = self.global_att(xa) + xlg = xl + xg + wei = self.sigmoid(xlg) + xo = 2 * x * wei + 2 * residual * (1 - wei) + if flag: + xo = xo[0].unsqueeze(0) + return xo diff --git a/paddlemix/models/audioldm2/clap_module/htsat_model.py b/paddlemix/models/audioldm2/clap_module/htsat_model.py new file mode 100644 index 000000000..c6588a929 --- /dev/null +++ b/paddlemix/models/audioldm2/clap_module/htsat_model.py @@ -0,0 +1,1105 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +import math +import warnings +import random + +from .utils import do_mixup, interpolate, Spectrogram, LogmelFilterBank, SpecAugmentation +from ..utils import to_2tuple, DropPath, Mlp +from .feature_fusion import iAFF, AFF, DAF + +class PatchEmbed(nn.Layer): + """2D Image to Patch Embedding""" + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + patch_stride=16, + enable_fusion=False, + fusion_type="None", + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patch_stride = to_2tuple(patch_stride) + self.img_size = img_size + self.patch_size = patch_size + self.patch_stride = patch_stride + self.grid_size = ( + img_size[0] // patch_stride[0], + img_size[1] // patch_stride[1], + ) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.enable_fusion = enable_fusion + self.fusion_type = fusion_type + + padding = ( + (patch_size[0] - patch_stride[0]) // 2, + (patch_size[1] - patch_stride[1]) // 2, + ) + + if (self.enable_fusion) and (self.fusion_type == "channel_map"): + self.proj = nn.Conv2D( + in_chans * 4, + embed_dim, + kernel_size=patch_size, + stride=patch_stride, + padding=padding, + ) + else: + self.proj = nn.Conv2D( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_stride, + padding=padding, + ) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + if (self.enable_fusion) and ( + self.fusion_type in ["daf_2d", "aff_2d", "iaff_2d"] + ): + self.mel_conv2d = nn.Conv2D( + in_chans, + embed_dim, + kernel_size=(patch_size[0], patch_size[1] * 3), + stride=(patch_stride[0], patch_stride[1] * 3), + padding=padding, + ) + if self.fusion_type == "daf_2d": + self.fusion_model = DAF() + elif self.fusion_type == "aff_2d": + self.fusion_model = AFF(channels=embed_dim, type="2D") + elif self.fusion_type == "iaff_2d": + self.fusion_model = iAFF(channels=embed_dim, type="2D") + + def forward(self, x, longer_idx=None): + if (self.enable_fusion) and ( + self.fusion_type in ["daf_2d", "aff_2d", "iaff_2d"] + ): + global_x = x[:, 0:1, :, :] + + # global processing + B, C, H, W = global_x.shape + assert ( + H == self.img_size[0] and W == self.img_size[1] + ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + global_x = self.proj(global_x) + TW = global_x.shape[-1] + if len(longer_idx) > 0: + # local processing + local_x = x[longer_idx, 1:, :, :] + B, C, H, W = local_x.shape + local_x = local_x.reshape([B * C, 1, H, W]) + local_x = self.mel_conv2d(local_x) + local_x = local_x.reshape( + [B, C, local_x.shape[1], local_x.shape[2], local_x.shape[3]] + ) + local_x = local_x.transpose([0, 2, 3, 1, 4]).flatten(3) + TB, TC, TH, _ = local_x.shape + if local_x.shape[-1] < TW: + local_x = paddle.concat( + [ + local_x, + paddle.zeros( + [TB, TC, TH, TW - local_x.shape[-1]] + ), + ], + axis=-1, + ) + else: + local_x = local_x[:, :, :, :TW] + + global_x[longer_idx] = self.fusion_model(global_x[longer_idx], local_x) + x = global_x + else: + B, C, H, W = x.shape + assert ( + H == self.img_size[0] and W == self.img_size[1] + ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + + if self.flatten: + x = x.flatten(2).transpose([0, 2, 1]) # BCHW -> BNC + x = self.norm(x) + return x + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + with paddle.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor = paddle.multiply(tensor, paddle.to_tensor(std) * math.sqrt(2.0)) + # tensor.mul_(std * math.sqrt(2.0)) + tensor = paddle.add(tensor, paddle.to_tensor(mean)) + # tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clip_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.reshape([B, H // window_size, window_size, W // window_size, window_size, C]) + windows = ( + x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C]) + ) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.reshape( + [B, H // window_size, W // window_size, window_size, window_size, -1] + ) + x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1]) + return x + + +class WindowAttention(nn.Layer): + r"""Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + relative_position_bias_table = paddle.zeros([(2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads]) + self.relative_position_bias_table = paddle.create_parameter( + shape=relative_position_bias_table.shape, + dtype=str(relative_position_bias_table.numpy().dtype), + default_initializer=nn.initializer.Assign(relative_position_bias_table) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(self.window_size[0]) + coords_w = paddle.arange(self.window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpose( + [1, 2, 0] + ) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(axis=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape([B_, N, 3, self.num_heads, C // self.num_heads]) + .transpose([2, 0, 3, 1, 4]) + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) + + q = q * self.scale + k_perm_shape = list(range(k.dim())) + k_new_perm_shape = k_perm_shape + k_new_perm_shape[-1], k_new_perm_shape[-2] = k_perm_shape[-2], k_perm_shape[-1] + attn = q @ k.transpose(k_new_perm_shape) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1]) + ].reshape( + [self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1] + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1] + ) # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N]) + mask.unsqueeze( + 1 + ).unsqueeze(0) + attn = attn.reshape([-1, self.num_heads, N, N]) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + tmp = attn @ v + tmp_perm_shape = list(range(tmp.dim())) + new_tmp_perm_shape = tmp_perm_shape + new_tmp_perm_shape[1], new_tmp_perm_shape[2] = tmp_perm_shape[2], tmp_perm_shape[1] + x = tmp.transpose(new_tmp_perm_shape).reshape([B_, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + return x, attn + + def extra_repr(self): + return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}" + + +# We use the model based on Swintransformer Block, therefore we can use the swin-transformer pretrained model +class SwinTransformerBlock(nn.Layer): + r"""Swin Transformer Block. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + input_resolution, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + norm_before_mlp="ln", + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + self.norm_before_mlp = norm_before_mlp + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert ( + 0 <= self.shift_size < self.window_size + ), "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + if self.norm_before_mlp == "ln": + self.norm2 = nn.LayerNorm(dim) + elif self.norm_before_mlp == "bn": + self.bn2 = nn.BatchNorm1D(dim) + def norm2_fun(x): + perm_shape = list(range(x.dim())) + new_perm_shape = perm_shape + new_perm_shape[1], new_perm_shape[2] = perm_shape[2], perm_shape[1] + return self.bn2(x.transpose(new_perm_shape)).transpose(new_perm_shape) + + self.norm2 = norm2_fun + else: + raise NotImplementedError + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if self.shift_size > 0: + # calculate attention mask for SW-MSA + H, W = self.input_resolution + img_mask = paddle.zeros([1, H, W, 1]) # 1 H W 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.reshape([-1, self.window_size * self.window_size]) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = paddle.where(attn_mask != 0, paddle.ones_like(attn_mask)*float(-100.0), attn_mask) + attn_mask = paddle.where(attn_mask == 0, paddle.ones_like(attn_mask)*float(0.0), attn_mask) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + + shortcut = x + x = self.norm1(x) + x = x.reshape([B, H, W, C]) + + # cyclic shift + if self.shift_size > 0: + shifted_x = paddle.roll( + x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2) + ) + else: + shifted_x = x + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.reshape( + [-1, self.window_size * self.window_size, C] + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows, attn = self.attn( + x_windows, mask=self.attn_mask + ) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.reshape([-1, self.window_size, self.window_size, C]) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = paddle.roll( + shifted_x, shifts=(self.shift_size, self.shift_size), axis=(1, 2) + ) + else: + x = shifted_x + x = x.reshape([B, H * W, C]) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x, attn + + def extra_repr(self): + return ( + f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " + f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" + ) + + +class PatchMerging(nn.Layer): + r"""Patch Merging Layer. + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.reshape([B, H, W, C]) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.reshape([B, -1, 4 * C]) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + def extra_repr(self): + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + +class BasicLayer(nn.Layer): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + norm_before_mlp="ln", + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.LayerList( + [ + SwinTransformerBlock( + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, + norm_layer=norm_layer, + norm_before_mlp=norm_before_mlp, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, dim=dim, norm_layer=norm_layer + ) + else: + self.downsample = None + + def forward(self, x): + attns = [] + for blk in self.blocks: + x, attn = blk(x) + if not self.training: + attns.append(attn.unsqueeze(0)) + if self.downsample is not None: + x = self.downsample(x) + if not self.training: + attn = paddle.concat(attns, axis=0) + attn = paddle.mean(attn, axis=0) + return x, attn + + def extra_repr(self): + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + +# The Core of HTSAT +class HTSAT_Swin_Transformer(nn.Layer): + r"""HTSAT based on the Swin Transformer + Args: + spec_size (int | tuple(int)): Input Spectrogram size. Default 256 + patch_size (int | tuple(int)): Patch size. Default: 4 + path_stride (iot | tuple(int)): Patch Stride for Frequency and Time Axis. Default: 4 + in_chans (int): Number of input image channels. Default: 1 (mono) + num_classes (int): Number of classes for classification head. Default: 527 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each HTSAT-Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 8 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + config (module): The configuration Module from config.py + """ + + def __init__( + self, + spec_size=256, + patch_size=4, + patch_stride=(4, 4), + in_chans=1, + num_classes=527, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[4, 8, 16, 32], + window_size=8, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.1, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + use_checkpoint=False, + norm_before_mlp="ln", + config=None, + enable_fusion=False, + fusion_type="None", + **kwargs, + ): + super(HTSAT_Swin_Transformer, self).__init__() + + self.config = config + self.spec_size = spec_size + self.patch_stride = patch_stride + self.patch_size = patch_size + self.window_size = window_size + self.embed_dim = embed_dim + self.depths = depths + self.ape = ape + self.in_chans = in_chans + self.num_classes = num_classes + self.num_heads = num_heads + self.num_layers = len(self.depths) + self.num_features = int(self.embed_dim * 2 ** (self.num_layers - 1)) + + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + + self.qkv_bias = qkv_bias + self.qk_scale = None + + self.patch_norm = patch_norm + self.norm_layer = norm_layer if self.patch_norm else None + self.norm_before_mlp = norm_before_mlp + self.mlp_ratio = mlp_ratio + + self.use_checkpoint = use_checkpoint + + self.enable_fusion = enable_fusion + self.fusion_type = fusion_type + + # process mel-spec ; used only once + self.freq_ratio = self.spec_size // self.config.mel_bins + window = "hann" + center = True + pad_mode = "reflect" + ref = 1.0 + amin = 1e-10 + top_db = None + self.interpolate_ratio = 32 # Downsampled ratio + # Spectrogram extractor + self.spectrogram_extractor = Spectrogram( + n_fft=config.window_size, + hop_length=config.hop_size, + win_length=config.window_size, + window=window, + center=center, + pad_mode=pad_mode, + freeze_parameters=True, + ) + # Logmel feature extractor + self.logmel_extractor = LogmelFilterBank( + sr=config.sample_rate, + n_fft=config.window_size, + n_mels=config.mel_bins, + fmin=config.fmin, + fmax=config.fmax, + ref=ref, + amin=amin, + top_db=top_db, + freeze_parameters=True, + ) + # Spec augmenter + self.spec_augmenter = SpecAugmentation( + time_drop_width=64, + time_stripes_num=2, + freq_drop_width=8, + freq_stripes_num=2, + ) # 2 2 + self.bn0 = nn.BatchNorm2D(self.config.mel_bins) + + # split spctrogram into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=self.spec_size, + patch_size=self.patch_size, + in_chans=self.in_chans, + embed_dim=self.embed_dim, + norm_layer=self.norm_layer, + patch_stride=patch_stride, + enable_fusion=self.enable_fusion, + fusion_type=self.fusion_type, + ) + + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.grid_size + self.patches_resolution = patches_resolution + + # absolute position embedding + if self.ape: + absolute_pos_embed = paddle.zeros([1, num_patches, self.embed_dim]) + self.absolute_pos_embed = paddle.create_parameter( + shape=absolute_pos_embed.shape, + dtype=str(absolute_pos_embed.numpy().dtype), + default_initializer=nn.initializer.Assign(absolute_pos_embed) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=self.drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in paddle.linspace(0, self.drop_path_rate, sum(self.depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.LayerList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(self.embed_dim * 2**i_layer), + input_resolution=( + patches_resolution[0] // (2**i_layer), + patches_resolution[1] // (2**i_layer), + ), + depth=self.depths[i_layer], + num_heads=self.num_heads[i_layer], + window_size=self.window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + qk_scale=self.qk_scale, + drop=self.drop_rate, + attn_drop=self.attn_drop_rate, + drop_path=dpr[ + sum(self.depths[:i_layer]) : sum(self.depths[: i_layer + 1]) + ], + norm_layer=self.norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + norm_before_mlp=self.norm_before_mlp, + ) + self.layers.append(layer) + + self.norm = self.norm_layer(self.num_features) + self.avgpool = nn.AdaptiveAvgPool1D(1) + self.maxpool = nn.AdaptiveMaxPool1D(1) + + SF = ( + self.spec_size + // (2 ** (len(self.depths) - 1)) + // self.patch_stride[0] + // self.freq_ratio + ) + self.tscam_conv = nn.Conv2D( + in_channels=self.num_features, + out_channels=self.num_classes, + kernel_size=(SF, 3), + padding=(0, 1), + ) + self.head = nn.Linear(num_classes, num_classes) + + if (self.enable_fusion) and ( + self.fusion_type in ["daf_1d", "aff_1d", "iaff_1d"] + ): + self.mel_conv1d = nn.Sequential( + nn.Conv1D(64, 64, kernel_size=5, stride=3, padding=2), + nn.BatchNorm1D(64), + ) + if self.fusion_type == "daf_1d": + self.fusion_model = DAF() + elif self.fusion_type == "aff_1d": + self.fusion_model = AFF(channels=64, type="1D") + elif self.fusion_type == "iaff_1d": + self.fusion_model = iAFF(channels=64, type="1D") + + @paddle.jit.not_to_static + def no_weight_decay(self): + return {"absolute_pos_embed"} + + @paddle.jit.not_to_static + def no_weight_decay_keywords(self): + return {"relative_position_bias_table"} + + def forward_features(self, x, longer_idx=None): + # A deprecated optimization for using a hierarchical output from different blocks + + frames_num = x.shape[2] + x = self.patch_embed(x, longer_idx=longer_idx) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + for i, layer in enumerate(self.layers): + x, attn = layer(x) + # for x + x = self.norm(x) + B, N, C = x.shape + SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] + ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1] + x = x.transpose([0, 2, 1]).reshape([B, C, SF, ST]) + B, C, F, T = x.shape + # group 2D CNN + c_freq_bin = F // self.freq_ratio + x = x.reshape([B, C, F // c_freq_bin, c_freq_bin, T]) + x = x.transpose([0, 1, 3, 2, 4]).reshape([B, C, c_freq_bin, -1]) + # get latent_output + fine_grained_latent_output = paddle.mean(x, axis=2) + fine_grained_latent_output = interpolate( + fine_grained_latent_output.transpose([0, 2, 1]), + 8 * self.patch_stride[1], + ) + + latent_output = self.avgpool(paddle.flatten(x, 2)) + latent_output = paddle.flatten(latent_output, 1) + + # display the attention map, if needed + + x = self.tscam_conv(x) + x = paddle.flatten(x, 2) # B, C, T + + fpx = interpolate( + nn.functional.sigmoid(x).transpose([0, 2, 1]), 8 * self.patch_stride[1] + ) + + x = self.avgpool(x) + x = paddle.flatten(x, 1) + + output_dict = { + "framewise_output": fpx, # already sigmoided + "clipwise_output": nn.functional.sigmoid(x), + "fine_grained_embedding": fine_grained_latent_output, + "embedding": latent_output, + } + + return output_dict + + def crop_wav(self, x, crop_size, spe_pos=None): + time_steps = x.shape[2] + tx = paddle.zeros([x.shape[0], x.shape[1], crop_size, x.shape[3]]) + for i in range(len(x)): + if spe_pos is None: + crop_pos = random.randint(0, time_steps - crop_size - 1) + else: + crop_pos = spe_pos + tx[i][0] = x[i, 0, crop_pos : crop_pos + crop_size, :] + return tx + + # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model + def reshape_wav2img(self, x): + B, C, T, F = x.shape + target_T = int(self.spec_size * self.freq_ratio) + target_F = self.spec_size // self.freq_ratio + assert ( + T <= target_T and F <= target_F + ), "the wav size should less than or equal to the swin input size" + # to avoid bicubic zero error + if T < target_T: + x = nn.functional.interpolate( + x, (target_T, x.shape[3]), mode="bicubic", align_corners=True + ) + if F < target_F: + x = nn.functional.interpolate( + x, (x.shape[2], target_F), mode="bicubic", align_corners=True + ) + x = x.transpose([0, 1, 3, 2]) + x = x.reshape( + [x.shape[0], + x.shape[1], + x.shape[2], + self.freq_ratio, + x.shape[3] // self.freq_ratio] + ) + # print(x.shape) + x = x.transpose([0, 1, 3, 2, 4]) + x = x.reshape([x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4]]) + return x + + # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model + def repeat_wat2img(self, x, cur_pos): + B, C, T, F = x.shape + target_T = int(self.spec_size * self.freq_ratio) + target_F = self.spec_size // self.freq_ratio + assert ( + T <= target_T and F <= target_F + ), "the wav size should less than or equal to the swin input size" + # to avoid bicubic zero error + if T < target_T: + x = nn.functional.interpolate( + x, (target_T, x.shape[3]), mode="bicubic", align_corners=True + ) + if F < target_F: + x = nn.functional.interpolate( + x, (x.shape[2], target_F), mode="bicubic", align_corners=True + ) + x = x.transpose([0, 1, 3, 2]) # B C F T + x = x[:, :, :, cur_pos : cur_pos + self.spec_size] + # x = x.repeat_interleave(repeats=(1, 1, 4, 1)) + x = x.repeat_interleave(repeats=4, axis=2) + return x + + def forward( + self, x: paddle.Tensor, mixup_lambda=None, infer_mode=False, device=None + ): # out_feat_keys: List[str] = None): + if self.enable_fusion and x["longer"].sum() == 0: + # if no audio is longer than 10s, then randomly select one audio to be longer + x["longer"][paddle.randint(0, x["longer"].shape[0], (1,))] = True + + if not self.enable_fusion: + x = x["waveform"] + x = self.spectrogram_extractor(x) # (batch_size, 1, time_steps, freq_bins) + x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) + x = x.transpose([0, 3, 2, 1]) + x = self.bn0(x) + x = x.transpose([0, 3, 2, 1]) + if self.training: + x = self.spec_augmenter(x) + + if self.training and mixup_lambda is not None: + x = do_mixup(x, mixup_lambda) + + x = self.reshape_wav2img(x) + output_dict = self.forward_features(x) + else: + longer_list = x["longer"] + x = x["mel_fusion"] + x = x.transpose([0, 3, 2, 1]) + x = self.bn0(x) + x = x.transpose([0, 3, 2, 1]) + longer_list_idx = paddle.where(longer_list)[0].squeeze() + if self.fusion_type in ["daf_1d", "aff_1d", "iaff_1d"]: + new_x = x[:, 0:1, :, :].clone() + if len(longer_list_idx) > 0: + # local processing + fusion_x_local = x[longer_list_idx, 1:, :, :].clone() + FB, FC, FT, FF = fusion_x_local.shape + fusion_x_local = fusion_x_local.reshape([FB * FC, FT, FF]) + fusion_x_local = paddle.transpose( + fusion_x_local, (0, 2, 1) + ) + fusion_x_local = self.mel_conv1d(fusion_x_local) + fusion_x_local = fusion_x_local.reshape( + FB, FC, FF, fusion_x_local.shape[-1] + ) + fusion_x_local = ( + paddle.transpose(fusion_x_local, (0, 2, 1, 3)) + .flatten(2) + ) + if fusion_x_local.shape[-1] < FT: + fusion_x_local = paddle.concat( + [ + fusion_x_local, + paddle.zeros( + (FB, FF, FT - fusion_x_local.size(-1)) + ), + ], + axis=-1, + ) + else: + fusion_x_local = fusion_x_local[:, :, :FT] + # 1D fusion + new_x = new_x.squeeze(1).transpose((0, 2, 1)) + new_x[longer_list_idx] = self.fusion_model( + new_x[longer_list_idx], fusion_x_local + ) + x = new_x.transpose((0, 2, 1))[:, None, :, :] + else: + x = new_x + + elif self.fusion_type in ["daf_2d", "aff_2d", "iaff_2d", "channel_map"]: + x = x # no change + + if self.training: + x = self.spec_augmenter(x) + if self.training and mixup_lambda is not None: + x = do_mixup(x, mixup_lambda) + + x = self.reshape_wav2img(x) + output_dict = self.forward_features(x, longer_idx=longer_list_idx) + + return output_dict + + +def create_htsat_model(audio_cfg, enable_fusion=False, fusion_type="None"): + try: + assert audio_cfg.model_name in [ + "base", + ], "model name for HTS-AT is wrong!" + if audio_cfg.model_name == "base": + model = HTSAT_Swin_Transformer( + spec_size=256, + patch_size=4, + patch_stride=(4, 4), + num_classes=audio_cfg.class_num, + embed_dim=128, + depths=[2, 2, 12, 2], + num_heads=[4, 8, 16, 32], + window_size=8, + config=audio_cfg, + enable_fusion=enable_fusion, + fusion_type=fusion_type, + ) + + return model + except: + raise RuntimeError( + f"Import Model for {audio_cfg.model_name} not found, or the audio cfg parameters are not enough." + ) diff --git a/paddlemix/models/audioldm2/clap_module/model.py b/paddlemix/models/audioldm2/clap_module/model.py new file mode 100644 index 000000000..c1c213d2e --- /dev/null +++ b/paddlemix/models/audioldm2/clap_module/model.py @@ -0,0 +1,403 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional + +import numpy as np +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from dataclasses import dataclass + +import logging + +from .htsat_model import create_htsat_model +from paddlenlp.transformers import RobertaModel, BertModel, BartModel + + +class MLPLayers(nn.Layer): + def __init__(self, units=[512, 512, 512], nonlin=nn.ReLU(), dropout=0.1): + super(MLPLayers, self).__init__() + self.nonlin = nonlin + self.dropout = dropout + + sequence = [] + for u0, u1 in zip(units[:-1], units[1:]): + sequence.append(nn.Linear(u0, u1)) + sequence.append(self.nonlin) + sequence.append(nn.Dropout(self.dropout)) + sequence = sequence[:-2] + + self.sequential = nn.Sequential(*sequence) + + def forward(self, X): + X = self.sequential(X) + return X + +class ResidualAttentionBlock(nn.Layer): + def __init__(self, d_model: int, n_head: int, act_layer: Callable = nn.GELU): + super().__init__() + + self.attn = nn.MultiHeadAttention(d_model, n_head) + self.ln_1 = nn.LayerNorm(d_model) + self.mlp = nn.Sequential( + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", act_layer()), + ("c_proj", nn.Linear(d_model * 4, d_model)), + ) + self.ln_2 = nn.LayerNorm(d_model) + + def attention(self, x: paddle.Tensor, attn_mask: Optional[paddle.Tensor] = None): + return self.attn(x, x, x, attn_mask=attn_mask)[0] + + def forward(self, x: paddle.Tensor, attn_mask: Optional[paddle.Tensor] = None): + x = x + self.attention(self.ln_1(x), attn_mask=attn_mask) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Layer): + def __init__( + self, width: int, layers: int, heads: int, act_layer: Callable = nn.GELU + ): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.LayerList( + [ + ResidualAttentionBlock(width, heads, act_layer=act_layer) + for _ in range(layers) + ] + ) + + def forward(self, x: paddle.Tensor, attn_mask: Optional[paddle.Tensor] = None): + for r in self.resblocks: + x = r(x, attn_mask=attn_mask) + return x + +# Audio Config Class +@dataclass +class CLAPAudioCfg: + model_type: str = "HTSAT" + model_name: str = "base" + sample_rate: int = 48000 + audio_length: int = 1024 + window_size: int = 1024 + hop_size: int = 480 + fmin: int = 50 + fmax: int = 14000 + class_num: int = 527 + mel_bins: int = 64 + clip_samples: int = 480000 + +@dataclass +class CLAPTextCfg: + context_length: int = 77 + vocab_size: int = 49408 + width: int = 512 + heads: int = 8 + layers: int = 12 + model_type: str = "roberta" + +class CLAP(nn.Layer): + def __init__( + self, + embed_dim: int, + audio_cfg: CLAPAudioCfg, + text_cfg: CLAPTextCfg, + quick_gelu: bool = False, + enable_fusion: bool = False, + fusion_type: str = "None", + joint_embed_shape: int = 512, + mlp_act: str = "relu", + ): + super().__init__() + if isinstance(audio_cfg, dict): + audio_cfg = CLAPAudioCfg(**audio_cfg) + if isinstance(text_cfg, dict): + text_cfg = CLAPTextCfg(**text_cfg) + + self.audio_cfg = audio_cfg + self.text_cfg = text_cfg + self.enable_fusion = enable_fusion + self.fusion_type = fusion_type + self.joint_embed_shape = joint_embed_shape + self.mlp_act = mlp_act + + self.context_length = text_cfg.context_length + + act_layer = nn.GELU + + if mlp_act == "relu": + mlp_act_layer = nn.ReLU() + elif mlp_act == "gelu": + mlp_act_layer = nn.GELU() + else: + raise NotImplementedError + + # audio branch + # audio branch parameters + if audio_cfg.model_type == "PANN": + raise ValueError("PANN has not been implemented.") + elif audio_cfg.model_type == "HTSAT": + self.audio_branch = create_htsat_model( + audio_cfg, enable_fusion, fusion_type + ) + else: + logging.error(f"Model config for {audio_cfg.model_type} not found") + raise RuntimeError(f"Model config for {audio_cfg.model_type} not found.") + + # text branch + # text branch parameters + if text_cfg.model_type == "transformer": + self.text_branch = Transformer( + width=text_cfg.width, + layers=text_cfg.layers, + heads=text_cfg.heads, + act_layer=act_layer, + ) + self.vocab_size = text_cfg.vocab_size + self.token_embedding = nn.Embedding(text_cfg.vocab_size, text_cfg.width) + positional_embedding = paddle.empty([self.context_length, text_cfg.width]) + self.positional_embedding = paddle.create_parameter( + shape=positional_embedding.shape, + dtype=str(positional_embedding.numpy().dtype), + default_initializer=nn.initializer.Assign(positional_embedding) + ) + # self.ln_final = LayerNorm(text_cfg.width) + self.ln_final = nn.LayerNorm(text_cfg.width) + self.text_transform = MLPLayers( + units=[ + self.joint_embed_shape, + self.joint_embed_shape, + self.joint_embed_shape, + ], + dropout=0.1, + ) + self.text_projection = nn.Sequential( + nn.Linear(text_cfg.width, self.joint_embed_shape), + mlp_act_layer, + nn.Linear(self.joint_embed_shape, self.joint_embed_shape), + ) + elif text_cfg.model_type == "bert": + self.text_branch = BertModel.from_pretrained("bert-base-uncased") + self.text_transform = MLPLayers( + units=[ + self.joint_embed_shape, + self.joint_embed_shape, + self.joint_embed_shape, + ], + dropout=0.1, + ) + self.text_projection = nn.Sequential( + nn.Linear(768, self.joint_embed_shape), + mlp_act_layer, + nn.Linear(self.joint_embed_shape, self.joint_embed_shape), + ) + elif text_cfg.model_type == "roberta": + self.text_branch = RobertaModel.from_pretrained("roberta-base") + self.text_transform = MLPLayers( + units=[ + self.joint_embed_shape, + self.joint_embed_shape, + self.joint_embed_shape, + ], + dropout=0.1, + ) + self.text_projection = nn.Sequential( + nn.Linear(768, self.joint_embed_shape), + mlp_act_layer, + nn.Linear(self.joint_embed_shape, self.joint_embed_shape), + ) + elif text_cfg.model_type == "bart": + self.text_branch = BartModel.from_pretrained("bart-base") + self.text_transform = MLPLayers( + units=[ + self.joint_embed_shape, + self.joint_embed_shape, + self.joint_embed_shape, + ], + dropout=0.1, + ) + self.text_projection = nn.Sequential( + nn.Linear(768, self.joint_embed_shape), + mlp_act_layer, + nn.Linear(self.joint_embed_shape, self.joint_embed_shape), + ) + else: + logging.error(f"Model config for {text_cfg.model_type} not found") + raise RuntimeError(f"Model config for {text_cfg.model_type} not found.") + self.text_branch_type = text_cfg.model_type + # text branch parameters + + # audio branch parameters + self.audio_transform = MLPLayers( + units=[ + self.joint_embed_shape, + self.joint_embed_shape, + self.joint_embed_shape, + ], + dropout=0.1, + ) + + # below here is text branch parameters + + self.audio_projection = nn.Sequential( + nn.Linear(embed_dim, self.joint_embed_shape), + mlp_act_layer, + nn.Linear(self.joint_embed_shape, self.joint_embed_shape), + ) + + self.logit_scale_a = paddle.create_parameter([],"float32",default_initializer=nn.initializer.Assign(paddle.ones([])*np.log(1 / 0.07))) + self.logit_scale_t = paddle.create_parameter([],"float32",default_initializer=nn.initializer.Assign(paddle.ones([])*np.log(1 / 0.07))) + self.register_buffer("attn_mask", self.build_attention_mask(), persistable=False) + + def build_attention_mask(self): + + mask = paddle.empty([self.context_length, self.context_length]) * float("-inf") + # mask.fill_(float("-inf")) + mask = paddle.triu(mask, 1) # zero out the lower diagonal + # mask.triu_(1) # zero out the lower diagonal + return mask + + def encode_audio(self, audio): + return self.audio_branch( + audio, mixup_lambda=None + ) # mix lambda needs to add + + + def encode_text(self, text): + if self.text_branch_type == "transformer": + x = self.token_embedding(text) # [batch_size, n_ctx, d_model] + + x = x + self.positional_embedding + x = x.transpose([1, 0, 2]) # NLD -> LND + x = self.text_branch(x, attn_mask=self.attn_mask) + x = x.transpose([1, 0, 2]) # LND -> NLD + x = self.ln_final(x) + + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = self.text_projection(x[paddle.arange(x.shape[0]), text.argmax(axis=-1)]) + elif self.text_branch_type == "bert": + x = self.text_branch( + input_ids=text["input_ids"], + attention_mask=text["attention_mask"], + token_type_ids=text["token_type_ids"], + return_dict=True, + )["pooler_output"] + x = self.text_projection(x) + elif self.text_branch_type == "roberta": + x = self.text_branch( + input_ids=text["input_ids"], + attention_mask=text["attention_mask"], + return_dict=True, + )["pooler_output"] + x = self.text_projection(x) + elif self.text_branch_type == "bart": + x = paddle.mean( + self.text_branch( + input_ids=text["input_ids"], + attention_mask=text["attention_mask"], + return_dict=True, + )["encoder_last_hidden_state"], + axis=1, + ) + x = self.text_projection(x) + else: + logging.error(f"Model type {self.text_branch_type} not found") + raise RuntimeError(f"Model type {self.text_branch_type} not found.") + return x + + def forward(self, audio, text): + """Forward audio and text into the CLAP + + Parameters + ---------- + audio: paddle.Tensor (batch_size, audio_length) + the time-domain audio input / the batch of mel_spec and longer list. + text: paddle.Tensor () // need to add + the text token input + """ + + if audio is None and text is None: + # a hack to get the logit scale + return self.logit_scale_a.exp(), self.logit_scale_t.exp() + elif audio is None: + return self.encode_text(text) + elif text is None: + return self.audio_projection( + self.encode_audio(audio)["embedding"] + ) + audio_features = self.audio_projection( + self.encode_audio(audio)["embedding"] + ) + audio_features = F.normalize(audio_features, axis=-1) + + text_features = self.encode_text(text) + text_features = F.normalize(text_features, axis=-1) + + audio_features_mlp = self.audio_transform(audio_features) + text_features_mlp = self.text_transform(text_features) + # Four outputs: audio features (basic & MLP), text features (basic & MLP) + return ( + audio_features, + text_features, + audio_features_mlp, + text_features_mlp, + self.logit_scale_a.exp(), + self.logit_scale_t.exp(), + ) + + def get_logit_scale(self): + return self.logit_scale_a.exp(), self.logit_scale_t.exp() + + def get_text_embedding(self, data): + """Get the text embedding from the model + + Parameters + ---------- + data: paddle.Tensor + a tensor of text embedding + + Returns + ---------- + text_embed: paddle.Tensor + a tensor of text_embeds (N, D) + + """ + text_embeds = self.encode_text(data) + text_embeds = F.normalize(text_embeds, axis=-1) + + return text_embeds + + def get_audio_embedding(self, data): + """Get the audio embedding from the model + + Parameters + ---------- + data: a list of dict + the audio input dict list from 'get_audio_feature' method + + Returns + ---------- + audio_embed: paddle.Tensor + a tensor of audio_embeds (N, D) + + """ + audio_embeds = self.audio_projection( + self.encode_audio(data)["embedding"] + ) + audio_embeds = F.normalize(audio_embeds, axis=-1) + + return audio_embeds diff --git a/paddlemix/models/audioldm2/clap_module/utils.py b/paddlemix/models/audioldm2/clap_module/utils.py new file mode 100644 index 000000000..51c6e76d7 --- /dev/null +++ b/paddlemix/models/audioldm2/clap_module/utils.py @@ -0,0 +1,344 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +import librosa + +def interpolate(x, ratio): + """Interpolate data in time domain. This is used to compensate the + resolution reduction in downsampling of a CNN. + + Args: + x: (batch_size, time_steps, classes_num) + ratio: int, ratio to interpolate + Returns: + upsampled: (batch_size, time_steps * ratio, classes_num) + """ + (batch_size, time_steps, classes_num) = x.shape + upsampled = x[:, :, None, :].repeat_interleave(ratio, 2) + upsampled = upsampled.reshape([batch_size, time_steps * ratio, classes_num]) + return upsampled + +def do_mixup(x, mixup_lambda): + """ + Args: + x: (batch_size , ...) + mixup_lambda: (batch_size,) + Returns: + out: (batch_size, ...) + """ + perm_shape = list(range(x.dim())) + new_perm_shape = perm_shape + new_perm_shape[0], new_perm_shape[-1] = perm_shape[-1], perm_shape[0] + out = ( + x.transpose(new_perm_shape) * mixup_lambda + + paddle.flip(x, axis=[0]).transpose(new_perm_shape) * (1 - mixup_lambda) + ).transpose(new_perm_shape) + return out + + +class DFTBase(nn.Layer): + def __init__(self): + r"""Base class for DFT and IDFT matrix. + """ + super(DFTBase, self).__init__() + + def dft_matrix(self, n): + (x, y) = np.meshgrid(np.arange(n), np.arange(n)) + omega = np.exp(-2 * np.pi * 1j / n) + W = np.power(omega, x * y) # shape: (n, n) + return W + + def idft_matrix(self, n): + (x, y) = np.meshgrid(np.arange(n), np.arange(n)) + omega = np.exp(2 * np.pi * 1j / n) + W = np.power(omega, x * y) # shape: (n, n) + return W + + +class STFT(DFTBase): + def __init__(self, n_fft=2048, hop_length=None, win_length=None, + window='hann', center=True, pad_mode='reflect', freeze_parameters=True): + r"""Paddle implementation of STFT with Conv1d. The function has the + same output as librosa.stft. + + Args: + n_fft: int, fft window size, e.g., 2048 + hop_length: int, hop length samples, e.g., 441 + win_length: int, window length e.g., 2048 + window: str, window function name, e.g., 'hann' + center: bool + pad_mode: str, e.g., 'reflect' + freeze_parameters: bool, set to True to freeze all parameters. Set + to False to finetune all parameters. + """ + super(STFT, self).__init__() + + assert pad_mode in ['constant', 'reflect'] + + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.center = center + self.pad_mode = pad_mode + + # By default, use the entire frame. + if self.win_length is None: + self.win_length = n_fft + + # Set the default hop, if it's not already specified. + if self.hop_length is None: + self.hop_length = int(self.win_length // 4) + + fft_window = librosa.filters.get_window(window, self.win_length, fftbins=True) + + # Pad the window out to n_fft size. + fft_window = librosa.util.pad_center(data=fft_window, size=n_fft) + + # DFT & IDFT matrix. + self.W = self.dft_matrix(n_fft) + + out_channels = n_fft // 2 + 1 + + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Assign( + paddle.to_tensor( + np.real(self.W[:, 0 : out_channels] * fft_window[:, None]).T)[:, None, :] + )) + self.conv_real = nn.Conv1D(in_channels=1, out_channels=out_channels, + kernel_size=n_fft, stride=self.hop_length, padding=0, dilation=1, + groups=1, weight_attr=weight_attr, bias_attr=False) + + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Assign( + paddle.to_tensor( + np.imag(self.W[:, 0 : out_channels] * fft_window[:, None]).T)[:, None, :] + )) + self.conv_imag = nn.Conv1D(in_channels=1, out_channels=out_channels, + kernel_size=n_fft, stride=self.hop_length, padding=0, dilation=1, + groups=1, weight_attr=weight_attr, bias_attr=False) + + if freeze_parameters: + for param in self.parameters(): + param.stop_gradient = True + + def forward(self, input): + r"""Calculate STFT of batch of signals. + + Args: + input: (batch_size, data_length), input signals. + + Returns: + real: (batch_size, 1, time_steps, n_fft // 2 + 1) + imag: (batch_size, 1, time_steps, n_fft // 2 + 1) + """ + + x = input[:, None, :] # (batch_size, channels_num, data_length) + + if self.center: + x = nn.functional.pad(x, pad=(self.n_fft // 2, self.n_fft // 2), mode=self.pad_mode, data_format="NCL") + + real = self.conv_real(x) + imag = self.conv_imag(x) + # (batch_size, n_fft // 2 + 1, time_steps) + + real = real[:, None, :, :].transpose([0, 1, 3, 2]) + imag = imag[:, None, :, :].transpose([0, 1, 3, 2]) + # (batch_size, 1, time_steps, n_fft // 2 + 1) + + return real, imag + + +class Spectrogram(nn.Layer): + def __init__(self, n_fft=2048, hop_length=None, win_length=None, + window='hann', center=True, pad_mode='reflect', power=2.0, + freeze_parameters=True): + r"""Calculate spectrogram using paddle. The STFT is implemented with + Conv1d. The function has the same output of librosa.stft + """ + super(Spectrogram, self).__init__() + + self.power = power + + self.stft = STFT(n_fft=n_fft, hop_length=hop_length, + win_length=win_length, window=window, center=center, + pad_mode=pad_mode, freeze_parameters=True) + + def forward(self, input): + r"""Calculate spectrogram of input signals. + Args: + input: (batch_size, data_length) + + Returns: + spectrogram: (batch_size, 1, time_steps, n_fft // 2 + 1) + """ + + (real, imag) = self.stft.forward(input) + # (batch_size, n_fft // 2 + 1, time_steps) + + spectrogram = real ** 2 + imag ** 2 + + if self.power == 2.0: + pass + else: + spectrogram = spectrogram ** (self.power / 2.0) + + return spectrogram + + +class LogmelFilterBank(nn.Layer): + def __init__(self, sr=22050, n_fft=2048, n_mels=64, fmin=0.0, fmax=None, + is_log=True, ref=1.0, amin=1e-10, top_db=80.0, freeze_parameters=True): + r"""Calculate logmel spectrogram using paddle. The mel filter bank is + the paddle implementation of as librosa.filters.mel + """ + super(LogmelFilterBank, self).__init__() + + self.is_log = is_log + self.ref = paddle.to_tensor(ref, dtype="float32") + self.amin = paddle.to_tensor(amin, dtype="float32") + self.top_db = top_db + if fmax == None: + fmax = sr//2 + + self.melW = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, + fmin=fmin, fmax=fmax).T + # (n_fft // 2 + 1, mel_bins) + + self.melW = paddle.to_tensor(self.melW) + self.melW = paddle.create_parameter( + self.melW.shape, + str(self.melW.numpy().dtype), + default_initializer=nn.initializer.Assign(self.melW) + ) + + if freeze_parameters: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, input): + r"""Calculate (log) mel spectrogram from spectrogram. + + Args: + input: (*, n_fft), spectrogram + + Returns: + output: (*, mel_bins), (log) mel spectrogram + """ + + # Mel spectrogram + mel_spectrogram = paddle.matmul(input, self.melW) + # (*, mel_bins) + + # Logmel spectrogram + if self.is_log: + output = self.power_to_db(mel_spectrogram) + else: + output = mel_spectrogram + + return output + + + def power_to_db(self, input): + r"""Power to db, this function is the paddle implementation of + librosa.power_to_lb + """ + ref_value = self.ref + log_spec = 10.0 * paddle.log10(paddle.clip(input, min=self.amin, max=None)) + log_spec -= 10.0 * paddle.log10(paddle.maximum(self.amin, ref_value)) + + if self.top_db is not None: + if self.top_db < 0: + raise librosa.util.exceptions.ParameterError('top_db must be non-negative') + log_spec = paddle.clip(log_spec, min=log_spec.max().item() - self.top_db, max=None) + + return log_spec + +class DropStripes(nn.Layer): + def __init__(self, dim, drop_width, stripes_num): + """Drop stripes. + + Args: + dim: int, dimension along which to drop + drop_width: int, maximum width of stripes to drop + stripes_num: int, how many stripes to drop + """ + super(DropStripes, self).__init__() + + assert dim in [2, 3] # dim 2: time; dim 3: frequency + + self.dim = dim + self.drop_width = drop_width + self.stripes_num = stripes_num + + def forward(self, input): + """input: (batch_size, channels, time_steps, freq_bins)""" + + assert input.ndim == 4 + + if self.training is False: + return input + + else: + batch_size = input.shape[0] + total_width = input.shape[self.dim] + + for n in range(batch_size): + self.transform_slice(input[n], total_width) + + return input + + def transform_slice(self, e, total_width): + """e: (channels, time_steps, freq_bins)""" + + for _ in range(self.stripes_num): + distance = paddle.randint(low=0, high=self.drop_width, shape=(1,))[0] + bgn = paddle.randint(low=0, high=total_width - distance, shape=(1,))[0] + + if self.dim == 2: + e[:, bgn : bgn + distance, :] = 0 + elif self.dim == 3: + e[:, :, bgn : bgn + distance] = 0 + + +class SpecAugmentation(nn.Layer): + def __init__(self, time_drop_width, time_stripes_num, freq_drop_width, + freq_stripes_num): + """Spec augmetation. + [ref] Park, D.S., Chan, W., Zhang, Y., Chiu, C.C., Zoph, B., Cubuk, E.D. + and Le, Q.V., 2019. Specaugment: A simple data augmentation method + for automatic speech recognition. arXiv preprint arXiv:1904.08779. + + Args: + time_drop_width: int + time_stripes_num: int + freq_drop_width: int + freq_stripes_num: int + """ + + super(SpecAugmentation, self).__init__() + + self.time_dropper = DropStripes(dim=2, drop_width=time_drop_width, + stripes_num=time_stripes_num) + + self.freq_dropper = DropStripes(dim=3, drop_width=freq_drop_width, + stripes_num=freq_stripes_num) + + def forward(self, input): + x = self.time_dropper(input) + x = self.freq_dropper(x) + return x diff --git a/paddlemix/models/audioldm2/configuration.py b/paddlemix/models/audioldm2/configuration.py new file mode 100644 index 000000000..7590ed74e --- /dev/null +++ b/paddlemix/models/audioldm2/configuration.py @@ -0,0 +1,204 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Union +from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlemix.utils.log import logger + +__all__ = ["AudioLDM2Config"] + +class AudioLDM2Config(PretrainedConfig): + + model_type = "audioldm2" + + def __init__( + self, + model_name: str = "audioldm2-full", + first_stage_key: str = "fbank", + sampling_rate: int = 16000, + parameterization: str = "eps", + log_every_t: int = 200, + latent_t_size: int = 256, + latent_f_size: int = 16, + channels: int = 8, + timesteps: int = 1000, + num_timesteps_cond: int = 1, + linear_start: float = 0.0015, + linear_end: float = 0.0195, + unconditional_prob_cfg: float = 0.1, + device: str = "gpu", + unet_image_size: int = 64, + unet_context_dim: list = [768, 1024], + unet_in_channels: int = 8, + unet_out_channels: int = 8, + unet_model_channels: int = 128, + unet_attention_resolutions: list = [8, 4, 2], + unet_num_res_blocks: int = 2, + unet_channel_mult: list = [1, 2, 3, 5], + unet_num_head_channels: int = 32, + unet_use_spatial_transformer: bool = True, + unet_transformer_depth: int = 1, + autoencoder_sampling_rate: int = 16000, + autoencoder_batchsize: int = 4, + autoencoder_image_key: str = "fbank", + autoencoder_subband: int = 1, + autoencoder_embed_dim: int = 8, + autoencoder_time_shuffle: int = 1, + ddconfig_double_z: bool = True, + ddconfig_mel_bins: int = 64, + ddconfig_z_channels: int = 8, + ddconfig_resolution: int = 256, + ddconfig_downsample_time: bool = False, + ddconfig_in_channels: int = 1, + ddconfig_out_ch: int = 1, + ddconfig_ch: int = 128, + ddconfig_ch_mult: list = [1, 2, 4], + ddconfig_num_res_blocks: int = 2, + ddconfig_attn_resolutions: list = [], + ddconfig_dropout: float = 0.0, + sequence2audiomae_always_output_audiomae_gt: bool = False, + sequence2audiomae_learnable: bool = True, + sequence2audiomae_use_gt_mae_output: bool = True, + sequence2audiomae_use_gt_mae_prob: float = 0.0, + sequence2audiomae_base_learning_rate: float = 0.0002, + sequence2audiomae_sequence_gen_length: int = 8, + sequence2audiomae_use_warmup: bool = True, + sequence2audiomae_sequence_input_key: list = ['film_clap_cond1', 'crossattn_flan_t5'], + sequence2audiomae_sequence_input_embed_dim: list = [512, 1024], + sequence2audiomae_batchsize: int = 16, + sequence2audiomae_cond_stage_configs: dict = None, + **kwargs, + ): + kwargs["return_dict"] = kwargs.pop("return_dict", True) + super().__init__(**kwargs) + self.first_stage_key = first_stage_key + self.sampling_rate = sampling_rate + self.parameterization = parameterization + self.log_every_t = log_every_t + self.latent_t_size = latent_t_size + self.latent_f_size = latent_f_size + self.channels = channels + self.timesteps = timesteps + self.num_timesteps_cond = num_timesteps_cond + self.linear_start = linear_start + self.linear_end = linear_end + self.unconditional_prob_cfg = unconditional_prob_cfg + self.device = device + + self.unet_config = {} + self.unet_config["target"] = ".unet.openaimodel.UNetModel" + self.unet_config["params"] = {} + self.unet_config["params"]["image_size"] = unet_image_size + self.unet_config["params"]["context_dim"] = unet_context_dim + self.unet_config["params"]["in_channels"] = unet_in_channels + self.unet_config["params"]["out_channels"] = unet_out_channels + self.unet_config["params"]["model_channels"] = unet_model_channels + self.unet_config["params"]["attention_resolutions"] = unet_attention_resolutions + self.unet_config["params"]["num_res_blocks"] = unet_num_res_blocks + self.unet_config["params"]["channel_mult"] = unet_channel_mult + self.unet_config["params"]["num_head_channels"] = unet_num_head_channels + self.unet_config["params"]["use_spatial_transformer"] = unet_use_spatial_transformer + self.unet_config["params"]["transformer_depth"] = unet_transformer_depth + + self.first_stage_config = {} + self.first_stage_config["target"] = ".latent_encoder.autoencoder.AudioLDMAutoencoderKL" + self.first_stage_config["params"] = {} + self.first_stage_config["params"]["sampling_rate"] = autoencoder_sampling_rate + self.first_stage_config["params"]["batchsize"] = autoencoder_batchsize + self.first_stage_config["params"]["image_key"] = autoencoder_image_key + self.first_stage_config["params"]["subband"] = autoencoder_subband + self.first_stage_config["params"]["embed_dim"] = autoencoder_embed_dim + self.first_stage_config["params"]["time_shuffle"] = autoencoder_time_shuffle + + self.first_stage_config["params"]["ddconfig"] = {} + self.first_stage_config["params"]["ddconfig"]["double_z"] = ddconfig_double_z + self.first_stage_config["params"]["ddconfig"]["mel_bins"] = ddconfig_mel_bins + self.first_stage_config["params"]["ddconfig"]["z_channels"] = ddconfig_z_channels + self.first_stage_config["params"]["ddconfig"]["resolution"] = ddconfig_resolution + self.first_stage_config["params"]["ddconfig"]["downsample_time"] = ddconfig_downsample_time + self.first_stage_config["params"]["ddconfig"]["in_channels"] = ddconfig_in_channels + self.first_stage_config["params"]["ddconfig"]["out_ch"] = ddconfig_out_ch + self.first_stage_config["params"]["ddconfig"]["ch"] = ddconfig_ch + self.first_stage_config["params"]["ddconfig"]["ch_mult"] = ddconfig_ch_mult + self.first_stage_config["params"]["ddconfig"]["num_res_blocks"] = ddconfig_num_res_blocks + self.first_stage_config["params"]["ddconfig"]["attn_resolutions"] = ddconfig_attn_resolutions + self.first_stage_config["params"]["ddconfig"]["dropout"] = ddconfig_dropout + + self.cond_stage_config = {} + self.cond_stage_config["crossattn_audiomae_generated"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["cond_stage_key"] = "all" + self.cond_stage_config["crossattn_audiomae_generated"]["conditioning_key"] = "crossattn" + self.cond_stage_config["crossattn_audiomae_generated"]["target"] = ".encoders.sequence2audiomae_encoder.SequenceGenAudioMAECond" # gpt2 + self.cond_stage_config["crossattn_audiomae_generated"]["params"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["always_output_audiomae_gt"] = sequence2audiomae_always_output_audiomae_gt + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["learnable"] = sequence2audiomae_learnable + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["use_gt_mae_output"] = sequence2audiomae_use_gt_mae_output + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["use_gt_mae_prob"] = sequence2audiomae_use_gt_mae_prob + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["base_learning_rate"] = sequence2audiomae_base_learning_rate + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["sequence_gen_length"] = sequence2audiomae_sequence_gen_length + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["use_warmup"] = sequence2audiomae_use_warmup + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["sequence_input_key"] = sequence2audiomae_sequence_input_key + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["sequence_input_embed_dim"] = sequence2audiomae_sequence_input_embed_dim + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["batchsize"] = sequence2audiomae_batchsize + + if "speech" not in model_name: + self.cond_stage_config["crossattn_flan_t5"] = {} + self.cond_stage_config["crossattn_flan_t5"]["cond_stage_key"] = "text" + self.cond_stage_config["crossattn_flan_t5"]["conditioning_key"] = "crossattn" + self.cond_stage_config["crossattn_flan_t5"]["target"] = ".encoders.flant5_encoder.FlanT5HiddenState" + + if sequence2audiomae_cond_stage_configs is None: + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"]["cond_stage_key"] = "text" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"]["conditioning_key"] = "film" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"]["target"] = ".encoders.clap_encoder.CLAPAudioEmbeddingClassifierFreev2" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"]["params"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"]["params"]["sampling_rate"] = 48000 + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"]["params"]["embed_mode"] = "text" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["film_clap_cond1"]["params"]["amodel"] = "HTSAT-base" + + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_flan_t5"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_flan_t5"]["cond_stage_key"] = "text" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_flan_t5"]["conditioning_key"] = "crossattn" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_flan_t5"]["target"] = ".encoders.flant5_encoder.FlanT5HiddenState" + + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["cond_stage_key"] = "ta_kaldi_fbank" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["conditioning_key"] = "crossattn" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["target"] = ".encoders.audiomae_encoder.AudioMAEConditionCTPoolRand" + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"] = {} + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"]["regularization"] = False + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"]["no_audiomae_mask"] = True + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"]["time_pooling_factors"] = [8] + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"]["freq_pooling_factors"] = [8] + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"]["eval_time_pooling"] = 8 + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"]["eval_freq_pooling"] = 8 + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"]["crossattn_audiomae_pooled"]["params"]["mask_ratio"] = 0 + else: + self.cond_stage_config["crossattn_audiomae_generated"]["params"]["cond_stage_config"] = sequence2audiomae_cond_stage_configs + + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) diff --git a/paddlemix/models/audioldm2/diffusionwrapper.py b/paddlemix/models/audioldm2/diffusionwrapper.py new file mode 100644 index 000000000..37b391b33 --- /dev/null +++ b/paddlemix/models/audioldm2/diffusionwrapper.py @@ -0,0 +1,166 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from inspect import isfunction +import importlib +import numpy as np + +class DiffusionWrapper(nn.Layer): + def __init__(self, diff_model_config, conditioning_key): + super().__init__() + self.diffusion_model = instantiate_from_config(diff_model_config) + + self.conditioning_key = conditioning_key + + for key in self.conditioning_key: + if ( + "concat" in key + or "crossattn" in key + or "hybrid" in key + or "film" in key + or "noncond" in key + ): + continue + else: + raise ValueError("The conditioning key %s is illegal" % key) + + self.being_verbosed_once = False + + def forward(self, x, t, cond_dict: dict = {}): + # x with condition (or maybe not) + xc = x + + y = None + context_list, attn_mask_list = [], [] + + conditional_keys = cond_dict.keys() + + for key in conditional_keys: + if "concat" in key: + xc = paddle.concat([x, cond_dict[key].unsqueeze(1)], axis=1) + elif "film" in key: + if y is None: + y = cond_dict[key].squeeze(1) + else: + y = paddle.concat([y, cond_dict[key].squeeze(1)], axis=-1) + elif "crossattn" in key: + # assert context is None, "You can only have one context matrix, got %s" % (cond_dict.keys()) + if isinstance(cond_dict[key], dict): + for k in cond_dict[key].keys(): + if "crossattn" in k: + context, attn_mask = cond_dict[key][ + k + ] # crossattn_audiomae_pooled: paddle.Size([12, 128, 768]) + else: + assert len(cond_dict[key]) == 2, ( + "The context condition for %s you returned should have two element, one context one mask" + % (key) + ) + context, attn_mask = cond_dict[key] + + # The input to the UNet model is a list of context matrix + context_list.append(context) + attn_mask_list.append(attn_mask) + + elif ( + "noncond" in key + ): # If you use loss function in the conditional module, include the keyword "noncond" in the return dictionary + continue + else: + raise NotImplementedError() + + out = self.diffusion_model( + xc, t, context_list=context_list, y=y, context_attn_mask_list=attn_mask_list + ) + + return out + +def instantiate_from_config(config): + if not "target" in config: + if config == "__is_first_stage__": + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package="paddlemix.models.audioldm2"), cls) + +def count_params(model, verbose=False): + total_params = sum(p.numel() for p in model.parameters()) + if verbose: + tmp = float(total_params * 1.e-6) + print(f"{model.__class__.__name__} has {tmp:.2f} M params.") + return total_params + +def make_beta_schedule( + schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3 +): + if schedule == "linear": + betas = ( + paddle.linspace( + linear_start**0.5, linear_end**0.5, n_timestep, dtype="float64" + ) + ** 2 + ) + + elif schedule == "cosine": + timesteps = ( + paddle.arange(n_timestep + 1, dtype="float64") / n_timestep + cosine_s + ) + alphas = timesteps / (1 + cosine_s) * np.pi / 2 + alphas = paddle.cos(alphas).pow(2) + alphas = alphas / alphas[0] + betas = 1 - alphas[1:] / alphas[:-1] + betas = np.clip(betas, a_min=0, a_max=0.999) + + elif schedule == "sqrt_linear": + betas = paddle.linspace( + linear_start, linear_end, n_timestep, dtype="float64" + ) + elif schedule == "sqrt": + betas = ( + paddle.linspace(linear_start, linear_end, n_timestep, dtype="float64") + ** 0.5 + ) + else: + raise ValueError(f"schedule '{schedule}' unknown.") + return betas.numpy() + +def extract_into_tensor(a, t, x_shape): + b, *_ = t.shape + out = a.gather(t, -1) + return out.reshape((b,) + ((1,) * (len(x_shape) - 1))) + +def noise_like(shape, repeat=False): + repeat_noise = lambda: paddle.randn((1, *shape[1:])).repeat_interleave(repeats=shape[0], axis=0) + noise = lambda: paddle.randn(shape) + return repeat_noise() if repeat else noise() + +def default(val, d): + if val is not None: + return val + return d() if isfunction(d) else d + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self diff --git a/paddlemix/models/audioldm2/encoders/audiomae_encoder.py b/paddlemix/models/audioldm2/encoders/audiomae_encoder.py new file mode 100644 index 000000000..e06e2e687 --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/audiomae_encoder.py @@ -0,0 +1,155 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +from ..audiomae import mae as models_mae + +class Vanilla_AudioMAE(nn.Layer): + """Audio Masked Autoencoder (MAE) pre-trained on AudioSet (for AudioLDM2)""" + + def __init__( + self, + ): + super().__init__() + model = models_mae.__dict__["mae_vit_base_patch16"]( + in_chans=1, audio_exp=True, img_size=(1024, 128) + ) + + self.model = model.eval() + + def forward(self, x, mask_ratio=0.0, no_mask=False, no_average=False): + """ + x: mel fbank [Batch, 1, 1024 (T), 128 (F)] + mask_ratio: 'masking ratio (percentage of removed patches).' + """ + with paddle.no_grad(): + # embed: [B, 513, 768] for mask_ratio=0.0 + if no_mask: + if no_average: + raise RuntimeError("This function is deprecated") + else: + embed = self.model.forward_encoder_no_mask(x) # mask_ratio + else: + raise RuntimeError("This function is deprecated") + return embed + +class AudioMAEConditionCTPoolRand(nn.Layer): + def __init__( + self, + time_pooling_factors=[1, 2, 4, 8], + freq_pooling_factors=[1, 2, 4, 8], + eval_time_pooling=None, + eval_freq_pooling=None, + mask_ratio=0.0, + regularization=False, + no_audiomae_mask=True, + no_audiomae_average=False, + ): + super().__init__() + self.device = None + self.time_pooling_factors = time_pooling_factors + self.freq_pooling_factors = freq_pooling_factors + self.no_audiomae_mask = no_audiomae_mask + self.no_audiomae_average = no_audiomae_average + + self.eval_freq_pooling = eval_freq_pooling + self.eval_time_pooling = eval_time_pooling + self.mask_ratio = mask_ratio + self.use_reg = regularization + + self.audiomae = Vanilla_AudioMAE() + self.audiomae.eval() + for p in self.audiomae.parameters(): + p.stop_gradient = True + + # Required + def get_unconditional_condition(self, batchsize): + param = self.audiomae.parameters()[0] + assert param.stop_gradient == True + + time_pool, freq_pool = min(self.eval_time_pooling, 64), min( + self.eval_freq_pooling, 8 + ) + + token_num = int(512 / (time_pool * freq_pool)) + return [ + paddle.zeros((batchsize, token_num, 768), dtype="float32"), + paddle.ones((batchsize, token_num), dtype="float32"), + ] + + def pool(self, representation, time_pool=None, freq_pool=None): + assert representation.shape[-1] == 768 + representation = representation[:, 1:, :] + perm = list(range(representation.dim())) + new_perm = perm + new_perm[1], new_perm[2] = perm[2], perm[1] + representation = representation.transpose(new_perm) + bs, embedding_dim, token_num = representation.shape + representation = representation.reshape([bs, embedding_dim, 64, 8]) + + if self.training: + if time_pool is None and freq_pool is None: + time_pool = min( + 64, + self.time_pooling_factors[ + np.random.choice(list(range(len(self.time_pooling_factors)))) + ], + ) + freq_pool = min(8, time_pool) # TODO here I make some modification. + else: + time_pool, freq_pool = min(self.eval_time_pooling, 64), min( + self.eval_freq_pooling, 8 + ) + + self.avgpooling = nn.AvgPool2D( + kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool) + ) + self.maxpooling = nn.MaxPool2D( + kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool) + ) + + pooled = ( + self.avgpooling(representation) + self.maxpooling(representation) + ) / 2 # [bs, embedding_dim, time_token_num, freq_token_num] + pooled = pooled.flatten(2).transpose([0, 2, 1]) + return pooled # [bs, token_num, embedding_dim] + + def regularization(self, x): + assert x.shape[-1] == 768 + x = nn.functional.normalize(x, p=2, axis=-1) + return x + + # Required + def forward(self, batch, time_pool=None, freq_pool=None): + assert batch.shape[-2] == 1024 and batch.shape[-1] == 128 + + batch = batch.unsqueeze(1) + with paddle.no_grad(): + representation = self.audiomae( + batch, + mask_ratio=self.mask_ratio, + no_mask=self.no_audiomae_mask, + no_average=self.no_audiomae_average, + ) + + representation = self.pool(representation, time_pool, freq_pool) + if self.use_reg: + representation = self.regularization(representation) + return [ + representation, + paddle.ones((representation.shape[0], representation.shape[1]), dtype="float32"), + ] + \ No newline at end of file diff --git a/paddlemix/models/audioldm2/encoders/clap_encoder.py b/paddlemix/models/audioldm2/encoders/clap_encoder.py new file mode 100644 index 000000000..fe0f35660 --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/clap_encoder.py @@ -0,0 +1,395 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import warnings +from paddle.audio.features import MelSpectrogram +from ..clap_module.clap import create_clap_model +from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer +from typing import Optional + +def get_audio_features( + audio_data, mel, max_len, data_truncating, data_filling, audio_cfg +): + """ + Calculate and add audio features to sample. + Sample: a dict containing all the data of current sample. + audio_data: a tensor of shape (T) containing audio data. + max_len: the maximum length of audio data. + data_truncating: the method of truncating data. + data_filling: the method of filling data. + audio_cfg: a dict containing audio configuration. Comes from model_cfg['audio_cfg']. + """ + sample = {} + + # assert audio_data.size(-1) <= max_len, str(audio_data.size()) + + # split to three parts + chunk_frames = ( + max_len // audio_cfg["hop_size"] + 1 + ) # the +1 related to how the spectrogram is computed + mel = mel[:chunk_frames] + + audio_data = audio_data[..., :max_len] + sample["mel_fusion"] = mel + longer = paddle.to_tensor([True], dtype="bool") + + sample["longer"] = longer + sample["waveform"] = audio_data + + return sample + +def _get_sinc_resample_kernel( + orig_freq: int, + new_freq: int, + gcd: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interp_hann", + beta: Optional[float] = None, + dtype: Optional[paddle.dtype] = None, +): + if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq): + raise Exception( + "Frequencies must be of integer type to ensure quality resampling computation. " + ) + + if resampling_method in ["sinc_interpolation", "kaiser_window"]: + method_map = { + "sinc_interpolation": "sinc_interp_hann", + "kaiser_window": "sinc_interp_kaiser", + } + warnings.warn( + f'"{resampling_method}" resampling method name is being deprecated and replaced by ' + f'"{method_map[resampling_method]}" in the next release. ' + "The default behavior remains unchanged.", + stacklevel=3, + ) + elif resampling_method not in ["sinc_interp_hann", "sinc_interp_kaiser"]: + raise ValueError("Invalid resampling method: {}".format(resampling_method)) + + orig_freq = int(orig_freq) // gcd + new_freq = int(new_freq) // gcd + + if lowpass_filter_width <= 0: + raise ValueError("Low pass filter width should be positive.") + base_freq = min(orig_freq, new_freq) + # This will perform antialiasing filtering by removing the highest frequencies. + base_freq *= rolloff + + width = math.ceil(lowpass_filter_width * orig_freq / base_freq) + # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e., + # they will have a lot of almost zero values to the left or to the right... + # There is probably a way to evaluate those filters more efficiently, but this is kept for + # future work. + idx_dtype = dtype if dtype is not None else paddle.float64 + + idx = paddle.arange(-width, width + orig_freq, dtype=idx_dtype)[None, None] / orig_freq + + t = paddle.arange(0, -new_freq, -1, dtype=dtype)[:, None, None] / new_freq + idx + t *= base_freq + t = t.clip_(-lowpass_filter_width, lowpass_filter_width) + + if resampling_method == "sinc_interp_hann": + window = paddle.cos(t * math.pi / lowpass_filter_width / 2) ** 2 + else: + # sinc_interp_kaiser + if beta is None: + beta = 14.769656459379492 + beta_tensor = paddle.to_tensor(float(beta)) + window = paddle.i0(beta_tensor * paddle.sqrt(1 - (t / lowpass_filter_width) ** 2)) / paddle.i0(beta_tensor) + + t *= math.pi + + scale = base_freq / orig_freq + kernels = paddle.where(t == 0, paddle.to_tensor(1.0, dtype=t.dtype), t.sin() / t) + kernels *= window * scale + + if dtype is None: + kernels = paddle.cast(kernels, dtype=paddle.float32) + + return kernels, width + +def _apply_sinc_resample_kernel( + waveform: paddle.Tensor, + orig_freq: int, + new_freq: int, + gcd: int, + kernel: paddle.Tensor, + width: int, +): + if not "float" in str(waveform.dtype): + raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.") + + orig_freq = int(orig_freq) // gcd + new_freq = int(new_freq) // gcd + + # pack batch + shape = waveform.shape + waveform = waveform.reshape([-1, shape[-1]]) + + num_wavs, length = waveform.shape + waveform = nn.functional.pad(waveform.unsqueeze(0), (width, width + orig_freq), data_format='NCL').squeeze(0) + resampled = nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq) + perm_shape = list(range(resampled.dim())) + new_perm_shape = perm_shape + new_perm_shape[1], new_perm_shape[2] = perm_shape[2], perm_shape[1] + resampled = resampled.transpose(new_perm_shape).reshape([num_wavs, -1]) + target_length = paddle.cast(paddle.ceil(paddle.to_tensor(new_freq * length / orig_freq)), dtype="int64") + resampled = resampled[..., :target_length] + + # unpack batch + resampled = resampled.reshape(shape[:-1] + resampled.shape[-1:]) + return resampled + + +def resample( + waveform: paddle.Tensor, + orig_freq: int, + new_freq: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interp_hann", + beta: Optional[float] = None, +) -> paddle.Tensor: + r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`. + + Note: + ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in + more efficient computation if resampling multiple waveforms with the same resampling parameters. + + Args: + waveform (Tensor): The input signal of dimension `(..., time)` + orig_freq (int): The original frequency of the signal + new_freq (int): The desired frequency + lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper + but less efficient. (Default: ``6``) + rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist. + Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``) + resampling_method (str, optional): The resampling method to use. + Options: [``"sinc_interp_hann"``, ``"sinc_interp_kaiser"``] (Default: ``"sinc_interp_hann"``) + beta (float or None, optional): The shape parameter used for kaiser window. + + Returns: + Tensor: The waveform at the new frequency of dimension `(..., time).` + """ + + if orig_freq <= 0.0 or new_freq <= 0.0: + raise ValueError("Original frequency and desired frequecy should be positive") + + if orig_freq == new_freq: + return waveform + + gcd = math.gcd(int(orig_freq), int(new_freq)) + + kernel, width = _get_sinc_resample_kernel( + orig_freq, + new_freq, + gcd, + lowpass_filter_width, + rolloff, + resampling_method, + beta, + waveform.dtype, + ) + resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width) + return resampled + +class CLAPAudioEmbeddingClassifierFreev2(nn.Layer): + def __init__( + self, + pretrained_path="", + enable_cuda=False, + sampling_rate=16000, + embed_mode="audio", + amodel="HTSAT-base", + unconditional_prob=0.1, + random_mute=False, + max_random_mute_portion=0.5, + training_mode=True, + ): + super().__init__() + self.device = "cpu" # The model itself is on cpu + self.cuda = enable_cuda + self.precision = "fp32" + self.amodel = amodel # or 'PANN-14' + self.tmodel = "roberta" # the best text encoder in our training + self.enable_fusion = False # False if you do not want to use the fusion model + self.fusion_type = "aff_2d" + self.pretrained = pretrained_path + self.embed_mode = embed_mode + self.embed_mode_orig = embed_mode + self.sampling_rate = sampling_rate + self.unconditional_prob = unconditional_prob + self.random_mute = random_mute + self.tokenize = RobertaTokenizer.from_pretrained("roberta-base") + self.max_random_mute_portion = max_random_mute_portion + self.training_mode = training_mode + self.model, self.model_cfg = create_clap_model( + self.amodel, + self.tmodel, + self.pretrained, + precision=self.precision, + enable_fusion=self.enable_fusion, + fusion_type=self.fusion_type, + ) + self.model = self.model.to(self.device) + audio_cfg = self.model_cfg["audio_cfg"] + self.mel_transform = MelSpectrogram( + sr=audio_cfg["sample_rate"], + n_fft=audio_cfg["window_size"], + hop_length=audio_cfg["hop_size"], + win_length=audio_cfg["window_size"], + power=2.0, + center=True, + pad_mode="reflect", + # onesided=True, + n_mels=64, + f_min=audio_cfg["fmin"], + f_max=audio_cfg["fmax"], + norm=None, + ) + for p in self.model.parameters(): + # p.requires_grad = False + p.stop_gradient = True + self.unconditional_token = None + self.model.eval() + + def get_unconditional_condition(self, batchsize): + self.unconditional_token = self.model.get_text_embedding( + self.tokenizer(["", ""]) + )[0:1] + return paddle.concat([self.unconditional_token.unsqueeze(0)] * batchsize, axis=0) + + def batch_to_list(self, batch): + ret = [] + for i in range(batch.size(0)): + ret.append(batch[i]) + return ret + + def make_decision(self, probability): + if float(paddle.rand([])) < probability: + return True + else: + return False + + def random_uniform(self, start, end): + val = paddle.rand([]).item() + return start + (end - start) * val + + def _random_mute(self, waveform): + # waveform: [bs, t-steps] + t_steps = waveform.shape[-1] + for i in range(waveform.shape[0]): + mute_size = int( + self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion)) + ) + mute_start = int(self.random_uniform(0, t_steps - mute_size)) + waveform[i, mute_start : mute_start + mute_size] = 0 + return waveform + + def cos_similarity(self, waveform, text): + # waveform: [bs, t_steps] + original_embed_mode = self.embed_mode + with paddle.no_grad(): + self.embed_mode = "audio" + audio_emb = self(waveform) + self.embed_mode = "text" + text_emb = self(text) + similarity = F.cosine_similarity(audio_emb, text_emb, axis=2) + self.embed_mode = original_embed_mode + return similarity.squeeze() + + def build_unconditional_emb(self): + self.unconditional_token = self.model.get_text_embedding( + self.tokenizer(["", ""]) + )[0:1] + + def forward(self, batch): + # If you want this conditioner to be unconditional, set self.unconditional_prob = 1.0 + # If you want this conditioner to be fully conditional, set self.unconditional_prob = 0.0 + if self.model.training == True and not self.training_mode: + print( + "The pretrained CLAP model should always be in eval mode. Reloading model just in case you change the parameters." + ) + self.model, self.model_cfg = create_clap_model( + self.amodel, + self.tmodel, + self.pretrained, + precision=self.precision, + device="cuda" if self.cuda else "cpu", + enable_fusion=self.enable_fusion, + fusion_type=self.fusion_type, + ) + for p in self.model.parameters(): + # p.requires_grad = False + p.stop_gradient = True + self.model.eval() + + if self.unconditional_token is None: + self.build_unconditional_emb() + + # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode + if self.embed_mode == "audio": + if not self.training: + print("INFO: clap model calculate the audio embedding as condition") + with paddle.no_grad(): + if self.sampling_rate != 48000: + batch = resample( + batch, orig_freq=self.sampling_rate, new_freq=48000 + ) + audio_data = batch.squeeze(1) + mel = self.mel_transform(audio_data) + audio_dict = get_audio_features( + audio_data, + mel, + 480000, + data_truncating="fusion", + data_filling="repeatpad", + audio_cfg=self.model_cfg["audio_cfg"], + ) + # [bs, 512] + embed = self.model.get_audio_embedding(audio_dict) + elif self.embed_mode == "text": + with paddle.no_grad(): + # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode + text_data = self.tokenizer(batch) + + if isinstance(batch, str) or ( + isinstance(batch, list) and len(batch) == 1 + ): + for key in text_data.keys(): + text_data[key] = text_data[key].unsqueeze(0) + + embed = self.model.get_text_embedding(text_data) + + embed = embed.unsqueeze(1) + for i in range(embed.shape[0]): + if self.make_decision(self.unconditional_prob): + embed[i] = self.unconditional_token + return embed.detach() + + def tokenizer(self, text): + result = self.tokenize( + text, + padding="max_length", + truncation=True, + max_length=512, + return_tensors="pd", + return_attention_mask=True, + ) + return {k: v.squeeze(0) for k, v in result.items()} diff --git a/paddlemix/models/audioldm2/encoders/flant5_encoder.py b/paddlemix/models/audioldm2/encoders/flant5_encoder.py new file mode 100644 index 000000000..97b77c111 --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/flant5_encoder.py @@ -0,0 +1,96 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import paddle +import paddle.nn as nn +from paddlenlp.transformers import AutoTokenizer, T5EncoderModel, T5Config + +class FlanT5HiddenState(nn.Layer): + """ + llama = FlanT5HiddenState() + data = ["","this is not an empty sentence"] + encoder_hidden_states = llama(data) + import ipdb;ipdb.set_trace() + """ + + def __init__( + self, text_encoder_name="t5-v1_1-large", freeze_text_encoder=True # t5-v1_1-large -> google/flan-t5-large + ): + super().__init__() + self.freeze_text_encoder = freeze_text_encoder + self.tokenizer = AutoTokenizer.from_pretrained(text_encoder_name) + self.model = T5EncoderModel(T5Config.from_pretrained(text_encoder_name)) + if freeze_text_encoder: + self.model.eval() + for p in self.model.parameters(): + p.stop_gradient = True + else: + print("=> The text encoder is learnable") + + self.empty_hidden_state_cfg = None + self.device = None + + # Required + def get_unconditional_condition(self, batchsize): + param = self.model.parameters()[0] + if self.freeze_text_encoder: + assert param.stop_gradient == True + + # device = param.device + if self.empty_hidden_state_cfg is None: + self.empty_hidden_state_cfg, _ = self([""]) + + hidden_state = paddle.cast(paddle.concat([self.empty_hidden_state_cfg] * batchsize), dtype="float32") + attention_mask = ( + paddle.ones((batchsize, hidden_state.shape[1]), dtype="float32") + ) + return [hidden_state, attention_mask] # Need to return float type + + def forward(self, batch): + param = self.model.parameters()[0] + if self.freeze_text_encoder: + assert param.stop_gradient == True + + try: + return self.encode_text(batch) + except Exception as e: + print(e, batch) + logging.exception("An error occurred: %s", str(e)) + + def encode_text(self, prompt): + # device = self.model.device + batch = self.tokenizer( + prompt, + max_length=128, # self.tokenizer.model_max_length + padding=True, + truncation=True, + return_tensors="pd", + ) + input_ids, attention_mask = batch.input_ids, batch.attention_mask + # Get text encoding + if self.freeze_text_encoder: + with paddle.no_grad(): + encoder_hidden_states = self.model( + input_ids=input_ids, attention_mask=attention_mask + )[0] + else: + encoder_hidden_states = self.model( + input_ids=input_ids, attention_mask=attention_mask + )[0] + return [ + encoder_hidden_states.detach(), + paddle.cast(attention_mask, dtype="float32"), + ] # Attention mask == 1 means usable token + \ No newline at end of file diff --git a/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py b/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py new file mode 100644 index 000000000..fd05a9208 --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/phoneme_encoder/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py b/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py new file mode 100644 index 000000000..5433a9e8b --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/phoneme_encoder/cleaners.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" from https://github.com/keithito/tacotron """ + +import re +from unidecode import unidecode +from phonemizer import phonemize + +__all__ = [ + "basic_cleaners", + "transliteration_cleaners", + "english_cleaners", + "english_cleaners2" +] + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_abbreviations(text) + phonemes = phonemize(text, language='en-us', backend='espeak', strip=True) + phonemes = collapse_whitespace(phonemes) + return phonemes + + +def english_cleaners2(text): + '''Pipeline for English text, including abbreviation expansion. + punctuation + stress''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_abbreviations(text) + phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True) + phonemes = collapse_whitespace(phonemes) + return phonemes \ No newline at end of file diff --git a/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py b/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py new file mode 100644 index 000000000..fd5abd88c --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/phoneme_encoder/symbols.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Defines the set of symbols used in text input to the model. +''' +_pad = '_' +_punctuation = ';:,.!?¡¿—…"«»“” ' +_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' +_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" + + +# Export all symbols: +symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + +# Special symbol ids +SPACE_ID = symbols.index(" ") diff --git a/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py b/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py new file mode 100644 index 000000000..efbd661b5 --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/phoneme_encoder/text.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" from https://github.com/keithito/tacotron """ + +from .cleaners import * +from .symbols import symbols + +# Mappings from symbol to numeric ID and vice versa: +_symbol_to_id = {s: i for i, s in enumerate(symbols)} +_id_to_symbol = {i: s for i, s in enumerate(symbols)} + +cleaner = english_cleaners2 + +def text_to_sequence(text, cleaner_names): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + Returns: + List of integers corresponding to the symbols in the text + ''' + sequence = [] + + clean_text = _clean_text(text, cleaner_names) + for symbol in clean_text: + symbol_id = _symbol_to_id[symbol] + sequence += [symbol_id] + return sequence + +def cleaned_text_to_sequence(cleaned_text): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + Args: + text: string to convert to a sequence + Returns: + List of integers corresponding to the symbols in the text + ''' + sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] + return sequence + +def sequence_to_text(sequence): + '''Converts a sequence of IDs back to a string''' + result = '' + for symbol_id in sequence: + s = _id_to_symbol[symbol_id] + result += s + return result + +def _clean_text(text, cleaner_names): + text = cleaner(text) + return text diff --git a/paddlemix/models/audioldm2/encoders/sequence2audiomae_encoder.py b/paddlemix/models/audioldm2/encoders/sequence2audiomae_encoder.py new file mode 100644 index 000000000..4122bbbff --- /dev/null +++ b/paddlemix/models/audioldm2/encoders/sequence2audiomae_encoder.py @@ -0,0 +1,487 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from paddlenlp.transformers import GPTModel +import importlib + +class Sequence2AudioMAE(nn.Layer): + def __init__( + self, + base_learning_rate, + sequence_gen_length, + sequence_input_key, + sequence_input_embed_dim, + cond_stage_config, + optimizer_type="AdamW", + use_warmup=True, + use_ar_gen_loss=False, + use_audiomae_linear=False, + target_tokens_mask_ratio=0.0, + random_mask_ratio=False, + **kwargs + ): + super().__init__() + assert use_audiomae_linear == False + self.random_mask_ratio = random_mask_ratio + self.learning_rate = base_learning_rate + self.cond_stage_config = cond_stage_config + self.use_audiomae_linear = use_audiomae_linear + self.optimizer_type = optimizer_type + self.use_warmup = use_warmup + self.use_ar_gen_loss = use_ar_gen_loss + # Even though the LDM can be conditioned on mutliple pooling rate + # Our model always predict the higest pooling rate + + self.mae_token_num = sequence_gen_length + self.sequence_input_key = sequence_input_key + self.sequence_input_embed_dim = sequence_input_embed_dim + self.target_tokens_mask_ratio = target_tokens_mask_ratio + + self.start_of_sequence_tokens = nn.Embedding(32, 768) + self.end_of_sequence_tokens = nn.Embedding(32, 768) + + self.input_sequence_embed_linear = nn.LayerList([]) + self.initial_learning_rate = None + + for dim in self.sequence_input_embed_dim: + self.input_sequence_embed_linear.append(nn.Linear(dim, 768)) + + self.cond_stage_models = nn.LayerList([]) + self.instantiate_cond_stage(cond_stage_config) + self.initialize_param_check_toolkit() + + self.model = GPTModel.from_pretrained("gpt2") + + self.loss_fn = nn.L1Loss() + + self.logger_save_dir = None + self.logger_exp_name = None + self.logger_exp_group_name = None + self.logger_version = None + + def set_log_dir(self, save_dir, exp_group_name, exp_name): + self.logger_save_dir = save_dir + self.logger_exp_group_name = exp_group_name + self.logger_exp_name = exp_name + + def cfg_uncond(self, batch_size): + unconditional_conditioning = {} + for key in self.cond_stage_model_metadata: + model_idx = self.cond_stage_model_metadata[key]["model_idx"] + unconditional_conditioning[key] = self.cond_stage_models[ + model_idx + ].get_unconditional_condition(batch_size) + assert ( + "crossattn_audiomae_pooled" in unconditional_conditioning.keys() + ), "The module is not initialized with AudioMAE" + unconditional_conditioning[ + "crossattn_clap_to_audiomae_feature" + ] = unconditional_conditioning["crossattn_audiomae_pooled"] + return unconditional_conditioning + + def add_sos_eos_tokens(self, _id, sequence, attn_mask): + batchsize = sequence.shape[0] + + new_attn_mask_step = paddle.ones((batchsize, 1)) + key_id = paddle.to_tensor([_id]) + + # Add two more steps to attn mask + new_attn_mask = paddle.concat( + [new_attn_mask_step, attn_mask, new_attn_mask_step], axis=1 + ) + + # Add two more tokens in the sequence + sos_token = self.start_of_sequence_tokens(key_id).expand([batchsize, 1, -1]) + eos_token = self.end_of_sequence_tokens(key_id).expand([batchsize, 1, -1]) + new_sequence = paddle.concat([sos_token, sequence, eos_token], axis=1) + return new_sequence, new_attn_mask + + def truncate_sequence_and_mask(self, sequence, mask, max_len=512): + if sequence.shape[1] > max_len: + print( + "The input sequence length to GPT-2 model is too long:", + sequence.shape[1], + ) + return sequence[:, :max_len], mask[:, :max_len] + else: + return sequence, mask + + def get_input_sequence_and_mask(self, cond_dict): + input_embeds = None + input_embeds_attn_mask = None + for _id, sequence_key in enumerate(self.sequence_input_key): + assert sequence_key in cond_dict.keys(), ( + "Invalid sequence key %s" % sequence_key + ) + cond_embed = cond_dict[sequence_key] + if isinstance(cond_embed, list): + assert ( + len(cond_embed) == 2 + ), "The crossattn returned list should have length 2, including embed and attn_mask" + item_input_embeds, item_attn_mask = cond_embed + + item_input_embeds = self.input_sequence_embed_linear[_id]( + item_input_embeds + ) + + item_input_embeds, item_attn_mask = self.add_sos_eos_tokens( + _id, item_input_embeds, item_attn_mask + ) + + if input_embeds is None and input_embeds_attn_mask is None: + input_embeds, input_embeds_attn_mask = ( + item_input_embeds, + item_attn_mask, + ) + else: + input_embeds = paddle.concat( + [input_embeds, item_input_embeds], axis=1 + ) # The 1-st dimension is time steps + input_embeds_attn_mask = paddle.concat( + [input_embeds_attn_mask, item_attn_mask], axis=1 + ) # The 1-st dimension is time steps + else: + assert isinstance(cond_embed, paddle.Tensor) + cond_embed = self.input_sequence_embed_linear[_id](cond_embed) + attn_mask = paddle.ones((cond_embed.shape[0], cond_embed.shape[1])) + + item_input_embeds, item_attn_mask = self.add_sos_eos_tokens( + _id, cond_embed, attn_mask + ) + + if input_embeds is None and input_embeds_attn_mask is None: + input_embeds, input_embeds_attn_mask = ( + item_input_embeds, + item_attn_mask, + ) + else: + input_embeds, input_embeds_attn_mask = paddle.concat( + [input_embeds, item_input_embeds], axis=1 + ), paddle.concat([input_embeds_attn_mask, item_attn_mask], axis=1) + + assert input_embeds is not None and input_embeds_attn_mask is not None + + input_embeds, input_embeds_attn_mask = self.truncate_sequence_and_mask( + input_embeds, input_embeds_attn_mask, int(1024 - self.mae_token_num) + ) + cond_sequence_end_time_idx = input_embeds.shape[ + 1 + ] # The index that we start to collect the output embeds + + return input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx + + def mask_target_sequence(self, target_embeds, target_embeds_attn_mask): + time_seq_mask = None + if self.target_tokens_mask_ratio > 1e-4: + batchsize, time_seq_len, embed_dim = target_embeds.shape + _, time_seq_len = target_embeds_attn_mask.shape + # Generate random mask + if self.random_mask_ratio: + mask_ratio = paddle.rand((1,)).item() * self.target_tokens_mask_ratio + else: + mask_ratio = self.target_tokens_mask_ratio + + time_seq_mask = (paddle.rand((batchsize, time_seq_len)) > mask_ratio) + + # Mask the target embedding + target_embeds = target_embeds * time_seq_mask.unsqueeze(-1) + target_embeds_attn_mask = target_embeds_attn_mask * time_seq_mask + return target_embeds, target_embeds_attn_mask, time_seq_mask + + def generate_partial(self, batch, cond_dict=None, no_grad=False): + if cond_dict is None: + cond_dict = self.get_input(batch) + + print("Generate partially prompted audio with in-context learning") + + target_embeds, target_embeds_attn_mask = ( + cond_dict["crossattn_audiomae_pooled"][0], + cond_dict["crossattn_audiomae_pooled"][1], + ) + + target_time_steps = target_embeds.shape[1] + + ( + input_embeds, + input_embeds_attn_mask, + cond_sequence_end_time_idx, + ) = self.get_input_sequence_and_mask(cond_dict) + + model_input = paddle.concat( + [input_embeds, target_embeds[:, : target_time_steps // 4, :]], axis=1 + ) + model_input_mask = paddle.concat( + [ + input_embeds_attn_mask, + target_embeds_attn_mask[:, : target_time_steps // 4], + ], + axis=1, + ) + + steps = self.mae_token_num + + for _ in range(3 * steps // 4): + output = self.model( + inputs_embeds=model_input, attention_mask=model_input_mask, return_dict=True + )["last_hidden_state"] + # Update the model input + model_input = paddle.concat([model_input, output[:, -1:, :]], axis=1) + # Update the attention mask + attention_mask_new_step = paddle.ones((model_input_mask.shape[0], 1)) + model_input_mask = paddle.concat( + [model_input_mask, attention_mask_new_step], axis=1 + ) + + output = model_input[:, cond_sequence_end_time_idx:] + + return output, cond_dict + + def generate(self, batch, cond_dict=None, no_grad=False): + if cond_dict is None: + cond_dict = self.get_input(batch) + + ( + input_embeds, + input_embeds_attn_mask, + cond_sequence_end_time_idx, + ) = self.get_input_sequence_and_mask(cond_dict) + model_input = input_embeds + model_input_mask = input_embeds_attn_mask + + steps = self.mae_token_num + + for _ in range(steps): + output = self.model( + inputs_embeds=model_input, attention_mask=model_input_mask, return_dict=True + )["last_hidden_state"] + # Update the model input + model_input = paddle.concat([model_input, output[:, -1:, :]], axis=1) + # Update the attention mask + attention_mask_new_step = paddle.ones((model_input_mask.shape[0], 1)) + model_input_mask = paddle.concat( + [model_input_mask, attention_mask_new_step], axis=1 + ) + + return model_input[:, cond_sequence_end_time_idx:], cond_dict + + def get_input_item(self, batch, k): + fname, text, waveform, stft, fbank = ( + batch["fname"], + batch["text"], + batch["waveform"], + batch["stft"], + batch["log_mel_spec"], + ) + ret = {} + + ret["fbank"] = ( + paddle.cast(fbank.unsqueeze(1), dtype="float32") + ) + ret["stft"] = paddle.cast(stft, dtype="float32") + ret["waveform"] = paddle.cast(waveform, dtype="float32") + ret["text"] = list(text) + ret["fname"] = fname + + for key in batch.keys(): + if key not in ret.keys(): + ret[key] = batch[key] + + return ret[k] + + def get_input(self, batch): + cond_dict = {} + if len(self.cond_stage_model_metadata.keys()) > 0: + unconditional_cfg = False + + for cond_model_key in self.cond_stage_model_metadata.keys(): + cond_stage_key = self.cond_stage_model_metadata[cond_model_key][ + "cond_stage_key" + ] + + # The original data for conditioning + xc = self.get_input_item(batch, cond_stage_key) + if type(xc) == paddle.Tensor: + xc = xc + + c = self.get_learned_conditioning( + xc, key=cond_model_key, unconditional_cfg=unconditional_cfg + ) + cond_dict[cond_model_key] = c + + return cond_dict + + def instantiate_cond_stage(self, config): + self.cond_stage_model_metadata = {} + + for i, cond_model_key in enumerate(config.keys()): + model = instantiate_from_config(config[cond_model_key]) + self.cond_stage_models.append(model) + self.cond_stage_model_metadata[cond_model_key] = { + "model_idx": i, + "cond_stage_key": config[cond_model_key]["cond_stage_key"], + "conditioning_key": config[cond_model_key]["conditioning_key"], + } + + def get_learned_conditioning(self, c, key, unconditional_cfg): + assert key in self.cond_stage_model_metadata.keys() + + # Classifier-free guidance + if not unconditional_cfg: + c = self.cond_stage_models[ + self.cond_stage_model_metadata[key]["model_idx"] + ](c) + else: + if isinstance(c, paddle.Tensor): + batchsize = c.shape[0] + elif isinstance(c, list): + batchsize = len(c) + else: + raise NotImplementedError() + c = self.cond_stage_models[ + self.cond_stage_model_metadata[key]["model_idx"] + ].get_unconditional_condition(batchsize) + + return c + + def initialize_param_check_toolkit(self): + self.tracked_steps = 0 + self.param_dict = {} + + def statistic_require_grad_tensor_number(self, module, name=None): + requires_grad_num = 0 + total_num = 0 + require_grad_tensor = None + for p in module.parameters(): + if not p.stop_gradient: + requires_grad_num += 1 + if require_grad_tensor is None: + require_grad_tensor = p + total_num += 1 + print( + "Module: [%s] have %s trainable parameters out of %s total parameters (%.2f)" + % (name, requires_grad_num, total_num, requires_grad_num / total_num) + ) + return require_grad_tensor + + +class SequenceGenAudioMAECond(Sequence2AudioMAE): + def __init__( + self, + cond_stage_config, + base_learning_rate, + sequence_gen_length, + sequence_input_key, + sequence_input_embed_dim, + batchsize, + always_output_audiomae_gt=False, + pretrained_path=None, + force_reload_pretrain_avoid_overwrite=False, + learnable=True, + use_warmup=True, + use_gt_mae_output=True, # False: does not use AudioMAE GT, True: Use AudioMAE GT + use_gt_mae_prob=0.0, + ): # The prob of using AudioMAE GT + if use_warmup: + use_warmup = False + + super().__init__( + base_learning_rate=base_learning_rate, + cond_stage_config=cond_stage_config, + sequence_gen_length=sequence_gen_length, + sequence_input_key=sequence_input_key, + use_warmup=use_warmup, + sequence_input_embed_dim=sequence_input_embed_dim, + batchsize=batchsize, + ) + + assert use_gt_mae_output is not None and use_gt_mae_prob is not None + self.always_output_audiomae_gt = always_output_audiomae_gt + self.force_reload_pretrain_avoid_overwrite = ( + force_reload_pretrain_avoid_overwrite + ) + self.pretrained_path = pretrained_path + if self.force_reload_pretrain_avoid_overwrite: + self.is_reload = False + else: + self.is_reload = True + + self.load_pretrain_model() + + self.use_gt_mae_output = use_gt_mae_output + self.use_gt_mae_prob = use_gt_mae_prob + self.learnable = learnable + + if not learnable: + # Only optimize the GPT2 model + for p in self.model.parameters(): + p.stop_gradient = True + self.eval() + + def load_pretrain_model(self): + if self.pretrained_path is not None: + print("Reload SequenceGenAudioMAECond from %s" % self.pretrained_path) + state_dict = paddle.load(self.pretrained_path)["state_dict"] + self.load_dict(state_dict) + + # Required + def get_unconditional_condition(self, batchsize): + return_dict = self.cfg_uncond(batchsize) + return_dict["crossattn_audiomae_generated"] = [ + return_dict["crossattn_audiomae_pooled"][0], + paddle.ones_like(return_dict["crossattn_audiomae_pooled"][1], dtype="float32"), + ] + return return_dict + + def forward(self, batch): + # The conditional module can return both tensor or dictionaries + # The returned tensor will be corresponding to the cond_stage_key + # The returned dict will have keys that correspond to the cond_stage_key + ret_dict = {} + + if self.force_reload_pretrain_avoid_overwrite and not self.is_reload: + self.load_pretrain_model() + self.is_reload = True + + input_embeds, cond_dict = self.generate(batch) + input_embeds_mask = ( + paddle.ones((input_embeds.shape[0], input_embeds.shape[1]), dtype="float32") + ) + ret_dict["crossattn_audiomae_generated"] = [ + input_embeds, + input_embeds_mask, + ] # Input sequence and mask + + # If the following two keys are not in cond_stage_key, then they will not be used as condition + for key in cond_dict.keys(): + ret_dict[key] = cond_dict[key] + + return ret_dict + +def instantiate_from_config(config): + if not "target" in config: + if config == "__is_first_stage__": + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package="paddlemix.models.audioldm2"), cls) diff --git a/paddlemix/models/audioldm2/hifigan/model.py b/paddlemix/models/audioldm2/hifigan/model.py new file mode 100644 index 000000000..d0df98101 --- /dev/null +++ b/paddlemix/models/audioldm2/hifigan/model.py @@ -0,0 +1,333 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.nn.utils import weight_norm, remove_weight_norm +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv1D, Conv1DTranspose + +LRELU_SLOPE = 0.1 + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def get_vocoder_config(): + return { + "resblock": "1", + "num_gpus": 6, + "batch_size": 16, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + "upsample_rates": [5, 4, 2, 2, 2], + "upsample_kernel_sizes": [16, 16, 8, 4, 4], + "upsample_initial_channel": 1024, + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + "segment_size": 8192, + "num_mels": 64, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 160, + "win_size": 1024, + "sampling_rate": 16000, + "fmin": 0, + "fmax": 8000, + "fmax_for_loss": None, + "num_workers": 4, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1, + }, + } + + +def get_vocoder_config_48k(): + return { + "resblock": "1", + "num_gpus": 8, + "batch_size": 128, + "learning_rate": 0.0001, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [6,5,4,2,2], + "upsample_kernel_sizes": [12,10,8,4,4], + "upsample_initial_channel": 1536, + "resblock_kernel_sizes": [3,7,11,15], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5], [1,3,5]], + + "segment_size": 15360, + "num_mels": 256, + "n_fft": 2048, + "hop_size": 480, + "win_size": 2048, + + "sampling_rate": 48000, + + "fmin": 20, + "fmax": 24000, + "fmax_for_loss": None, + + "num_workers": 8, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:18273", + "world_size": 1 + } + } + + +class ResBlock(nn.Layer): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock, self).__init__() + self.h = h + weight_attr1 = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + weight_attr2 = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + weight_attr3 = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + self.convs1 = nn.LayerList( + [ + weight_norm( + Conv1D( + channels, + channels, + kernel_size, + 1, + padding=get_padding(kernel_size, dilation[0]), + dilation=dilation[0], + weight_attr=weight_attr1, + ) + ), + weight_norm( + Conv1D( + channels, + channels, + kernel_size, + 1, + padding=get_padding(kernel_size, dilation[1]), + dilation=dilation[1], + weight_attr=weight_attr2, + ) + ), + weight_norm( + Conv1D( + channels, + channels, + kernel_size, + 1, + padding=get_padding(kernel_size, dilation[2]), + dilation=dilation[2], + weight_attr=weight_attr3, + ) + ), + ] + ) + + weight_attr4 = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + weight_attr5 = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + weight_attr6 = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + self.convs2 = nn.LayerList( + [ + weight_norm( + Conv1D( + channels, + channels, + kernel_size, + 1, + padding=get_padding(kernel_size, 1), + dilation=1, + weight_attr=weight_attr4, + ) + ), + weight_norm( + Conv1D( + channels, + channels, + kernel_size, + 1, + padding=get_padding(kernel_size, 1), + dilation=1, + weight_attr=weight_attr5, + ) + ), + weight_norm( + Conv1D( + channels, + channels, + kernel_size, + 1, + padding=get_padding(kernel_size, 1), + dilation=1, + weight_attr=weight_attr6, + ) + ), + ] + ) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class Generator(nn.Layer): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm( + Conv1D(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3) + ) + resblock = ResBlock + + self.ups = nn.LayerList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + weight_attr_tmp = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + self.ups.append( + weight_norm( + Conv1DTranspose( + h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + weight_attr=weight_attr_tmp, + ) + ) + ) + + self.resblocks = nn.LayerList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) + ): + self.resblocks.append(resblock(h, ch, k, d)) + + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=0.01) + ) + self.conv_post = weight_norm(Conv1D(ch, 1, 7, 1, padding=3, weight_attr=weight_attr)) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = paddle.tanh(x) + + return x + + def remove_weight_norm(self): + # print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +def get_vocoder(config, mel_bins): + if(mel_bins == 64): + config = get_vocoder_config() + config = AttrDict(config) + vocoder = Generator(config) + vocoder.eval() + vocoder.remove_weight_norm() + else: + config = get_vocoder_config_48k() + config = AttrDict(config) + vocoder = Generator(config) + vocoder.eval() + vocoder.remove_weight_norm() + + return vocoder + + +def vocoder_infer(mels, vocoder, lengths=None): + with paddle.no_grad(): + wavs = vocoder(mels).squeeze(1) + + wavs = (wavs.numpy() * 32768).astype("int16") + + if lengths is not None: + wavs = wavs[:, :lengths] + + return wavs + + +def synth_one_sample(mel_input, mel_prediction, labels, vocoder): + if vocoder is not None: + + wav_reconstruction = vocoder_infer( + mel_input.transpose([0, 2, 1]), + vocoder, + ) + wav_prediction = vocoder_infer( + mel_prediction.transpose([0, 2, 1]), + vocoder, + ) + else: + wav_reconstruction = wav_prediction = None + + return wav_reconstruction, wav_prediction diff --git a/paddlemix/models/audioldm2/latent_encoder/autoencoder.py b/paddlemix/models/audioldm2/latent_encoder/autoencoder.py new file mode 100644 index 000000000..31aac6f2d --- /dev/null +++ b/paddlemix/models/audioldm2/latent_encoder/autoencoder.py @@ -0,0 +1,140 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numpy as np + +from ppdiffusers import AutoencoderKL +from ..hifigan.model import get_vocoder + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = paddle.chunk(parameters, 2, axis=1) + self.logvar = paddle.clip(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = paddle.exp(0.5 * self.logvar) + self.var = paddle.exp(self.logvar) + if self.deterministic: + self.var = self.std = paddle.zeros_like(self.mean) + + def sample(self): + x = self.mean + self.std * paddle.randn(self.mean.shape) + return x + + def kl(self, other=None): + if self.deterministic: + return paddle.to_tensor([0.0]) + else: + if other is None: + return 0.5 * paddle.mean( + paddle.pow(self.mean, 2) + self.var - 1.0 - self.logvar, + dim=[1, 2, 3], + ) + else: + return 0.5 * paddle.mean( + paddle.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var + - 1.0 + - self.logvar + + other.logvar, + dim=[1, 2, 3], + ) + + def nll(self, sample, dims=[1, 2, 3]): + if self.deterministic: + return paddle.to_tensor([0.0]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * paddle.sum( + logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) / self.var, + dim=dims, + ) + + def mode(self): + return self.mean + + +class AudioLDMAutoencoderKL(AutoencoderKL): + def __init__( + self, + ddconfig=None, + lossconfig=None, + batchsize=None, + embed_dim=None, + time_shuffle=1, + subband=1, + sampling_rate=16000, + reload_from_ckpt=None, + ignore_keys=[], + image_key="fbank", + colorize_nlabels=None, + monitor=None, + base_learning_rate=1e-5, + ): + super().__init__( + in_channels = ddconfig["in_channels"], + out_channels = ddconfig["out_ch"], + down_block_types = ("DownEncoderBlock2D",) * len(ddconfig["ch_mult"]), + up_block_types = ("UpDecoderBlock2D",) * len(ddconfig["ch_mult"]), + block_out_channels = tuple([ddconfig["ch"]*i for i in ddconfig["ch_mult"]]), + layers_per_block = ddconfig["num_res_blocks"], + latent_channels = ddconfig["z_channels"], + ) + self.automatic_optimization = False + assert ( + "mel_bins" in ddconfig.keys() + ), "mel_bins is not specified in the Autoencoder config" + num_mel = ddconfig["mel_bins"] + self.image_key = image_key + self.sampling_rate = sampling_rate + + self.loss = None + self.subband = int(subband) + + if self.subband > 1: + print("Use subband decomposition %s" % self.subband) + + if self.image_key == "fbank": + self.vocoder = get_vocoder(None, num_mel) + self.embed_dim = embed_dim + if colorize_nlabels is not None: + assert type(colorize_nlabels) == int + self.register_buffer("colorize", paddle.randn([3, colorize_nlabels, 1, 1])) + if monitor is not None: + self.monitor = monitor + self.learning_rate = float(base_learning_rate) + # print("Initial learning rate %s" % self.learning_rate) + + self.time_shuffle = time_shuffle + self.reload_from_ckpt = reload_from_ckpt + self.reloaded = False + self.mean, self.std = None, None + + self.feature_cache = None + self.flag_first_run = True + self.train_step = 0 + + self.logger_save_dir = None + self.logger_exp_name = None + + def encode(self, x): + h = self.encoder(x) + moments = self.quant_conv(h) + posterior = DiagonalGaussianDistribution(moments) + return posterior + + def decode(self, z): + z = self.post_quant_conv(z) + dec = self.decoder(z) + return dec diff --git a/paddlemix/models/audioldm2/latentdiffusion_samplers.py b/paddlemix/models/audioldm2/latentdiffusion_samplers.py new file mode 100644 index 000000000..561e3dcab --- /dev/null +++ b/paddlemix/models/audioldm2/latentdiffusion_samplers.py @@ -0,0 +1,870 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +from tqdm import tqdm + +def extract_into_tensor(a, t, x_shape): + b, *_ = t.shape + out = a.gather(t, -1) + # return out.reshape(b, *((1,) * (len(x_shape) - 1))) + return out.reshape((b,) + ((1,) * (len(x_shape) - 1))) + +def noise_like(shape, repeat=False): + repeat_noise = lambda: paddle.randn((1, *shape[1:])).repeat_interleave(repeats=shape[0], axis=0) + noise = lambda: paddle.randn(shape) + return repeat_noise() if repeat else noise() + +def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True): + # select alphas for computing the variance schedule + alphas = alphacums[ddim_timesteps] + alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()) + + # according the the formula provided in https://arxiv.org/abs/2010.02502 + sigmas = eta * np.sqrt( + (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev) + ) + if verbose: + print( + f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}" + ) + print( + f"For the chosen value of eta, which is {eta}, " + f"this results in the following sigma_t schedule for ddim sampler {sigmas}" + ) + return sigmas, alphas, alphas_prev + +def make_ddim_timesteps( + ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True +): + if ddim_discr_method == "uniform": + c = num_ddpm_timesteps // num_ddim_timesteps + ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) + elif ddim_discr_method == "quad": + ddim_timesteps = ( + (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps)) ** 2 + ).astype(int) + else: + raise NotImplementedError( + f'There is no ddim discretization method called "{ddim_discr_method}"' + ) + + # assert ddim_timesteps.shape[0] == num_ddim_timesteps + # add one to get the final alpha values right (the ones from first scale to data during sampling) + steps_out = ddim_timesteps + 1 + if verbose: + print(f"Selected timesteps for ddim sampler: {steps_out}") + return steps_out + +class DDIMSampler(object): + def __init__(self, model, schedule="linear", device="cpu", **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + self.device = device + + def register_buffer(self, name, attr): + setattr(self, name, attr) + + def make_schedule( + self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True + ): + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose, + ) + alphas_cumprod = self.model.alphas_cumprod + assert ( + alphas_cumprod.shape[0] == self.ddpm_num_timesteps + ), "alphas have to be defined for each timestep" + to_paddle = lambda x: paddle.cast(x.clone().detach(), dtype="float32") if isinstance(x, paddle.Tensor) else paddle.to_tensor(x, dtype="float32") + + self.register_buffer("betas", to_paddle(self.model.betas)) + self.register_buffer("alphas_cumprod", to_paddle(alphas_cumprod)) + self.register_buffer( + "alphas_cumprod_prev", to_paddle(self.model.alphas_cumprod_prev) + ) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer( + "sqrt_alphas_cumprod", to_paddle(np.sqrt(alphas_cumprod.numpy())) + ) + self.register_buffer( + "sqrt_one_minus_alphas_cumprod", + to_paddle(np.sqrt(1.0 - alphas_cumprod.numpy())), + ) + self.register_buffer( + "log_one_minus_alphas_cumprod", to_paddle(np.log(1.0 - alphas_cumprod.numpy())) + ) + self.register_buffer( + "sqrt_recip_alphas_cumprod", to_paddle(np.sqrt(1.0 / alphas_cumprod.numpy())) + ) + self.register_buffer( + "sqrt_recipm1_alphas_cumprod", + to_paddle(np.sqrt(1.0 / alphas_cumprod.numpy() - 1)), + ) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters( + alphacums=alphas_cumprod.numpy(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, + verbose=verbose, + ) + self.register_buffer("ddim_sigmas", ddim_sigmas) + self.register_buffer("ddim_alphas", ddim_alphas) + self.register_buffer("ddim_alphas_prev", ddim_alphas_prev) + self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * paddle.sqrt( + (1 - self.alphas_cumprod_prev) + / (1 - self.alphas_cumprod) + * (1 - self.alphas_cumprod / self.alphas_cumprod_prev) + ) + self.register_buffer( + "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps + ) + + @paddle.no_grad() + def sample( + self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0.0, + mask=None, + x0=None, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + dynamic_threshold=None, + ucg_schedule=None, + **kwargs, + ): + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + # print(f'Data shape for DDIM sampling is {size}, eta {eta}') + + samples, intermediates = self.ddim_sampling( + conditioning, + size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, + x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ucg_schedule=ucg_schedule, + ) + return samples, intermediates + + @paddle.no_grad() + def ddim_sampling( + self, + cond, + shape, + x_T=None, + ddim_use_original_steps=False, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + log_every_t=100, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + dynamic_threshold=None, + ucg_schedule=None, + ): + + b = shape[0] + if x_T is None: + img = paddle.randn(shape) + else: + img = x_T + + if timesteps is None: + timesteps = ( + self.ddpm_num_timesteps + if ddim_use_original_steps + else self.ddim_timesteps + ) + elif timesteps is not None and not ddim_use_original_steps: + subset_end = ( + int( + min(timesteps / self.ddim_timesteps.shape[0], 1) + * self.ddim_timesteps.shape[0] + ) + - 1 + ) + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {"x_inter": [img], "pred_x0": [img]} + time_range = ( + reversed(range(0, timesteps)) + if ddim_use_original_steps + else np.flip(timesteps) + ) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + print(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps) + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = paddle.full((b,), step, dtype="int64") + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample( + x0, ts + ) # TODO: deterministic forward pass? + img = img_orig * mask + (1.0 - mask) * img + + if ucg_schedule is not None: + assert len(ucg_schedule) == len(time_range) + unconditional_guidance_scale = ucg_schedule[i] + + outs = self.p_sample_ddim( + img, + cond, + ts, + index=index, + use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, + temperature=temperature, + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ) + img, pred_x0 = outs + if callback: + callback(i) + if img_callback: + img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates["x_inter"].append(img) + intermediates["pred_x0"].append(pred_x0) + + return img, intermediates + + @paddle.no_grad() + def p_sample_ddim( + self, + x, + c, + t, + index, + repeat_noise=False, + use_original_steps=False, + quantize_denoised=False, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + dynamic_threshold=None, + ): + b, *_ = x.shape + + if unconditional_conditioning is None or unconditional_guidance_scale == 1.0: + model_output = self.model.apply_model(x, t, c) + else: + x_in = x + t_in = t + + assert isinstance(c, dict) + assert isinstance(unconditional_conditioning, dict) + + model_uncond = self.model.apply_model( + x_in, t_in, unconditional_conditioning + ) + model_t = self.model.apply_model(x_in, t_in, c) + + model_output = model_uncond + unconditional_guidance_scale * ( + model_t - model_uncond + ) + + if self.model.parameterization == "v": + e_t = self.model.predict_eps_from_z_and_v(x, t, model_output) + else: + e_t = model_output + + if score_corrector is not None: + assert self.model.parameterization == "eps", "not implemented" + e_t = score_corrector.modify_score( + self.model, e_t, x, t, c, **corrector_kwargs + ) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = ( + self.model.alphas_cumprod_prev + if use_original_steps + else self.ddim_alphas_prev + ) + sqrt_one_minus_alphas = ( + self.model.sqrt_one_minus_alphas_cumprod + if use_original_steps + else self.ddim_sqrt_one_minus_alphas + ) + sigmas = ( + self.model.ddim_sigmas_for_original_num_steps + if use_original_steps + else self.ddim_sigmas + ) + # select parameters corresponding to the currently considered timestep + a_t = paddle.full((b, 1, 1, 1), alphas[index]) + a_prev = paddle.full((b, 1, 1, 1), alphas_prev[index]) + sigma_t = paddle.full((b, 1, 1, 1), sigmas[index]) + sqrt_one_minus_at = paddle.full( + (b, 1, 1, 1), sqrt_one_minus_alphas[index] + ) + + # current prediction for x_0 + if self.model.parameterization != "v": + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + else: + pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output) + + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + + if dynamic_threshold is not None: + raise NotImplementedError() + + # direction pointing to x_t + dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, repeat_noise) * temperature + if noise_dropout > 0.0: + noise = nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + @paddle.no_grad() + def encode( + self, + x0, + c, + t_enc, + use_original_steps=False, + return_intermediates=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + callback=None, + ): + num_reference_steps = ( + self.ddpm_num_timesteps + if use_original_steps + else self.ddim_timesteps.shape[0] + ) + + assert t_enc <= num_reference_steps + num_steps = t_enc + + if use_original_steps: + alphas_next = self.alphas_cumprod[:num_steps] + alphas = self.alphas_cumprod_prev[:num_steps] + else: + alphas_next = self.ddim_alphas[:num_steps] + alphas = paddle.to_tensor(self.ddim_alphas_prev[:num_steps]) + + x_next = x0 + intermediates = [] + inter_steps = [] + for i in tqdm(range(num_steps), desc="Encoding Image"): + t = paddle.full( + (x0.shape[0],), i, dtype="int64" + ) + if unconditional_guidance_scale == 1.0: + noise_pred = self.model.apply_model(x_next, t, c) + else: + assert unconditional_conditioning is not None + e_t_uncond, noise_pred = paddle.chunk( + self.model.apply_model( + paddle.concat((x_next, x_next)), + paddle.concat((t, t)), + paddle.concat((unconditional_conditioning, c)), + ), + 2, + ) + noise_pred = e_t_uncond + unconditional_guidance_scale * ( + noise_pred - e_t_uncond + ) + + xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next + weighted_noise_pred = ( + alphas_next[i].sqrt() + * ((1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) + * noise_pred + ) + x_next = xt_weighted + weighted_noise_pred + if ( + return_intermediates + and i % (num_steps // return_intermediates) == 0 + and i < num_steps - 1 + ): + intermediates.append(x_next) + inter_steps.append(i) + elif return_intermediates and i >= num_steps - 2: + intermediates.append(x_next) + inter_steps.append(i) + if callback: + callback(i) + + out = {"x_encoded": x_next, "intermediate_steps": inter_steps} + if return_intermediates: + out.update({"intermediates": intermediates}) + return x_next, out + + @paddle.no_grad() + def stochastic_encode(self, x0, t, use_original_steps=False, noise=None): + # fast, but does not allow for exact reconstruction + # t serves as an index to gather the correct alphas + if use_original_steps: + sqrt_alphas_cumprod = self.sqrt_alphas_cumprod + sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod + else: + sqrt_alphas_cumprod = paddle.sqrt(self.ddim_alphas) + sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas + + if noise is None: + noise = paddle.randn(x0.shape) + return ( + extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 + + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise + ) + + @paddle.no_grad() + def decode( + self, + x_latent, + cond, + t_start, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + use_original_steps=False, + callback=None, + ): + timesteps = ( + np.arange(self.ddpm_num_timesteps) + if use_original_steps + else self.ddim_timesteps + ) + timesteps = timesteps[:t_start] + + time_range = np.flip(timesteps) + total_steps = timesteps.shape[0] + print(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc="Decoding image", total=total_steps) + x_dec = x_latent + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = paddle.full( + (x_latent.shape[0],), step, dtype="int64" + ) + x_dec, _ = self.p_sample_ddim( + x_dec, + cond, + ts, + index=index, + use_original_steps=use_original_steps, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + ) + if callback: + callback(i) + return x_dec + + +class PLMSSampler(object): + def __init__(self, model, schedule="linear", **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + setattr(self, name, attr) + + def make_schedule( + self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True + ): + if ddim_eta != 0: + ddim_eta = 0 + # raise ValueError('ddim_eta must be 0 for PLMS') + + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose, + ) + alphas_cumprod = self.model.alphas_cumprod + assert ( + alphas_cumprod.shape[0] == self.ddpm_num_timesteps + ), "alphas have to be defined for each timestep" + to_paddle = lambda x: paddle.cast(x.clone().detach(), dtype="float32") + + self.register_buffer("betas", to_paddle(self.model.betas)) + self.register_buffer("alphas_cumprod", to_paddle(alphas_cumprod)) + self.register_buffer( + "alphas_cumprod_prev", to_paddle(self.model.alphas_cumprod_prev) + ) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer( + "sqrt_alphas_cumprod", to_paddle(np.sqrt(alphas_cumprod.numpy())) + ) + self.register_buffer( + "sqrt_one_minus_alphas_cumprod", + to_paddle(np.sqrt(1.0 - alphas_cumprod.numpy())), + ) + self.register_buffer( + "log_one_minus_alphas_cumprod", to_paddle(np.log(1.0 - alphas_cumprod.numpy())) + ) + self.register_buffer( + "sqrt_recip_alphas_cumprod", to_paddle(np.sqrt(1.0 / alphas_cumprod.numpy())) + ) + self.register_buffer( + "sqrt_recipm1_alphas_cumprod", + to_paddle(np.sqrt(1.0 / alphas_cumprod.numpy() - 1)), + ) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters( + alphacums=alphas_cumprod.numpy(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, + verbose=verbose, + ) + self.register_buffer("ddim_sigmas", ddim_sigmas) + self.register_buffer("ddim_alphas", ddim_alphas) + self.register_buffer("ddim_alphas_prev", ddim_alphas_prev) + self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * paddle.sqrt( + (1 - self.alphas_cumprod_prev) + / (1 - self.alphas_cumprod) + * (1 - self.alphas_cumprod / self.alphas_cumprod_prev) + ) + self.register_buffer( + "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps + ) + + @paddle.no_grad() + def sample( + self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0.0, + mask=None, + x0=None, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + **kwargs, + ): + if conditioning is not None: + if isinstance(conditioning, dict): + cbs = conditioning[list(conditioning.keys())[0]].shape[0] + if cbs != batch_size: + print( + f"Warning: Got {cbs} conditionings but batch-size is {batch_size}" + ) + else: + if conditioning.shape[0] != batch_size: + print( + f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}" + ) + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f"Data shape for PLMS sampling is {size}") + + samples, intermediates = self.plms_sampling( + conditioning, + size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, + x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + ) + return samples, intermediates + + @paddle.no_grad() + def plms_sampling( + self, + cond, + shape, + x_T=None, + ddim_use_original_steps=False, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + log_every_t=100, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + ): + + b = shape[0] + if x_T is None: + img = paddle.randn(shape) + else: + img = x_T + + if timesteps is None: + timesteps = ( + self.ddpm_num_timesteps + if ddim_use_original_steps + else self.ddim_timesteps + ) + elif timesteps is not None and not ddim_use_original_steps: + subset_end = ( + int( + min(timesteps / self.ddim_timesteps.shape[0], 1) + * self.ddim_timesteps.shape[0] + ) + - 1 + ) + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {"x_inter": [img], "pred_x0": [img]} + time_range = ( + list(reversed(range(0, timesteps))) + if ddim_use_original_steps + else np.flip(timesteps) + ) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + print(f"Running PLMS Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc="PLMS Sampler", total=total_steps) + old_eps = [] + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = paddle.full((b,), step, dtype="int64") + ts_next = paddle.full( + (b,), + time_range[min(i + 1, len(time_range) - 1)], + dtype="int64", + ) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample( + x0, ts + ) # TODO: deterministic forward pass? + img = img_orig * mask + (1.0 - mask) * img + + outs = self.p_sample_plms( + img, + cond, + ts, + index=index, + use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, + temperature=temperature, + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + old_eps=old_eps, + t_next=ts_next, + ) + img, pred_x0, e_t = outs + old_eps.append(e_t) + if len(old_eps) >= 4: + old_eps.pop(0) + if callback: + callback(i) + if img_callback: + img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates["x_inter"].append(img) + intermediates["pred_x0"].append(pred_x0) + + return img, intermediates + + @paddle.no_grad() + def p_sample_plms( + self, + x, + c, + t, + index, + repeat_noise=False, + use_original_steps=False, + quantize_denoised=False, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + old_eps=None, + t_next=None, + ): + b, *_ = x.shape + + def get_model_output(x, t): + if ( + unconditional_conditioning is None + or unconditional_guidance_scale == 1.0 + ): + e_t = self.model.apply_model(x, t, c) + else: + x_in = paddle.concat([x] * 2) + t_in = paddle.concat([t] * 2) + c_in = paddle.concat([unconditional_conditioning, c]) + e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2) + e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond) + + if score_corrector is not None: + assert self.model.parameterization == "eps" + e_t = score_corrector.modify_score( + self.model, e_t, x, t, c, **corrector_kwargs + ) + + return e_t + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = ( + self.model.alphas_cumprod_prev + if use_original_steps + else self.ddim_alphas_prev + ) + sqrt_one_minus_alphas = ( + self.model.sqrt_one_minus_alphas_cumprod + if use_original_steps + else self.ddim_sqrt_one_minus_alphas + ) + sigmas = ( + self.model.ddim_sigmas_for_original_num_steps + if use_original_steps + else self.ddim_sigmas + ) + + def get_x_prev_and_pred_x0(e_t, index): + # select parameters corresponding to the currently considered timestep + a_t = paddle.full((b, 1, 1, 1), alphas[index]) + a_prev = paddle.full((b, 1, 1, 1), alphas_prev[index]) + sigma_t = paddle.full((b, 1, 1, 1), sigmas[index]) + sqrt_one_minus_at = paddle.full( + (b, 1, 1, 1), sqrt_one_minus_alphas[index] + ) + + # current prediction for x_0 + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + # direction pointing to x_t + dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, repeat_noise) * temperature + if noise_dropout > 0.0: + noise = nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + e_t = get_model_output(x, t) + if len(old_eps) == 0: + # Pseudo Improved Euler (2nd order) + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index) + e_t_next = get_model_output(x_prev, t_next) + e_t_prime = (e_t + e_t_next) / 2 + elif len(old_eps) == 1: + # 2nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (3 * e_t - old_eps[-1]) / 2 + elif len(old_eps) == 2: + # 3nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12 + elif len(old_eps) >= 3: + # 4nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = ( + 55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3] + ) / 24 + + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index) + + return x_prev, pred_x0, e_t diff --git a/paddlemix/models/audioldm2/modeling.py b/paddlemix/models/audioldm2/modeling.py new file mode 100644 index 000000000..4e52ac8d6 --- /dev/null +++ b/paddlemix/models/audioldm2/modeling.py @@ -0,0 +1,898 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import os +import numpy as np +from paddlemix.models.model_utils import MixPretrainedModel +# from ppdiffusers.models import LitEma +import soundfile as sf +import tqdm +from .encoders.clap_encoder import CLAPAudioEmbeddingClassifierFreev2 +from .latentdiffusion_samplers import DDIMSampler, PLMSSampler +from .latent_encoder.autoencoder import DiagonalGaussianDistribution +from .diffusionwrapper import ( + DiffusionWrapper, + make_beta_schedule, + extract_into_tensor, + noise_like, + default, + instantiate_from_config, + disabled_train +) +from .configuration import AudioLDM2Config + +__all__ = [ + "AudioLDM2Model", + "AudioLDM2PretrainedModel", +] + +class AudioLDM2PretrainedModel(MixPretrainedModel): + """ + The class for pretrained model of AudioLDM2. + """ + + model_config_file = "config.json" + config_class = AudioLDM2Config + resource_files_names = {"model_state": "model_state.pdparams"} + base_model_prefix = "audioldm2" + +class AudioLDM2Model(AudioLDM2PretrainedModel): + """ + Args: + config (:class:`AudioLDM2Config`): + """ + + def __init__(self, config: AudioLDM2Config): + super(AudioLDM2Model, self).__init__(config) + assert config.parameterization in [ + "eps", + "x0", + "v", + ], 'currently only supporting "eps" and "x0" and "v"' + self.parameterization = config.parameterization + self.device_name = config.device + self.clip_denoised = False + self.log_every_t = config.log_every_t + self.first_stage_key = config.first_stage_key + self.sampling_rate = config.sampling_rate + # self.use_ema = True + # if self.use_ema: + # self.model_ema = LitEma(self.model) + + self.clap = CLAPAudioEmbeddingClassifierFreev2( + pretrained_path="", + enable_cuda=self.device_name=="gpu", + sampling_rate=self.sampling_rate, + embed_mode="audio", + amodel="HTSAT-base", + ) + self.latent_t_size = config.latent_t_size + self.latent_f_size = config.latent_f_size + self.channels = config.channels + self.use_positional_encodings = False + self.conditioning_key = list(config.cond_stage_config.keys()) + self.model = DiffusionWrapper(config.unet_config, self.conditioning_key) + + self.v_posterior = 0.0 # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta + + self.num_timesteps_cond = default(config.num_timesteps_cond, 1) + assert self.num_timesteps_cond <= config.timesteps + self.register_schedule( + beta_schedule="linear", + timesteps=config.timesteps, + linear_start=config.linear_start, + linear_end=config.linear_end, + cosine_s=8e-3, + ) + logvar_init = 0.0 + self.logvar = paddle.full(shape=(self.num_timesteps,), fill_value=logvar_init) + self.logvar = paddle.create_parameter( + shape=self.logvar.shape, + dtype=str(self.logvar.numpy().dtype), + default_initializer=nn.initializer.Assign(self.logvar) + ) + self.logvar.stop_gradient = True + + self.register_buffer("scale_factor", paddle.to_tensor(1.0)) + self.instantiate_first_stage(config.first_stage_config) + self.unconditional_prob_cfg = config.unconditional_prob_cfg + self.cond_stage_models = nn.LayerList([]) + self.instantiate_cond_stage(config.cond_stage_config) + self.conditional_dry_run_finished = False + + def instantiate_first_stage(self, config): + model = instantiate_from_config(config) + self.first_stage_model = model.eval() + self.first_stage_model.train = disabled_train + for param in self.first_stage_model.parameters(): + param.stop_gradient = True + + def instantiate_cond_stage(self, config): + self.cond_stage_model_metadata = {} + for i, cond_model_key in enumerate(config.keys()): + if "params" in config[cond_model_key] and "device" in config[cond_model_key]["params"]: + config[cond_model_key]["params"]["device"] = self.device_name + model = instantiate_from_config(config[cond_model_key]) + model = model.to(self.device_name) + self.cond_stage_models.append(model) + self.cond_stage_model_metadata[cond_model_key] = { + "model_idx": i, + "cond_stage_key": config[cond_model_key]["cond_stage_key"], + "conditioning_key": config[cond_model_key]["conditioning_key"], + } + + def make_cond_schedule( + self, + ): + self.cond_ids = paddle.full( + size=(self.num_timesteps,), + fill_value=self.num_timesteps - 1, + dtype="int64", + ) + ids = paddle.cast( + paddle.round( + paddle.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond) + ), + dtype="int64" + ) + self.cond_ids[: self.num_timesteps_cond] = ids + + + def register_schedule( + self, + beta_schedule="linear", + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + ): + betas = make_beta_schedule( + beta_schedule, + timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s, + ) + alphas = 1.0 - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1]) + + (timesteps,) = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert ( + alphas_cumprod.shape[0] == self.num_timesteps + ), "alphas have to be defined for each timestep" + + self.register_buffer("betas", paddle.to_tensor(betas, dtype="float32")) + self.register_buffer("alphas_cumprod", paddle.to_tensor(alphas_cumprod, dtype="float32")) + self.register_buffer("alphas_cumprod_prev", paddle.to_tensor(alphas_cumprod_prev, dtype="float32")) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer("sqrt_alphas_cumprod", paddle.to_tensor(np.sqrt(alphas_cumprod), dtype="float32")) + self.register_buffer( + "sqrt_one_minus_alphas_cumprod", paddle.to_tensor(np.sqrt(1.0 - alphas_cumprod), dtype="float32") + ) + self.register_buffer( + "log_one_minus_alphas_cumprod", paddle.to_tensor(np.log(1.0 - alphas_cumprod), dtype="float32") + ) + self.register_buffer( + "sqrt_recip_alphas_cumprod", paddle.to_tensor(np.sqrt(1.0 / alphas_cumprod), dtype="float32") + ) + self.register_buffer( + "sqrt_recipm1_alphas_cumprod", paddle.to_tensor(np.sqrt(1.0 / alphas_cumprod - 1), dtype="float32") + ) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + posterior_variance = (1 - self.v_posterior) * betas * ( + 1.0 - alphas_cumprod_prev + ) / (1.0 - alphas_cumprod) + self.v_posterior * betas + # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) + self.register_buffer("posterior_variance", paddle.to_tensor(posterior_variance, dtype="float32")) + # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain + self.register_buffer( + "posterior_log_variance_clipped", + paddle.to_tensor(np.log(np.maximum(posterior_variance, 1e-20)), dtype="float32"), + ) + self.register_buffer( + "posterior_mean_coef1", + paddle.to_tensor(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod), dtype="float32"), + ) + self.register_buffer( + "posterior_mean_coef2", + paddle.to_tensor( + (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod), + dtype="float32" + ), + ) + + if self.parameterization == "eps": + lvlb_weights = self.betas**2 / ( + 2 + * self.posterior_variance + * paddle.to_tensor(alphas, dtype="float32") + * (1 - self.alphas_cumprod) + ) + elif self.parameterization == "x0": + lvlb_weights = ( + 0.5 + * np.sqrt(paddle.to_tensor(alphas_cumprod, dtype="float32")) + / (2.0 * 1 - paddle.to_tensor(alphas_cumprod, dtype="float32")) + ) + elif self.parameterization == "v": + lvlb_weights = paddle.ones_like( + self.betas**2 + / ( + 2 + * self.posterior_variance + * paddle.to_tensor(alphas, dtype="float32") + * (1 - self.alphas_cumprod) + ) + ) + else: + raise NotImplementedError("mu not supported") + # TODO how to choose this term + lvlb_weights[0] = lvlb_weights[1] + self.register_buffer("lvlb_weights", lvlb_weights, persistable=False) + assert not paddle.isnan(self.lvlb_weights).all() + + self.shorten_cond_schedule = self.num_timesteps_cond > 1 + if self.shorten_cond_schedule: + self.make_cond_schedule() + + def make_decision(self, probability): + if float(paddle.rand([])) < probability: + return True + else: + return False + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = extract_into_tensor( + self.log_one_minus_alphas_cumprod, t, x_start.shape + ) + return mean, variance, log_variance + + def predict_start_from_noise(self, x_t, t, noise): + return ( + extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + * noise + ) + + def q_posterior(self, x_start, x_t, t): + posterior_mean = ( + extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + + extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape + ) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance( + self, + x, + c, + t, + clip_denoised: bool, + return_codebook_ids=False, + quantize_denoised=False, + return_x0=False, + score_corrector=None, + corrector_kwargs=None, + ): + t_in = t + model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids) + + if score_corrector is not None: + assert self.parameterization == "eps" + model_out = score_corrector.modify_score( + self, model_out, x, t, c, **corrector_kwargs + ) + + if return_codebook_ids: + model_out, logits = model_out + + if self.parameterization == "eps": + x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) + elif self.parameterization == "x0": + x_recon = model_out + else: + raise NotImplementedError() + + if clip_denoised: + x_recon.clip_(-1.0, 1.0) + if quantize_denoised: + x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon) + model_mean, posterior_variance, posterior_log_variance = self.q_posterior( + x_start=x_recon, x_t=x, t=t + ) + if return_codebook_ids: + return model_mean, posterior_variance, posterior_log_variance, logits + elif return_x0: + return model_mean, posterior_variance, posterior_log_variance, x_recon + else: + return model_mean, posterior_variance, posterior_log_variance + + @paddle.no_grad() + def p_sample( + self, + x, + c, + t, + clip_denoised=False, + repeat_noise=False, + return_codebook_ids=False, + quantize_denoised=False, + return_x0=False, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + ): + b, *_ = x.shape + outputs = self.p_mean_variance( + x=x, + c=c, + t=t, + clip_denoised=clip_denoised, + return_codebook_ids=return_codebook_ids, + quantize_denoised=quantize_denoised, + return_x0=return_x0, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + ) + if return_codebook_ids: + raise DeprecationWarning("Support dropped.") + elif return_x0: + model_mean, _, model_log_variance, x0 = outputs + else: + model_mean, _, model_log_variance = outputs + + noise = noise_like(x.shape, repeat_noise) * temperature + if noise_dropout > 0.0: + noise = nn.functional.dropout(noise, p=noise_dropout) + # no noise when t == 0 + nonzero_mask = ( + (1 - paddle.cast(t == 0, "float32")).reshape((b, *((1,) * (len(x.shape) - 1)))) + ) + + if return_x0: + return ( + model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, + x0, + ) + else: + return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise + + @paddle.no_grad() + def p_sample_loop( + self, + cond, + shape, + return_intermediates=False, + x_T=None, + verbose=True, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + start_T=None, + log_every_t=None, + ): + if not log_every_t: + log_every_t = self.log_every_t + b = shape[0] + if x_T is None: + img = paddle.randn(shape) + else: + img = x_T + + intermediates = [img] + if timesteps is None: + timesteps = self.num_timesteps + + if start_T is not None: + timesteps = min(timesteps, start_T) + iterator = ( + tqdm(reversed(range(0, timesteps)), desc="Sampling t", total=timesteps) + if verbose + else reversed(range(0, timesteps)) + ) + + if mask is not None: + assert x0 is not None + assert x0.shape[2:3] == mask.shape[2:3] # spatial size has to match + + for i in iterator: + ts = paddle.full((b,), i, dtype="int64") + + if self.shorten_cond_schedule: + assert self.model.conditioning_key != "hybrid" + tc = self.cond_ids[ts] + cond = self.q_sample(x_start=cond, t=tc, noise=paddle.randn(cond.shapes)) + + img = self.p_sample( + img, + cond, + ts, + clip_denoised=self.clip_denoised, + quantize_denoised=quantize_denoised, + ) + + if mask is not None: + img_orig = self.q_sample(x0, ts) + img = img_orig * mask + (1.0 - mask) * img + + if i % log_every_t == 0 or i == timesteps - 1: + intermediates.append(img) + if callback: + callback(i) + if img_callback: + img_callback(img, i) + + if return_intermediates: + return img, intermediates + return img + + @paddle.no_grad() + def sample( + self, + cond, + batch_size=16, + return_intermediates=False, + x_T=None, + verbose=True, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + shape=None, + **kwargs, + ): + if shape is None: + shape = (batch_size, self.channels, self.latent_t_size, self.latent_f_size) + if cond is not None: + if isinstance(cond, dict): + cond = { + key: cond[key][:batch_size] + if not isinstance(cond[key], list) + else list(map(lambda x: x[:batch_size], cond[key])) + for key in cond + } + else: + cond = ( + [c[:batch_size] for c in cond] + if isinstance(cond, list) + else cond[:batch_size] + ) + return self.p_sample_loop( + cond, + shape, + return_intermediates=return_intermediates, + x_T=x_T, + verbose=verbose, + timesteps=timesteps, + quantize_denoised=quantize_denoised, + mask=mask, + x0=x0, + **kwargs, + ) + + @paddle.no_grad() + def sample_log( + self, + cond, + batch_size, + ddim, + ddim_steps, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + use_plms=False, + mask=None, + **kwargs, + ): + if mask is not None: + shape = (self.channels, mask.shape[-2], mask.shape[-1]) + else: + shape = (self.channels, self.latent_t_size, self.latent_f_size) + + intermediate = None + if ddim and not use_plms: + ddim_sampler = DDIMSampler(self, device=self.device) + samples, intermediates = ddim_sampler.sample( + ddim_steps, + batch_size, + shape, + cond, + verbose=False, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + mask=mask, + **kwargs, + ) + elif use_plms: + plms_sampler = PLMSSampler(self) + samples, intermediates = plms_sampler.sample( + ddim_steps, + batch_size, + shape, + cond, + verbose=False, + unconditional_guidance_scale=unconditional_guidance_scale, + mask=mask, + unconditional_conditioning=unconditional_conditioning, + **kwargs, + ) + + else: + samples, intermediates = self.sample( + cond=cond, + batch_size=batch_size, + return_intermediates=True, + unconditional_guidance_scale=unconditional_guidance_scale, + mask=mask, + unconditional_conditioning=unconditional_conditioning, + **kwargs, + ) + + return samples, intermediate + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: paddle.randn(x_start.shape)) + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) + * noise + ) + + def predict_start_from_z_and_v(self, x_t, t, v): + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t + - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v + ) + + def predict_eps_from_z_and_v(self, x_t, t, v): + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) + * x_t + ) + + def get_v(self, x, noise, t): + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise + - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x + ) + + def _get_input(self, batch, k): + fname, text, waveform, stft, fbank, phoneme_idx = ( + batch["fname"], + batch["text"], + batch["waveform"], + batch["stft"], + batch["log_mel_spec"], + batch["phoneme_idx"] + ) + ret = {} + + ret["fbank"] = ( + paddle.cast(fbank.unsqueeze(1), dtype="float32") + ) + ret["stft"] = paddle.cast(stft, dtype="float32") + ret["waveform"] = paddle.cast(waveform, dtype="float32") + ret["phoneme_idx"] = paddle.cast(phoneme_idx, dtype="int64") + ret["text"] = list(text) + ret["fname"] = fname + + for key in batch.keys(): + if key not in ret.keys(): + ret[key] = batch[key] + + return ret[k] + + def get_first_stage_encoding(self, encoder_posterior): + z = encoder_posterior.sample() + if isinstance(encoder_posterior, DiagonalGaussianDistribution): + z = encoder_posterior.sample() + elif isinstance(encoder_posterior, paddle.Tensor): + z = encoder_posterior + else: + raise NotImplementedError( + f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented" + ) + return self.scale_factor * z + + def get_learned_conditioning(self, c, key, unconditional_cfg): + assert key in self.cond_stage_model_metadata.keys() + + # Classifier-free guidance + if not unconditional_cfg: + c = self.cond_stage_models[ + self.cond_stage_model_metadata[key]["model_idx"] + ](c) + else: + # when the cond_stage_key is "all", pick one random element out + if isinstance(c, dict): + c = c[list(c.keys())[0]] + + if isinstance(c, paddle.Tensor): + batchsize = c.shape[0] + elif isinstance(c, list): + batchsize = len(c) + else: + raise NotImplementedError() + + c = self.cond_stage_models[ + self.cond_stage_model_metadata[key]["model_idx"] + ].get_unconditional_condition(batchsize) + + return c + + def get_input( + self, + batch, + k, + return_first_stage_encode=True, + return_decoding_output=False, + return_encoder_input=False, + return_encoder_output=False, + unconditional_prob_cfg=0.1, + ): + x = self._get_input(batch, k) + + if return_first_stage_encode: + encoder_posterior = self.encode_first_stage(x) + z = self.get_first_stage_encoding(encoder_posterior).detach() + else: + z = None + cond_dict = {} + if len(self.cond_stage_model_metadata.keys()) > 0: + unconditional_cfg = False + if self.conditional_dry_run_finished and self.make_decision( + unconditional_prob_cfg + ): + unconditional_cfg = True + for cond_model_key in self.cond_stage_model_metadata.keys(): + cond_stage_key = self.cond_stage_model_metadata[cond_model_key][ + "cond_stage_key" + ] + + if cond_model_key in cond_dict.keys(): + continue + + # The original data for conditioning + # If cond_model_key is "all", that means the conditional model need all the information from a batch + if cond_stage_key != "all": + xc = self._get_input(batch, cond_stage_key) + else: + xc = batch + # if cond_stage_key is "all", xc will be a dictionary containing all keys + # Otherwise xc will be an entry of the dictionary + c = self.get_learned_conditioning( + xc, key=cond_model_key, unconditional_cfg=unconditional_cfg + ) + # cond_dict will be used to condition the diffusion model + # If one conditional model return multiple conditioning signal + if isinstance(c, dict): + for k in c.keys(): + cond_dict[k] = c[k] + else: + cond_dict[cond_model_key] = c + + out = [z, cond_dict] + + if return_decoding_output: + xrec = self.decode_first_stage(z) + out += [xrec] + + if return_encoder_input: + out += [x] + + if return_encoder_output: + out += [encoder_posterior] + + if not self.conditional_dry_run_finished: + self.conditional_dry_run_finished = True + + # Output is a dictionary, where the value could only be tensor or tuple + return out + + def encode_first_stage(self, x): + with paddle.no_grad(): + return self.first_stage_model.encode(x) + + def decode_first_stage(self, z): + with paddle.no_grad(): + z = 1.0 / self.scale_factor * z + decoding = self.first_stage_model.decode(z) + return decoding + + def mel_spectrogram_to_waveform( + self, mel, savepath=".", bs=None, name="outwav", save=True + ): + # Mel: [bs, 1, t-steps, fbins] + if len(mel.shape) == 4: + mel = mel.squeeze(1) + mel = mel.transpose([0, 2, 1]) + waveform = self.first_stage_model.vocoder(mel) + waveform = waveform.cpu().detach().numpy() + if save: + self.save_waveform(waveform, savepath, name) + return waveform + + def save_waveform(self, waveform, savepath, name="outwav"): + for i in range(waveform.shape[0]): + if type(name) is str: + path = os.path.join( + savepath, "%s_%s_%s.wav" % (self.global_step, i, name) + ) + elif type(name) is list: + path = os.path.join( + savepath, + "%s.wav" + % ( + os.path.basename(name[i]) + if (not ".wav" in name[i]) + else os.path.basename(name[i]).split(".")[0] + ), + ) + else: + raise NotImplementedError + todo_waveform = waveform[i, 0] + todo_waveform = ( + todo_waveform / np.max(np.abs(todo_waveform)) + ) * 0.8 # Normalize the energy of the generation output + sf.write(path, todo_waveform, samplerate=self.sampling_rate) + + def filter_useful_cond_dict(self, cond_dict): + new_cond_dict = {} + for key in cond_dict.keys(): + if key in self.cond_stage_model_metadata.keys(): + new_cond_dict[key] = cond_dict[key] + + # All the conditional key in the metadata should be used + for key in self.cond_stage_model_metadata.keys(): + assert key in new_cond_dict.keys(), "%s, %s" % ( + key, + str(new_cond_dict.keys()), + ) + + return new_cond_dict + + def reorder_cond_dict(self, cond_dict): + # To make sure the order is correct + new_cond_dict = {} + for key in self.conditioning_key: + new_cond_dict[key] = cond_dict[key] + return new_cond_dict + + def apply_model(self, x_noisy, t, cond, return_ids=False): + cond = self.reorder_cond_dict(cond) + + x_recon = self.model(x_noisy, t, cond_dict=cond) + + if isinstance(x_recon, tuple) and not return_ids: + return x_recon[0] + else: + return x_recon + + def forward( + self, + batch, + ddim_steps=200, + ddim_eta=1.0, + x_T=None, + n_gen=1, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + use_plms=False, + **kwargs, + ): + # Generate n_gen times and select the best + # Batch: audio, text, fnames + assert x_T is None + + if use_plms: + assert ddim_steps is not None + + use_ddim = ddim_steps is not None + + # with self.ema_scope("Plotting"): + for i in range(1): + z, c = self.get_input( + batch, + self.first_stage_key, + unconditional_prob_cfg=0.0, # Do not output unconditional information in the c + ) + + c = self.filter_useful_cond_dict(c) + + text = self._get_input(batch, "text") + + # Generate multiple samples + batch_size = z.shape[0] * n_gen + + # Generate multiple samples at a time and filter out the best + # The condition to the diffusion wrapper can have many format + for cond_key in c.keys(): + if isinstance(c[cond_key], list): + for i in range(len(c[cond_key])): + c[cond_key][i] = paddle.concat([c[cond_key][i]] * n_gen, axis=0) + elif isinstance(c[cond_key], dict): + for k in c[cond_key].keys(): + c[cond_key][k] = paddle.concat([c[cond_key][k]] * n_gen, axis=0) + else: + c[cond_key] = paddle.concat([c[cond_key]] * n_gen, axis=0) + + text = text * n_gen + + if unconditional_guidance_scale != 1.0: + unconditional_conditioning = {} + for key in self.cond_stage_model_metadata: + model_idx = self.cond_stage_model_metadata[key]["model_idx"] + unconditional_conditioning[key] = self.cond_stage_models[ + model_idx + ].get_unconditional_condition(batch_size) + + fnames = list(self._get_input(batch, "fname")) + samples, _ = self.sample_log( + cond=c, + batch_size=batch_size, + x_T=x_T, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + use_plms=use_plms, + ) + + mel = self.decode_first_stage(samples) + + waveform = self.mel_spectrogram_to_waveform( + mel, savepath="", bs=None, name=fnames, save=False + ) + + if n_gen > 1: + best_index = [] + similarity = self.clap.cos_similarity( + paddle.to_tensor(waveform, dtype="float32").squeeze(1), text + ) + for i in range(z.shape[0]): + candidates = similarity[i :: z.shape[0]] + max_index = paddle.argmax(candidates).item() + best_index.append(i + max_index * z.shape[0]) + + waveform = waveform[best_index] + + print("Similarity between generated audio and text:") + print(' '.join('{:.2f}'.format(num) for num in similarity.detach().numpy().tolist())) + print("Choose the following indexes as the output:", best_index) + + return waveform diff --git a/paddlemix/models/audioldm2/requirement.txt b/paddlemix/models/audioldm2/requirement.txt new file mode 100644 index 000000000..4ee0937a9 --- /dev/null +++ b/paddlemix/models/audioldm2/requirement.txt @@ -0,0 +1,4 @@ +librosa +unidecode +phonemizer +espeak \ No newline at end of file diff --git a/paddlemix/models/audioldm2/unet/attention.py b/paddlemix/models/audioldm2/unet/attention.py new file mode 100644 index 000000000..5a6aaa3eb --- /dev/null +++ b/paddlemix/models/audioldm2/unet/attention.py @@ -0,0 +1,199 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +from ppdiffusers.models.attention import GEGLU +from einops import rearrange, repeat +from ..diffusionwrapper import default + +def Normalize(in_channels): + return nn.GroupNorm( + num_groups=32, num_channels=in_channels, epsilon=1e-6 + ) + +class FeedForward(nn.Layer): + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = ( + nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU()) + if not glu + else GEGLU(dim, inner_dim) + ) + + self.net = nn.Sequential( + project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out) + ) + + def forward(self, x): + return self.net(x) + + +class CrossAttention(nn.Layer): + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0): + super().__init__() + inner_dim = dim_head * heads + context_dim = default(context_dim, query_dim) + + self.scale = dim_head**-0.5 + self.heads = heads + + self.to_q = nn.Linear(query_dim, inner_dim, bias_attr=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias_attr=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias_attr=False) + + self.to_out = nn.Sequential( + nn.Linear(inner_dim, query_dim), nn.Dropout(dropout) + ) + + def forward(self, x, context=None, mask=None): + h = self.heads + + q = self.to_q(x) + context = default(context, x) + + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v)) + + sim = paddle.einsum("b i d, b j d -> b i j", q, k) * self.scale + + if mask is not None: + mask = rearrange(mask, "b ... -> b (...)") + max_neg_value = -paddle.finfo(sim.dtype).max + mask = repeat(mask, "b j -> (b h) () j", h=h) + tmp = paddle.full(sim.shape, max_neg_value, sim.dtype) + sim = paddle.where(~(mask == 1), tmp, sim) + + # attention, what we cannot get enough of + attn = nn.functional.softmax(sim, axis=-1) + out = paddle.einsum("b i j, b j d -> b i d", attn, v) + out = rearrange(out, "(b h) n d -> b n (h d)", h=h) + return self.to_out(out) + + +class LinearAttention(nn.Layer): + def __init__(self, dim, heads=4, dim_head=32): + super().__init__() + self.heads = heads + hidden_dim = dim_head * heads + self.to_qkv = nn.Conv2D(dim, hidden_dim * 3, 1, bias_attr=False) + self.to_out = nn.Conv2D(hidden_dim, dim, 1) + + def forward(self, x): + b, c, h, w = x.shape + qkv = self.to_qkv(x) + q, k, v = rearrange( + qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3 + ) + k = nn.functional.softmax(k, axis=-1) + context = paddle.einsum("bhdn,bhen->bhde", k, v) + out = paddle.einsum("bhde,bhdn->bhen", context, q) + out = rearrange( + out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w + ) + return self.to_out(out) + +class BasicTransformerBlock(nn.Layer): + def __init__( + self, + dim, + n_heads, + d_head, + dropout=0.0, + context_dim=None, + gated_ff=True, + checkpoint=True, + ): + super().__init__() + self.attn1 = CrossAttention( + query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout + ) # is a self-attention + self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff) + self.attn2 = CrossAttention( + query_dim=dim, + context_dim=context_dim, + heads=n_heads, + dim_head=d_head, + dropout=dropout, + ) # is self-attn if context is none + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + self.norm3 = nn.LayerNorm(dim) + self.checkpoint = checkpoint + + def forward(self, x, context=None, mask=None): + x = self.attn1(self.norm1(x)) + x + x = self.attn2(self.norm2(x), context=context, mask=mask) + x + x = self.ff(self.norm3(x)) + x + return x + +class SpatialTransformer(nn.Layer): + """ + Transformer block for image-like data. + First, project the input (aka embedding) + and reshape to b, t, d. + Then apply standard transformer action. + Finally, reshape to image + """ + + def __init__( + self, + in_channels, + n_heads, + d_head, + depth=1, + dropout=0.0, + context_dim=None, + ): + super().__init__() + + context_dim = context_dim + + self.in_channels = in_channels + inner_dim = n_heads * d_head + self.norm = Normalize(in_channels) + + self.proj_in = nn.Conv2D( + in_channels, inner_dim, kernel_size=1, stride=1, padding=0 + ) + + self.transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim + ) + for d in range(depth) + ] + ) + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(value=0.0) + ) + self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0, weight_attr=weight_attr) + + def forward(self, x, context=None, mask=None): + # note: if no context is given, cross-attention defaults to self-attention + b, c, h, w = x.shape + x_in = x + x = self.norm(x) + x = self.proj_in(x) + x = rearrange(x, "b c h w -> b (h w) c") + for block in self.transformer_blocks: + x = block(x, context=context, mask=mask) + x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w) + x = self.proj_out(x) + return x + x_in diff --git a/paddlemix/models/audioldm2/unet/openaimodel.py b/paddlemix/models/audioldm2/unet/openaimodel.py new file mode 100644 index 000000000..e40d5a6c0 --- /dev/null +++ b/paddlemix/models/audioldm2/unet/openaimodel.py @@ -0,0 +1,868 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import math +import numpy as np +from abc import abstractmethod +from .attention import SpatialTransformer +from einops import repeat + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1D(*args, **kwargs) + elif dims == 2: + return nn.Conv2D(*args, **kwargs) + elif dims == 3: + return nn.Conv3D(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1D(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2D(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3D(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def normalization(channels): + """ + Make a standard normalization layer. + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(32, channels) + + +def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): + """ + Create sinusoidal timestep embeddings. + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + if not repeat_only: + half = dim // 2 + freqs = paddle.exp( + -math.log(max_period) + * paddle.arange(start=0, end=half, dtype="float32") + / half + ) + args = paddle.cast(timesteps[:, None], dtype="float32") * freqs[None] + embedding = paddle.concat([paddle.cos(args), paddle.sin(args)], axis=-1) + if dim % 2: + embedding = paddle.concat( + [embedding, paddle.zeros_like(embedding[:, :1])], axis=-1 + ) + else: + embedding = repeat(timesteps, "b -> b d", d=dim) + return embedding + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + return paddle.cast(super().forward(paddle.cast(x, dtype="float32")), dtype = x.dtype) + + +class TimestepBlock(nn.Layer): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + +class Upsample(nn.Layer): + """ + An upsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd( + dims, self.channels, self.out_channels, 3, padding=padding + ) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate( + x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest" + ) + else: + x = F.interpolate(x, scale_factor=2, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Layer): + """ + A downsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd( + dims, + self.channels, + self.out_channels, + 3, + stride=stride, + padding=padding, + ) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial + convolution instead of a smaller 1x1 convolution to change the + channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param use_checkpoint: if True, use gradient checkpointing on this module. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + + self.in_layers = nn.Sequential( + normalization(channels), + nn.Silu(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + nn.Silu(), + nn.Linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(value=0.0) + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.Silu(), + nn.Dropout(p=dropout), + conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1, weight_attr=weight_attr) + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd( + dims, channels, self.out_channels, 3, padding=1 + ) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 1) + + def forward(self, x, emb): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = paddle.cast(self.emb_layers(emb), dtype = h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = paddle.chunk(emb_out, 2, axis=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class QKVAttention(nn.Layer): + """ + A module which performs QKV attention and splits in a different order. + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, axis=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = paddle.einsum( + "bct,bcs->bts", + (q * scale).rehsape([bs * self.n_heads, ch, length]), + (k * scale).rehsape([bs * self.n_heads, ch, length]), + ) # More stable with f16 than dividing afterwards + weight = paddle.cast(F.softmax(paddle.cast(weight, dtype="float32"), axis=-1), dtype=weight.dtype) + a = paddle.einsum( + "bts,bcs->bct", + weight, + v.reshape([bs * self.n_heads, ch, length]), + ) + return a.reshape([bs, -1, length]) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttentionLegacy(nn.Layer): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = ( + qkv.reshape([bs * self.n_heads, ch * 3, length]).split(ch, axis=1) + ) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = paddle.einsum( + "bct,bcs->bts", q * scale, k * scale + ) # More stable with f16 than dividing afterwards + weight = paddle.cast(F.softmax(paddle.cast(weight, dtype="float32"), axis=-1), dtype=weight.dtype) + a = paddle.einsum("bts,bcs->bct", weight, v) + return a.reshape([bs, -1, length]) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class AttentionBlock(nn.Layer): + """ + An attention block that allows spatial positions to attend to each other. + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(value=0.0) + ) + self.proj_out = conv_nd(1, channels, channels, 1, weight_attr=weight_attr) + + def forward(self, x): + b, c, *spatial = x.shape + x = x.reshape([b, c, -1]) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + return (x + h).reshape([b, c, *spatial]) + + +def count_flops_attn(model, _x, y): + """ + A counter for the `thop` package to count the operations in an + attention operation. + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial**2) * c + model.total_ops += paddle.to_tensor([matmul_ops], dtype="float64") + + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + + def forward(self, x, emb, context_list=None, mask_list=None): + # The first spatial transformer block does not have context + spatial_transformer_id = 0 + context_list = [None] + context_list + mask_list = [None] + mask_list + + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + elif isinstance(layer, SpatialTransformer): + if spatial_transformer_id >= len(context_list): + context, mask = None, None + else: + context, mask = ( + context_list[spatial_transformer_id], + mask_list[spatial_transformer_id], + ) + if mask is not None: + mask = paddle.cast(mask, dtype="bool") + x = layer(x, context, mask=mask) + spatial_transformer_id += 1 + else: + x = layer(x) + return x + + +class UNetModel(nn.Layer): + """ + The full UNet model with attention and timestep embedding. + :param in_channels: channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which + attention will take place. May be a set, list, or tuple. + For example, if this contains 4, then at 4x downsampling, attention + will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and + downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be + class-conditional with `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use + a fixed channel width per attention head. + :param num_heads_upsample: works with num_heads to set a different number + of heads for upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + :param use_new_attention_order: use a different attention pattern for potentially + increased efficiency. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + extra_sa_layer=True, + num_classes=None, + extra_film_condition_dim=None, + use_checkpoint=False, + use_fp16=False, + num_heads=-1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + use_spatial_transformer=True, # custom transformer support + transformer_depth=1, # custom transformer support + context_dim=None, # custom transformer support + n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model + legacy=True, + ): + super().__init__() + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + if num_heads == -1: + assert ( + num_head_channels != -1 + ), "Either num_heads or num_head_channels has to be set" + + if num_head_channels == -1: + assert ( + num_heads != -1 + ), "Either num_heads or num_head_channels has to be set" + + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.num_classes = num_classes + self.extra_film_condition_dim = extra_film_condition_dim + self.use_checkpoint = use_checkpoint + self._dtype = "float16" if use_fp16 else "float32" + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + self.predict_codebook_ids = n_embed is not None + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + nn.Linear(model_channels, time_embed_dim), + nn.Silu(), + nn.Linear(time_embed_dim, time_embed_dim), + ) + + # assert not ( + # self.num_classes is not None and self.extra_film_condition_dim is not None + # ), "As for the condition of theh UNet model, you can only set using class label or an extra embedding vector (such as from CLAP). You cannot set both num_classes and extra_film_condition_dim." + + if self.num_classes is not None: + self.label_emb = nn.Embedding(num_classes, time_embed_dim) + + self.use_extra_film_by_concat = self.extra_film_condition_dim is not None + + if self.extra_film_condition_dim is not None: + self.film_emb = nn.Linear(self.extra_film_condition_dim, time_embed_dim) + print( + "+ Use extra condition on UNet channel using Film. Extra condition dimension is %s. " + % self.extra_film_condition_dim + ) + + if context_dim is not None and not use_spatial_transformer: + assert ( + use_spatial_transformer + ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..." + + if context_dim is not None and not isinstance(context_dim, list): + context_dim = [context_dim] + elif context_dim is None: + context_dim = [None] # At least use one spatial transformer + + self.input_blocks = nn.LayerList( + [ + TimestepEmbedSequential( + conv_nd(dims, in_channels, model_channels, 3, padding=1) + ) + ] + ) + self._feature_size = model_channels + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim + if (not self.use_extra_film_by_concat) + else time_embed_dim * 2, + dropout, + out_channels=mult * model_channels, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + dim_head = ( + ch // num_heads + if use_spatial_transformer + else num_head_channels + ) + if extra_sa_layer: + layers.append( + SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=None, + ) + ) + for context_dim_id in range(len(context_dim)): + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) + if not use_spatial_transformer + else SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim[context_dim_id], + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim + if (not self.use_extra_film_by_concat) + else time_embed_dim * 2, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) + if resblock_updown + else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch + ) + ) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + middle_layers = [ + ResBlock( + ch, + time_embed_dim + if (not self.use_extra_film_by_concat) + else time_embed_dim * 2, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + if extra_sa_layer: + middle_layers.append( + SpatialTransformer( + ch, num_heads, dim_head, depth=transformer_depth, context_dim=None + ) + ) + for context_dim_id in range(len(context_dim)): + middle_layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) + if not use_spatial_transformer + else SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim[context_dim_id], + ) + ) + middle_layers.append( + ResBlock( + ch, + time_embed_dim + if (not self.use_extra_film_by_concat) + else time_embed_dim * 2, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ) + self.middle_block = TimestepEmbedSequential(*middle_layers) + + self._feature_size += ch + + self.output_blocks = nn.LayerList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(num_res_blocks + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock( + ch + ich, + time_embed_dim + if (not self.use_extra_film_by_concat) + else time_embed_dim * 2, + dropout, + out_channels=model_channels * mult, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = model_channels * mult + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ( + ch // num_heads + if use_spatial_transformer + else num_head_channels + ) + if extra_sa_layer: + layers.append( + SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=None, + ) + ) + for context_dim_id in range(len(context_dim)): + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads_upsample, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) + if not use_spatial_transformer + else SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim[context_dim_id], + ) + ) + if level and i == num_res_blocks: + out_ch = ch + layers.append( + ResBlock( + ch, + time_embed_dim + if (not self.use_extra_film_by_concat) + else time_embed_dim * 2, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True, + ) + if resblock_updown + else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(value=0.0) + ) + self.out = nn.Sequential( + normalization(ch), + nn.Silu(), + conv_nd(dims, model_channels, out_channels, 3, padding=1, weight_attr=weight_attr), + ) + if self.predict_codebook_ids: + self.id_predictor = nn.Sequential( + normalization(ch), + conv_nd(dims, model_channels, n_embed, 1), + # nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits + ) + + self.shape_reported = False + + def forward( + self, + x, + timesteps=None, + y=None, + context_list=None, + context_attn_mask_list=None, + **kwargs, + ): + """ + Apply the model to an input batch. + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param context: conditioning plugged in via crossattn + :param y: an [N] Tensor of labels, if class-conditional. an [N, extra_film_condition_dim] Tensor if film-embed conditional + :return: an [N x C x ...] Tensor of outputs. + """ + if not self.shape_reported: + # print("The shape of UNet input is", x.size()) + self.shape_reported = True + + assert (y is not None) == ( + self.num_classes is not None or self.extra_film_condition_dim is not None + ), "must specify y if and only if the model is class-conditional or film embedding conditional" + hs = [] + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False) + emb = self.time_embed(t_emb) + + if self.use_extra_film_by_concat: + emb = paddle.concat([emb, self.film_emb(y)], axis=-1) + + h = paddle.cast(x, dtype="float32") + for module in self.input_blocks: + h = module(h, emb, context_list, context_attn_mask_list) + hs.append(h) + h = self.middle_block(h, emb, context_list, context_attn_mask_list) + for module in self.output_blocks: + concate_tensor = hs.pop() + h = paddle.concat([h, concate_tensor], axis=1) + h = module(h, emb, context_list, context_attn_mask_list) + h = paddle.cast(h, dtype=x.dtype) + if self.predict_codebook_ids: + return self.id_predictor(h) + else: + return self.out(h) diff --git a/paddlemix/models/audioldm2/utils.py b/paddlemix/models/audioldm2/utils.py new file mode 100644 index 000000000..7adb63e30 --- /dev/null +++ b/paddlemix/models/audioldm2/utils.py @@ -0,0 +1,86 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from itertools import repeat +import collections.abc +from functools import partial + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + +to_2tuple = _ntuple(2) + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor.floor_() # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + +class Mlp(nn.Layer): + """ MLP as used in Vision Transformer, MLP-Mixer and related networks + """ + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + bias=True, + drop=0., + use_conv=False, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + linear_layer = partial(nn.Conv2D, kernel_size=1) if use_conv else nn.Linear + + self.fc1 = linear_layer(in_features, hidden_features, bias_attr=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = linear_layer(hidden_features, out_features, bias_attr=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.fc2(x) + x = self.drop2(x) + return x + \ No newline at end of file