update code

PaddlePaddle · Jan 18, 2024 · 9ff9536 · 9ff9536
1 parent 63b667c
commit 9ff9536
Show file tree

Hide file tree

Showing 6 changed files with 10 additions and 54 deletions.
diff --git a/paddlemix/examples/audioldm2/README.md b/paddlemix/examples/audioldm2/README.md
@@ -7,16 +7,21 @@
 
 ## 2. Demo
 
-## 2.1 依赖安装
+### 2.1 依赖安装
+
+- 请确保已安装 ppdiffusers ([参考方法](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/README.md?plain=1#L62))
+
+- 其余依赖安装：
+
 ```bash
 cd /paddlemix/models/audioldm2
 pip install -r requirement.txt
-
 ```
-## 2.2 动态图推理
+
+### 2.2 动态图推理
 ```bash
 python run_predict.py \
 --text "Musical constellations twinkling in the night sky, forming a cosmic melody." \
---model_name_or_path "/home/aistudio/data/data252967" \
+--model_name_or_path "/my_model_path" \
 --seed 1001 \
 ```
diff --git a/paddlemix/examples/audioldm2/__init__.py b/paddlemix/examples/audioldm2/__init__.py
diff --git a/paddlemix/examples/audioldm2/run_predict.py b/paddlemix/examples/audioldm2/run_predict.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlemix/models/audioldm2/encoders/clap_encoder.py b/paddlemix/models/audioldm2/encoders/clap_encoder.py
@@ -89,28 +89,8 @@ def _get_sinc_resample_kernel(
         raise ValueError("Low pass filter width should be positive.")
     base_freq = min(orig_freq, new_freq)
     # This will perform antialiasing filtering by removing the highest frequencies.
-    # At first I thought I only needed this when downsampling, but when upsampling
-    # you will get edge artifacts without this, as the edge is equivalent to zero padding,
-    # which will add high freq artifacts.
     base_freq *= rolloff
 
-    # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
-    # using the sinc interpolation formula:
-    #   x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t))
-    # We can then sample the function x(t) with a different sample rate:
-    #    y[j] = x(j / new_freq)
-    # or,
-    #    y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
-
-    # We see here that y[j] is the convolution of x[i] with a specific filter, for which
-    # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing.
-    # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq].
-    # Indeed:
-    # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq))
-    #                 = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq))
-    #                 = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
-    # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`.
-    # This will explain the F.conv1d after, with a stride of orig_freq.
     width = math.ceil(lowpass_filter_width * orig_freq / base_freq)
     # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e.,
     # they will have a lot of almost zero values to the left or to the right...
@@ -124,8 +104,6 @@ def _get_sinc_resample_kernel(
     t *= base_freq
     t = t.clip_(-lowpass_filter_width, lowpass_filter_width)
 
-    # we do not use built in torch windows here as we need to evaluate the window
-    # at specific positions, not over a regular grid.
     if resampling_method == "sinc_interp_hann":
         window = paddle.cos(t * math.pi / lowpass_filter_width / 2) ** 2
     else:

diff --git a/paddlemix/models/audioldm2/encoders/sequence2audiomae_encoder.py b/paddlemix/models/audioldm2/encoders/sequence2audiomae_encoder.py
@@ -45,10 +45,6 @@ def __init__(
         # Even though the LDM can be conditioned on mutliple pooling rate
         # Our model always predict the higest pooling rate
 
-        # self.time_pool = max(self.cond_stage_config["crossattn_audiomae_pooled"]["params"]["time_pooling_factors"])
-        # self.freq_pool = max(self.cond_stage_config["crossattn_audiomae_pooled"]["params"]["freq_pooling_factors"])
-        # self.mae_token_num = int(512/(self.time_pool*self.freq_pool))
-
         self.mae_token_num = sequence_gen_length
         self.sequence_input_key = sequence_input_key
         self.sequence_input_embed_dim = sequence_input_embed_dim
@@ -210,8 +206,6 @@ def generate_partial(self, batch, cond_dict=None, no_grad=False):
             cond_dict = self.get_input(batch)
 
         print("Generate partially prompted audio with in-context learning")
-        # self.model.train()
-        # assert self.model.training==True
 
         target_embeds, target_embeds_attn_mask = (
             cond_dict["crossattn_audiomae_pooled"][0],
@@ -259,9 +253,6 @@ def generate(self, batch, cond_dict=None, no_grad=False):
         if cond_dict is None:
             cond_dict = self.get_input(batch)
 
-        # self.model.train()
-        # print("!!!!!!!!!!!!!train")
-
         (
             input_embeds,
             input_embeds_attn_mask,
@@ -320,10 +311,6 @@ def get_input(self, batch):
                     "cond_stage_key"
                 ]
 
-                # if(not self.training):
-                #     if(isinstance(self.cond_stage_models[self.cond_stage_model_metadata[cond_model_key]["model_idx"]], CLAPAudioEmbeddingClassifierFreev2)):
-                #         assert cond_stage_key == "text" # CLAP model should use text for evaluation
-
                 # The original data for conditioning
                 xc = self.get_input_item(batch, cond_stage_key)
                 if type(xc) == paddle.Tensor:

diff --git a/paddlemix/models/audioldm2/requirement.txt b/paddlemix/models/audioldm2/requirement.txt
@@ -1,5 +1,4 @@
 librosa
-ppdiffusers
 unidecode
 phonemizer
 espeak