Skip to content

Commit

Permalink
update code
Browse files Browse the repository at this point in the history
  • Loading branch information
NKNaN committed Jan 18, 2024
1 parent 63b667c commit 9ff9536
Show file tree
Hide file tree
Showing 6 changed files with 10 additions and 54 deletions.
13 changes: 9 additions & 4 deletions paddlemix/examples/audioldm2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,21 @@

## 2. Demo

## 2.1 依赖安装
### 2.1 依赖安装

- 请确保已安装 ppdiffusers ([参考方法](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/README.md?plain=1#L62))

- 其余依赖安装:

```bash
cd /paddlemix/models/audioldm2
pip install -r requirement.txt

```
## 2.2 动态图推理

### 2.2 动态图推理
```bash
python run_predict.py \
--text "Musical constellations twinkling in the night sky, forming a cosmic melody." \
--model_name_or_path "/home/aistudio/data/data252967" \
--model_name_or_path "/my_model_path" \
--seed 1001 \
```
13 changes: 0 additions & 13 deletions paddlemix/examples/audioldm2/__init__.py

This file was deleted.

2 changes: 1 addition & 1 deletion paddlemix/examples/audioldm2/run_predict.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
22 changes: 0 additions & 22 deletions paddlemix/models/audioldm2/encoders/clap_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,28 +89,8 @@ def _get_sinc_resample_kernel(
raise ValueError("Low pass filter width should be positive.")
base_freq = min(orig_freq, new_freq)
# This will perform antialiasing filtering by removing the highest frequencies.
# At first I thought I only needed this when downsampling, but when upsampling
# you will get edge artifacts without this, as the edge is equivalent to zero padding,
# which will add high freq artifacts.
base_freq *= rolloff

# The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
# using the sinc interpolation formula:
# x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t))
# We can then sample the function x(t) with a different sample rate:
# y[j] = x(j / new_freq)
# or,
# y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))

# We see here that y[j] is the convolution of x[i] with a specific filter, for which
# we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing.
# But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq].
# Indeed:
# y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq))
# = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq))
# = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
# so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`.
# This will explain the F.conv1d after, with a stride of orig_freq.
width = math.ceil(lowpass_filter_width * orig_freq / base_freq)
# If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e.,
# they will have a lot of almost zero values to the left or to the right...
Expand All @@ -124,8 +104,6 @@ def _get_sinc_resample_kernel(
t *= base_freq
t = t.clip_(-lowpass_filter_width, lowpass_filter_width)

# we do not use built in torch windows here as we need to evaluate the window
# at specific positions, not over a regular grid.
if resampling_method == "sinc_interp_hann":
window = paddle.cos(t * math.pi / lowpass_filter_width / 2) ** 2
else:
Expand Down
13 changes: 0 additions & 13 deletions paddlemix/models/audioldm2/encoders/sequence2audiomae_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@ def __init__(
# Even though the LDM can be conditioned on mutliple pooling rate
# Our model always predict the higest pooling rate

# self.time_pool = max(self.cond_stage_config["crossattn_audiomae_pooled"]["params"]["time_pooling_factors"])
# self.freq_pool = max(self.cond_stage_config["crossattn_audiomae_pooled"]["params"]["freq_pooling_factors"])
# self.mae_token_num = int(512/(self.time_pool*self.freq_pool))

self.mae_token_num = sequence_gen_length
self.sequence_input_key = sequence_input_key
self.sequence_input_embed_dim = sequence_input_embed_dim
Expand Down Expand Up @@ -210,8 +206,6 @@ def generate_partial(self, batch, cond_dict=None, no_grad=False):
cond_dict = self.get_input(batch)

print("Generate partially prompted audio with in-context learning")
# self.model.train()
# assert self.model.training==True

target_embeds, target_embeds_attn_mask = (
cond_dict["crossattn_audiomae_pooled"][0],
Expand Down Expand Up @@ -259,9 +253,6 @@ def generate(self, batch, cond_dict=None, no_grad=False):
if cond_dict is None:
cond_dict = self.get_input(batch)

# self.model.train()
# print("!!!!!!!!!!!!!train")

(
input_embeds,
input_embeds_attn_mask,
Expand Down Expand Up @@ -320,10 +311,6 @@ def get_input(self, batch):
"cond_stage_key"
]

# if(not self.training):
# if(isinstance(self.cond_stage_models[self.cond_stage_model_metadata[cond_model_key]["model_idx"]], CLAPAudioEmbeddingClassifierFreev2)):
# assert cond_stage_key == "text" # CLAP model should use text for evaluation

# The original data for conditioning
xc = self.get_input_item(batch, cond_stage_key)
if type(xc) == paddle.Tensor:
Expand Down
1 change: 0 additions & 1 deletion paddlemix/models/audioldm2/requirement.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
librosa
ppdiffusers
unidecode
phonemizer
espeak

0 comments on commit 9ff9536

Please sign in to comment.