From fcb169022962e8a33cd408eae566ab318696f5a7 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jul 2024 09:53:42 +0200 Subject: [PATCH 01/71] created auto task mappings --- optimum/onnxruntime/modeling_diffusion.py | 43 +++++++++++++++++------ 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index f4e5475211..3e5aed3fb0 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -18,6 +18,7 @@ import shutil import warnings from abc import abstractmethod +from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, Optional, Union @@ -26,6 +27,7 @@ import torch from diffusers import ( DDIMScheduler, + DiffusionPipeline, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline, @@ -69,8 +71,8 @@ logger = logging.getLogger(__name__) -class ORTStableDiffusionPipelineBase(ORTModel): - auto_model_class = StableDiffusionPipeline +class ORTDiffusionPipeline(ORTModel): + auto_model_class = DiffusionPipeline main_input_name = "input_ids" base_model_prefix = "onnx_model" config_name = "model_index.json" @@ -350,9 +352,9 @@ def _from_pretrained( text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_2_path=new_model_save_dir - / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER - / text_encoder_2_file_name, + text_encoder_2_path=( + new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name + ), provider=provider, session_options=session_options, provider_options=provider_options, @@ -561,7 +563,7 @@ def forward(self, sample: np.ndarray): @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ @@ -570,7 +572,7 @@ class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusion @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ @@ -579,7 +581,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDi @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ @@ -588,7 +590,7 @@ class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDi @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ @@ -596,7 +598,7 @@ class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentCo __call__ = LatentConsistencyPipelineMixin.__call__ -class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase): +class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline): auto_model_class = StableDiffusionXLImg2ImgPipeline def __init__( @@ -661,3 +663,24 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab """ __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + + +AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), + ] +) + +AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), + ] +) + +AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ] +) From 1cbb5448845036104648c6c20267a041a4568250 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 18 Jul 2024 16:50:32 +0200 Subject: [PATCH 02/71] added correct auto classes --- optimum/modeling_base.py | 9 ++++++--- optimum/onnxruntime/modeling_diffusion.py | 24 +++++++++++++---------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 5bab0622de..3da2d9d0d2 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -85,7 +85,6 @@ class PreTrainedModel(ABC): # noqa: F811 class OptimizedModel(PreTrainedModel): config_class = AutoConfig - load_tf_weights = None base_model_prefix = "optimized_model" config_name = CONFIG_NAME @@ -378,10 +377,14 @@ def from_pretrained( ) model_id, revision = model_id.split("@") - library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token) + library_name = TasksManager.infer_library_from_model( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if library_name == "timm": - config = PretrainedConfig.from_pretrained(model_id, subfolder, revision) + config = PretrainedConfig.from_pretrained( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if config is None: if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME: diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 3e5aed3fb0..59732e63ea 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -28,10 +28,14 @@ from diffusers import ( DDIMScheduler, DiffusionPipeline, + LatentConsistencyModelPipeline, LMSDiscreteScheduler, PNDMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLPipeline, ) from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available @@ -73,11 +77,13 @@ class ORTDiffusionPipeline(ORTModel): auto_model_class = DiffusionPipeline - main_input_name = "input_ids" + main_input_name = "prompt" base_model_prefix = "onnx_model" config_name = "model_index.json" sub_component_config_name = "config.json" + # TODO: instead of having a bloated init, we should probably have an init per pipeline, + # so that we can easily add new pipelines without having to modify the base class def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -401,7 +407,7 @@ def _from_transformers( provider_options: Optional[Dict[str, Any]] = None, use_io_binding: Optional[bool] = None, task: Optional[str] = None, - ) -> "ORTStableDiffusionPipeline": + ) -> "ORTDiffusionPipeline": if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", @@ -568,7 +574,7 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ - __call__ = StableDiffusionPipelineMixin.__call__ + auto_model_class = StableDiffusionPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -577,7 +583,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ - __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ + auto_model_class = StableDiffusionImg2ImgPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -586,7 +592,7 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ - __call__ = StableDiffusionInpaintPipelineMixin.__call__ + auto_model_class = StableDiffusionInpaintPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -595,12 +601,10 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ - __call__ = LatentConsistencyPipelineMixin.__call__ + auto_model_class = LatentConsistencyModelPipeline class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline): - auto_model_class = StableDiffusionXLImg2ImgPipeline - def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -653,7 +657,7 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ - __call__ = StableDiffusionXLPipelineMixin.__call__ + auto_model_class = StableDiffusionXLPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -662,7 +666,7 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ - __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + auto_model_class = StableDiffusionXLImg2ImgPipeline AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( From cdba70ea788938f2c632132606f64a95e476b761 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jul 2024 09:53:42 +0200 Subject: [PATCH 03/71] created auto task mappings --- optimum/onnxruntime/modeling_diffusion.py | 43 +++++++++++++++++------ 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index f4e5475211..3e5aed3fb0 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -18,6 +18,7 @@ import shutil import warnings from abc import abstractmethod +from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, Optional, Union @@ -26,6 +27,7 @@ import torch from diffusers import ( DDIMScheduler, + DiffusionPipeline, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline, @@ -69,8 +71,8 @@ logger = logging.getLogger(__name__) -class ORTStableDiffusionPipelineBase(ORTModel): - auto_model_class = StableDiffusionPipeline +class ORTDiffusionPipeline(ORTModel): + auto_model_class = DiffusionPipeline main_input_name = "input_ids" base_model_prefix = "onnx_model" config_name = "model_index.json" @@ -350,9 +352,9 @@ def _from_pretrained( text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_2_path=new_model_save_dir - / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER - / text_encoder_2_file_name, + text_encoder_2_path=( + new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name + ), provider=provider, session_options=session_options, provider_options=provider_options, @@ -561,7 +563,7 @@ def forward(self, sample: np.ndarray): @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ @@ -570,7 +572,7 @@ class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusion @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ @@ -579,7 +581,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDi @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ @@ -588,7 +590,7 @@ class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDi @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ @@ -596,7 +598,7 @@ class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentCo __call__ = LatentConsistencyPipelineMixin.__call__ -class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase): +class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline): auto_model_class = StableDiffusionXLImg2ImgPipeline def __init__( @@ -661,3 +663,24 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab """ __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + + +AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), + ] +) + +AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), + ] +) + +AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ] +) From 5bebbd52040c9d057bbbf36c6c9d78fec2f785a0 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 18 Jul 2024 16:50:32 +0200 Subject: [PATCH 04/71] added correct auto classes --- optimum/modeling_base.py | 9 ++++++--- optimum/onnxruntime/modeling_diffusion.py | 24 +++++++++++++---------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 5bab0622de..3da2d9d0d2 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -85,7 +85,6 @@ class PreTrainedModel(ABC): # noqa: F811 class OptimizedModel(PreTrainedModel): config_class = AutoConfig - load_tf_weights = None base_model_prefix = "optimized_model" config_name = CONFIG_NAME @@ -378,10 +377,14 @@ def from_pretrained( ) model_id, revision = model_id.split("@") - library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token) + library_name = TasksManager.infer_library_from_model( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if library_name == "timm": - config = PretrainedConfig.from_pretrained(model_id, subfolder, revision) + config = PretrainedConfig.from_pretrained( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if config is None: if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME: diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 3e5aed3fb0..59732e63ea 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -28,10 +28,14 @@ from diffusers import ( DDIMScheduler, DiffusionPipeline, + LatentConsistencyModelPipeline, LMSDiscreteScheduler, PNDMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLPipeline, ) from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available @@ -73,11 +77,13 @@ class ORTDiffusionPipeline(ORTModel): auto_model_class = DiffusionPipeline - main_input_name = "input_ids" + main_input_name = "prompt" base_model_prefix = "onnx_model" config_name = "model_index.json" sub_component_config_name = "config.json" + # TODO: instead of having a bloated init, we should probably have an init per pipeline, + # so that we can easily add new pipelines without having to modify the base class def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -401,7 +407,7 @@ def _from_transformers( provider_options: Optional[Dict[str, Any]] = None, use_io_binding: Optional[bool] = None, task: Optional[str] = None, - ) -> "ORTStableDiffusionPipeline": + ) -> "ORTDiffusionPipeline": if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", @@ -568,7 +574,7 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ - __call__ = StableDiffusionPipelineMixin.__call__ + auto_model_class = StableDiffusionPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -577,7 +583,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ - __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ + auto_model_class = StableDiffusionImg2ImgPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -586,7 +592,7 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ - __call__ = StableDiffusionInpaintPipelineMixin.__call__ + auto_model_class = StableDiffusionInpaintPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -595,12 +601,10 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ - __call__ = LatentConsistencyPipelineMixin.__call__ + auto_model_class = LatentConsistencyModelPipeline class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline): - auto_model_class = StableDiffusionXLImg2ImgPipeline - def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -653,7 +657,7 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ - __call__ = StableDiffusionXLPipelineMixin.__call__ + auto_model_class = StableDiffusionXLPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -662,7 +666,7 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ - __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + auto_model_class = StableDiffusionXLImg2ImgPipeline AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( From 40b2ac0ab619725aed28c3def0df3987857be6b5 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 19 Jul 2024 09:22:03 +0200 Subject: [PATCH 05/71] added ort/auto diffusion classes --- optimum/onnxruntime/modeling_diffusion.py | 104 +++++++++++++++++++++- 1 file changed, 101 insertions(+), 3 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 59732e63ea..a5fcdc0ae5 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -26,6 +26,10 @@ import numpy as np import torch from diffusers import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + ConfigMixin, DDIMScheduler, DiffusionPipeline, LatentConsistencyModelPipeline, @@ -37,10 +41,16 @@ StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline, ) +from diffusers.pipelines.auto_pipeline import ( + AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, + AUTO_INPAINT_PIPELINES_MAPPING, + AUTO_TEXT2IMAGE_PIPELINES_MAPPING, +) from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import validate_hf_hub_args from transformers import CLIPFeatureExtractor, CLIPTokenizer from transformers.file_utils import add_end_docstrings @@ -576,6 +586,8 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi auto_model_class = StableDiffusionPipeline + __call__ = StableDiffusionPipelineMixin.__call__ + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin): @@ -585,6 +597,8 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg auto_model_class = StableDiffusionImg2ImgPipeline + __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin): @@ -594,6 +608,8 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp auto_model_class = StableDiffusionInpaintPipeline + __call__ = StableDiffusionInpaintPipelineMixin.__call__ + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin): @@ -603,6 +619,8 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP auto_model_class = LatentConsistencyModelPipeline + __call__ = LatentConsistencyPipelineMixin.__call__ + class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline): def __init__( @@ -659,6 +677,8 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu auto_model_class = StableDiffusionXLPipeline + __call__ = StableDiffusionXLPipelineMixin.__call__ + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin): @@ -668,23 +688,101 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab auto_model_class = StableDiffusionXLImg2ImgPipeline + __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + -AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( +ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ + ("lcm", ORTLatentConsistencyModelPipeline), ("stable-diffusion", ORTStableDiffusionPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), ] ) -AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( +ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), ] ) -AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict( +ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionInpaintPipeline), ] ) + + +def _get_task_class(ort_mapping, pipeline_class_name, throw_error_if_not_exist: bool = True): + for model_type, ort_pipeline_class in ort_mapping.items(): + if pipeline_class_name == ort_pipeline_class.auto_model_class.__name__: + return ort_pipeline_class + + if throw_error_if_not_exist: + raise ValueError(f"ORTPipeline can't find a pipeline linked to {pipeline_class_name}") + + +class ORTPipelineBase(ConfigMixin): + config_name = "model_index.json" + + ort_pipeline_mapping = None + auto_pipeline_mapping = None + + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_pipe(pipeline)` methods." + ) + + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + load_config_kwargs = { + "force_download": kwargs.get("force_download", False), + "resume_download": kwargs.get("resume_download", None), + "local_files_only": kwargs.get("local_files_only", False), + "cache_dir": kwargs.get("cache_dir", None), + "revision": kwargs.get("revision", None), + "proxies": kwargs.get("proxies", None), + "token": kwargs.get("token", None), + } + + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + + original_class_name = config["_class_name"] + + pipeline_cls = _get_task_class( + cls.ort_pipeline_mapping, + cls.auto_pipeline_mapping, + original_class_name, + ) + + return pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs) + + @classmethod + def from_pipe(cls, **kwargs): + raise NotImplementedError( + f"from_pipe is not yet implemented for {cls.__name__}. Please use from_pretrained instead." + ) + + +class ORTPipelineForText2Image(ORTPipelineBase): + auto_model_class = AutoPipelineForText2Image + + ort_pipeline_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING + auto_pipeline_mapping = AUTO_TEXT2IMAGE_PIPELINES_MAPPING + + +class ORTPipelineForImage2Image(ORTPipelineBase): + auto_model_class = AutoPipelineForImage2Image + + ort_pipeline_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING + auto_pipeline_mapping = AUTO_IMAGE2IMAGE_PIPELINES_MAPPING + + +class ORTPipelineForInpainting(ORTPipelineBase): + auto_model_class = AutoPipelineForInpainting + + ort_pipeline_mapping = ORT_INPAINT_PIPELINES_MAPPING + auto_pipeline_mapping = AUTO_INPAINT_PIPELINES_MAPPING From 29bfe57c01ff7c74503e094f01430168c3763b53 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 31 Jul 2024 15:07:53 +0200 Subject: [PATCH 06/71] fix ORTPipeline detection --- optimum/onnxruntime/__init__.py | 6 ++++ optimum/onnxruntime/modeling_diffusion.py | 42 +++++++++++++++-------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index f1d4f63a9f..35cbf14587 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -88,6 +88,9 @@ "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", + "ORTPipelineForText2Image", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", ] @@ -147,6 +150,9 @@ else: from .modeling_diffusion import ( ORTLatentConsistencyModelPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index a5fcdc0ae5..982dd12334 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -712,14 +712,32 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ] ) - -def _get_task_class(ort_mapping, pipeline_class_name, throw_error_if_not_exist: bool = True): - for model_type, ort_pipeline_class in ort_mapping.items(): - if pipeline_class_name == ort_pipeline_class.auto_model_class.__name__: - return ort_pipeline_class +SUPPORTED_TASKS_MAPPINGS = [ + ORT_TEXT2IMAGE_PIPELINES_MAPPING, + ORT_IMAGE2IMAGE_PIPELINES_MAPPING, + ORT_INPAINT_PIPELINES_MAPPING, +] + + +def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True): + def get_model(pipeline_class_name): + for task_mapping in SUPPORTED_TASKS_MAPPINGS: + for model_name, pipeline in task_mapping.items(): + if ( + pipeline.__name__ == pipeline_class_name + or pipeline.auto_model_class.__name__ == pipeline_class_name + ): + return model_name + + model_name = get_model(pipeline_class_name) + + if model_name is not None: + task_class = mapping.get(model_name, None) + if task_class is not None: + return task_class if throw_error_if_not_exist: - raise ValueError(f"ORTPipeline can't find a pipeline linked to {pipeline_class_name}") + raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}") class ORTPipelineBase(ConfigMixin): @@ -749,16 +767,12 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): } config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + config = config[0] if isinstance(config, tuple) else config + class_name = config["_class_name"] - original_class_name = config["_class_name"] - - pipeline_cls = _get_task_class( - cls.ort_pipeline_mapping, - cls.auto_pipeline_mapping, - original_class_name, - ) + ort_pipeline_cls = _get_task_class(cls.ort_pipeline_mapping, class_name) - return pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs) + return ort_pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs) @classmethod def from_pipe(cls, **kwargs): From f6df38ccca773e3de0cc55e567b0593fca4ece12 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 31 Jul 2024 15:08:13 +0200 Subject: [PATCH 07/71] start test refactoring --- optimum/utils/testing_utils.py | 11 + tests/onnxruntime/test_diffusion.py | 730 ++++++++++++++++++ .../test_stable_diffusion_pipeline.py | 562 -------------- tests/onnxruntime/utils_onnxruntime_tests.py | 15 +- 4 files changed, 752 insertions(+), 566 deletions(-) create mode 100644 tests/onnxruntime/test_diffusion.py delete mode 100644 tests/onnxruntime/test_stable_diffusion_pipeline.py diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index 76fe9a05b1..6579e230dc 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -84,6 +84,17 @@ def require_ort_rocm(test_case): ) +def require_ort_cuda(test_case): + """Decorator marking a test that requires CUDAExecutionProvider for ONNX Runtime.""" + import onnxruntime as ort + + providers = ort.get_available_providers() + + return unittest.skipUnless("CUDAExecutionProvider" == providers[0], "test requires CUDAExecutionProvider")( + test_case + ) + + def require_hf_token(test_case): """ Decorator marking a test that requires huggingface hub token. diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py new file mode 100644 index 0000000000..2d5ab7a7f8 --- /dev/null +++ b/tests/onnxruntime/test_diffusion.py @@ -0,0 +1,730 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +import unittest +from typing import Dict + +import numpy as np +import PIL +import pytest +import torch +from diffusers import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + StableDiffusionPipeline, + StableDiffusionXLPipeline, +) +from diffusers.utils import load_image +from diffusers.utils.testing_utils import floats_tensor +from packaging.version import Version, parse +from parameterized import parameterized +from transformers.testing_utils import require_torch_gpu +from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin + +from optimum.onnxruntime import ( + ORTLatentConsistencyModelPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, + ORTStableDiffusionImg2ImgPipeline, + ORTStableDiffusionInpaintPipeline, + ORTStableDiffusionPipeline, + ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLPipeline, +) +from optimum.onnxruntime.modeling_diffusion import ( + ORTModelTextEncoder, + ORTModelUnet, + ORTModelVaeDecoder, + ORTModelVaeEncoder, +) +from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor +from optimum.utils.import_utils import _diffusers_version +from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm + + +if parse(_diffusers_version) > Version("0.21.4"): + from diffusers import LatentConsistencyModelPipeline + + +def _generate_inputs(batch_size=1): + inputs = { + "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + +def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"): + if input_type == "pil": + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo.png" + ).resize((width, height)) + elif input_type == "np": + image = np.random.rand(height, width, channel) + elif input_type == "pt": + image = torch.rand((channel, height, width)) + + return [image] * batch_size + + +def to_np(image): + if isinstance(image[0], PIL.Image.Image): + return np.stack([np.array(i) for i in image], axis=0) + elif isinstance(image, torch.Tensor): + return image.cpu().numpy().transpose(0, 2, 3, 1) + return image + + +class ORTPipelineForText2ImageTest(ORTModelTestMixin): + ARCHITECTURE_TO_ORTMODEL_CLASS = { + "stable-diffusion": ORTStableDiffusionPipeline, + "stable-diffusion-xl": ORTStableDiffusionXLPipeline, + "lcm": ORTLatentConsistencyModelPipeline, + } + + AUTOMODEL_CLASS = AutoPipelineForText2Image + ORTMODEL_CLASS = ORTPipelineForText2Image + + TASK = "text-to-image" + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertIsInstance(pipeline, self.ARCHITECTURE_TO_ORTMODEL_CLASS[model_arch]) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + batch_size, height = 1, 32 + for width in [64, 32]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand( + grid_parameters( + {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]} + ) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters( + {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]} + ) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_callback(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: + callback_fn.has_been_called = True + callback_fn.number_of_steps += 1 + + callback_fn.has_been_called = False + callback_fn.number_of_steps = 0 + + inputs = self.generate_inputs(height=64, width=64) + pipeline(**inputs, callback=callback_fn, callback_steps=1) + self.assertTrue(callback_fn.has_been_called) + self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"]) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + inputs = _generate_inputs() + height, width = 64, 64 + np.random.seed(0) + ort_outputs_1 = pipeline(**inputs, height=height, width=width) + np.random.seed(0) + ort_outputs_2 = pipeline(**inputs, height=height, width=width) + ort_outputs_3 = pipeline(**inputs, height=height, width=width) + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + @parameterized.expand(["stable-diffusion"]) + def test_negative_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + negative_prompt = ["This is a negative prompt"] + np.random.seed(0) + image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1] + prompt = inputs.pop("prompt") + embeds = [] + for p in [prompt, negative_prompt]: + text_inputs = pipeline.tokenizer( + p, + padding="max_length", + max_length=pipeline.tokenizer.model_max_length, + truncation=True, + return_tensors="np", + ) + text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32)) + embeds.append(pipeline.text_encoder(text_inputs)[0]) + + inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds + np.random.seed(0) + image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1] + self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4)) + + def generate_inputs(self, height=128, width=128, batch_size=1): + inputs = _generate_inputs(batch_size=batch_size) + inputs["height"] = height + inputs["width"] = width + return inputs + + +class ORTPipelineForImage2ImageTest(ORTModelTestMixin): + ARCHITECTURE_TO_ORTMODEL_CLASS = { + "stable-diffusion": ORTStableDiffusionImg2ImgPipeline, + "stable-diffusion-xl": ORTStableDiffusionXLImg2ImgPipeline, + } + AUTOMODEL_CLASS = AutoPipelineForImage2Image + ORTMODEL_CLASS = ORTPipelineForImage2Image + + TASK = "image-to-image" + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + batch_size, height = 1, 32 + for width in [64, 32]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand( + grid_parameters( + {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]} + ) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters( + {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]} + ) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_callback(self, model_arch: str): + def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: + callback_fn.has_been_called = True + callback_fn.number_of_steps += 1 + + pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) + callback_fn.has_been_called = False + callback_fn.number_of_steps = 0 + inputs = self.generate_inputs(height=64, width=64) + pipe(**inputs, callback=callback_fn, callback_steps=1) + self.assertTrue(callback_fn.has_been_called) + self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for input_type in ["np", "pil", "pt"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): + inputs = _generate_inputs(batch_size=batch_size) + inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) + inputs["strength"] = 0.75 + return inputs + + # @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + # @require_diffusers + # def test_shape(self, model_arch: str): + # model_args = {"test_name": model_arch, "model_arch": model_arch} + # self._setup(model_args) + # height, width, batch_size = 128, 64, 1 + # pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + # if self.TASK == "image-to-image": + # input_types = ["np", "pil", "pt"] + # elif self.TASK == "text-to-image": + # input_types = ["np"] + # else: + # input_types = ["pil"] + + # for input_type in input_types: + # if self.TASK == "image-to-image": + # inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + # else: + # inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + # for output_type in ["np", "pil", "latent"]: + # inputs["output_type"] = output_type + # outputs = pipeline(**inputs).images + # if output_type == "pil": + # self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + # elif output_type == "np": + # self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + # else: + # self.assertEqual( + # outputs.shape, + # (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + # ) + + +# class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase): +# SUPPORTED_ARCHITECTURES = ["stable-diffusion"] +# ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline +# TASK = "image-to-image" + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# @require_diffusers +# def test_compare_diffusers_pipeline(self, model_arch: str): +# model_args = {"test_name": model_arch, "model_arch": model_arch} +# self._setup(model_args) +# height, width = 128, 128 + +# inputs = self.generate_inputs(height=height, width=width) +# inputs["prompt"] = "A painting of a squirrel eating a burger" +# inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED)) + +# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) +# ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images + +# diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) +# diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images + +# self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1)) + + +# class ORTStableDiffusionPipelineTest(unittest.TestCase): +# SUPPORTED_ARCHITECTURES = [ +# "stable-diffusion", +# ] +# ORTMODEL_CLASS = ORTStableDiffusionPipeline +# TASK = "text-to-image" + + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# @require_diffusers +# def test_image_reproducibility(self, model_arch: str): +# pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) +# inputs = _generate_inputs() +# height, width = 64, 32 +# np.random.seed(0) +# ort_outputs_1 = pipeline(**inputs, height=height, width=width) +# np.random.seed(0) +# ort_outputs_2 = pipeline(**inputs, height=height, width=width) +# ort_outputs_3 = pipeline(**inputs, height=height, width=width) +# # Compare model outputs +# self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) +# self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# def test_negative_prompt(self, model_arch: str): +# pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) +# inputs = _generate_inputs() +# inputs["height"], inputs["width"] = 64, 32 +# negative_prompt = ["This is a negative prompt"] +# np.random.seed(0) +# image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1] +# prompt = inputs.pop("prompt") +# embeds = [] +# for p in [prompt, negative_prompt]: +# text_inputs = pipeline.tokenizer( +# p, +# padding="max_length", +# max_length=pipeline.tokenizer.model_max_length, +# truncation=True, +# return_tensors="np", +# ) +# text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32)) +# embeds.append(pipeline.text_encoder(text_inputs)[0]) + +# inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds +# np.random.seed(0) +# image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1] +# self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4)) + + +# class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin): +# SUPPORTED_ARCHITECTURES = [ +# "stable-diffusion-xl", +# ] +# ORTMODEL_CLASS = ORTStableDiffusionXLPipeline +# TASK = "text-to-image" + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# @require_diffusers +# def test_compare_to_diffusers(self, model_arch: str): +# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) +# self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) +# self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder) +# self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) +# self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) +# self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) +# self.assertIsInstance(ort_pipeline.config, Dict) + +# pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch]) +# batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 +# latents = ort_pipeline.prepare_latents( +# batch_size * num_images_per_prompt, +# ort_pipeline.unet.config["in_channels"], +# height, +# width, +# dtype=np.float32, +# generator=np.random.RandomState(0), +# ) + +# kwargs = { +# "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, +# "num_inference_steps": 1, +# "num_images_per_prompt": num_images_per_prompt, +# "height": height, +# "width": width, +# "guidance_rescale": 0.1, +# } + +# for output_type in ["latent", "np"]: +# ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images +# self.assertIsInstance(ort_outputs, np.ndarray) +# with torch.no_grad(): +# outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images + +# # Compare model outputs +# self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) +# # Compare model devices +# self.assertEqual(pipeline.device, ort_pipeline.device) + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# @require_diffusers +# def test_image_reproducibility(self, model_arch: str): +# pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) +# inputs = _generate_inputs() +# height, width = 64, 32 +# np.random.seed(0) +# ort_outputs_1 = pipeline(**inputs, height=height, width=width) +# np.random.seed(0) +# ort_outputs_2 = pipeline(**inputs, height=height, width=width) +# ort_outputs_3 = pipeline(**inputs, height=height, width=width) +# self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) +# self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + +# class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase): +# SUPPORTED_ARCHITECTURES = [ +# "stable-diffusion", +# ] +# ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline +# TASK = "inpainting" + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# @require_diffusers +# def test_compare_diffusers_pipeline(self, model_arch: str): +# model_args = {"test_name": model_arch, "model_arch": model_arch} +# self._setup(model_args) +# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) +# diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch]) +# height, width = 64, 64 +# latents_shape = ( +# 1, +# ort_pipeline.vae_decoder.config["latent_channels"], +# height // ort_pipeline.vae_scale_factor, +# width // ort_pipeline.vae_scale_factor, +# ) +# inputs = self.generate_inputs(height=height, width=width) + +# np_latents = np.random.rand(*latents_shape).astype(np.float32) +# torch_latents = torch.from_numpy(np_latents) + +# ort_outputs = ort_pipeline(**inputs, latents=np_latents).images +# self.assertEqual(ort_outputs.shape, (1, height, width, 3)) + +# diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images +# self.assertEqual(diffusers_outputs.shape, (1, height, width, 3)) + +# self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4)) + +# def generate_inputs(self, height=128, width=128, batch_size=1): +# inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width) +# inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] +# inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] +# return inputs + + +# class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin): +# SUPPORTED_ARCHITECTURES = [ +# "stable-diffusion-xl", +# ] +# ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline +# TASK = "image-to-image" + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# @require_diffusers +# def test_inference(self, model_arch: str): +# model_args = {"test_name": model_arch, "model_arch": model_arch} +# self._setup(model_args) +# pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + +# height, width = 128, 128 +# inputs = self.generate_inputs(height=height, width=width) +# inputs["image"] = load_image( +# "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" +# "/in_paint/overture-creations-5sI6fQgYIuo.png" +# ).resize((width, height)) +# output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1] +# expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080]) + +# self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1)) + +# def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): +# inputs = _generate_inputs(batch_size=batch_size) +# inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) +# inputs["strength"] = 0.75 +# return inputs + + +# class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin): +# SUPPORTED_ARCHITECTURES = [ +# "latent-consistency", +# ] +# ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline +# TASK = "text-to-image" + +# @parameterized.expand(SUPPORTED_ARCHITECTURES) +# @require_diffusers +# @unittest.skipIf( +# parse(_diffusers_version) <= Version("0.21.4"), +# "not supported with this diffusers version, needs diffusers>=v0.22.0", +# ) +# def test_compare_to_diffusers(self, model_arch: str): +# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) +# self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) +# self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) +# self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) +# self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) +# self.assertIsInstance(ort_pipeline.config, Dict) + +# pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch]) +# batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 +# latents = ort_pipeline.prepare_latents( +# batch_size * num_images_per_prompt, +# ort_pipeline.unet.config["in_channels"], +# height, +# width, +# dtype=np.float32, +# generator=np.random.RandomState(0), +# ) + +# kwargs = { +# "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, +# "num_inference_steps": 1, +# "num_images_per_prompt": num_images_per_prompt, +# "height": height, +# "width": width, +# "guidance_scale": 8.5, +# } + +# for output_type in ["latent", "np"]: +# ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images +# self.assertIsInstance(ort_outputs, np.ndarray) +# with torch.no_grad(): +# outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images + +# # Compare model outputs +# self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) +# # Compare model devices +# self.assertEqual(pipeline.device, ort_pipeline.device) + + +class ImageProcessorTest(unittest.TestCase): + def test_vae_image_processor_pt(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt")) + input_np = to_np(input_pt) + + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) + out_np = to_np(out) + in_np = (input_np * 255).round() if output_type == "pil" else input_np + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + + def test_vae_image_processor_np(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_np = np.stack(_create_image(height=8, width=8, input_type="np")) + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) + out_np = to_np(out) + in_np = (input_np * 255).round() if output_type == "pil" else input_np + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + + def test_vae_image_processor_pil(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil") + + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) + for i, o in zip(input_pil, out): + in_np = np.array(i) + out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py deleted file mode 100644 index 44cd22ffec..0000000000 --- a/tests/onnxruntime/test_stable_diffusion_pipeline.py +++ /dev/null @@ -1,562 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import random -import unittest -from typing import Dict - -import numpy as np -import PIL -import pytest -import torch -from diffusers import ( - OnnxStableDiffusionImg2ImgPipeline, - StableDiffusionPipeline, - StableDiffusionXLPipeline, -) -from diffusers.utils import load_image -from diffusers.utils.testing_utils import floats_tensor -from packaging.version import Version, parse -from parameterized import parameterized -from transformers.testing_utils import require_torch_gpu -from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin - -from optimum.onnxruntime import ( - ORTLatentConsistencyModelPipeline, - ORTStableDiffusionImg2ImgPipeline, - ORTStableDiffusionInpaintPipeline, - ORTStableDiffusionPipeline, - ORTStableDiffusionXLImg2ImgPipeline, - ORTStableDiffusionXLPipeline, -) -from optimum.onnxruntime.modeling_diffusion import ( - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, -) -from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor -from optimum.utils.import_utils import _diffusers_version -from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm - - -if parse(_diffusers_version) > Version("0.21.4"): - from diffusers import LatentConsistencyModelPipeline - - -def _generate_inputs(batch_size=1): - inputs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - -def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"): - if input_type == "pil": - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) - elif input_type == "np": - image = np.random.rand(height, width, channel) - elif input_type == "pt": - image = torch.rand((channel, height, width)) - - return [image] * batch_size - - -def to_np(image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image - - -class ORTStableDiffusionPipelineBase(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionPipeline - TASK = "text-to-image" - - @require_diffusers - def test_load_vanilla_model_which_is_not_supported(self): - with self.assertRaises(Exception) as context: - _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) - - self.assertIn( - f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) - ) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_num_images_per_prompt(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_callback(self, model_arch: str): - def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: - callback_fn.has_been_called = True - callback_fn.number_of_steps += 1 - - pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - callback_fn.has_been_called = False - callback_fn.number_of_steps = 0 - inputs = self.generate_inputs(height=64, width=64) - pipe(**inputs, callback=callback_fn, callback_steps=1) - self.assertTrue(callback_fn.has_been_called) - self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_shape(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - height, width, batch_size = 128, 64, 1 - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - - if self.TASK == "image-to-image": - input_types = ["np", "pil", "pt"] - elif self.TASK == "text-to-image": - input_types = ["np"] - else: - input_types = ["pil"] - - for input_type in input_types: - if self.TASK == "image-to-image": - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - else: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for output_type in ["np", "pil", "latent"]: - inputs["output_type"] = output_type - outputs = pipeline(**inputs).images - if output_type == "pil": - self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) - elif output_type == "np": - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - else: - self.assertEqual( - outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - ) - - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = _generate_inputs(batch_size=batch_size) - inputs["height"] = height - inputs["width"] = width - return inputs - - -class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline - TASK = "image-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_diffusers_pipeline(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - height, width = 128, 128 - - inputs = self.generate_inputs(height=height, width=width) - inputs["prompt"] = "A painting of a squirrel eating a burger" - inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED)) - - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - - diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - - self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size=batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs - - -class ORTStableDiffusionPipelineTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - pipeline.safety_checker = None - batch_size, num_images_per_prompt, height, width = 1, 2, 64, 32 - - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": "sailing ship in storm by Leonardo da Vinci", - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - self.assertIsInstance(ort_outputs, np.ndarray) - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_image_reproducibility(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - height, width = 64, 32 - np.random.seed(0) - ort_outputs_1 = pipeline(**inputs, height=height, width=width) - np.random.seed(0) - ort_outputs_2 = pipeline(**inputs, height=height, width=width) - ort_outputs_3 = pipeline(**inputs, height=height, width=width) - # Compare model outputs - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) - self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_negative_prompt(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - inputs["height"], inputs["width"] = 64, 32 - negative_prompt = ["This is a negative prompt"] - np.random.seed(0) - image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1] - prompt = inputs.pop("prompt") - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = pipeline.tokenizer( - p, - padding="max_length", - max_length=pipeline.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32)) - embeds.append(pipeline.text_encoder(text_inputs)[0]) - - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - np.random.seed(0) - image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1] - self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4)) - - -class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion-xl", - ] - ORTMODEL_CLASS = ORTStableDiffusionXLPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ort_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_image_reproducibility(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - height, width = 64, 32 - np.random.seed(0) - ort_outputs_1 = pipeline(**inputs, height=height, width=width) - np.random.seed(0) - ort_outputs_2 = pipeline(**inputs, height=height, width=width) - ort_outputs_3 = pipeline(**inputs, height=height, width=width) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) - self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - -class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline - TASK = "inpainting" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_diffusers_pipeline(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch]) - height, width = 64, 64 - latents_shape = ( - 1, - ort_pipeline.vae_decoder.config["latent_channels"], - height // ort_pipeline.vae_scale_factor, - width // ort_pipeline.vae_scale_factor, - ) - inputs = self.generate_inputs(height=height, width=width) - - np_latents = np.random.rand(*latents_shape).astype(np.float32) - torch_latents = torch.from_numpy(np_latents) - - ort_outputs = ort_pipeline(**inputs, latents=np_latents).images - self.assertEqual(ort_outputs.shape, (1, height, width, 3)) - - diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images - self.assertEqual(diffusers_outputs.shape, (1, height, width, 3)) - - self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4)) - - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width) - inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - return inputs - - -class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion-xl", - ] - ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline - TASK = "image-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_inference(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - - height, width = 128, 128 - inputs = self.generate_inputs(height=height, width=width) - inputs["image"] = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) - output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1] - expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080]) - - self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size=batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs - - -class ImageProcessorTest(unittest.TestCase): - def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt")) - input_np = to_np(input_pt) - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_np = np.stack(_create_image(height=8, width=8, input_type="np")) - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil") - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) - for i, o in zip(input_pil, out): - in_np = np.array(i) - out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - -class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "latent-consistency", - ] - ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - @unittest.skipIf( - parse(_diffusers_version) <= Version("0.21.4"), - "not supported with this diffusers version, needs diffusers>=v0.22.0", - ) - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_scale": 8.5, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ort_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index bb6935461d..e77b9b7c20 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -108,7 +108,7 @@ "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", "levit": "hf-internal-testing/tiny-random-LevitModel", - "latent-consistency": "echarlaix/tiny-random-latent-consistency", + "lcm": "echarlaix/tiny-random-latent-consistency", "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel", "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model", "longt5": "hf-internal-testing/tiny-random-LongT5Model", @@ -213,9 +213,16 @@ def _setup(self, model_args: Dict): continue set_seed(SEED) - onnx_model = self.ORTMODEL_CLASS.from_pretrained( - model_id, **model_args, use_io_binding=False, export=True - ) + if hasattr(self, "ORTMODEL_CLASS"): + onnx_model = self.ORTMODEL_CLASS.from_pretrained( + model_id, **model_args, use_io_binding=False, export=True + ) + elif hasattr(self, "ORTPIPELINE_CLASS"): + onnx_model = self.ORTPIPELINE_CLASS.from_pretrained( + model_id, **model_args, use_io_binding=False, export=True + ) + else: + raise ValueError("ORTMODEL_CLASS or ORTPIPELINE_CLASS must be defined") model_dir = tempfile.mkdtemp( prefix=f"{model_arch_and_params}_{self.TASK}_{model_id.replace('/', '_')}" From 3123ea5fa6c201c86cb023c6301aa00afede3e15 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 27 Aug 2024 17:46:50 +0200 Subject: [PATCH 08/71] dynamic dtype --- optimum/onnxruntime/modeling_diffusion.py | 98 ++++++++++++++--------- 1 file changed, 61 insertions(+), 37 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 982dd12334..7445f1c6ef 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -53,6 +53,7 @@ from huggingface_hub.utils import validate_hf_hub_args from transformers import CLIPFeatureExtractor, CLIPTokenizer from transformers.file_utils import add_end_docstrings +from transformers.modeling_outputs import ModelOutput import onnxruntime as ort @@ -72,9 +73,9 @@ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) +from .io_binding import TypeHelper from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel from .utils import ( - _ORT_TO_NP_TYPE, ONNX_WEIGHTS_NAME, get_provider_for_device, parse_device, @@ -501,14 +502,23 @@ class _ORTDiffusionModelPart: CONFIG_NAME = "config.json" + _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs + _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs + def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): self.session = session self.parent_model = parent_model - self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} - self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} config_path = Path(session._model_path).parent / self.CONFIG_NAME self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()} + self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} + self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} + self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} + + @property + def input_dtype(self): + # for backward compatibility + return {key: TypeHelper.ort_type_to_numpy_type(value) for key, value in self.input_dtypes.items()} @property def device(self): @@ -523,12 +533,16 @@ def __call__(self, *args, **kwargs): class ORTModelTextEncoder(_ORTDiffusionModelPart): - def forward(self, input_ids: np.ndarray): - onnx_inputs = { - "input_ids": input_ids, - } - outputs = self.session.run(None, onnx_inputs) - return outputs + def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(input_ids, torch.Tensor) + + model_inputs = {"input_ids": input_ids} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) class ORTModelUnet(_ORTDiffusionModelPart): @@ -537,45 +551,55 @@ def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): def forward( self, - sample: np.ndarray, - timestep: np.ndarray, - encoder_hidden_states: np.ndarray, - text_embeds: Optional[np.ndarray] = None, - time_ids: Optional[np.ndarray] = None, - timestep_cond: Optional[np.ndarray] = None, + sample: Union[np.ndarray, torch.Tensor], + timestep: Union[np.ndarray, torch.Tensor], + encoder_hidden_states: Union[np.ndarray, torch.Tensor], + text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, + time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, ): - onnx_inputs = { + use_torch = isinstance(sample, torch.Tensor) + + model_inputs = { "sample": sample, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, + "text_embeds": text_embeds, + "time_ids": time_ids, + "timestep_cond": timestep_cond, } - if text_embeds is not None: - onnx_inputs["text_embeds"] = text_embeds - if time_ids is not None: - onnx_inputs["time_ids"] = time_ids - if timestep_cond is not None: - onnx_inputs["timestep_cond"] = timestep_cond - outputs = self.session.run(None, onnx_inputs) - return outputs + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) class ORTModelVaeDecoder(_ORTDiffusionModelPart): - def forward(self, latent_sample: np.ndarray): - onnx_inputs = { - "latent_sample": latent_sample, - } - outputs = self.session.run(None, onnx_inputs) - return outputs + def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(latent_sample, torch.Tensor) + + model_inputs = {"latent_sample": latent_sample} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) class ORTModelVaeEncoder(_ORTDiffusionModelPart): - def forward(self, sample: np.ndarray): - onnx_inputs = { - "sample": sample, - } - outputs = self.session.run(None, onnx_inputs) - return outputs + def forward(self, sample: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(sample, torch.Tensor) + + model_inputs = {"sample": sample} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) From 7803ef311e6efedcebf2220d8290d8216652d022 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 27 Aug 2024 17:50:36 +0200 Subject: [PATCH 09/71] support torch random numbers generator --- .../diffusers/pipeline_latent_consistency.py | 6 +- .../diffusers/pipeline_stable_diffusion.py | 16 ++++-- .../pipeline_stable_diffusion_img2img.py | 56 +++++++++++++++---- .../pipeline_stable_diffusion_inpaint.py | 22 +++++--- .../diffusers/pipeline_stable_diffusion_xl.py | 21 +++++-- .../pipeline_stable_diffusion_xl_img2img.py | 28 +++++++--- optimum/pipelines/diffusers/pipeline_utils.py | 8 +-- 7 files changed, 115 insertions(+), 42 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py index 41c85b5b6a..630d463de7 100644 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py @@ -36,7 +36,7 @@ def __call__( original_inference_steps: int = None, guidance_scale: float = 8.5, num_images_per_prompt: int = 1, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, output_type: str = "pil", @@ -66,7 +66,7 @@ def __call__( usually at the expense of lower image quality. num_images_per_prompt (`int`, defaults to 1): The number of images to generate per prompt. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -121,7 +121,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() prompt_embeds = self._encode_prompt( prompt, diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py index 98bff0de44..6cc47fab1b 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py @@ -189,7 +189,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype ) if latents is None: - latents = generator.randn(*shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*shape).astype(dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) elif latents.shape != shape: raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") @@ -209,7 +217,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -248,7 +256,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -303,7 +311,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py index 81a6ffa1e0..f7f0586ac9 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py @@ -16,7 +16,7 @@ from typing import Callable, List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import deprecate @@ -72,6 +72,43 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + else: + init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = np.concatenate([init_latents], axis=0) + + # add noise to latents using the timesteps + if isinstance(generator, np.random.RandomState): + noise = generator.randn(*init_latents.shape).astype(dtype) + elif isinstance(generator, torch.Generator): + noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + + init_latents = self.scheduler.add_noise( + torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) + ).numpy() + + return init_latents + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__ def __call__( self, @@ -83,7 +120,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, output_type: str = "pil", @@ -125,7 +162,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): A np.random.RandomState to make generation deterministic. prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not @@ -168,7 +205,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -225,12 +262,8 @@ def __call__( timesteps = self.scheduler.timesteps.numpy()[-init_timestep] timesteps = np.array([timesteps] * batch_size * num_images_per_prompt) - # add noise to latents using the timesteps - noise = generator.randn(*init_latents.shape).astype(latents_dtype) - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ) - init_latents = init_latents.numpy() + # 5. Prepare latent variables + latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. @@ -276,7 +309,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py index 19de793ccd..cb3c7db96e 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py @@ -16,7 +16,7 @@ from typing import Callable, List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import PIL_INTERPOLATION @@ -108,7 +108,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -200,7 +200,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -229,11 +229,19 @@ def __call__( width // self.vae_scale_factor, ) latents_dtype = prompt_embeds.dtype + if latents is None: - latents = generator.randn(*latents_shape).astype(latents_dtype) - else: - if latents.shape != latents_shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*latents_shape).astype(latents_dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + elif latents.shape != latents_shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") # prepare mask and masked_image mask, masked_image = prepare_mask_and_masked_image( diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 2a5e7bf78b..3c210862ac 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -235,7 +235,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype ) if latents is None: - latents = generator.randn(*shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*shape).astype(dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) elif latents.shape != shape: raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") @@ -270,7 +278,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -315,7 +323,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -383,7 +391,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -410,6 +418,7 @@ def __call__( # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps + print("timesteps", timesteps) # 5. Prepare latent variables latents = self.prepare_latents( @@ -440,6 +449,7 @@ def __call__( timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance @@ -475,7 +485,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py index a07903a735..19988599b6 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py @@ -17,7 +17,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput @@ -222,7 +222,7 @@ def get_timesteps(self, num_inference_steps, strength): return timesteps, num_inference_steps - t_start # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): + def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): batch_size = batch_size * num_images_per_prompt if image.shape[1] == 4: @@ -242,11 +242,22 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt init_latents = np.concatenate([init_latents], axis=0) # add noise to latents using the timesteps - noise = generator.randn(*init_latents.shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + noise = generator.randn(*init_latents.shape).astype(dtype) + elif isinstance(generator, torch.Generator): + noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep) + torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) ) - return init_latents.numpy() + init_latents = init_latents.numpy() + + return init_latents def _get_add_time_ids( self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype @@ -274,7 +285,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -375,7 +386,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -482,7 +493,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py index 869b91ffe5..e9d5986b61 100644 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ b/optimum/pipelines/diffusers/pipeline_utils.py @@ -17,7 +17,7 @@ from typing import List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers import ConfigMixin from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor @@ -206,7 +206,7 @@ def postprocess( def get_height_width( self, - image: [PIL.Image.Image, np.ndarray], + image: Union[PIL.Image.Image, np.ndarray], height: Optional[int] = None, width: Optional[int] = None, ): @@ -264,10 +264,10 @@ def reshape(images: np.ndarray) -> np.ndarray: # TODO : remove after diffusers v0.21.0 release def resize( self, - image: [PIL.Image.Image, np.ndarray, torch.Tensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], height: Optional[int] = None, width: Optional[int] = None, - ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Resize image. """ From aa41f422cec94979f7ec8e330a6076640d331edf Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 27 Aug 2024 17:50:57 +0200 Subject: [PATCH 10/71] compact diffusion testing suite --- tests/onnxruntime/test_diffusion.py | 818 ++++++++++--------- tests/onnxruntime/utils_onnxruntime_tests.py | 13 +- 2 files changed, 434 insertions(+), 397 deletions(-) diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 2d5ab7a7f8..1840725299 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -12,9 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import random import unittest -from typing import Dict import numpy as np import PIL @@ -24,12 +22,8 @@ AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, - StableDiffusionPipeline, - StableDiffusionXLPipeline, ) from diffusers.utils import load_image -from diffusers.utils.testing_utils import floats_tensor -from packaging.version import Version, parse from parameterized import parameterized from transformers.testing_utils import require_torch_gpu from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin @@ -45,22 +39,20 @@ ORTStableDiffusionXLImg2ImgPipeline, ORTStableDiffusionXLPipeline, ) -from optimum.onnxruntime.modeling_diffusion import ( - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, -) from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor -from optimum.utils.import_utils import _diffusers_version from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm -if parse(_diffusers_version) > Version("0.21.4"): - from diffusers import LatentConsistencyModelPipeline +def get_generator(generator_framework, seed): + if generator_framework == "np": + return np.random.RandomState(seed) + elif generator_framework == "pt": + return torch.Generator().manual_seed(seed) + else: + raise ValueError(f"Unknown generator_framework: {generator_framework}") -def _generate_inputs(batch_size=1): +def _generate_prompts(batch_size=1): inputs = { "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, "num_inference_steps": 3, @@ -70,7 +62,7 @@ def _generate_inputs(batch_size=1): return inputs -def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"): +def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"): if input_type == "pil": image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" @@ -94,16 +86,24 @@ def to_np(image): class ORTPipelineForText2ImageTest(ORTModelTestMixin): ARCHITECTURE_TO_ORTMODEL_CLASS = { + "lcm": ORTLatentConsistencyModelPipeline, "stable-diffusion": ORTStableDiffusionPipeline, "stable-diffusion-xl": ORTStableDiffusionXLPipeline, - "lcm": ORTLatentConsistencyModelPipeline, } - AUTOMODEL_CLASS = AutoPipelineForText2Image ORTMODEL_CLASS = ORTPipelineForText2Image + AUTOMODEL_CLASS = AutoPipelineForText2Image TASK = "text-to-image" + def generate_inputs(self, height=128, width=128, batch_size=1): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["height"] = height + inputs["width"] = width + + return inputs + @require_diffusers def test_load_vanilla_model_which_is_not_supported(self): with self.assertRaises(Exception) as context: @@ -131,12 +131,41 @@ def test_num_images_per_prompt(self, model_arch: str): self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) self.assertEqual(pipeline.unet.config["in_channels"], 4) - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 128, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + if model_arch == "lcm": + # LCM doesn't support deterministic outputs beyond the first inference step + # TODO: Investigate why this is the case + inputs["num_inference_steps"] = 1 + + for output_type in ["latent", "np"]: + inputs["output_type"] = output_type + + ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + self.assertTrue( + np.allclose(ort_output, diffusers_output, atol=1e-4), + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), + ) + self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) @parameterized.expand( grid_parameters( @@ -172,7 +201,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 + height, width, batch_size = 64, 32, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) outputs = pipeline(**inputs).images # Verify model devices @@ -186,19 +215,32 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: - callback_fn.has_been_called = True - callback_fn.number_of_steps += 1 + height, width, batch_size = 64, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 - callback_fn.has_been_called = False - callback_fn.number_of_steps = 0 + ort_callback = Callback() + auto_callback = Callback() - inputs = self.generate_inputs(height=64, width=64) - pipeline(**inputs, callback=callback_fn, callback_steps=1) - self.assertTrue(callback_fn.has_been_called) - self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertTrue(auto_callback.has_been_called) + self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps) @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) @require_diffusers @@ -222,55 +264,74 @@ def test_shape(self, model_arch: str): (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) - @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"]) + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) @require_diffusers def test_image_reproducibility(self, model_arch: str): + if model_arch in ["lcm"]: + pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") + model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - inputs = _generate_inputs() - height, width = 64, 64 - np.random.seed(0) - ort_outputs_1 = pipeline(**inputs, height=height, width=width) - np.random.seed(0) - ort_outputs_2 = pipeline(**inputs, height=height, width=width) - ort_outputs_3 = pipeline(**inputs, height=height, width=width) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) - self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - @parameterized.expand(["stable-diffusion"]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) def test_negative_prompt(self, model_arch: str): + if model_arch in ["lcm"]: + pytest.skip("LCM (Latent Consistency Model) does not support negative prompts") + model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + negative_prompt = ["This is a negative prompt"] - np.random.seed(0) - image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1] + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + image_slice_1 = pipeline( + **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED) + ).images[0, -3:, -3:, -1] prompt = inputs.pop("prompt") - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = pipeline.tokenizer( - p, - padding="max_length", + + if model_arch == "stable-diffusion-xl": + ( + inputs["prompt_embeds"], + inputs["negative_prompt_embeds"], + inputs["pooled_prompt_embeds"], + inputs["negative_pooled_prompt_embeds"], + ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt) + else: + text_ids = pipeline.tokenizer( + prompt, max_length=pipeline.tokenizer.model_max_length, + padding="max_length", + return_tensors="np", truncation=True, + ).input_ids + negative_text_ids = pipeline.tokenizer( + negative_prompt, + max_length=pipeline.tokenizer.model_max_length, + padding="max_length", return_tensors="np", - ) - text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32)) - embeds.append(pipeline.text_encoder(text_inputs)[0]) + truncation=True, + ).input_ids + inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0] + inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0] - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - np.random.seed(0) - image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1] - self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4)) + image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1] - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = _generate_inputs(batch_size=batch_size) - inputs["height"] = height - inputs["width"] = width - return inputs + self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1)) class ORTPipelineForImage2ImageTest(ORTModelTestMixin): @@ -283,6 +344,19 @@ class ORTPipelineForImage2ImageTest(ORTModelTestMixin): TASK = "image-to-image" + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type + ) + + inputs["strength"] = 0.75 + inputs["height"] = height + inputs["width"] = width + + return inputs + @require_diffusers def test_load_vanilla_model_which_is_not_supported(self): with self.assertRaises(Exception) as context: @@ -297,6 +371,7 @@ def test_load_vanilla_model_which_is_not_supported(self): def test_num_images_per_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) self.assertEqual(pipeline.vae_scale_factor, 2) self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) @@ -320,9 +395,11 @@ def test_num_images_per_prompt(self, model_arch: str): def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) outputs = pipeline(**inputs).images # Verify model devices self.assertEqual(pipeline.device.type.lower(), "cuda") @@ -342,9 +419,11 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) outputs = pipeline(**inputs).images # Verify model devices self.assertEqual(pipeline.device.type.lower(), "cuda") @@ -355,26 +434,47 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) @require_diffusers def test_callback(self, model_arch: str): - def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: - callback_fn.has_been_called = True - callback_fn.number_of_steps += 1 - - pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - callback_fn.has_been_called = False - callback_fn.number_of_steps = 0 - inputs = self.generate_inputs(height=64, width=64) - pipe(**inputs, callback=callback_fn, callback_steps=1) - self.assertTrue(callback_fn.has_been_called) - self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) + if model_arch in ["stable-diffusion"]: + pytest.skip( + "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)" + ) + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ort_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) @require_diffusers def test_shape(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + height, width, batch_size = 32, 64, 1 for input_type in ["np", "pil", "pt"]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) @@ -392,315 +492,259 @@ def test_shape(self, model_arch: str): (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size=batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + pytest.skip("Img2Img models do not support support output reproducibility for some reason") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 128, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2)) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + pytest.skip("Img2Img models do not support support output reproducibility for some reason") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + +class ORTPipelineForInpaintingTest(ORTModelTestMixin): + ARCHITECTURE_TO_ORTMODEL_CLASS = { + "stable-diffusion": ORTStableDiffusionInpaintPipeline, + } + + AUTOMODEL_CLASS = AutoPipelineForInpainting + ORTMODEL_CLASS = ORTPipelineForInpainting + + TASK = "inpainting" + + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): + assert batch_size == 1, "Inpainting models only support batch_size=1" + assert input_type == "pil", "Inpainting models only support input_type='pil'" + + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=1, channel=channel, input_type="pil" + )[0] + inputs["mask_image"] = _generate_images( + height=height, width=width, batch_size=1, channel=channel, input_type="pil" + )[0] + + inputs["height"] = height + inputs["width"] = width + return inputs - # @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) - # @require_diffusers - # def test_shape(self, model_arch: str): - # model_args = {"test_name": model_arch, "model_arch": model_arch} - # self._setup(model_args) - # height, width, batch_size = 128, 64, 1 - # pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - - # if self.TASK == "image-to-image": - # input_types = ["np", "pil", "pt"] - # elif self.TASK == "text-to-image": - # input_types = ["np"] - # else: - # input_types = ["pil"] - - # for input_type in input_types: - # if self.TASK == "image-to-image": - # inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - # else: - # inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - # for output_type in ["np", "pil", "latent"]: - # inputs["output_type"] = output_type - # outputs = pipeline(**inputs).images - # if output_type == "pil": - # self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) - # elif output_type == "np": - # self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - # else: - # self.assertEqual( - # outputs.shape, - # (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - # ) - - -# class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase): -# SUPPORTED_ARCHITECTURES = ["stable-diffusion"] -# ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline -# TASK = "image-to-image" - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# @require_diffusers -# def test_compare_diffusers_pipeline(self, model_arch: str): -# model_args = {"test_name": model_arch, "model_arch": model_arch} -# self._setup(model_args) -# height, width = 128, 128 - -# inputs = self.generate_inputs(height=height, width=width) -# inputs["prompt"] = "A painting of a squirrel eating a burger" -# inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED)) - -# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) -# ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - -# diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) -# diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - -# self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1)) - - -# class ORTStableDiffusionPipelineTest(unittest.TestCase): -# SUPPORTED_ARCHITECTURES = [ -# "stable-diffusion", -# ] -# ORTMODEL_CLASS = ORTStableDiffusionPipeline -# TASK = "text-to-image" - - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# @require_diffusers -# def test_image_reproducibility(self, model_arch: str): -# pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) -# inputs = _generate_inputs() -# height, width = 64, 32 -# np.random.seed(0) -# ort_outputs_1 = pipeline(**inputs, height=height, width=width) -# np.random.seed(0) -# ort_outputs_2 = pipeline(**inputs, height=height, width=width) -# ort_outputs_3 = pipeline(**inputs, height=height, width=width) -# # Compare model outputs -# self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) -# self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# def test_negative_prompt(self, model_arch: str): -# pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) -# inputs = _generate_inputs() -# inputs["height"], inputs["width"] = 64, 32 -# negative_prompt = ["This is a negative prompt"] -# np.random.seed(0) -# image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1] -# prompt = inputs.pop("prompt") -# embeds = [] -# for p in [prompt, negative_prompt]: -# text_inputs = pipeline.tokenizer( -# p, -# padding="max_length", -# max_length=pipeline.tokenizer.model_max_length, -# truncation=True, -# return_tensors="np", -# ) -# text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32)) -# embeds.append(pipeline.text_encoder(text_inputs)[0]) - -# inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds -# np.random.seed(0) -# image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1] -# self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4)) - - -# class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin): -# SUPPORTED_ARCHITECTURES = [ -# "stable-diffusion-xl", -# ] -# ORTMODEL_CLASS = ORTStableDiffusionXLPipeline -# TASK = "text-to-image" - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# @require_diffusers -# def test_compare_to_diffusers(self, model_arch: str): -# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) -# self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) -# self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder) -# self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) -# self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) -# self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) -# self.assertIsInstance(ort_pipeline.config, Dict) - -# pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch]) -# batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 -# latents = ort_pipeline.prepare_latents( -# batch_size * num_images_per_prompt, -# ort_pipeline.unet.config["in_channels"], -# height, -# width, -# dtype=np.float32, -# generator=np.random.RandomState(0), -# ) - -# kwargs = { -# "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, -# "num_inference_steps": 1, -# "num_images_per_prompt": num_images_per_prompt, -# "height": height, -# "width": width, -# "guidance_rescale": 0.1, -# } - -# for output_type in ["latent", "np"]: -# ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images -# self.assertIsInstance(ort_outputs, np.ndarray) -# with torch.no_grad(): -# outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - -# # Compare model outputs -# self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) -# # Compare model devices -# self.assertEqual(pipeline.device, ort_pipeline.device) - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# @require_diffusers -# def test_image_reproducibility(self, model_arch: str): -# pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) -# inputs = _generate_inputs() -# height, width = 64, 32 -# np.random.seed(0) -# ort_outputs_1 = pipeline(**inputs, height=height, width=width) -# np.random.seed(0) -# ort_outputs_2 = pipeline(**inputs, height=height, width=width) -# ort_outputs_3 = pipeline(**inputs, height=height, width=width) -# self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) -# self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - -# class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase): -# SUPPORTED_ARCHITECTURES = [ -# "stable-diffusion", -# ] -# ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline -# TASK = "inpainting" - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# @require_diffusers -# def test_compare_diffusers_pipeline(self, model_arch: str): -# model_args = {"test_name": model_arch, "model_arch": model_arch} -# self._setup(model_args) -# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) -# diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch]) -# height, width = 64, 64 -# latents_shape = ( -# 1, -# ort_pipeline.vae_decoder.config["latent_channels"], -# height // ort_pipeline.vae_scale_factor, -# width // ort_pipeline.vae_scale_factor, -# ) -# inputs = self.generate_inputs(height=height, width=width) - -# np_latents = np.random.rand(*latents_shape).astype(np.float32) -# torch_latents = torch.from_numpy(np_latents) - -# ort_outputs = ort_pipeline(**inputs, latents=np_latents).images -# self.assertEqual(ort_outputs.shape, (1, height, width, 3)) - -# diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images -# self.assertEqual(diffusers_outputs.shape, (1, height, width, 3)) - -# self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4)) - -# def generate_inputs(self, height=128, width=128, batch_size=1): -# inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width) -# inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] -# inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] -# return inputs - - -# class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin): -# SUPPORTED_ARCHITECTURES = [ -# "stable-diffusion-xl", -# ] -# ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline -# TASK = "image-to-image" - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# @require_diffusers -# def test_inference(self, model_arch: str): -# model_args = {"test_name": model_arch, "model_arch": model_arch} -# self._setup(model_args) -# pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - -# height, width = 128, 128 -# inputs = self.generate_inputs(height=height, width=width) -# inputs["image"] = load_image( -# "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" -# "/in_paint/overture-creations-5sI6fQgYIuo.png" -# ).resize((width, height)) -# output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1] -# expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080]) - -# self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1)) - -# def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): -# inputs = _generate_inputs(batch_size=batch_size) -# inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) -# inputs["strength"] = 0.75 -# return inputs - - -# class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin): -# SUPPORTED_ARCHITECTURES = [ -# "latent-consistency", -# ] -# ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline -# TASK = "text-to-image" - -# @parameterized.expand(SUPPORTED_ARCHITECTURES) -# @require_diffusers -# @unittest.skipIf( -# parse(_diffusers_version) <= Version("0.21.4"), -# "not supported with this diffusers version, needs diffusers>=v0.22.0", -# ) -# def test_compare_to_diffusers(self, model_arch: str): -# ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) -# self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) -# self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) -# self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) -# self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) -# self.assertIsInstance(ort_pipeline.config, Dict) - -# pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch]) -# batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 -# latents = ort_pipeline.prepare_latents( -# batch_size * num_images_per_prompt, -# ort_pipeline.unet.config["in_channels"], -# height, -# width, -# dtype=np.float32, -# generator=np.random.RandomState(0), -# ) - -# kwargs = { -# "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, -# "num_inference_steps": 1, -# "num_images_per_prompt": num_images_per_prompt, -# "height": height, -# "width": width, -# "guidance_scale": 8.5, -# } - -# for output_type in ["latent", "np"]: -# ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images -# self.assertIsInstance(ort_outputs, np.ndarray) -# with torch.no_grad(): -# outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - -# # Compare model outputs -# self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) -# # Compare model devices -# self.assertEqual(pipeline.device, ort_pipeline.device) + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + batch_size, height = 1, 32 + for width in [64, 32]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand( + grid_parameters( + {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]} + ) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters( + {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]} + ) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_callback(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ort_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + height, width, batch_size = 32, 64, 1 + + for input_type in ["pil"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + latents_shape = ( + batch_size, + ort_pipeline.vae_decoder.config["latent_channels"], + height // ort_pipeline.vae_scale_factor, + width // ort_pipeline.vae_scale_factor, + ) + + np_latents = np.random.rand(*latents_shape).astype(np.float32) + torch_latents = torch.from_numpy(np_latents) + + ort_output = ort_pipeline(**inputs, latents=np_latents).images + diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images + + self.assertTrue( + np.allclose(ort_output, diffusers_output, atol=1e-4), + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), + ) + + @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) class ImageProcessorTest(unittest.TestCase): def test_vae_image_processor_pt(self): image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt")) + input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt")) input_np = to_np(input_pt) for output_type in ["np", "pil"]: @@ -711,7 +755,7 @@ def test_vae_image_processor_pt(self): def test_vae_image_processor_np(self): image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_np = np.stack(_create_image(height=8, width=8, input_type="np")) + input_np = np.stack(_generate_images(height=8, width=8, input_type="np")) for output_type in ["np", "pil"]: out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) out_np = to_np(out) @@ -720,7 +764,7 @@ def test_vae_image_processor_np(self): def test_vae_image_processor_pil(self): image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil") + input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil") for output_type in ["np", "pil"]: out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index e77b9b7c20..aa06476498 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -213,16 +213,9 @@ def _setup(self, model_args: Dict): continue set_seed(SEED) - if hasattr(self, "ORTMODEL_CLASS"): - onnx_model = self.ORTMODEL_CLASS.from_pretrained( - model_id, **model_args, use_io_binding=False, export=True - ) - elif hasattr(self, "ORTPIPELINE_CLASS"): - onnx_model = self.ORTPIPELINE_CLASS.from_pretrained( - model_id, **model_args, use_io_binding=False, export=True - ) - else: - raise ValueError("ORTMODEL_CLASS or ORTPIPELINE_CLASS must be defined") + onnx_model = self.ORTMODEL_CLASS.from_pretrained( + model_id, **model_args, use_io_binding=False, export=True + ) model_dir = tempfile.mkdtemp( prefix=f"{model_arch_and_params}_{self.TASK}_{model_id.replace('/', '_')}" From 4837828102b2cbd876af9c9aef6f44a8d0651d5b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 27 Aug 2024 19:02:00 +0200 Subject: [PATCH 11/71] fix --- tests/onnxruntime/test_diffusion.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 1840725299..a8b82dd7c4 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -352,8 +352,6 @@ def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_ ) inputs["strength"] = 0.75 - inputs["height"] = height - inputs["width"] = width return inputs @@ -694,6 +692,11 @@ def test_shape(self, model_arch: str): @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): + if model_arch in ["stable-diffusion"]: + pytest.skip( + "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant" + ) + model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) From 80532b3bad2e6b82b2f057672ec339cc18ab35ac Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sat, 7 Sep 2024 17:06:56 +0200 Subject: [PATCH 12/71] test --- optimum/onnxruntime/base.py | 12 +- optimum/onnxruntime/modeling_diffusion.py | 214 +++++++++++----------- 2 files changed, 107 insertions(+), 119 deletions(-) diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index d9877670ba..5206edfc08 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -22,7 +22,6 @@ from onnxruntime import InferenceSession -from ..utils import NormalizedConfigManager from ..utils.logging import warn_once from .io_binding import TypeHelper from .modeling_ort import ORTModel @@ -41,17 +40,10 @@ class ORTModelPart: _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs - def __init__( - self, - session: InferenceSession, - parent_model: "ORTModel", - ): + def __init__(self, session: InferenceSession, parent_model: "ORTModel"): self.session = session self.parent_model = parent_model - self.normalized_config = NormalizedConfigManager.get_normalized_config_class( - self.parent_model.config.model_type - )(self.parent_model.config) - self.main_input_name = self.parent_model.main_input_name + self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 7e998b4a89..606919ea7f 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -17,7 +17,6 @@ import os import shutil import warnings -from abc import abstractmethod from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory @@ -41,11 +40,6 @@ StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline, ) -from diffusers.pipelines.auto_pipeline import ( - AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, - AUTO_INPAINT_PIPELINES_MAPPING, - AUTO_TEXT2IMAGE_PIPELINES_MAPPING, -) from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download @@ -73,6 +67,7 @@ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) +from .base import ORTModelPart from .io_binding import TypeHelper from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel from .utils import ( @@ -86,25 +81,25 @@ logger = logging.getLogger(__name__) -class ORTDiffusionPipeline(ORTModel): - auto_model_class = DiffusionPipeline - main_input_name = "prompt" - base_model_prefix = "onnx_model" +class ORTPipeline(ORTModel): + auto_model_class = None + model_type = "onnx_pipeline" + config_name = "model_index.json" sub_component_config_name = "config.json" - # TODO: instead of having a bloated init, we should probably have an init per pipeline, - # so that we can easily add new pipelines without having to modify the base class + main_input_name = "prompt" + def __init__( self, vae_decoder_session: ort.InferenceSession, - text_encoder_session: ort.InferenceSession, unet_session: ort.InferenceSession, - config: Dict[str, Any], tokenizer: CLIPTokenizer, + config: Dict[str, Any], scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], feature_extractor: Optional[CLIPFeatureExtractor] = None, vae_encoder_session: Optional[ort.InferenceSession] = None, + text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, tokenizer_2: Optional[CLIPTokenizer] = None, use_io_binding: Optional[bool] = None, @@ -113,23 +108,28 @@ def __init__( """ Args: vae_decoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the VAE decoder. - text_encoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the text encoder. + The ONNX Runtime inference session associated to the VAE decoder unet_session (`ort.InferenceSession`): The ONNX Runtime inference session associated to the U-NET. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + for the text encoder. config (`Dict[str, Any]`): A config dictionary from which the model components will be instantiated. Make sure to only load configuration files of compatible classes. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): A model extracting features from generated images to be used as inputs for the `safety_checker` vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): The ONNX Runtime inference session associated to the VAE encoder. + text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): + The ONNX Runtime inference session associated to the text encoder. + tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + for the second text encoder. use_io_binding (`Optional[bool]`, defaults to `None`): Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to `True` if the device is CUDA, otherwise defaults to `False`. @@ -137,7 +137,7 @@ def __init__( The directory under which the model exported to ONNX was saved. """ self.shared_attributes_init( - vae_decoder_session, + model=vae_decoder_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, ) @@ -418,7 +418,7 @@ def _from_transformers( provider_options: Optional[Dict[str, Any]] = None, use_io_binding: Optional[bool] = None, task: Optional[str] = None, - ) -> "ORTDiffusionPipeline": + ) -> "ORTPipeline": if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", @@ -499,46 +499,27 @@ def _save_config(self, save_directory): self.save_config(save_directory) -# TODO : Use ORTModelPart once IOBinding support is added -class _ORTDiffusionModelPart: - """ - For multi-file ONNX models, represents a part of the model. - It has its own `onnxruntime.InferenceSession`, and can perform a forward pass. - """ - +class ORTPipelinePart(ORTModelPart): CONFIG_NAME = "config.json" - _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs - _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs - - def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): - self.session = session - self.parent_model = parent_model + def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): config_path = Path(session._model_path).parent / self.CONFIG_NAME - self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} - self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} - self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} - self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} - @property - def input_dtype(self): - # for backward compatibility - return {key: TypeHelper.ort_type_to_numpy_type(value) for key, value in self.input_dtypes.items()} - - @property - def device(self): - return self.parent_model.device + if config_path.is_file(): + # TODO: use FrozenDict + self.config = parent_model._dict_from_json_file(config_path) + else: + self.config = {} - @abstractmethod - def forward(self, *args, **kwargs): - pass + super().__init__(session, parent_model) - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) + @property + def input_dtype(self): + # for backward compatibility and diffusion mixins (will be standardized in the future) + return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} -class ORTModelTextEncoder(_ORTDiffusionModelPart): +class ORTModelTextEncoder(ORTPipelinePart): def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): use_torch = isinstance(input_ids, torch.Tensor) @@ -551,10 +532,7 @@ def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): return ModelOutput(**model_outputs) -class ORTModelUnet(_ORTDiffusionModelPart): - def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): - super().__init__(session, parent_model) - +class ORTModelUnet(ORTPipelinePart): def forward( self, sample: Union[np.ndarray, torch.Tensor], @@ -582,7 +560,7 @@ def forward( return ModelOutput(**model_outputs) -class ORTModelVaeDecoder(_ORTDiffusionModelPart): +class ORTModelVaeDecoder(ORTPipelinePart): def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): use_torch = isinstance(latent_sample, torch.Tensor) @@ -595,7 +573,7 @@ def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): return ModelOutput(**model_outputs) -class ORTModelVaeEncoder(_ORTDiffusionModelPart): +class ORTModelVaeEncoder(ORTPipelinePart): def forward(self, sample: Union[np.ndarray, torch.Tensor]): use_torch = isinstance(sample, torch.Tensor) @@ -609,7 +587,7 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor]): @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ @@ -620,7 +598,7 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ @@ -631,7 +609,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ @@ -642,7 +620,7 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ @@ -652,7 +630,7 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP __call__ = LatentConsistencyPipelineMixin.__call__ -class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline): +class ORTStableDiffusionXLPipelineBase(ORTPipeline): def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -721,6 +699,48 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ +SUPPORTED_ORT_PIPELINES = [ + ORTStableDiffusionPipeline, + ORTStableDiffusionImg2ImgPipeline, + ORTStableDiffusionInpaintPipeline, + ORTLatentConsistencyModelPipeline, + ORTStableDiffusionXLPipeline, + ORTStableDiffusionXLImg2ImgPipeline, +] + + +def _get_pipeline_class(class_name: str, throw_error_if_not_exist: bool = True): + for ort_pipeline_class in SUPPORTED_ORT_PIPELINES: + if ort_pipeline_class.auto_model_class.__name__ == class_name: + return ort_pipeline_class + + if throw_error_if_not_exist: + raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {class_name}") + + +class ORTDiffusionPipeline(ConfigMixin): + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + load_config_kwargs = { + "force_download": kwargs.get("force_download", False), + "resume_download": kwargs.get("resume_download", None), + "local_files_only": kwargs.get("local_files_only", False), + "cache_dir": kwargs.get("cache_dir", None), + "revision": kwargs.get("revision", None), + "proxies": kwargs.get("proxies", None), + "token": kwargs.get("token", None), + } + + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + config = config[0] if isinstance(config, tuple) else config + class_name = config["_class_name"] + + ort_pipeline_class = _get_pipeline_class(class_name) + + return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) + + ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ ("lcm", ORTLatentConsistencyModelPipeline), @@ -742,49 +762,38 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ] ) -SUPPORTED_TASKS_MAPPINGS = [ +SUPPORTED_ORT_PIPELINES_MAPPINGS = [ ORT_TEXT2IMAGE_PIPELINES_MAPPING, ORT_IMAGE2IMAGE_PIPELINES_MAPPING, ORT_INPAINT_PIPELINES_MAPPING, ] -def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True): - def get_model(pipeline_class_name): - for task_mapping in SUPPORTED_TASKS_MAPPINGS: - for model_name, pipeline in task_mapping.items(): +def _get_task_class(mapping, pipeline_class_name): + def _get_model_name(pipeline_class_name): + for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS: + for model_name, ort_pipeline in ort_pipelines_mapping.items(): if ( - pipeline.__name__ == pipeline_class_name - or pipeline.auto_model_class.__name__ == pipeline_class_name + ort_pipeline.__name__ == pipeline_class_name + or ort_pipeline.auto_model_class.__name__ == pipeline_class_name ): return model_name - model_name = get_model(pipeline_class_name) + model_name = _get_model_name(pipeline_class_name) if model_name is not None: task_class = mapping.get(model_name, None) if task_class is not None: return task_class - if throw_error_if_not_exist: - raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}") + raise ValueError(f"ORTPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}") -class ORTPipelineBase(ConfigMixin): - config_name = "model_index.json" - - ort_pipeline_mapping = None - auto_pipeline_mapping = None - - def __init__(self, *args, **kwargs): - raise EnvironmentError( - f"{self.__class__.__name__} is designed to be instantiated " - f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " - f"`{self.__class__.__name__}.from_pipe(pipeline)` methods." - ) +class ORTPipelineForTask(ConfigMixin): + auto_model_class = None + ort_pipelines_mapping = None @classmethod - @validate_hf_hub_args def from_pretrained(cls, pretrained_model_or_path, **kwargs): load_config_kwargs = { "force_download": kwargs.get("force_download", False), @@ -795,38 +804,25 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): "proxies": kwargs.get("proxies", None), "token": kwargs.get("token", None), } - config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) config = config[0] if isinstance(config, tuple) else config class_name = config["_class_name"] - ort_pipeline_cls = _get_task_class(cls.ort_pipeline_mapping, class_name) + ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name) - return ort_pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs) - - @classmethod - def from_pipe(cls, **kwargs): - raise NotImplementedError( - f"from_pipe is not yet implemented for {cls.__name__}. Please use from_pretrained instead." - ) + return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) -class ORTPipelineForText2Image(ORTPipelineBase): +class ORTPipelineForText2Image(ORTPipelineForTask): auto_model_class = AutoPipelineForText2Image - - ort_pipeline_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING - auto_pipeline_mapping = AUTO_TEXT2IMAGE_PIPELINES_MAPPING + ort_pipelines_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING -class ORTPipelineForImage2Image(ORTPipelineBase): +class ORTPipelineForImage2Image(ORTPipelineForTask): auto_model_class = AutoPipelineForImage2Image + ort_pipelines_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING - ort_pipeline_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING - auto_pipeline_mapping = AUTO_IMAGE2IMAGE_PIPELINES_MAPPING - -class ORTPipelineForInpainting(ORTPipelineBase): +class ORTPipelineForInpainting(ORTPipelineForTask): auto_model_class = AutoPipelineForInpainting - - ort_pipeline_mapping = ORT_INPAINT_PIPELINES_MAPPING - auto_pipeline_mapping = AUTO_INPAINT_PIPELINES_MAPPING + ort_pipelines_mapping = ORT_INPAINT_PIPELINES_MAPPING From f99a058f7ea75578770808e116256348bada63ac Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sat, 7 Sep 2024 17:29:14 +0200 Subject: [PATCH 13/71] test --- optimum/onnxruntime/base.py | 1 + optimum/onnxruntime/modeling_diffusion.py | 14 +++++++++----- optimum/onnxruntime/modeling_seq2seq.py | 10 ---------- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index 5206edfc08..ccfd646ea0 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -43,6 +43,7 @@ class ORTModelPart: def __init__(self, session: InferenceSession, parent_model: "ORTModel"): self.session = session self.parent_model = parent_model + self.main_input_name = self.parent_model.main_input_name self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 606919ea7f..0d3fa2bcc5 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -30,7 +30,6 @@ AutoPipelineForText2Image, ConfigMixin, DDIMScheduler, - DiffusionPipeline, LatentConsistencyModelPipeline, LMSDiscreteScheduler, PNDMScheduler, @@ -88,8 +87,6 @@ class ORTPipeline(ORTModel): config_name = "model_index.json" sub_component_config_name = "config.json" - main_input_name = "prompt" - def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -592,6 +589,7 @@ class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ + main_input_name = "prompt" auto_model_class = StableDiffusionPipeline __call__ = StableDiffusionPipelineMixin.__call__ @@ -603,6 +601,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipel ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ + main_input_name = "prompt" auto_model_class = StableDiffusionImg2ImgPipeline __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ @@ -614,6 +613,7 @@ class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipel ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ + main_input_name = "prompt" auto_model_class = StableDiffusionInpaintPipeline __call__ = StableDiffusionInpaintPipelineMixin.__call__ @@ -625,6 +625,7 @@ class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMi ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ + main_input_name = "prompt" auto_model_class = LatentConsistencyModelPipeline __call__ = LatentConsistencyPipelineMixin.__call__ @@ -683,6 +684,7 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ + main_input_name = "prompt" auto_model_class = StableDiffusionXLPipeline __call__ = StableDiffusionXLPipelineMixin.__call__ @@ -694,6 +696,7 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ + main_input_name = "prompt" auto_model_class = StableDiffusionXLImg2ImgPipeline __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ @@ -719,6 +722,8 @@ def _get_pipeline_class(class_name: str, throw_error_if_not_exist: bool = True): class ORTDiffusionPipeline(ConfigMixin): + config_name = "model_index.json" + @classmethod @validate_hf_hub_args def from_pretrained(cls, pretrained_model_or_path, **kwargs): @@ -790,8 +795,7 @@ def _get_model_name(pipeline_class_name): class ORTPipelineForTask(ConfigMixin): - auto_model_class = None - ort_pipelines_mapping = None + config_name = "model_index.json" @classmethod def from_pretrained(cls, pretrained_model_or_path, **kwargs): diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index 4ce3e4707e..fc185500d8 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -72,16 +72,6 @@ from transformers.generation_utils import GenerationMixin -# if check_if_transformers_greater("4.37.0"): -# # starting from transformers v4.37.0, the whisper generation loop is implemented in the `WhisperGenerationMixin` -# # and it implements many new features including short and long form generation, and starts with 2 init tokens -# from transformers.models.whisper.generation_whisper import WhisperGenerationMixin -# else: - -# class WhisperGenerationMixin(WhisperForConditionalGeneration, GenerationMixin): -# pass - - if check_if_transformers_greater("4.43.0"): from transformers.cache_utils import EncoderDecoderCache else: From 781ede7d6a530d023bb78283336564c107e129ca Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sat, 7 Sep 2024 20:13:54 +0200 Subject: [PATCH 14/71] test --- optimum/onnxruntime/base.py | 41 +++++++++-------- optimum/onnxruntime/modeling_seq2seq.py | 58 ------------------------- 2 files changed, 22 insertions(+), 77 deletions(-) diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index ccfd646ea0..b59c59ede7 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -22,6 +22,7 @@ from onnxruntime import InferenceSession +from ..utils import NormalizedConfigManager from ..utils.logging import warn_once from .io_binding import TypeHelper from .modeling_ort import ORTModel @@ -83,12 +84,18 @@ class ORTEncoder(ORTModelPart): Encoder part of the encoder-decoder model for ONNX Runtime inference. """ - def forward( - self, - input_ids: torch.LongTensor, - attention_mask: torch.LongTensor, - **kwargs, - ) -> BaseModelOutput: + def __init__(self, session: InferenceSession, parent_model: "ORTModel"): + super().__init__(session, parent_model) + + config = ( + self.parent_model.config.encoder + if hasattr(self.parent_model.config, "encoder") + else self.parent_model.config + ) + + self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) + + def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, **kwargs) -> BaseModelOutput: use_torch = isinstance(input_ids, torch.Tensor) self.parent_model.raise_on_numpy_input_io_binding(use_torch) @@ -131,6 +138,14 @@ def __init__( ): super().__init__(session, parent_model) + config = ( + self.parent_model.config.encoder + if hasattr(self.parent_model.config, "encoder") + else self.parent_model.config + ) + + self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) + # TODO: make this less hacky. self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)] self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)] @@ -146,11 +161,7 @@ def __init__( self.use_past_in_outputs = len(self.key_value_output_names) > 0 self.use_past_in_inputs = len(self.key_value_input_names) > 0 - self.use_fp16 = False - for inp in session.get_inputs(): - if "past_key_values" in inp.name and inp.type == "tensor(float16)": - self.use_fp16 = True - break + self.use_fp16 = self.dtype == torch.float16 # We may use ORTDecoderForSeq2Seq for vision-encoder-decoder models, where models as gpt2 # can be used but do not support KV caching for the cross-attention key/values, see: @@ -454,11 +465,3 @@ def prepare_inputs_for_merged( cache_position = cache_position.to(self.device) return use_cache_branch_tensor, past_key_values, cache_position - - -class ORTDecoder(ORTDecoderForSeq2Seq): - def __init__(self, *args, **kwargs): - logger.warning( - "The class `ORTDecoder` is deprecated and will be removed in optimum v1.15.0, please use `ORTDecoderForSeq2Seq` instead." - ) - super().__init__(*args, **kwargs) diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index fc185500d8..3cecadafe3 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -46,7 +46,6 @@ from ..onnx.utils import _get_external_data_paths from ..utils import check_if_transformers_greater from ..utils.file_utils import validate_file_exists -from ..utils.normalized_config import NormalizedConfigManager from ..utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors from .base import ORTDecoderForSeq2Seq, ORTEncoder from .constants import ( @@ -1155,49 +1154,6 @@ class ORTModelForSeq2SeqLM(ORTModelForConditionalGeneration, GenerationMixin): auto_model_class = AutoModelForSeq2SeqLM main_input_name = "input_ids" - def __init__( - self, - encoder_session: ort.InferenceSession, - decoder_session: ort.InferenceSession, - config: "PretrainedConfig", - onnx_paths: List[str], - decoder_with_past_session: Optional[ort.InferenceSession] = None, - use_cache: bool = True, - use_io_binding: Optional[bool] = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - preprocessors: Optional[List] = None, - generation_config: Optional[GenerationConfig] = None, - **kwargs, - ): - super().__init__( - encoder_session, - decoder_session, - config, - onnx_paths, - decoder_with_past_session, - use_cache, - use_io_binding, - model_save_dir, - preprocessors, - generation_config, - **kwargs, - ) - - # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized. - if config.model_type == "encoder-decoder": - self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.encoder.model_type - )(config.encoder) - - self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - - if self.decoder_with_past is not None: - self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: return ORTEncoder(session, self) @@ -1511,20 +1467,6 @@ def __init__( **kwargs, ) - # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized. - self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.encoder.model_type - )(config.encoder) - - self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - - if self.decoder_with_past is not None: - self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: return ORTEncoderForVisionEncoderDecoder(session, self) From f0e3f2be5ccfcdb4da6bdfae32a1a5262292b699 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sat, 7 Sep 2024 20:23:21 +0200 Subject: [PATCH 15/71] use latent-consistency architecture name instead of lcm --- optimum/exporters/tasks.py | 2 +- optimum/onnxruntime/__init__.py | 2 ++ optimum/onnxruntime/modeling_diffusion.py | 2 +- tests/exporters/exporters_utils.py | 2 +- tests/onnxruntime/test_diffusion.py | 12 ++++++------ tests/onnxruntime/utils_onnxruntime_tests.py | 2 +- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 9705304087..a489f34fb0 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -308,9 +308,9 @@ class TasksManager: "image-feature-extraction": "feature-extraction", # for backward compatibility and testing (where # model task and model type are still the same) - "lcm": "text-to-image", "stable-diffusion": "text-to-image", "stable-diffusion-xl": "text-to-image", + "latent-consistency": "text-to-image", } _CUSTOM_CLASSES = { diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 35cbf14587..78ef2896d0 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -91,6 +91,7 @@ "ORTPipelineForText2Image", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", + "ORTDiffusionPipeline", ] @@ -149,6 +150,7 @@ ) else: from .modeling_diffusion import ( + ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 0d3fa2bcc5..32c64f38ef 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -748,9 +748,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ - ("lcm", ORTLatentConsistencyModelPipeline), ("stable-diffusion", ORTStableDiffusionPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), + ("latent-consistency", ORTLatentConsistencyModelPipeline), ] ) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index a55c7a124d..c8a33b0be3 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -298,7 +298,7 @@ PYTORCH_DIFFUSION_MODEL = { "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", - "lcm": "echarlaix/tiny-random-latent-consistency", + "latent-consistency": "echarlaix/tiny-random-latent-consistency", } PYTORCH_TIMM_MODEL = { diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index a8b82dd7c4..a7360ab386 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -86,7 +86,7 @@ def to_np(image): class ORTPipelineForText2ImageTest(ORTModelTestMixin): ARCHITECTURE_TO_ORTMODEL_CLASS = { - "lcm": ORTLatentConsistencyModelPipeline, + "latent-consistency": ORTLatentConsistencyModelPipeline, "stable-diffusion": ORTStableDiffusionPipeline, "stable-diffusion-xl": ORTStableDiffusionXLPipeline, } @@ -150,8 +150,8 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - if model_arch == "lcm": - # LCM doesn't support deterministic outputs beyond the first inference step + if model_arch == "latent-consistency": + # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step # TODO: Investigate why this is the case inputs["num_inference_steps"] = 1 @@ -267,7 +267,7 @@ def test_shape(self, model_arch: str): @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) @require_diffusers def test_image_reproducibility(self, model_arch: str): - if model_arch in ["lcm"]: + if model_arch in ["latent-consistency"]: pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -288,8 +288,8 @@ def test_image_reproducibility(self, model_arch: str): @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) def test_negative_prompt(self, model_arch: str): - if model_arch in ["lcm"]: - pytest.skip("LCM (Latent Consistency Model) does not support negative prompts") + if model_arch in ["latent-consistency"]: + pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index aa06476498..bb6935461d 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -108,7 +108,7 @@ "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", "levit": "hf-internal-testing/tiny-random-LevitModel", - "lcm": "echarlaix/tiny-random-latent-consistency", + "latent-consistency": "echarlaix/tiny-random-latent-consistency", "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel", "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model", "longt5": "hf-internal-testing/tiny-random-LongT5Model", From 80c63d087c2c7fb537a8d9740627f9042660e9a2 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sat, 7 Sep 2024 21:32:02 +0200 Subject: [PATCH 16/71] fix --- optimum/onnxruntime/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index b59c59ede7..0e54bafed7 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -139,8 +139,8 @@ def __init__( super().__init__(session, parent_model) config = ( - self.parent_model.config.encoder - if hasattr(self.parent_model.config, "encoder") + self.parent_model.config.decoder + if hasattr(self.parent_model.config, "decoder") else self.parent_model.config ) From a4518f23ede32ebebcf9a2b0a4beb3e4d7ac86b4 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sun, 8 Sep 2024 10:59:01 +0200 Subject: [PATCH 17/71] add ort diffusion pipeline tests --- optimum/onnxruntime/modeling_diffusion.py | 15 +- .../diffusers/pipeline_stable_diffusion_xl.py | 1 - tests/onnxruntime/test_diffusion.py | 134 ++++++++++-------- 3 files changed, 84 insertions(+), 66 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 32c64f38ef..18cd38c5f2 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -712,13 +712,16 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ] -def _get_pipeline_class(class_name: str, throw_error_if_not_exist: bool = True): +def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): for ort_pipeline_class in SUPPORTED_ORT_PIPELINES: - if ort_pipeline_class.auto_model_class.__name__ == class_name: + if ( + ort_pipeline_class.__name__ == pipeline_class_name + or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): return ort_pipeline_class if throw_error_if_not_exist: - raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {class_name}") + raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}") class ORTDiffusionPipeline(ConfigMixin): @@ -777,10 +780,10 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): def _get_task_class(mapping, pipeline_class_name): def _get_model_name(pipeline_class_name): for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS: - for model_name, ort_pipeline in ort_pipelines_mapping.items(): + for model_name, ort_pipeline_class in ort_pipelines_mapping.items(): if ( - ort_pipeline.__name__ == pipeline_class_name - or ort_pipeline.auto_model_class.__name__ == pipeline_class_name + ort_pipeline_class.__name__ == pipeline_class_name + or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name ): return model_name diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 3c210862ac..0407c16a77 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -418,7 +418,6 @@ def __call__( # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps - print("timesteps", timesteps) # 5. Prepare latent variables latents = self.prepare_latents( diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index a7360ab386..9f480b2d1a 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -22,6 +22,7 @@ AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, + DiffusionPipeline, ) from diffusers.utils import load_image from parameterized import parameterized @@ -29,27 +30,22 @@ from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin from optimum.onnxruntime import ( - ORTLatentConsistencyModelPipeline, + ORTDiffusionPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, - ORTStableDiffusionImg2ImgPipeline, - ORTStableDiffusionInpaintPipeline, - ORTStableDiffusionPipeline, - ORTStableDiffusionXLImg2ImgPipeline, - ORTStableDiffusionXLPipeline, ) from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm -def get_generator(generator_framework, seed): - if generator_framework == "np": +def get_generator(framework, seed): + if framework == "np": return np.random.RandomState(seed) - elif generator_framework == "pt": + elif framework == "pt": return torch.Generator().manual_seed(seed) else: - raise ValueError(f"Unknown generator_framework: {generator_framework}") + raise ValueError(f"Unknown framework: {framework}") def _generate_prompts(batch_size=1): @@ -85,11 +81,7 @@ def to_np(image): class ORTPipelineForText2ImageTest(ORTModelTestMixin): - ARCHITECTURE_TO_ORTMODEL_CLASS = { - "latent-consistency": ORTLatentConsistencyModelPipeline, - "stable-diffusion": ORTStableDiffusionPipeline, - "stable-diffusion-xl": ORTStableDiffusionXLPipeline, - } + SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"] ORTMODEL_CLASS = ORTPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -113,15 +105,23 @@ def test_load_vanilla_model_which_is_not_supported(self): f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) ) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_ort_pipeline_class_dispatch(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertIsInstance(pipeline, self.ARCHITECTURE_TO_ORTMODEL_CLASS[model_arch]) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -138,7 +138,7 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images).images self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -168,9 +168,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) @parameterized.expand( - grid_parameters( - {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]} - ) + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) ) @require_torch_gpu @pytest.mark.cuda_ep_test @@ -189,9 +187,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): self.assertEqual(outputs.shape, (batch_size, height, width, 3)) @parameterized.expand( - grid_parameters( - {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]} - ) + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) ) @require_torch_gpu @require_ort_rocm @@ -210,7 +206,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -242,7 +238,7 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: self.assertTrue(auto_callback.has_been_called) self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_shape(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -264,7 +260,7 @@ def test_shape(self, model_arch: str): (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): if model_arch in ["latent-consistency"]: @@ -286,7 +282,7 @@ def test_image_reproducibility(self, model_arch: str): self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): if model_arch in ["latent-consistency"]: pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") @@ -335,10 +331,8 @@ def test_negative_prompt(self, model_arch: str): class ORTPipelineForImage2ImageTest(ORTModelTestMixin): - ARCHITECTURE_TO_ORTMODEL_CLASS = { - "stable-diffusion": ORTStableDiffusionImg2ImgPipeline, - "stable-diffusion-xl": ORTStableDiffusionXLImg2ImgPipeline, - } + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + AUTOMODEL_CLASS = AutoPipelineForImage2Image ORTMODEL_CLASS = ORTPipelineForImage2Image @@ -364,7 +358,23 @@ def test_load_vanilla_model_which_is_not_supported(self): f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) ) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(list(SUPPORTED_ARCHITECTURES)) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -383,9 +393,7 @@ def test_num_images_per_prompt(self, model_arch: str): self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) @parameterized.expand( - grid_parameters( - {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]} - ) + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) ) @require_torch_gpu @pytest.mark.cuda_ep_test @@ -406,9 +414,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): self.assertEqual(outputs.shape, (batch_size, height, width, 3)) @parameterized.expand( - grid_parameters( - {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]} - ) + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) ) @require_torch_gpu @require_ort_rocm @@ -429,7 +435,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): if model_arch in ["stable-diffusion"]: @@ -465,7 +471,7 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: self.assertTrue(ort_callback.has_been_called) self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_shape(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -490,7 +496,7 @@ def test_shape(self, model_arch: str): (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): pytest.skip("Img2Img models do not support support output reproducibility for some reason") @@ -509,7 +515,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2)) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): pytest.skip("Img2Img models do not support support output reproducibility for some reason") @@ -532,9 +538,7 @@ def test_image_reproducibility(self, model_arch: str): class ORTPipelineForInpaintingTest(ORTModelTestMixin): - ARCHITECTURE_TO_ORTMODEL_CLASS = { - "stable-diffusion": ORTStableDiffusionInpaintPipeline, - } + SUPPORTED_ARCHITECTURES = ["stable-diffusion"] AUTOMODEL_CLASS = AutoPipelineForInpainting ORTMODEL_CLASS = ORTPipelineForInpainting @@ -568,7 +572,23 @@ def test_load_vanilla_model_which_is_not_supported(self): f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) ) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -587,9 +607,7 @@ def test_num_images_per_prompt(self, model_arch: str): self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) @parameterized.expand( - grid_parameters( - {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]} - ) + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) ) @require_torch_gpu @pytest.mark.cuda_ep_test @@ -610,9 +628,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): self.assertEqual(outputs.shape, (batch_size, height, width, 3)) @parameterized.expand( - grid_parameters( - {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]} - ) + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) ) @require_torch_gpu @require_ort_rocm @@ -633,7 +649,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -664,7 +680,7 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: self.assertTrue(ort_callback.has_been_called) self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_shape(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -689,7 +705,7 @@ def test_shape(self, model_arch: str): (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): if model_arch in ["stable-diffusion"]: @@ -724,7 +740,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), ) - @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys())) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} From 9f0c7b632388274f6c451d2ee597935761198b1f Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 11:03:44 +0200 Subject: [PATCH 18/71] added dummy objects --- optimum/onnxruntime/__init__.py | 10 +++++- optimum/utils/dummy_diffusers_objects.py | 44 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 78ef2896d0..09a48ec955 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -79,6 +79,10 @@ "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + "ORTPipelineForText2Image", + "ORTDiffusionPipeline", ] else: _import_structure["modeling_diffusion"] = [ @@ -88,9 +92,9 @@ "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", - "ORTPipelineForText2Image", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", + "ORTPipelineForText2Image", "ORTDiffusionPipeline", ] @@ -141,7 +145,11 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( + ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py index f6914bbcd3..35d1ffe9fc 100644 --- a/optimum/utils/dummy_diffusers_objects.py +++ b/optimum/utils/dummy_diffusers_objects.py @@ -79,3 +79,47 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) + + +class ORTDiffusionPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForText2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForImage2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForInpainting(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) From 56d06d467e049c7838b1b6036e2b8c65eb5d7500 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 11:19:49 +0200 Subject: [PATCH 19/71] remove duplicate code --- .../pipeline_stable_diffusion_img2img.py | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py index f7f0586ac9..a66035a789 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py @@ -19,7 +19,6 @@ import PIL.Image import torch from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import deprecate from .pipeline_stable_diffusion import StableDiffusionPipelineMixin @@ -228,31 +227,7 @@ def __call__( latents_dtype = prompt_embeds.dtype image = image.astype(latents_dtype) - # encode the init image into latents and scale the latents - init_latents = self.vae_encoder(sample=image)[0] - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - init_latents = scaling_factor * init_latents - - if isinstance(prompt, str): - prompt = [prompt] - if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0: - # expand init_latents for batch_size - deprecation_message = ( - f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" - " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many initial images as text prompts to suppress this warning." - ) - deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) - additional_image_per_prompt = len(prompt) // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) - elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." - ) - else: - init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0) # get the original timestep using init_timestep offset = self.scheduler.config.get("steps_offset", 0) @@ -274,8 +249,6 @@ def __call__( if accepts_eta: extra_step_kwargs["eta"] = eta - latents = init_latents - t_start = max(num_inference_steps - init_timestep + offset, 0) timesteps = self.scheduler.timesteps[t_start:].numpy() From 7bfe4a5091133f5fb63562c27a6511f6dceddc7c Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 15:32:15 +0200 Subject: [PATCH 20/71] update stable diffusion mixin --- .../diffusers/pipeline_stable_diffusion.py | 644 ++++++++++++------ optimum/pipelines/diffusers/pipeline_utils.py | 362 ++++------ tests/onnxruntime/test_diffusion.py | 66 +- 3 files changed, 580 insertions(+), 492 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py index 6cc47fab1b..9eddbd1462 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py @@ -14,70 +14,87 @@ import inspect import logging -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg, retrieve_timesteps -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg +from .pipeline_utils import DiffusionPipelineMixin, randn_tensor logger = logging.getLogger(__name__) class StableDiffusionPipelineMixin(DiffusionPipelineMixin): - # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L114 - def _encode_prompt( + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def encode_prompt( self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, ): r""" Encodes the prompt into text encoder hidden states. Args: - prompt (`Union[str, List[str]]`): + prompt (`str` or `List[str]`, *optional*): prompt to be encoded + device: (`torch.device`): + torch device num_images_per_prompt (`int`): number of images that should be generated per prompt do_classifier_free_guidance (`bool`): whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. """ - if isinstance(prompt, str): + + if prompt is not None and isinstance(prompt, str): batch_size = 1 - elif isinstance(prompt, list): + elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] if prompt_embeds is None: - # get prompt text embeddings text_inputs = self.tokenizer( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="np", + return_tensors="pt", ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - if not np.array_equal(text_input_ids, untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] ) @@ -86,22 +103,59 @@ def _encode_prompt( f" {self.tokenizer.model_max_length} tokens: {removed_text}" ) - prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0] + # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + # attention_mask = text_inputs.attention_mask.to(device) + # else: + # attention_mask = None - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) + if clip_skip is None: + prompt_embeds = self.text_encoder( + text_input_ids.to(device) + # attention_mask=attention_mask + ) + prompt_embeds = next(iter(prompt_embeds.values())) + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + # attention_mask=attention_mask, + # output_hidden_states=True + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] if negative_prompt is None: uncond_tokens = [""] * batch_size - elif type(prompt) is not type(negative_prompt): + elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}." ) elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] * batch_size + uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" @@ -117,41 +171,119 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="np", + return_tensors="pt", + ) + + # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + # attention_mask = uncond_input.attention_mask.to(device) + # else: + # attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + # attention_mask=attention_mask, ) - negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0] + negative_prompt_embeds = next(iter(negative_prompt_embeds.values())) if do_classifier_free_guidance: - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + return prompt_embeds, negative_prompt_embeds + + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds]) + return image_embeds, uncond_image_embeds - return prompt_embeds + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + def decode_latents(self, latents): + latents = 1 / self.vae_decoder.config.scaling_factor * latents + image = self.vae_decoder(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs - # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217 def check_inputs( self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}." ) + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) if prompt is not None and prompt_embeds is not None: raise ValueError( @@ -179,9 +311,28 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = ( + batch_size, + num_channels_latents, + int(height) // self.vae_scale_factor, + int(width) // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -189,211 +340,316 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype ) if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*shape).astype(dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - + latents = latents * self.scheduler.init_noise_sigma return latents - # Adapted from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L264 + @torch.no_grad() def __call__( self, prompt: Optional[Union[str, List[str]]] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, + num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, ): r""" - Function invoked when calling the pipeline for generation. + The call function to the pipeline for generation. Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): + num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when + using zero terminal SNR. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. """ - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor + device = self.device + + latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents + prompt_embeds = ( + self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + ) + negative_prompt_embeds = ( + self.np_to_pt(negative_prompt_embeds, device) + if isinstance(negative_prompt_embeds, np.ndarray) + else negative_prompt_embeds + ) + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + logger.warning( + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + logger.warning( + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - # check inputs. Raise error if not correct + # 0. Default height and width to unet + height = height or self.unet.config.get("sample_size") * self.vae_scale_factor + width = width or self.unet.config.get("sample_size") * self.vae_scale_factor + # to deal with lora scaling and other possible forward hooks + + # 1. Check inputs. Raise error if not correct self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + prompt, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, ) - # define call parameters - if isinstance(prompt, str): + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): batch_size = 1 - elif isinstance(prompt, list): + elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] - if generator is None: - generator = np.random.RandomState() - + lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 + do_classifier_free_guidance = guidance_scale > 1 and self.unet.config.get("time_cond_proj_dim", None) is None - prompt_embeds = self._encode_prompt( + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, + device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + clip_skip=clip_skip, ) - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + do_classifier_free_guidance, + ) + else: + image_embeds = None + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.get("in_channels") latents = self.prepare_latents( batch_size * num_images_per_prompt, - self.unet.config.get("in_channels", 4), + num_channels_latents, height, width, prompt_embeds.dtype, + device, generator, latents, ) - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) + else None + ) + + # 6.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.get("time_cond_proj_dim", None) is not None: + guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.get("time_cond_proj_dim", None) + ).to(device=device, dtype=latents.dtype) + # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + sample=latent_model_input, + timestep=t.unsqueeze(0), + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), + # cross_attention_kwargs=cross_attention_kwargs, + # added_cond_kwargs=added_cond_kwargs, + # return_dict=False, + ) + noise_pred = next(iter(noise_pred.values())) + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + if do_classifier_free_guidance and guidance_rescale > 0.0: # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + image = self.vae_decoder( + latents / self.vae_decoder.config.get("scaling_factor"), + # return_dict=False, + # generator=generator, + # TODO: in some models, it might be mandatory to pass generator here for reproducibility ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": + image = next(iter(image.values())) + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: image = latents has_nsfw_concept = None - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) if has_nsfw_concept is None: do_denormalize = [True] * image.shape[0] @@ -402,26 +658,10 @@ def __call__( image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + # # Offload all models + # self.maybe_free_model_hooks() + if not return_dict: return (image, has_nsfw_concept) return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - - def run_safety_checker(self, image: np.ndarray): - if self.safety_checker is None: - has_nsfw_concept = None - else: - feature_extractor_input = self.image_processor.numpy_to_pil(image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="np" - ).pixel_values.astype(image.dtype) - images, has_nsfw_concept = [], [] - for i in range(image.shape[0]): - image_i, has_nsfw_concept_i = self.safety_checker( - clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1] - ) - images.append(image_i) - has_nsfw_concept.append(has_nsfw_concept_i[0]) - image = np.concatenate(images) - - return image, has_nsfw_concept diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py index e9d5986b61..71ae650ed1 100644 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ b/optimum/pipelines/diffusers/pipeline_utils.py @@ -13,36 +13,23 @@ # limitations under the License. -import warnings -from typing import List, Optional, Union +import logging +from typing import List, Optional, Tuple, Union import numpy as np -import PIL.Image import torch from diffusers import ConfigMixin -from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor -from diffusers.utils.pil_utils import PIL_INTERPOLATION -from PIL import Image from tqdm.auto import tqdm +from transformers.modeling_outputs import ModelOutput + + +logger = logging.getLogger(__name__) class DiffusionPipelineMixin(ConfigMixin): - # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L812 @staticmethod - def numpy_to_pil(images): - """ - Converts a numpy image or a batch of images to a PIL image. - """ - if images.ndim == 3: - images = images[None, ...] - images = (images * 255).round().astype("uint8") - if images.shape[-1] == 1: - # special case for grayscale (single channel) images - pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] - else: - pil_images = [Image.fromarray(image) for image in images] - - return pil_images + def np_to_pt(tensor: np.ndarray, device: str) -> "torch.Tensor": + return torch.from_numpy(tensor).to(device) # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L827 def progress_bar(self, iterable=None, total=None): @@ -61,222 +48,125 @@ def progress_bar(self, iterable=None, total=None): raise ValueError("Either `total` or `iterable` has to be defined.") -# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L58 -def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): +def np_randn_tensor( + shape: Union[Tuple, List], + generator: Optional[Union[List["np.random.RandomState"], "np.random.RandomState"]] = None, + device: Optional["torch.device"] = None, + dtype: Optional["torch.dtype"] = None, + layout: Optional["torch.layout"] = None, +): + """A helper function to create random tensors on the desired `device` with the desired `dtype`. When + passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor + is always created on the CPU. """ - Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + batch_size = shape[0] + + # make sure generator list of length 1 is treated like a non-list + if isinstance(generator, list) and len(generator) == 1: + generator = generator[0] + + if isinstance(generator, list): + shape = (1,) + shape[1:] + latents = [generator[i].randn(*shape) for i in range(batch_size)] + latents = np.stack(latents, axis=0) + elif generator is not None: + latents = generator.randn(*shape) + else: + latents = np.random.randn(*shape) + + return latents + + +def pt_randn_tensor( + shape: Union[Tuple, List], + generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None, + device: Optional["torch.device"] = None, + dtype: Optional["torch.dtype"] = None, + layout: Optional["torch.layout"] = None, +): + """A helper function to create random tensors on the desired `device` with the desired `dtype`. When + passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor + is always created on the CPU. """ - std_text = np.std(noise_pred_text, axis=tuple(range(1, noise_pred_text.ndim)), keepdims=True) - std_cfg = np.std(noise_cfg, axis=tuple(range(1, noise_cfg.ndim)), keepdims=True) - # rescale the results from guidance (fixes overexposure) - noise_pred_rescaled = noise_cfg * (std_text / std_cfg) - # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images - noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg - return noise_cfg - - -class VaeImageProcessor(DiffusersVaeImageProcessor): - # Adapted from diffusers.VaeImageProcessor.denormalize - @staticmethod - def denormalize(images: np.ndarray): - """ - Denormalize an image array to [0,1]. - """ - return np.clip(images / 2 + 0.5, 0, 1) - - # Adapted from diffusers.VaeImageProcessor.preprocess - def preprocess( - self, - image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - height: Optional[int] = None, - width: Optional[int] = None, - ) -> np.ndarray: - """ - Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors. - """ - supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) - - do_convert_grayscale = getattr(self.config, "do_convert_grayscale", False) - # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image - if do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: - if isinstance(image, torch.Tensor): - # if image is a pytorch tensor could have 2 possible shapes: - # 1. batch x height x width: we should insert the channel dimension at position 1 - # 2. channnel x height x width: we should insert batch dimension at position 0, - # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 - # for simplicity, we insert a dimension of size 1 at position 1 for both cases - image = image.unsqueeze(1) - else: - # if it is a numpy array, it could have 2 possible shapes: - # 1. batch x height x width: insert channel dimension on last position - # 2. height x width x channel: insert batch dimension on first position - if image.shape[-1] == 1: - image = np.expand_dims(image, axis=0) - else: - image = np.expand_dims(image, axis=-1) - - if isinstance(image, supported_formats): - image = [image] - elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): - raise ValueError( - f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}" - ) - - if isinstance(image[0], PIL.Image.Image): - if self.config.do_convert_rgb: - image = [self.convert_to_rgb(i) for i in image] - elif do_convert_grayscale: - image = [self.convert_to_grayscale(i) for i in image] - if self.config.do_resize: - height, width = self.get_height_width(image[0], height, width) - image = [self.resize(i, height, width) for i in image] - image = self.reshape(self.pil_to_numpy(image)) - else: - if isinstance(image[0], torch.Tensor): - image = [self.pt_to_numpy(elem) for elem in image] - image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) - else: - image = self.reshape(np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)) - - if do_convert_grayscale and image.ndim == 3: - image = np.expand_dims(image, 1) - - # don't need any preprocess if the image is latents - if image.shape[1] == 4: - return image - - if self.config.do_resize: - height, width = self.get_height_width(image, height, width) - image = self.resize(image, height, width) - - # expected range [0,1], normalize to [-1,1] - do_normalize = self.config.do_normalize - if image.min() < 0 and do_normalize: - warnings.warn( - "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] " - f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]", - FutureWarning, - ) - do_normalize = False - - if do_normalize: - image = self.normalize(image) - - if getattr(self.config, "do_binarize", False): - image = self.binarize(image) - - return image - - # Adapted from diffusers.VaeImageProcessor.postprocess - def postprocess( - self, - image: np.ndarray, - output_type: str = "pil", - do_denormalize: Optional[List[bool]] = None, + # device on which tensor is created defaults to device + rand_device = device + batch_size = shape[0] + + layout = layout or torch.strided + device = device or torch.device("cpu") + + if generator is not None: + gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type + if gen_device_type != device.type and gen_device_type == "cpu": + rand_device = "cpu" + if device != "mps": + logger.info( + f"The passed generator was created on 'cpu' even though a tensor on {device} was expected." + f" Tensors will be created on 'cpu' and then moved to {device}. Note that one can probably" + f" slighly speed up this function by passing a generator that was created on the {device} device." + ) + elif gen_device_type != device.type and gen_device_type == "cuda": + raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.") + + # make sure generator list of length 1 is treated like a non-list + if isinstance(generator, list) and len(generator) == 1: + generator = generator[0] + + if isinstance(generator, list): + shape = (1,) + shape[1:] + latents = [ + torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout) + for i in range(batch_size) + ] + latents = torch.cat(latents, dim=0).to(device) + else: + latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device) + + return latents + + +def randn_tensor( + shape: Union[Tuple, List], + generator: Optional[ + Union[List[Union["torch.Generator", "np.random.RandomState"]], "torch.Generator", "np.random.RandomState"] + ] = None, + device: Optional["torch.device"] = None, + dtype: Optional["torch.dtype"] = None, + layout: Optional["torch.layout"] = None, +) -> "torch.Tensor": + if (isinstance(generator, list) and isinstance(generator[0], torch.Generator)) or isinstance( + generator, torch.Generator ): - if not isinstance(image, np.ndarray): - raise ValueError( - f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array" - ) - if output_type not in ["latent", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - warnings.warn(deprecation_message, FutureWarning) - output_type = "np" - - if output_type == "latent": - return image - - if do_denormalize is None: - do_denormalize = [self.config.do_normalize] * image.shape[0] - - image = np.stack( - [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0 - ) - - image = image.transpose((0, 2, 3, 1)) - - if output_type == "pil": - image = self.numpy_to_pil(image) - - return image - - def get_height_width( - self, - image: Union[PIL.Image.Image, np.ndarray], - height: Optional[int] = None, - width: Optional[int] = None, + return pt_randn_tensor(shape, generator, device, dtype, layout) + elif (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) or isinstance( + generator, np.random.RandomState ): - """ - This function return the height and width that are downscaled to the next integer multiple of - `vae_scale_factor`. - - Args: - image(`PIL.Image.Image`, `np.ndarray`): - The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have - shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should - have shape `[batch, channel, height, width]`. - height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the height of `image` input. - width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use the width of the `image` input. - """ - height = height or (image.height if isinstance(image, PIL.Image.Image) else image.shape[-2]) - width = width or (image.width if isinstance(image, PIL.Image.Image) else image.shape[-1]) - # resize to integer multiple of vae_scale_factor - width, height = (x - x % self.config.vae_scale_factor for x in (width, height)) - return height, width - - # Adapted from diffusers.VaeImageProcessor.numpy_to_pt - @staticmethod - def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: - """ - Convert a NumPy image to a PyTorch tensor. - """ - if images.ndim == 3: - images = images[..., None] - - images = torch.from_numpy(images) - return images - - # Adapted from diffusers.VaeImageProcessor.pt_to_numpy - @staticmethod - def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: - """ - Convert a PyTorch tensor to a NumPy image. - """ - images = images.cpu().float().numpy() - return images - - @staticmethod - def reshape(images: np.ndarray) -> np.ndarray: - """ - Reshape inputs to expected shape. - """ - if images.ndim == 3: - images = images[..., None] - - return images.transpose(0, 3, 1, 2) - - # TODO : remove after diffusers v0.21.0 release - def resize( - self, - image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], - height: Optional[int] = None, - width: Optional[int] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: - """ - Resize image. - """ - if isinstance(image, PIL.Image.Image): - image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) - elif isinstance(image, torch.Tensor): - image = torch.nn.functional.interpolate(image, size=(height, width)) - elif isinstance(image, np.ndarray): - image = self.numpy_to_pt(image) - image = torch.nn.functional.interpolate(image, size=(height, width)) - image = self.pt_to_numpy(image) - return image + return torch.from_numpy(np_randn_tensor(shape, generator, device, dtype, layout)).to(device) + else: + return pt_randn_tensor(shape, generator, device, dtype, layout) + + +def retrieve_latents( + encoder_output: ModelOutput, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latent_sample"): + return encoder_output.latent_sample + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + +from contextlib import contextmanager + +@contextmanager +def patch_randn_tensor(): + import diffusers.utils.torch_utils + + old_randn_tensor = diffusers.utils.torch_utils.randn_tensor + diffusers.utils.torch_utils.randn_tensor = randn_tensor + yield + diffusers.utils.torch_utils.randn_tensor = old_randn_tensor diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 9f480b2d1a..6e1c521d4f 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import unittest import numpy as np import PIL @@ -35,7 +34,6 @@ ORTPipelineForInpainting, ORTPipelineForText2Image, ) -from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm @@ -150,10 +148,10 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - if model_arch == "latent-consistency": - # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step - # TODO: Investigate why this is the case - inputs["num_inference_steps"] = 1 + # if model_arch == "latent-consistency": + # # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step + # # TODO: Investigate why this is the case + # inputs["num_inference_steps"] = 1 for output_type in ["latent", "np"]: inputs["output_type"] = output_type @@ -263,8 +261,8 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): - if model_arch in ["latent-consistency"]: - pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") + # if model_arch in ["latent-consistency"]: + # pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -279,13 +277,16 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertTrue( + np.allclose(ort_outputs_1.images[0], ort_outputs_2.images[0]), + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0]), + ) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): - if model_arch in ["latent-consistency"]: - pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") + # if model_arch in ["latent-consistency"]: + # pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -438,11 +439,6 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): - if model_arch in ["stable-diffusion"]: - pytest.skip( - "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)" - ) - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -708,11 +704,6 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - if model_arch in ["stable-diffusion"]: - pytest.skip( - "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant" - ) - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -758,36 +749,3 @@ def test_image_reproducibility(self, model_arch: str): self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - -class ImageProcessorTest(unittest.TestCase): - def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt")) - input_np = to_np(input_pt) - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_np = np.stack(_generate_images(height=8, width=8, input_type="np")) - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil") - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) - for i, o in zip(input_pil, out): - in_np = np.array(i) - out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) From 27c29a8f3076d85956d38bc890741f814215e986 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 15:32:24 +0200 Subject: [PATCH 21/71] update latent consistency --- .../diffusers/pipeline_latent_consistency.py | 467 +++++++++++++----- 1 file changed, 334 insertions(+), 133 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py index 630d463de7..ef5f537230 100644 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py @@ -13,180 +13,407 @@ # limitations under the License. import logging -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.image_processor import PipelineImageInput +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_timesteps +from diffusers.utils.deprecation_utils import deprecate from .pipeline_stable_diffusion import StableDiffusionPipelineMixin +from .pipeline_utils import patch_randn_tensor logger = logging.getLogger(__name__) class LatentConsistencyPipelineMixin(StableDiffusionPipelineMixin): - # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264 + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] + _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"] + + def get_guidance_scale_embedding( + self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 + ) -> torch.Tensor: + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + w (`torch.Tensor`): + Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings. + embedding_dim (`int`, *optional*, defaults to 512): + Dimension of the embeddings to generate. + dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): + Data type of the generated embeddings. + + Returns: + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + # Currently StableDiffusionPipeline.check_inputs with negative prompt stuff removed + def check_inputs( + self, + prompt: Union[str, List[str]], + height: int, + width: int, + callback_steps: int, + prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + callback_on_step_end_tensor_inputs=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def clip_skip(self): + return self._clip_skip + + @property + def do_classifier_free_guidance(self): + return False + + @property + def num_timesteps(self): + return self._num_timesteps + + @torch.no_grad() def __call__( self, - prompt: Optional[Union[str, List[str]]] = None, + prompt: Union[str, List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 4, original_inference_steps: int = None, + timesteps: List[int] = None, guidance_scale: float = 8.5, - num_images_per_prompt: int = 1, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", + num_images_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, ): r""" - Function invoked when calling the pipeline for generation. + The call function to the pipeline for generation. Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): + num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - num_images_per_prompt (`int`, defaults to 1): + original_inference_steps (`int`, *optional*): + The original number of inference steps use to generate a linearly-spaced timestep schedule, from which + we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule, + following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the + scheduler's `original_inference_steps` attribute. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps on the original LCM training/distillation timestep schedule are used. Must be in descending + order. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + Note that the original latent consistency models paper uses a different CFG formulation where the + guidance scales are decreased by 1 (so in the paper formulation CFG is enabled when `guidance_scale > + 0`). + num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. """ - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor + device = self.device - # Don't need to get negative prompts due to LCM guided distillation - negative_prompt = None - negative_prompt_embeds = None + # convert numpy arrays to torch tensors + prompt_embeds = self.np_to_pt(prompt_embeds) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + latents = self.np_to_pt(latents) if isinstance(latents, np.ndarray) else latents - # check inputs. Raise error if not correct + for k, v in kwargs.items(): + if isinstance(v, np.ndarray): + kwargs[k] = self.np_to_pt(v) + elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): + kwargs[k] = [self.np_to_pt(i) for i in v] + elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): + kwargs[k] = {k: self.np_to_pt(v) for k, v in v.items()} + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + prompt, + height, + width, + callback_steps, + prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, ) + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs - # define call parameters - if isinstance(prompt, str): + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): batch_size = 1 - elif isinstance(prompt, list): + elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] - if generator is None: - generator = np.random.RandomState() + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) - prompt_embeds = self._encode_prompt( + # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided + # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the + # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts. + prompt_embeds, _ = self.encode_prompt( prompt, + device, num_images_per_prompt, - False, - negative_prompt, + self.do_classifier_free_guidance, + negative_prompt=None, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, + negative_prompt_embeds=None, + lora_scale=lora_scale, + clip_skip=self.clip_skip, ) - # set timesteps - self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps) - timesteps = self.scheduler.timesteps + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, original_inference_steps=original_inference_steps + ) + # 5. Prepare latent variable + num_channels_latents = self.unet.config.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, - self.unet.config["in_channels"], + num_channels_latents, height, width, prompt_embeds.dtype, + device, generator, latents, ) - bs = batch_size * num_images_per_prompt - # get Guidance Scale Embedding - w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype) - w_embedding = self.get_guidance_scale_embedding( - w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype + + # 6. Get Guidance Scale Embedding + # NOTE: We use the Imagen CFG formulation that StableDiffusionPipeline uses rather than the original LCM paper + # CFG formulation, so we need to subtract 1 from the input guidance_scale. + # LCM CFG formulation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond), (cfg_scale > 0.0 using CFG) + w = torch.tensor(self.guidance_scale - 1).repeat(bs) + w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.time_cond_proj_dim).to( + device=device, dtype=latents.dtype ) - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None) + + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None + else None + ) + # 8. LCM MultiStep Sampling Loop: num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latents, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - timestep_cond=w_embedding, - )[0] + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + latents = latents.to(prompt_embeds.dtype) - # compute the previous noisy sample x_t -> x_t-1 - latents, denoised = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False - ) - latents, denoised = latents.numpy(), denoised.numpy() + # model prediction (v-prediction, eps, x) + model_pred = self.unet( + latents, + timestep=t.unsqueeze(0), + timestep_cond=w_embedding, + encoder_hidden_states=prompt_embeds, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), + # cross_attention_kwargs=cross_attention_kwargs, + # added_cond_kwargs=added_cond_kwargs, + # return_dict=False, + )[0] - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + # compute the previous noisy sample x_t -> x_t-1 + + latents, denoised = self.scheduler.step(model_pred, t, latents, **extra_step_kwargs, return_dict=False) + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - if output_type == "latent": + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + w_embedding = callback_outputs.pop("w_embedding", w_embedding) + denoised = callback_outputs.pop("denoised", denoised) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + denoised = denoised.to(prompt_embeds.dtype) + if not output_type == "latent": + image = self.vae_decoder( + denoised / self.vae_decoder.config.scaling_factor, + # return_dict=False, + )[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: image = denoised has_nsfw_concept = None - else: - denoised /= self.vae_decoder.config["scaling_factor"] - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) if has_nsfw_concept is None: do_denormalize = [True] * image.shape[0] @@ -195,36 +422,10 @@ def __call__( image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + # Offload all models + # self.maybe_free_model_hooks() + if not return_dict: return (image, has_nsfw_concept) return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - - # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264 - def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None): - """ - See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - - Args: - timesteps (`torch.Tensor`): - generate embedding vectors at these timesteps - embedding_dim (`int`, *optional*, defaults to 512): - dimension of the embeddings to generate - dtype: - data type of the generated embeddings - - Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` - """ - w = w * 1000 - half_dim = embedding_dim // 2 - emb = np.log(10000.0) / (half_dim - 1) - emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb) - emb = w[:, None] * emb[None, :] - emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1) - - if embedding_dim % 2 == 1: # zero pad - emb = np.pad(emb, [(0, 0), (0, 1)]) - - assert emb.shape == (w.shape[0], embedding_dim) - return emb From a2e5423182675bb2686083b00e35615b3008dc9f Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 15:32:36 +0200 Subject: [PATCH 22/71] update sd for img2img --- .../pipeline_stable_diffusion_img2img.py | 967 ++++++++++++++---- 1 file changed, 762 insertions(+), 205 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py index f7f0586ac9..e80e70115c 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py @@ -13,39 +13,396 @@ # limitations under the License. import inspect -from typing import Callable, List, Optional, Union +import logging +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL.Image import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import deprecate +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.models import ImageProjection +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_timesteps +from diffusers.utils.deprecation_utils import deprecate -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin +from .pipeline_utils import DiffusionPipelineMixin, randn_tensor, retrieve_latents -class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin): - # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.check_inputs +logger = logging.getLogger(__name__) + + +class StableDiffusionImg2ImgPipelineMixin(DiffusionPipelineMixin): + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + lora_scale: Optional[float] = None, + **kwargs, + ): + deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." + deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) + + prompt_embeds_tuple = self.encode_prompt( + prompt=prompt, + device=device, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + **kwargs, + ) + + # concatenate for backwards comp + prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + # if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): + # self._lora_scale = lora_scale + + # # dynamically adjust the LoRA scale + # if not USE_PEFT_BACKEND: + # adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + # else: + # scale_lora_layers(self.text_encoder, lora_scale) + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: process multi-vector tokens if necessary + # if isinstance(self, TextualInversionLoaderMixin): + # prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + # attention_mask = text_inputs.attention_mask.to(device) + # else: + # attention_mask = None + + if clip_skip is None: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + # attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + # attention_mask=attention_mask, + # output_hidden_states=True, + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: process multi-vector tokens if necessary + # if isinstance(self, TextualInversionLoaderMixin): + # uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + # attention_mask = uncond_input.attention_mask.to(device) + # else: + # attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + # attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # if self.text_encoder is not None: + # if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: + # # Retrieve the original scale by scaling back the LoRA layers + # unscale_lora_layers(self.text_encoder, lora_scale) + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + image_embeds = [] + if do_classifier_free_guidance: + negative_image_embeds = [] + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + + image_embeds.append(single_image_embeds[None, :]) + if do_classifier_free_guidance: + negative_image_embeds.append(single_negative_image_embeds[None, :]) + else: + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + negative_image_embeds.append(single_negative_image_embeds) + image_embeds.append(single_image_embeds) + + ip_adapter_image_embeds = [] + for i, single_image_embeds in enumerate(image_embeds): + single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0) + if do_classifier_free_guidance: + single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0) + + single_image_embeds = single_image_embeds.to(device=device) + ip_adapter_image_embeds.append(single_image_embeds) + + return ip_adapter_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + def decode_latents(self, latents): + deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead" + deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False) + + latents = 1 / self.vae_decoder.config.scaling_factor * latents + image = self.vae_decoder( + latents, + # return_dict=False, + )[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + def check_inputs( self, - prompt: Union[str, List[str]], - strength: float, - callback_steps: int, - negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, + prompt, + strength, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + callback_on_step_end_tensor_inputs=None, ): if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}." ) + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" @@ -72,131 +429,317 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + image = image.to(device=device, dtype=dtype) + batch_size = batch_size * num_images_per_prompt if image.shape[1] == 4: init_latents = image + else: - init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + + init_latents = [ + retrieve_latents(self.vae_encoder(image[i : i + 1]), generator=generator[i]) + for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = retrieve_latents(self.vae_encoder(image), generator=generator) + + init_latents = self.vae_decoder.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size + deprecation_message = ( + f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" + " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" + " your script to pass as many initial images as text prompts to suppress this warning." + ) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) + init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) else: - init_latents = np.concatenate([init_latents], axis=0) + init_latents = torch.cat([init_latents], dim=0) - # add noise to latents using the timesteps - if isinstance(generator, np.random.RandomState): - noise = generator.randn(*init_latents.shape).astype(dtype) - elif isinstance(generator, torch.Generator): - noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ).numpy() + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents + + return latents + + # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding( + self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 + ) -> torch.Tensor: + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - return init_latents + Args: + w (`torch.Tensor`): + Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings. + embedding_dim (`int`, *optional*, defaults to 512): + Dimension of the embeddings to generate. + dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): + Data type of the generated embeddings. - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__ + Returns: + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() def __call__( self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, + prompt: Union[str, List[str]] = None, + image: PipelineImageInput = None, strength: float = 0.8, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, + num_inference_steps: Optional[int] = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + guidance_scale: Optional[float] = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + clip_skip: int = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, ): r""" - Function invoked when calling the pipeline for generation. - + The call function to the pipeline for generation. Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`Union[np.ndarray, PIL.Image.Image]`): - `Image`, or tensor representing an image batch which will be upscaled. - strength (`float`, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` - will be used as a starting point, adding more noise to it the larger the `strength`. The number of - denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will - be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - num_inference_steps (`int`, defaults to 50): + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. + strength (`float`, *optional*, defaults to 0.8): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): + expense of slower inference. This parameter is modulated by `strength`. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): - A np.random.RandomState to make generation deterministic. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + Examples: Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. """ + device = self.device + + prompt_embeds = ( + self.np_to_pt(prompt_embeds, device=device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + ) + negative_prompt_embeds = ( + self.np_to_pt(negative_prompt_embeds, device=device) + if isinstance(negative_prompt_embeds, np.ndarray) + else negative_prompt_embeds + ) - # check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) - # define call parameters + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + strength, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._interrupt = False + self._do_classifier_free_guidance = self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -204,124 +747,135 @@ def __call__( else: batch_size = prompt_embeds.shape[0] - if generator is None: - generator = np.random.RandomState() - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - - image = self.image_processor.preprocess(image) - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( + # 3. Encode input prompt + text_encoder_lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, + device, num_images_per_prompt, - do_classifier_free_guidance, + self._do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, ) - - latents_dtype = prompt_embeds.dtype - image = image.astype(latents_dtype) - # encode the init image into latents and scale the latents - init_latents = self.vae_encoder(sample=image)[0] - - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - init_latents = scaling_factor * init_latents - - if isinstance(prompt, str): - prompt = [prompt] - if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0: - # expand init_latents for batch_size - deprecation_message = ( - f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" - " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many initial images as text prompts to suppress this warning." - ) - deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) - additional_image_per_prompt = len(prompt) // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) - elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self._do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self._do_classifier_free_guidance, ) - else: - init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0) - - # get the original timestep using init_timestep - offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset - init_timestep = min(init_timestep, num_inference_steps) - - timesteps = self.scheduler.timesteps.numpy()[-init_timestep] - timesteps = np.array([timesteps] * batch_size * num_images_per_prompt) - # 5. Prepare latent variables - latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator) + # 4. Preprocess image + image = self.image_processor.preprocess(image) - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta + # 5. set timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + + # 6. Prepare latent variables + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + ) - latents = init_latents + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - t_start = max(num_inference_steps - init_timestep + offset, 0) - timesteps = self.scheduler.timesteps[t_start:].numpy() + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None + else None + ) - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) + # 7.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + # 8. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[ - 0 - ] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self._do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + timestep=t.unsqueeze(0), + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), + # cross_attention_kwargs=cross_attention_kwargs, + # added_cond_kwargs=added_cond_kwargs, + # return_dict=False, + )[0] + + # perform guidance + if self._do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + image = self.vae_decoder( + latents / self.vae_decoder.config.scaling_factor, + # return_dict=False, + # generator=generator, + )[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: image = latents has_nsfw_concept = None - else: - latents /= scaling_factor - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) if has_nsfw_concept is None: do_denormalize = [True] * image.shape[0] @@ -330,6 +884,9 @@ def __call__( image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + # Offload all models + # self.maybe_free_model_hooks() + if not return_dict: return (image, has_nsfw_concept) From fdac13456df57fd6ad993891498bc39ca8a654ca Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 16:00:22 +0200 Subject: [PATCH 23/71] update latent consistency --- .../diffusers/pipeline_latent_consistency.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py index ef5f537230..d4a9cc03d4 100644 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py @@ -242,16 +242,16 @@ def __call__( device = self.device # convert numpy arrays to torch tensors - prompt_embeds = self.np_to_pt(prompt_embeds) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds - latents = self.np_to_pt(latents) if isinstance(latents, np.ndarray) else latents + prompt_embeds = self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents for k, v in kwargs.items(): if isinstance(v, np.ndarray): - kwargs[k] = self.np_to_pt(v) + kwargs[k] = self.np_to_pt(v, device) elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): - kwargs[k] = [self.np_to_pt(i) for i in v] + kwargs[k] = [self.np_to_pt(i, device) for i in v] elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): - kwargs[k] = {k: self.np_to_pt(v) for k, v in v.items()} + kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} callback = kwargs.pop("callback", None) callback_steps = kwargs.pop("callback_steps", None) @@ -384,8 +384,8 @@ def __call__( )[0] # compute the previous noisy sample x_t -> x_t-1 - latents, denoised = self.scheduler.step(model_pred, t, latents, **extra_step_kwargs, return_dict=False) + if callback_on_step_end is not None: callback_kwargs = {} for k in callback_on_step_end_tensor_inputs: From 5023cac7be2ea42bfa2ec4acd2e576a2598c1ee3 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 16:00:36 +0200 Subject: [PATCH 24/71] update model parts to use frozen dict --- optimum/onnxruntime/modeling_diffusion.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 18cd38c5f2..131625da78 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -39,6 +39,8 @@ StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline, ) +from diffusers.configuration_utils import FrozenDict +from diffusers.image_processor import VaeImageProcessor from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download @@ -58,7 +60,6 @@ from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin -from ..pipelines.diffusers.pipeline_utils import VaeImageProcessor from ..utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -503,10 +504,9 @@ def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): config_path = Path(session._model_path).parent / self.CONFIG_NAME if config_path.is_file(): - # TODO: use FrozenDict - self.config = parent_model._dict_from_json_file(config_path) + self.config = FrozenDict(parent_model._dict_from_json_file(config_path)) else: - self.config = {} + self.config = FrozenDict({}) super().__init__(session, parent_model) From 2cd616e68d224b1731502b89a1d5bad13f82fd50 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 10 Sep 2024 16:16:38 +0200 Subject: [PATCH 25/71] update tests and utils --- optimum/pipelines/diffusers/pipeline_utils.py | 11 - tests/onnxruntime/test_diffusion.py | 268 +++++++++--------- 2 files changed, 135 insertions(+), 144 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py index 71ae650ed1..19521df5a1 100644 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ b/optimum/pipelines/diffusers/pipeline_utils.py @@ -159,14 +159,3 @@ def retrieve_latents( return encoder_output.latents else: raise AttributeError("Could not access latents of provided encoder_output") - -from contextlib import contextmanager - -@contextmanager -def patch_randn_tensor(): - import diffusers.utils.torch_utils - - old_randn_tensor = diffusers.utils.torch_utils.randn_tensor - diffusers.utils.torch_utils.randn_tensor = randn_tensor - yield - diffusers.utils.torch_utils.randn_tensor = old_randn_tensor diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 6e1c521d4f..d17fbaa0d8 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -165,45 +165,6 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ) self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 64, 32, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): @@ -261,9 +222,6 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): - # if model_arch in ["latent-consistency"]: - # pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -273,6 +231,9 @@ def test_image_reproducibility(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for generator_framework in ["np", "pt"]: + if model_arch in ["latent-consistency"] and generator_framework == "np": + pytest.skip("Latent Consistency Model (LCM) scheduler doesn't support numpy generator") + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) @@ -285,9 +246,6 @@ def test_image_reproducibility(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): - # if model_arch in ["latent-consistency"]: - # pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -297,7 +255,9 @@ def test_negative_prompt(self, model_arch: str): negative_prompt = ["This is a negative prompt"] pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) image_slice_1 = pipeline( - **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED) + **inputs, + negative_prompt=negative_prompt, + generator=torch.Generator().manual_seed(SEED), ).images[0, -3:, -3:, -1] prompt = inputs.pop("prompt") @@ -326,10 +286,52 @@ def test_negative_prompt(self, model_arch: str): inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0] inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0] - image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1] + image_slice_2 = pipeline( + **inputs, + generator=torch.Generator().manual_seed(SEED), + ).images[0, -3:, -3:, -1] self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1)) + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 64, 32, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + class ORTPipelineForImage2ImageTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] @@ -393,49 +395,6 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images).images self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): @@ -532,6 +491,49 @@ def test_image_reproducibility(self, model_arch: str): self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + class ORTPipelineForInpaintingTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = ["stable-diffusion"] @@ -602,49 +604,6 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images).images self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): @@ -749,3 +708,46 @@ def test_image_reproducibility(self, model_arch: str): self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) From dceccca57f2dd645de0a1389283652c86fc8f753 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 11 Sep 2024 11:28:56 +0200 Subject: [PATCH 26/71] updated all mixins, enabled all tests ; all are passing except some reproducibility and comparaison tests (7 failed, 35 passed) --- optimum/onnxruntime/modeling_diffusion.py | 20 +- .../diffusers/pipeline_latent_consistency.py | 18 +- .../diffusers/pipeline_stable_diffusion.py | 98 +- .../pipeline_stable_diffusion_img2img.py | 417 +------ .../pipeline_stable_diffusion_inpaint.py | 849 +++++++++---- .../diffusers/pipeline_stable_diffusion_xl.py | 924 ++++++++++---- .../pipeline_stable_diffusion_xl_img2img.py | 1061 ++++++++++++----- optimum/pipelines/diffusers/pipeline_utils.py | 127 +- tests/onnxruntime/test_diffusion.py | 98 +- 9 files changed, 2263 insertions(+), 1349 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 131625da78..ff08a72f3e 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -187,12 +187,11 @@ def __init__( ) self._internal_dict.pop("vae", None) - if "block_out_channels" in self.vae_decoder.config: - self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1) - else: - self.vae_scale_factor = 8 - + self.vae_scale_factor = 2 ** (len(self.vae_decoder.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.mask_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True + ) @staticmethod def load_model( @@ -526,6 +525,11 @@ def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): onnx_outputs = self.session.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + if any("hidden_states" in model_output for model_output in model_outputs): + model_outputs["hidden_states"] = [] + for i in range(self.config.num_hidden_layers): + model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) + return ModelOutput(**model_outputs) @@ -567,6 +571,9 @@ def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): onnx_outputs = self.session.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + if "latent_sample" in model_outputs: + model_outputs["latents"] = model_outputs.pop("latent_sample") + return ModelOutput(**model_outputs) @@ -580,6 +587,9 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor]): onnx_outputs = self.session.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + if "latent_sample" in model_outputs: + model_outputs["latents"] = model_outputs.pop("latent_sample") + return ModelOutput(**model_outputs) diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py index d4a9cc03d4..505a8890ff 100644 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py @@ -23,7 +23,6 @@ from diffusers.utils.deprecation_utils import deprecate from .pipeline_stable_diffusion import StableDiffusionPipelineMixin -from .pipeline_utils import patch_randn_tensor logger = logging.getLogger(__name__) @@ -239,11 +238,19 @@ def __call__( second element is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content. """ + # must have a compatible torch device device = self.device # convert numpy arrays to torch tensors - prompt_embeds = self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents + prompt_embeds = ( + self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + ) + ip_adapter_image_embeds = ( + [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] + if ip_adapter_image_embeds is not None + else ip_adapter_image_embeds + ) for k, v in kwargs.items(): if isinstance(v, np.ndarray): @@ -253,6 +260,13 @@ def __call__( elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} + generator = ( + self.np_to_pt(generator, device) + if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) + or isinstance(generator, np.random.RandomState) + else generator + ) + callback = kwargs.pop("callback", None) callback_steps = kwargs.pop("callback_steps", None) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py index 9eddbd1462..464c4f343c 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py @@ -22,8 +22,10 @@ from diffusers.image_processor import PipelineImageInput from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg, retrieve_timesteps +from diffusers.utils.deprecation_utils import deprecate +from diffusers.utils.torch_utils import randn_tensor -from .pipeline_utils import DiffusionPipelineMixin, randn_tensor +from .pipeline_utils import DiffusionPipelineMixin logger = logging.getLogger(__name__) @@ -34,11 +36,11 @@ class StableDiffusionPipelineMixin(DiffusionPipelineMixin): def encode_prompt( self, - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, + prompt: str, + device: Optional[torch.device] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + negative_prompt: Optional[str] = None, prompt_embeds: Optional[torch.Tensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, @@ -74,6 +76,8 @@ def encode_prompt( the output of the pre-final layer will be used for computing the prompt embeddings. """ + device = device or self.device + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -234,7 +238,10 @@ def run_safety_checker(self, image, device, dtype): def decode_latents(self, latents): latents = 1 / self.vae_decoder.config.scaling_factor * latents - image = self.vae_decoder(latents, return_dict=False)[0] + image = self.vae_decoder( + latents, + # return_dict=False, + )[0] image = (image / 2 + 0.5).clamp(0, 1) # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 image = image.cpu().permute(0, 2, 3, 1).float().numpy() @@ -461,8 +468,10 @@ def __call__( second element is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content. """ + # must have a compatible torch device device = self.device + # convert numpy arrays to torch tensors latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents prompt_embeds = ( self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds @@ -472,17 +481,41 @@ def __call__( if isinstance(negative_prompt_embeds, np.ndarray) else negative_prompt_embeds ) + ip_adapter_image_embeds = ( + [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] + if ip_adapter_image_embeds is not None + else ip_adapter_image_embeds + ) + + for k, v in kwargs.items(): + if isinstance(v, np.ndarray): + kwargs[k] = self.np_to_pt(v, device) + elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): + kwargs[k] = [self.np_to_pt(i, device) for i in v] + elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): + kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} + + generator = ( + self.np_to_pt(generator, device) + if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) + or isinstance(generator, np.random.RandomState) + else generator + ) callback = kwargs.pop("callback", None) callback_steps = kwargs.pop("callback_steps", None) if callback is not None: - logger.warning( - "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) if callback_steps is not None: - logger.warning( - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): @@ -665,3 +698,46 @@ def __call__( return (image, has_nsfw_concept) return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start + + # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding( + self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 + ) -> torch.Tensor: + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + w (`torch.Tensor`): + Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings. + embedding_dim (`int`, *optional*, defaults to 512): + Dimension of the embeddings to generate. + dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): + Data type of the generated embeddings. + + Returns: + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py index e80e70115c..e7421be244 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect import logging from typing import Any, Callable, Dict, List, Optional, Union @@ -21,361 +20,21 @@ import torch from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback from diffusers.image_processor import PipelineImageInput -from diffusers.models import ImageProjection from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_timesteps +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents, retrieve_timesteps from diffusers.utils.deprecation_utils import deprecate +from diffusers.utils.torch_utils import randn_tensor -from .pipeline_utils import DiffusionPipelineMixin, randn_tensor, retrieve_latents +from .pipeline_stable_diffusion import StableDiffusionPipelineMixin logger = logging.getLogger(__name__) -class StableDiffusionImg2ImgPipelineMixin(DiffusionPipelineMixin): +class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin): _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt - def _encode_prompt( - self, - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - lora_scale: Optional[float] = None, - **kwargs, - ): - deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." - deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) - - prompt_embeds_tuple = self.encode_prompt( - prompt=prompt, - device=device, - num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - negative_prompt=negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - lora_scale=lora_scale, - **kwargs, - ) - - # concatenate for backwards comp - prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]]) - - return prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt - def encode_prompt( - self, - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - lora_scale: Optional[float] = None, - clip_skip: Optional[int] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - device: (`torch.device`): - torch device - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - lora_scale (`float`, *optional*): - A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - """ - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - # if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): - # self._lora_scale = lora_scale - - # # dynamically adjust the LoRA scale - # if not USE_PEFT_BACKEND: - # adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) - # else: - # scale_lora_layers(self.text_encoder, lora_scale) - - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - # textual inversion: process multi-vector tokens if necessary - # if isinstance(self, TextualInversionLoaderMixin): - # prompt = self.maybe_convert_prompt(prompt, self.tokenizer) - - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( - text_input_ids, untruncated_ids - ): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] - ) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}" - ) - - # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - # attention_mask = text_inputs.attention_mask.to(device) - # else: - # attention_mask = None - - if clip_skip is None: - prompt_embeds = self.text_encoder( - text_input_ids.to(device), - # attention_mask=attention_mask, - ) - prompt_embeds = prompt_embeds[0] - else: - prompt_embeds = self.text_encoder( - text_input_ids.to(device), - # attention_mask=attention_mask, - # output_hidden_states=True, - ) - # Access the `hidden_states` first, that contains a tuple of - # all the hidden states from the encoder layers. Then index into - # the tuple to access the hidden states from the desired layer. - prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] - # We also need to apply the final LayerNorm here to not mess with the - # representations. The `last_hidden_states` that we typically use for - # obtaining the final prompt representations passes through the LayerNorm - # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) - - if self.text_encoder is not None: - prompt_embeds_dtype = self.text_encoder.dtype - elif self.unet is not None: - prompt_embeds_dtype = self.unet.dtype - else: - prompt_embeds_dtype = prompt_embeds.dtype - - prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - # textual inversion: process multi-vector tokens if necessary - # if isinstance(self, TextualInversionLoaderMixin): - # uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) - - max_length = prompt_embeds.shape[1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - - # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - # attention_mask = uncond_input.attention_mask.to(device) - # else: - # attention_mask = None - - negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids.to(device), - # attention_mask=attention_mask, - ) - negative_prompt_embeds = negative_prompt_embeds[0] - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - - negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - - # if self.text_encoder is not None: - # if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: - # # Retrieve the original scale by scaling back the LoRA layers - # unscale_lora_layers(self.text_encoder, lora_scale) - - return prompt_embeds, negative_prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image - def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): - dtype = next(self.image_encoder.parameters()).dtype - - if not isinstance(image, torch.Tensor): - image = self.feature_extractor(image, return_tensors="pt").pixel_values - - image = image.to(device=device, dtype=dtype) - if output_hidden_states: - image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] - image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) - uncond_image_enc_hidden_states = self.image_encoder( - torch.zeros_like(image), output_hidden_states=True - ).hidden_states[-2] - uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( - num_images_per_prompt, dim=0 - ) - return image_enc_hidden_states, uncond_image_enc_hidden_states - else: - image_embeds = self.image_encoder(image).image_embeds - image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) - uncond_image_embeds = torch.zeros_like(image_embeds) - - return image_embeds, uncond_image_embeds - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds - def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance - ): - image_embeds = [] - if do_classifier_free_guidance: - negative_image_embeds = [] - if ip_adapter_image_embeds is None: - if not isinstance(ip_adapter_image, list): - ip_adapter_image = [ip_adapter_image] - - if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): - raise ValueError( - f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." - ) - - for single_ip_adapter_image, image_proj_layer in zip( - ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers - ): - output_hidden_state = not isinstance(image_proj_layer, ImageProjection) - single_image_embeds, single_negative_image_embeds = self.encode_image( - single_ip_adapter_image, device, 1, output_hidden_state - ) - - image_embeds.append(single_image_embeds[None, :]) - if do_classifier_free_guidance: - negative_image_embeds.append(single_negative_image_embeds[None, :]) - else: - for single_image_embeds in ip_adapter_image_embeds: - if do_classifier_free_guidance: - single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - negative_image_embeds.append(single_negative_image_embeds) - image_embeds.append(single_image_embeds) - - ip_adapter_image_embeds = [] - for i, single_image_embeds in enumerate(image_embeds): - single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0) - if do_classifier_free_guidance: - single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0) - single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0) - - single_image_embeds = single_image_embeds.to(device=device) - ip_adapter_image_embeds.append(single_image_embeds) - - return ip_adapter_image_embeds - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - if self.safety_checker is None: - has_nsfw_concept = None - else: - if torch.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - else: - feature_extractor_input = self.image_processor.numpy_to_pil(image) - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - return image, has_nsfw_concept - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents - def decode_latents(self, latents): - deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead" - deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False) - - latents = 1 / self.vae_decoder.config.scaling_factor * latents - image = self.vae_decoder( - latents, - # return_dict=False, - )[0] - image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() - return image - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # check if the scheduler accepts generator - accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_generator: - extra_step_kwargs["generator"] = generator - return extra_step_kwargs - def check_inputs( self, prompt, @@ -444,17 +103,6 @@ def check_inputs( f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) - def get_timesteps(self, num_inference_steps, strength, device): - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] - if hasattr(self.scheduler, "set_begin_index"): - self.scheduler.set_begin_index(t_start * self.scheduler.order) - - return timesteps, num_inference_steps - t_start - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): raise ValueError( @@ -520,37 +168,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt return latents - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding - def get_guidance_scale_embedding( - self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.Tensor: - """ - See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - - Args: - w (`torch.Tensor`): - Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings. - embedding_dim (`int`, *optional*, defaults to 512): - Dimension of the embeddings to generate. - dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): - Data type of the generated embeddings. - - Returns: - `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. - """ - assert len(w.shape) == 1 - w = w * 1000.0 - - half_dim = embedding_dim // 2 - emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) - emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) - emb = w.to(dtype)[:, None] * emb[None, :] - emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) - if embedding_dim % 2 == 1: # zero pad - emb = torch.nn.functional.pad(emb, (0, 1)) - assert emb.shape == (w.shape[0], embedding_dim) - return emb - @property def guidance_scale(self): return self._guidance_scale @@ -690,16 +307,38 @@ def __call__( second element is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content. """ + # must have a compatible torch device device = self.device + # convert numpy arrays to torch tensors prompt_embeds = ( - self.np_to_pt(prompt_embeds, device=device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds ) negative_prompt_embeds = ( - self.np_to_pt(negative_prompt_embeds, device=device) + self.np_to_pt(negative_prompt_embeds, device) if isinstance(negative_prompt_embeds, np.ndarray) else negative_prompt_embeds ) + ip_adapter_image_embeds = ( + [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] + if ip_adapter_image_embeds is not None + else ip_adapter_image_embeds + ) + + for k, v in kwargs.items(): + if isinstance(v, np.ndarray): + kwargs[k] = self.np_to_pt(v, device) + elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): + kwargs[k] = [self.np_to_pt(i, device) for i in v] + elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): + kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} + + generator = ( + self.np_to_pt(generator, device) + if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) + or isinstance(generator, np.random.RandomState) + else generator + ) callback = kwargs.pop("callback", None) callback_steps = kwargs.pop("callback_steps", None) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py index cb3c7db96e..2791b37b32 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py @@ -12,63 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL.Image import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import PIL_INTERPOLATION +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents, retrieve_timesteps +from diffusers.utils.deprecation_utils import deprecate +from diffusers.utils.torch_utils import randn_tensor from .pipeline_stable_diffusion import StableDiffusionPipelineMixin -def prepare_mask_and_masked_image(image, mask, latents_shape, vae_scale_factor): - image = np.array( - image.convert("RGB").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor)) - ) - image = image[None].transpose(0, 3, 1, 2) - image = image.astype(np.float32) / 127.5 - 1.0 - - image_mask = np.array( - mask.convert("L").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor)) - ) - masked_image = image * (image_mask < 127.5) - - mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"]) - mask = np.array(mask.convert("L")) - mask = mask.astype(np.float32) / 255.0 - mask = mask[None, None] - mask[mask < 0.5] = 0 - mask[mask >= 0.5] = 1 - - return mask, masked_image - - class StableDiffusionInpaintPipelineMixin(StableDiffusionPipelineMixin): - # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline.check_inputs def check_inputs( self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, + prompt, + image, + mask_image, + height, + width, + strength, + callback_steps, + output_type, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + callback_on_step_end_tensor_inputs=None, + padding_mask_crop=None, ): - if height % 8 != 0 or width % 8 != 0: + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}." ) + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" @@ -94,104 +90,435 @@ def check_inputs( f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" f" {negative_prompt_embeds.shape}." ) + if padding_mask_crop is not None: + if not isinstance(image, PIL.Image.Image): + raise ValueError( + f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}." + ) + if not isinstance(mask_image, PIL.Image.Image): + raise ValueError( + f"The mask image should be a PIL image when inpainting mask crop, but is of type" + f" {type(mask_image)}." + ) + if output_type != "pil": + raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.") + + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + image=None, + timestep=None, + is_strength_max=True, + return_noise=False, + return_image_latents=False, + ): + shape = ( + batch_size, + num_channels_latents, + int(height) // self.vae_scale_factor, + int(width) // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if (image is None or timestep is None) and not is_strength_max: + raise ValueError( + "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." + "However, either the image or the noise timestep has not been provided." + ) + + if return_image_latents or (latents is None and not is_strength_max): + image = image.to(device=device, dtype=dtype) + + if image.shape[1] == 4: + image_latents = image + else: + image_latents = self._encode_vae_image(image=image, generator=generator) + image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1) + + if latents is None: + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + # if strength is 1. then initialise the latents to noise, else initial to image + noise + latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) + # if pure noise then scale the initial latents by the Scheduler's init sigma + latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents + else: + noise = latents.to(device) + latents = noise * self.scheduler.init_noise_sigma + + outputs = (latents,) + + if return_noise: + outputs += (noise,) + + if return_image_latents: + outputs += (image_latents,) + + return outputs + + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if isinstance(generator, list): + image_latents = [ + retrieve_latents(self.vae_encoder(image[i : i + 1]), generator=generator[i]) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = retrieve_latents(self.vae_encoder(image), generator=generator) + + image_latents = self.vae_encoder.config.scaling_factor * image_latents + + return image_latents + + def prepare_mask_latents( + self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance + ): + # resize the mask to latents shape as we concatenate the mask to the latents + # we do that before converting to dtype to avoid breaking in case we're using cpu_offload + # and half precision + mask = torch.nn.functional.interpolate( + mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) + mask = mask.to(device=device, dtype=dtype) + + masked_image = masked_image.to(device=device, dtype=dtype) + + if masked_image.shape[1] == 4: + masked_image_latents = masked_image + else: + masked_image_latents = self._encode_vae_image(masked_image, generator=generator) + + # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method + if mask.shape[0] < batch_size: + if not batch_size % mask.shape[0] == 0: + raise ValueError( + "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" + f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" + " of masks that you pass is divisible by the total requested batch size." + ) + mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1) + if masked_image_latents.shape[0] < batch_size: + if not batch_size % masked_image_latents.shape[0] == 0: + raise ValueError( + "The passed images and the required batch size don't match. Images are supposed to be duplicated" + f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." + " Make sure the number of images that you pass is divisible by the total requested batch size." + ) + masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1) + + mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask + masked_image_latents = ( + torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + ) + + # aligning device to prevent device errors when concating it with the latent model input + masked_image_latents = masked_image_latents.to(device=device, dtype=dtype) + return mask, masked_image_latents + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt @torch.no_grad() def __call__( self, - prompt: Union[str, List[str]], - image: PIL.Image.Image, - mask_image: PIL.Image.Image, + prompt: Union[str, List[str]] = None, + image: PipelineImageInput = None, + mask_image: PipelineImageInput = None, + masked_image_latents: "torch.Tensor" = None, height: Optional[int] = None, width: Optional[int] = None, + padding_mask_crop: Optional[int] = None, + strength: float = 1.0, num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, + num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional["torch.Tensor"] = None, + prompt_embeds: Optional["torch.Tensor"] = None, + negative_prompt_embeds: Optional["torch.Tensor"] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List["torch.Tensor"]] = None, + output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + clip_skip: int = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, ): r""" - Function invoked when calling the pipeline for generation. + The call function to the pipeline for generation. Args: - prompt (`Union[str, List[str]]`): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`PIL.Image.Image`): - `Image`, or tensor representing an image batch which will be upscaled. - mask_image (`PIL.Image.Image`): - `Image`, or tensor representing a masked image batch which will be upscaled. - height (`Optional[int]`, defaults to None): + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to + be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch + tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the + expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the + expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but + if passing latents directly it is not encoded again. + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask + are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a + single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one + color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, + H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, + 1)`, or `(H, W)`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): + padding_mask_crop (`int`, *optional*, defaults to `None`): + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information irrelevant for inpainting, such as background. + strength (`float`, *optional*, defaults to 1.0): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): + expense of slower inference. This parameter is modulated by `strength`. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + Examples: + + ```py + >>> import PIL + >>> import requests + >>> import torch + >>> from io import BytesIO + + >>> from diffusers import StableDiffusionInpaintPipeline + + + >>> def download_image(url): + ... response = requests.get(url) + ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") + + + >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" + >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + + >>> init_image = download_image(img_url).resize((512, 512)) + >>> mask_image = download_image(mask_url).resize((512, 512)) + + >>> pipe = StableDiffusionInpaintPipeline.from_pretrained( + ... "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16 + ... ) + >>> pipe = pipe.to("cuda") + + >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench" + >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0] + ``` Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. """ - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor + # must have a compatible torch device + device = self.device - # check inputs. Raise error if not correct + # convert numpy arrays to torch tensors + latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents + prompt_embeds = ( + self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + ) + negative_prompt_embeds = ( + self.np_to_pt(negative_prompt_embeds, device) + if isinstance(negative_prompt_embeds, np.ndarray) + else negative_prompt_embeds + ) + ip_adapter_image_embeds = ( + [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] + if ip_adapter_image_embeds is not None + else ip_adapter_image_embeds + ) + + for k, v in kwargs.items(): + if isinstance(v, np.ndarray): + kwargs[k] = self.np_to_pt(v, device) + elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): + kwargs[k] = [self.np_to_pt(i, device) for i in v] + elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): + kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} + + generator = ( + self.np_to_pt(generator, device) + if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) + or isinstance(generator, np.random.RandomState) + else generator + ) + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + prompt, + image, + mask_image, + height, + width, + strength, + callback_steps, + output_type, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + padding_mask_crop, ) - # define call parameters + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._interrupt = False + + # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -199,146 +526,242 @@ def __call__( else: batch_size = prompt_embeds.shape[0] - if generator is None: - generator = np.random.RandomState() - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, + device, num_images_per_prompt, - do_classifier_free_guidance, + self.do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, ) + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) - num_channels_latents = self.vae_decoder.config.get("latent_channels", 4) - num_channels_unet = self.unet.config.get("in_channels", 9) - latents_shape = ( - batch_size * num_images_per_prompt, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, + # 4. set timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) + timesteps, num_inference_steps = self.get_timesteps( + num_inference_steps=num_inference_steps, strength=strength, device=device ) - latents_dtype = prompt_embeds.dtype + # check that number of inference steps is not < 1 - as this doesn't make sense + if num_inference_steps < 1: + raise ValueError( + f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline" + f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline." + ) + # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise + is_strength_max = strength == 1.0 - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*latents_shape).astype(latents_dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != latents_shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") + # 5. Preprocess mask and image + + if padding_mask_crop is not None: + crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop) + resize_mode = "fill" + else: + crops_coords = None + resize_mode = "default" - # prepare mask and masked_image - mask, masked_image = prepare_mask_and_masked_image( - image, mask_image, latents_shape[-2:], self.vae_scale_factor + original_image = image + init_image = self.image_processor.preprocess( + image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode ) - mask = mask.astype(latents.dtype) - masked_image = masked_image.astype(latents.dtype) + init_image = init_image.to(dtype=torch.float32) - masked_image_latents = self.vae_encoder(sample=masked_image)[0] + # 6. Prepare latent variables + num_channels_latents = self.vae_decoder.config.latent_channels + num_channels_unet = self.unet.config.in_channels + return_image_latents = num_channels_unet == 4 - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - masked_image_latents = scaling_factor * masked_image_latents + latents_outputs = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + image=init_image, + timestep=latent_timestep, + is_strength_max=is_strength_max, + return_noise=True, + return_image_latents=return_image_latents, + ) - # duplicate mask and masked_image_latents for each generation per prompt - mask = mask.repeat(batch_size * num_images_per_prompt, 0) - masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0) + if return_image_latents: + latents, noise, image_latents = latents_outputs + else: + latents, noise = latents_outputs - mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask - masked_image_latents = ( - np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + # 7. Prepare mask latent variables + mask_condition = self.mask_processor.preprocess( + mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords + ) + + if masked_image_latents is None: + masked_image = init_image * (mask_condition < 0.5) + else: + masked_image = masked_image_latents + + mask, masked_image_latents = self.prepare_mask_latents( + mask_condition, + masked_image, + batch_size * num_images_per_prompt, + height, + width, + prompt_embeds.dtype, + device, + generator, + self.do_classifier_free_guidance, ) - # check that sizes of mask, masked image and latents match + # 8. Check that sizes of mask, masked image and latents match if num_channels_unet == 9: # default case for runwayml/stable-diffusion-inpainting num_channels_mask = mask.shape[1] num_channels_masked_image = masked_image_latents.shape[1] - if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet: + if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: raise ValueError( - f"Incorrect configuration settings! The config of `pipeline.unet`: expects" - f" {num_channels_unet} but received `num_channels_latents`: {num_channels_latents} +" + f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" + f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" " `pipeline.unet` or your `mask_image` or `image` input." ) elif num_channels_unet != 4: raise ValueError( - f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {num_channels_unet}." + f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}." ) - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) + # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta + # 9.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None + else None + ) - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) + # 9.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + # 10. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - # concat latents, mask, masked_image_latnets in the channel dimension - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - if num_channels_unet == 9: - latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1) - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[ - 0 - ] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + + # concat latents, mask, masked_image_latents in the channel dimension + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + if num_channels_unet == 9: + latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + timestep=t.unsqueeze(0), + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), + # cross_attention_kwargs=cross_attention_kwargs, + # added_cond_kwargs=added_cond_kwargs, + # return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + if num_channels_unet == 4: + init_latents_proper = image_latents + if self.do_classifier_free_guidance: + init_mask, _ = mask.chunk(2) + else: + init_mask = mask + + if i < len(timesteps) - 1: + noise_timestep = timesteps[i + 1] + init_latents_proper = self.scheduler.add_noise( + init_latents_proper, noise, torch.tensor([noise_timestep]) + ) + + latents = (1 - init_mask) * init_latents_proper + init_mask * latents + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + mask = callback_outputs.pop("mask", mask) + masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + condition_kwargs = {} + # if isinstance(self.vae, AsymmetricAutoencoderKL): + # init_image = init_image.to(device=device, dtype=masked_image_latents.dtype) + # init_image_condition = init_image.clone() + # init_image = self._encode_vae_image(init_image, generator=generator) + # mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype) + # condition_kwargs = {"image": init_image_condition, "mask": mask_condition} + image = self.vae_decoder( + latents / self.vae_decoder.config.scaling_factor, + # return_dict=False, + # generator=generator, + **condition_kwargs, + )[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: image = latents has_nsfw_concept = None - else: - latents /= scaling_factor - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) if has_nsfw_concept is None: do_denormalize = [True] * image.shape[0] @@ -347,6 +770,12 @@ def __call__( image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + if padding_mask_crop is not None: + image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image] + + # Offload all models + # self.maybe_free_model_hooks() + if not return_dict: return (image, has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 0407c16a77..4dfd0dd1b5 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -12,64 +12,123 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect import logging from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import torch -from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput +from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import rescale_noise_cfg, retrieve_timesteps +from diffusers.utils.deprecation_utils import deprecate -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg +from .pipeline_stable_diffusion import StableDiffusionPipelineMixin logger = logging.getLogger(__name__) -class StableDiffusionXLPipelineMixin(DiffusionPipelineMixin): - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt - def _encode_prompt( +class StableDiffusionXLPipelineMixin(StableDiffusionPipelineMixin): + _optional_components = [ + "tokenizer", + "tokenizer_2", + "text_encoder", + "text_encoder_2", + "image_encoder", + "feature_extractor", + ] + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + "add_text_embeds", + "add_time_ids", + "negative_pooled_prompt_embeds", + "negative_add_time_ids", + ] + + def encode_prompt( self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, + prompt: str, + prompt_2: Optional[str] = None, + device: Optional[torch.device] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + negative_prompt: Optional[str] = None, + negative_prompt_2: Optional[str] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, ): r""" Encodes the prompt into text encoder hidden states. Args: - prompt (`Union[str, List[str]]`): + prompt (`str` or `List[str]`, *optional*): prompt to be encoded + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + device: (`torch.device`): + torch device num_images_per_prompt (`int`): number of images that should be generated per prompt do_classifier_free_guidance (`bool`): whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): + device = device or self.device + + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + # if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin): + # self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + # if self.text_encoder is not None: + # if not USE_PEFT_BACKEND: + # adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + # else: + # scale_lora_layers(self.text_encoder, lora_scale) + + # if self.text_encoder_2 is not None: + # if not USE_PEFT_BACKEND: + # adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale) + # else: + # scale_lora_layers(self.text_encoder_2, lora_scale) + + prompt = [prompt] if isinstance(prompt, str) else prompt + + if prompt is not None: batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] @@ -81,20 +140,28 @@ def _encode_prompt( ) if prompt_embeds is None: + prompt_2 = prompt_2 or prompt + prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 + + # textual inversion: process multi-vector tokens if necessary prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - # get prompt text embeddings + prompts = [prompt, prompt_2] + for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): + # if isinstance(self, TextualInversionLoaderMixin): + # prompt = self.maybe_convert_prompt(prompt, tokenizer) + text_inputs = tokenizer( prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, - return_tensors="np", + return_tensors="pt", ) + text_input_ids = text_inputs.input_ids - untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal( + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( text_input_ids, untruncated_ids ): removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) @@ -104,29 +171,43 @@ def _encode_prompt( ) prompt_embeds = text_encoder( - input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) + text_input_ids.to(device), + # output_hidden_states=True, ) + + # We are only ALWAYS interested in the pooled output of the final text encoder pooled_prompt_embeds = prompt_embeds[0] - prompt_embeds = prompt_embeds[-2] - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) + if clip_skip is None: + prompt_embeds = prompt_embeds.hidden_states[-2] + else: + # "2" because SDXL always indexes from the penultimate layer. + prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)] + prompt_embeds_list.append(prompt_embeds) - prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) + prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) # get unconditional embeddings for classifier free guidance zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"] if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = np.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) elif do_classifier_free_guidance and negative_prompt_embeds is None: negative_prompt = negative_prompt or "" + negative_prompt_2 = negative_prompt_2 or negative_prompt + + # normalize str to list + negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt + negative_prompt_2 = ( + batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2 + ) + + uncond_tokens: List[str] if prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}." ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" @@ -134,78 +215,138 @@ def _encode_prompt( " the batch size of `prompt`." ) else: - uncond_tokens = negative_prompt + uncond_tokens = [negative_prompt, negative_prompt_2] negative_prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): + for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): + # if isinstance(self, TextualInversionLoaderMixin): + # negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) + max_length = prompt_embeds.shape[1] uncond_input = tokenizer( - uncond_tokens, + negative_prompt, padding="max_length", max_length=max_length, truncation=True, - return_tensors="np", + return_tensors="pt", ) + negative_prompt_embeds = text_encoder( - input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) + uncond_input.input_ids.to(device), + # output_hidden_states=True, ) + # We are only ALWAYS interested in the pooled output of the final text encoder negative_pooled_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = negative_prompt_embeds[-2] + negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) - pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0) - negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0) + negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) + + if self.text_encoder_2 is not None: + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) + else: + prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + if self.text_encoder_2 is not None: + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) + else: + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( + bs_embed * num_images_per_prompt, -1 + ) + if do_classifier_free_guidance: + negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( + bs_embed * num_images_per_prompt, -1 + ) + + # if self.text_encoder is not None: + # if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND: + # # Retrieve the original scale by scaling back the LoRA layers + # unscale_lora_layers(self.text_encoder, lora_scale) + + # if self.text_encoder_2 is not None: + # if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND: + # # Retrieve the original scale by scaling back the LoRA layers + # unscale_lora_layers(self.text_encoder_2, lora_scale) return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs def check_inputs( self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, + prompt, + prompt_2, + height, + width, + callback_steps, + negative_prompt=None, + negative_prompt_2=None, + prompt_embeds=None, + negative_prompt_embeds=None, + pooled_prompt_embeds=None, + negative_pooled_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}." ) + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" " only forward one of the two." ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" f" {negative_prompt_embeds}. Please make sure to only forward one of the two." ) + elif negative_prompt_2 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: @@ -225,146 +366,343 @@ def check_inputs( "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." ) - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*shape).astype(dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) - else: + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) - elif latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - - return latents - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - extra_step_kwargs = {} - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_eta: - extra_step_kwargs["eta"] = eta - - return extra_step_kwargs - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ + def _get_add_time_ids( + self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None + ): + add_time_ids = list(original_size + crops_coords_top_left + target_size) + + # passed_add_embed_dim = ( + # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim + # ) + # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + # if expected_add_embed_dim != passed_add_embed_dim: + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + # ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids + + # def upcast_vae(self): + # dtype = self.vae.dtype + # self.vae.to(dtype=torch.float32) + # use_torch_2_0_or_xformers = isinstance( + # self.vae.decoder.mid_block.attentions[0].processor, + # ( + # AttnProcessor2_0, + # XFormersAttnProcessor, + # FusedAttnProcessor2_0, + # ), + # ) + # # if xformers or torch_2_0 is used attention block does not need + # # to be in float32 which can save lots of memory + # if use_torch_2_0_or_xformers: + # self.vae.post_quant_conv.to(dtype) + # self.vae.decoder.conv_in.to(dtype) + # self.vae.decoder.mid_block.to(dtype) + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def guidance_rescale(self): + return self._guidance_rescale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def denoising_end(self): + return self._denoising_end + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() def __call__( self, - prompt: Optional[Union[str, List[str]]] = None, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, original_size: Optional[Tuple[int, int]] = None, crops_coords_top_left: Tuple[int, int] = (0, 0), target_size: Optional[Tuple[int, int]] = None, + negative_original_size: Optional[Tuple[int, int]] = None, + negative_crops_coords_top_left: Tuple[int, int] = (0, 0), + negative_target_size: Optional[Tuple[int, int]] = None, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, ): r""" Function invoked when calling the pipeline for generation. Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): + prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. This is set to 1024 by default for the best results. + Anything below 512 pixels won't work well for + [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) + and checkpoints that are not specifically fine-tuned on low resolutions. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. This is set to 1024 by default for the best results. + Anything below 512 pixels won't work well for + [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) + and checkpoints that are not specifically fine-tuned on low resolutions. + num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - guidance_scale (`float`, defaults to 5): + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + denoising_end (`float`, *optional*): + When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be + completed before it is intentionally prematurely terminated. As a result, the returned sample will + still retain a substantial amount of noise as determined by the discrete timesteps selected by the + scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a + "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image + Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) + guidance_scale (`float`, *optional*, defaults to 5.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): + negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - eta (`float`, defaults to 0.0): + eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - output_type (`str`, defaults to `"pil"`): + pooled_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.7): + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead + of a plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when using zero terminal SNR. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + For most cases, `target_size` should be set to the desired height and width of the generated image. If + not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in + section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a specific image resolution. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a target image resolution. It should be as same + as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: Returns: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is a list with the generated images. """ + # must have a compatible torch device + device = self.device + + # convert numpy arrays to torch tensors + latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents + prompt_embeds = ( + self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + ) + negative_prompt_embeds = ( + self.np_to_pt(negative_prompt_embeds, device) + if isinstance(negative_prompt_embeds, np.ndarray) + else negative_prompt_embeds + ) + pooled_prompt_embeds = ( + self.np_to_pt(pooled_prompt_embeds, device) + if isinstance(pooled_prompt_embeds, np.ndarray) + else pooled_prompt_embeds + ) + negative_pooled_prompt_embeds = ( + self.np_to_pt(negative_pooled_prompt_embeds, device) + if isinstance(negative_pooled_prompt_embeds, np.ndarray) + else negative_pooled_prompt_embeds + ) + ip_adapter_image_embeds = ( + [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] + if ip_adapter_image_embeds is not None + else ip_adapter_image_embeds + ) + + for k, v in kwargs.items(): + if isinstance(v, np.ndarray): + kwargs[k] = self.np_to_pt(v, device) + elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): + kwargs[k] = [self.np_to_pt(i, device) for i in v] + elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): + kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} + + generator = ( + self.np_to_pt(generator, device) + if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) + or isinstance(generator, np.random.RandomState) + else generator + ) + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs # 0. Default height and width to unet - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor original_size = original_size or (height, width) target_size = target_size or (height, width) @@ -372,134 +710,278 @@ def __call__( # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, + prompt_2, height, width, callback_steps, negative_prompt, + negative_prompt_2, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, ) + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._denoising_end = denoising_end + self._interrupt = False + # 2. Define call parameters - if isinstance(prompt, str): + if prompt is not None and isinstance(prompt, str): batch_size = 1 - elif isinstance(prompt, list): + elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + ( prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, - ) = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + device=device, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=self.do_classifier_free_guidance, + negative_prompt=negative_prompt, + negative_prompt_2=negative_prompt_2, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + lora_scale=lora_scale, + clip_skip=self.clip_skip, ) # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, - self.unet.config.get("in_channels", 4), + num_channels_latents, height, width, prompt_embeds.dtype, + device, generator, latents, ) - # 6. Prepare extra step kwargs + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Prepare added time ids & embeddings add_text_embeds = pooled_prompt_embeds - add_time_ids = (original_size + crops_coords_top_left + target_size,) - add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype) - - if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) - add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) + if self.text_encoder_2 is None: + text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1]) + else: + text_encoder_projection_dim = self.text_encoder_2.config.projection_dim + + add_time_ids = self._get_add_time_ids( + original_size, + crops_coords_top_left, + target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + if negative_original_size is not None and negative_target_size is not None: + negative_add_time_ids = self._get_add_time_ids( + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + else: + negative_add_time_ids = add_time_ids + + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) + add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0) + + prompt_embeds = prompt_embeds.to(device) + add_text_embeds = add_text_embeds.to(device) + add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) # 8. Denoising loop - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latent_model_input, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - text_embeds=add_text_embeds, - time_ids=add_time_ids, + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + + # 8.1 Apply denoising_end + if ( + self.denoising_end is not None + and isinstance(self.denoising_end, float) + and self.denoising_end > 0 + and self.denoising_end < 1 + ): + discrete_timestep_cutoff = int( + round( + self.scheduler.config.num_train_timesteps + - (self.denoising_end * self.scheduler.config.num_train_timesteps) + ) ) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: + num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))) + timesteps = timesteps[:num_inference_steps] + + # 9. Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + added_cond_kwargs["image_embeds"] = image_embeds + noise_pred = self.unet( + latent_model_input, + timestep=t.unsqueeze(0), + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), + # cross_attention_kwargs=cross_attention_kwargs, + # added_cond_kwargs=added_cond_kwargs, + # return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + if latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds) + negative_pooled_prompt_embeds = callback_outputs.pop( + "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds + ) + add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) + negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # if XLA_AVAILABLE: + # xm.mark_step() + + if not output_type == "latent": + # make sure the VAE is in float32 mode, as it overflows in float16 + # needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast + + # if needs_upcasting: + # self.upcast_vae() + # latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype) + # elif latents.dtype != self.vae.dtype: + # if torch.backends.mps.is_available(): + # # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + # self.vae = self.vae.to(latents.dtype) + + # unscale/denormalize the latents + # denormalize with the mean and std if available and not None + has_latents_mean = ( + hasattr(self.vae_decoder.config, "latents_mean") and self.vae_decoder.config.latents_mean is not None ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": - image = latents - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] + has_latents_std = ( + hasattr(self.vae_decoder.config, "latents_std") and self.vae_decoder.config.latents_std is not None ) + if has_latents_mean and has_latents_std: + latents_mean = ( + torch.tensor(self.vae_decoder.config.latents_mean) + .view(1, 4, 1, 1) + .to(latents.device, latents.dtype) + ) + latents_std = ( + torch.tensor(self.vae_decoder.config.latents_std) + .view(1, 4, 1, 1) + .to(latents.device, latents.dtype) + ) + latents = latents * latents_std / self.vae_decoder.config.scaling_factor + latents_mean + else: + latents = latents / self.vae_decoder.config.scaling_factor + + image = self.vae_decoder( + latents, + # return_dict=False, + )[0] + # cast back to fp16 if needed + # if needs_upcasting: + # self.vae.to(dtype=torch.float16) + else: + image = latents + + if not output_type == "latent": # apply watermark if available if self.watermark is not None: image = self.watermark.apply_watermark(image) + image = self.image_processor.postprocess(image, output_type=output_type) + # Offload all models + # self.maybe_free_model_hooks() + if not return_dict: return (image,) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py index 19988599b6..66b7c79320 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py @@ -12,198 +12,114 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect import logging from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import PIL.Image import torch -from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput +from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import ( + rescale_noise_cfg, + retrieve_latents, + retrieve_timesteps, +) +from diffusers.utils.deprecation_utils import deprecate +from diffusers.utils.torch_utils import randn_tensor -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg +from .pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin logger = logging.getLogger(__name__) -class StableDiffusionXLImg2ImgPipelineMixin(DiffusionPipelineMixin): - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`Union[str, List[str]]`): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # Define tokenizers and text encoders - tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] - text_encoders = ( - [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] - ) - - if prompt_embeds is None: - prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - # get prompt text embeddings - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal( - text_input_ids, untruncated_ids - ): - removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {tokenizer.model_max_length} tokens: {removed_text}" - ) +class StableDiffusionXLImg2ImgPipelineMixin(StableDiffusionXLPipelineMixin): + _optional_components = [ + "tokenizer", + "tokenizer_2", + "text_encoder", + "text_encoder_2", + "image_encoder", + "feature_extractor", + ] + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + "add_text_embeds", + "add_time_ids", + "negative_pooled_prompt_embeds", + "add_neg_time_ids", + ] - prompt_embeds = text_encoder( - input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - pooled_prompt_embeds = prompt_embeds[0] - prompt_embeds = prompt_embeds[-2] - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) - prompt_embeds_list.append(prompt_embeds) - - prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) - - # get unconditional embeddings for classifier free guidance - zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"] - if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = np.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) - elif do_classifier_free_guidance and negative_prompt_embeds is None: - negative_prompt = negative_prompt or "" - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - negative_prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - negative_prompt_embeds = text_encoder( - input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - negative_pooled_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = negative_prompt_embeds[-2] - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) - - pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0) - negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0) - - return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.check_inputs def check_inputs( self, - prompt: Union[str, List[str]], - strength: float, - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, + prompt, + prompt_2, + strength, + num_inference_steps, + callback_steps, + negative_prompt=None, + negative_prompt_2=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + callback_on_step_end_tensor_inputs=None, ): if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): + if num_inference_steps is None: + raise ValueError("`num_inference_steps` cannot be None.") + elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0: + raise ValueError( + f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type" + f" {type(num_inference_steps)}." + ) + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}." ) + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" " only forward one of the two." ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" f" {negative_prompt_embeds}. Please make sure to only forward one of the two." ) + elif negative_prompt_2 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: @@ -213,301 +129,806 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) - def get_timesteps(self, num_inference_steps, strength): + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None): # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].numpy() + if denoising_start is None: + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + t_start = max(num_inference_steps - init_timestep, 0) + + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start - return timesteps, num_inference_steps - t_start + else: + # Strength is irrelevant if we directly request a timestep to start at; + # that is, strength is determined by the denoising_start instead. + discrete_timestep_cutoff = int( + round( + self.scheduler.config.num_train_timesteps + - (denoising_start * self.scheduler.config.num_train_timesteps) + ) + ) + + num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item() + if self.scheduler.order == 2 and num_inference_steps % 2 == 0: + # if the scheduler is a 2nd order scheduler we might have to do +1 + # because `num_inference_steps` might be even given that every timestep + # (except the highest one) is duplicated. If `num_inference_steps` is even it would + # mean that we cut the timesteps in the middle of the denoising step + # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1 + # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler + num_inference_steps = num_inference_steps + 1 + + # because t_n+1 >= t_n, we slice the timesteps starting from the end + t_start = len(self.scheduler.timesteps) - num_inference_steps + timesteps = self.scheduler.timesteps[t_start:] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start) + return timesteps, num_inference_steps + + def prepare_latents( + self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True + ): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + latents_mean = latents_std = None + if hasattr(self.vae_decoder.config, "latents_mean") and self.vae_decoder.config.latents_mean is not None: + latents_mean = torch.tensor(self.vae_decoder.config.latents_mean).view(1, 4, 1, 1) + if hasattr(self.vae_decoder.config, "latents_std") and self.vae_decoder.config.latents_std is not None: + latents_std = torch.tensor(self.vae_decoder.config.latents_std).view(1, 4, 1, 1) + + # Offload text encoder if `enable_model_cpu_offload` was enabled + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.text_encoder_2.to("cpu") + torch.cuda.empty_cache() + + image = image.to(device=device, dtype=dtype) - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): batch_size = batch_size * num_images_per_prompt if image.shape[1] == 4: init_latents = image + else: - init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) + # make sure the VAE is in float32 mode, as it overflows in float16 + # if self.vae_decoder.config.force_upcast: + # image = image.float() + # self.vae_decoder.to(dtype=torch.float32) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + + init_latents = [ + retrieve_latents(self.vae_encoder(image[i : i + 1]), generator=generator[i]) + for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = retrieve_latents(self.vae_encoder(image), generator=generator) + + # if self.vae_decoder.config.force_upcast: + # self.vae_decoder.to(dtype) + + init_latents = init_latents.to(dtype) + if latents_mean is not None and latents_std is not None: + latents_mean = latents_mean.to(device=device, dtype=dtype) + latents_std = latents_std.to(device=device, dtype=dtype) + init_latents = (init_latents - latents_mean) * self.vae_decoder.config.scaling_factor / latents_std + else: + init_latents = self.vae_decoder.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) + init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) else: - init_latents = np.concatenate([init_latents], axis=0) + init_latents = torch.cat([init_latents], dim=0) - # add noise to latents using the timesteps - if isinstance(generator, np.random.RandomState): - noise = generator.randn(*init_latents.shape).astype(dtype) - elif isinstance(generator, torch.Generator): - noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) + if add_noise: + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ) - init_latents = init_latents.numpy() + latents = init_latents - return init_latents + return latents def _get_add_time_ids( - self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, ): - if self.config.get("requires_aesthetics_score"): - add_time_ids = (original_size + crops_coords_top_left + (aesthetic_score,),) - add_neg_time_ids = (original_size + crops_coords_top_left + (negative_aesthetic_score,),) + if self.config.get("requires_aesthetics_score", False): + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) else: - add_time_ids = (original_size + crops_coords_top_left + target_size,) - add_neg_time_ids = (original_size + crops_coords_top_left + target_size,) - - add_time_ids = np.array(add_time_ids, dtype=dtype) - add_neg_time_ids = np.array(add_neg_time_ids, dtype=dtype) + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + # passed_add_embed_dim = ( + # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim + # ) + # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + # if ( + # expected_add_embed_dim > passed_add_embed_dim + # and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim + # ): + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model." + # ) + # elif ( + # expected_add_embed_dim < passed_add_embed_dim + # and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim + # ): + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model." + # ) + # elif expected_add_embed_dim != passed_add_embed_dim: + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + # ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) return add_time_ids, add_neg_time_ids - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def guidance_rescale(self): + return self._guidance_rescale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def denoising_end(self): + return self._denoising_end + + @property + def denoising_start(self): + return self._denoising_start + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() def __call__( self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + image: PipelineImageInput = None, strength: float = 0.3, num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + denoising_start: Optional[float] = None, + denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, - original_size: Optional[Tuple[int, int]] = None, + original_size: Tuple[int, int] = None, crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Optional[Tuple[int, int]] = None, + target_size: Tuple[int, int] = None, + negative_original_size: Optional[Tuple[int, int]] = None, + negative_crops_coords_top_left: Tuple[int, int] = (0, 0), + negative_target_size: Optional[Tuple[int, int]] = None, aesthetic_score: float = 6.0, negative_aesthetic_score: float = 2.5, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, ): r""" Function invoked when calling the pipeline for generation. Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): + prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`Union[np.ndarray, PIL.Image.Image]`): - `Image`, or tensor representing an image batch which will be upscaled. - strength (`float`, defaults to 0.8): + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + The image(s) to modify with the pipeline. + strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - num_inference_steps (`int`, defaults to 50): + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of + `denoising_start` being declared as an integer, the value of `strength` will be ignored. + num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - guidance_scale (`float`, defaults to 5): + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + denoising_start (`float`, *optional*): + When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be + bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and + it is assumed that the passed `image` is a partly denoised image. Note that when this is specified, + strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline + is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image + Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality). + denoising_end (`float`, *optional*): + When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be + completed before it is intentionally prematurely terminated. As a result, the returned sample will + still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be + denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the + final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline + forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image + Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality). + guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): + negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - eta (`float`, defaults to 0.0): + eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - output_type (`str`, defaults to `"pil"`): + pooled_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): + return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.7): + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when using zero terminal SNR. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + For most cases, `target_size` should be set to the desired height and width of the generated image. If + not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in + section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a specific image resolution. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a target image resolution. It should be as same + as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + aesthetic_score (`float`, *optional*, defaults to 6.0): + Used to simulate an aesthetic score of the generated image by influencing the positive text condition. + Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + negative_aesthetic_score (`float`, *optional*, defaults to 2.5): + Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to + simulate an aesthetic score of the generated image by influencing the negative text condition. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: Returns: [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a + `tuple. When returning a tuple, the first element is a list with the generated images. """ - # 0. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + # must have a compatible torch device + device = self.device + + # convert numpy arrays to torch tensors + latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents + prompt_embeds = ( + self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds + ) + negative_prompt_embeds = ( + self.np_to_pt(negative_prompt_embeds, device) + if isinstance(negative_prompt_embeds, np.ndarray) + else negative_prompt_embeds + ) + pooled_prompt_embeds = ( + self.np_to_pt(pooled_prompt_embeds, device) + if isinstance(pooled_prompt_embeds, np.ndarray) + else pooled_prompt_embeds + ) + negative_pooled_prompt_embeds = ( + self.np_to_pt(negative_pooled_prompt_embeds, device) + if isinstance(negative_pooled_prompt_embeds, np.ndarray) + else negative_pooled_prompt_embeds + ) + ip_adapter_image_embeds = ( + [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] + if ip_adapter_image_embeds is not None + else ip_adapter_image_embeds + ) + + for k, v in kwargs.items(): + if isinstance(v, np.ndarray): + kwargs[k] = self.np_to_pt(v, device) + elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): + kwargs[k] = [self.np_to_pt(i, device) for i in v] + elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): + kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} + + generator = ( + self.np_to_pt(generator, device) + if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) + or isinstance(generator, np.random.RandomState) + else generator + ) + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + strength, + num_inference_steps, + callback_steps, + negative_prompt, + negative_prompt_2, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._denoising_end = denoising_end + self._denoising_start = denoising_start + self._interrupt = False - # 1. Define call parameters - if isinstance(prompt, str): + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): batch_size = 1 - elif isinstance(prompt, list): + elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 2. Encode input prompt + # 3. Encode input prompt + text_encoder_lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) ( prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, - ) = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + device=device, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=self.do_classifier_free_guidance, + negative_prompt=negative_prompt, + negative_prompt_2=negative_prompt_2, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, ) - # 3. Preprocess image + # 4. Preprocess image image = self.image_processor.preprocess(image) - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) + # 5. Prepare timesteps + def denoising_value_valid(dnv): + return isinstance(dnv, float) and 0 < dnv < 1 - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) - latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0) - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - latents_dtype = prompt_embeds.dtype - image = image.astype(latents_dtype) - - # 5. Prepare latent variables - latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas ) - - # 6. Prepare extra step kwargs - extra_step_kwargs = {} - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_eta: - extra_step_kwargs["eta"] = eta + timesteps, num_inference_steps = self.get_timesteps( + num_inference_steps, + strength, + device, + denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None, + ) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + + add_noise = True if self.denoising_start is None else False + + # 6. Prepare latent variables + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + add_noise, + ) + # 7. Prepare extra step kwargs. + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) height, width = latents.shape[-2:] height = height * self.vae_scale_factor width = width * self.vae_scale_factor + original_size = original_size or (height, width) target_size = target_size or (height, width) # 8. Prepare added time ids & embeddings + if negative_original_size is None: + negative_original_size = original_size + if negative_target_size is None: + negative_target_size = target_size + add_text_embeds = pooled_prompt_embeds + if self.text_encoder_2 is None: + text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1]) + else: + text_encoder_projection_dim = self.text_encoder_2.config.projection_dim + add_time_ids, add_neg_time_ids = self._get_add_time_ids( original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, ) - - if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) - add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latent_model_input, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - text_embeds=add_text_embeds, - time_ids=add_time_ids, + add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1) + + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) + add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1) + add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0) + + prompt_embeds = prompt_embeds.to(device) + add_text_embeds = add_text_embeds.to(device) + add_time_ids = add_time_ids.to(device) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) - noise_pred = noise_pred[0] - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + # 9. Denoising loop + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs + # 9.1 Apply denoising_end + if ( + self.denoising_end is not None + and self.denoising_start is not None + and denoising_value_valid(self.denoising_end) + and denoising_value_valid(self.denoising_start) + and self.denoising_start >= self.denoising_end + ): + raise ValueError( + f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: " + + f" {self.denoising_end} when using type float." + ) + elif self.denoising_end is not None and denoising_value_valid(self.denoising_end): + discrete_timestep_cutoff = int( + round( + self.scheduler.config.num_train_timesteps + - (self.denoising_end * self.scheduler.config.num_train_timesteps) + ) + ) + num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))) + timesteps = timesteps[:num_inference_steps] + + # 9.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + added_cond_kwargs["image_embeds"] = image_embeds + noise_pred = self.unet( + latent_model_input, + timestep=t.unsqueeze(0), + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), + # cross_attention_kwargs=cross_attention_kwargs, + # added_cond_kwargs=added_cond_kwargs, + # return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + if latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds) + negative_pooled_prompt_embeds = callback_outputs.pop( + "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds + ) + add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) + add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # if XLA_AVAILABLE: + # xm.mark_step() + + if not output_type == "latent": + # make sure the VAE is in float32 mode, as it overflows in float16 + # needs_upcasting = self.vae_decoder.dtype == torch.float16 and self.vae_decoder.config.force_upcast + + # if needs_upcasting: + # self.upcast_vae() + # latents = latents.to(next(iter(self.vae_decoder.post_quant_conv.parameters())).dtype) + # elif latents.dtype != self.vae_decoder.dtype: + # if torch.backends.mps.is_available(): + # # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + # self.vae = self.vae_decoder.to(latents.dtype) + + # unscale/denormalize the latents + # denormalize with the mean and std if available and not None + has_latents_mean = ( + hasattr(self.vae_decoder.config, "latents_mean") and self.vae_decoder.config.latents_mean is not None + ) + has_latents_std = ( + hasattr(self.vae_decoder.config, "latents_std") and self.vae_decoder.config.latents_std is not None ) - latents = scheduler_output.prev_sample.numpy() + if has_latents_mean and has_latents_std: + latents_mean = ( + torch.tensor(self.vae_decoder.config.latents_mean) + .view(1, 4, 1, 1) + .to(latents.device, latents.dtype) + ) + latents_std = ( + torch.tensor(self.vae_decoder.config.latents_std) + .view(1, 4, 1, 1) + .to(latents.device, latents.dtype) + ) + latents = latents * latents_std / self.vae_decoder.config.scaling_factor + latents_mean + else: + latents = latents / self.vae_decoder.config.scaling_factor - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) + image = self.vae_decoder( + latents, + # return_dict=False, + )[0] - if output_type == "latent": - image = latents + # cast back to fp16 if needed + # if needs_upcasting: + # self.vae_decoder.to(dtype=torch.float16) else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) + image = latents + + # apply watermark if available + if self.watermark is not None: + image = self.watermark.apply_watermark(image) + + image = self.image_processor.postprocess(image, output_type=output_type) + + # Offload all models + # self.maybe_free_model_hooks() if not return_dict: return (image,) diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py index 19521df5a1..dba41381a4 100644 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ b/optimum/pipelines/diffusers/pipeline_utils.py @@ -14,13 +14,12 @@ import logging -from typing import List, Optional, Tuple, Union +from typing import Union import numpy as np import torch from diffusers import ConfigMixin from tqdm.auto import tqdm -from transformers.modeling_outputs import ModelOutput logger = logging.getLogger(__name__) @@ -28,8 +27,15 @@ class DiffusionPipelineMixin(ConfigMixin): @staticmethod - def np_to_pt(tensor: np.ndarray, device: str) -> "torch.Tensor": - return torch.from_numpy(tensor).to(device) + def np_to_pt( + np_object: Union[np.ndarray, np.random.RandomState], device: str + ) -> Union[torch.Tensor, torch.Generator]: + if isinstance(np_object, np.ndarray): + return torch.from_numpy(np_object).to(device) + elif isinstance(np_object, np.random.RandomState): + return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) + else: + raise ValueError(f"Unsupported type {type(np_object)}") # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L827 def progress_bar(self, iterable=None, total=None): @@ -46,116 +52,3 @@ def progress_bar(self, iterable=None, total=None): return tqdm(total=total, **self._progress_bar_config) else: raise ValueError("Either `total` or `iterable` has to be defined.") - - -def np_randn_tensor( - shape: Union[Tuple, List], - generator: Optional[Union[List["np.random.RandomState"], "np.random.RandomState"]] = None, - device: Optional["torch.device"] = None, - dtype: Optional["torch.dtype"] = None, - layout: Optional["torch.layout"] = None, -): - """A helper function to create random tensors on the desired `device` with the desired `dtype`. When - passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor - is always created on the CPU. - """ - batch_size = shape[0] - - # make sure generator list of length 1 is treated like a non-list - if isinstance(generator, list) and len(generator) == 1: - generator = generator[0] - - if isinstance(generator, list): - shape = (1,) + shape[1:] - latents = [generator[i].randn(*shape) for i in range(batch_size)] - latents = np.stack(latents, axis=0) - elif generator is not None: - latents = generator.randn(*shape) - else: - latents = np.random.randn(*shape) - - return latents - - -def pt_randn_tensor( - shape: Union[Tuple, List], - generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None, - device: Optional["torch.device"] = None, - dtype: Optional["torch.dtype"] = None, - layout: Optional["torch.layout"] = None, -): - """A helper function to create random tensors on the desired `device` with the desired `dtype`. When - passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor - is always created on the CPU. - """ - # device on which tensor is created defaults to device - rand_device = device - batch_size = shape[0] - - layout = layout or torch.strided - device = device or torch.device("cpu") - - if generator is not None: - gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type - if gen_device_type != device.type and gen_device_type == "cpu": - rand_device = "cpu" - if device != "mps": - logger.info( - f"The passed generator was created on 'cpu' even though a tensor on {device} was expected." - f" Tensors will be created on 'cpu' and then moved to {device}. Note that one can probably" - f" slighly speed up this function by passing a generator that was created on the {device} device." - ) - elif gen_device_type != device.type and gen_device_type == "cuda": - raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.") - - # make sure generator list of length 1 is treated like a non-list - if isinstance(generator, list) and len(generator) == 1: - generator = generator[0] - - if isinstance(generator, list): - shape = (1,) + shape[1:] - latents = [ - torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout) - for i in range(batch_size) - ] - latents = torch.cat(latents, dim=0).to(device) - else: - latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device) - - return latents - - -def randn_tensor( - shape: Union[Tuple, List], - generator: Optional[ - Union[List[Union["torch.Generator", "np.random.RandomState"]], "torch.Generator", "np.random.RandomState"] - ] = None, - device: Optional["torch.device"] = None, - dtype: Optional["torch.dtype"] = None, - layout: Optional["torch.layout"] = None, -) -> "torch.Tensor": - if (isinstance(generator, list) and isinstance(generator[0], torch.Generator)) or isinstance( - generator, torch.Generator - ): - return pt_randn_tensor(shape, generator, device, dtype, layout) - elif (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) or isinstance( - generator, np.random.RandomState - ): - return torch.from_numpy(np_randn_tensor(shape, generator, device, dtype, layout)).to(device) - else: - return pt_randn_tensor(shape, generator, device, dtype, layout) - - -def retrieve_latents( - encoder_output: ModelOutput, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" -): - if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": - return encoder_output.latent_dist.sample(generator) - elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": - return encoder_output.latent_dist.mode() - elif hasattr(encoder_output, "latent_sample"): - return encoder_output.latent_sample - elif hasattr(encoder_output, "latents"): - return encoder_output.latents - else: - raise AttributeError("Could not access latents of provided encoder_output") diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index d17fbaa0d8..606ad73f66 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -14,7 +14,6 @@ # limitations under the License. import numpy as np -import PIL import pytest import torch from diffusers import ( @@ -70,14 +69,6 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= return [image] * batch_size -def to_np(image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image - - class ORTPipelineForText2ImageTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"] @@ -148,22 +139,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - # if model_arch == "latent-consistency": - # # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step - # # TODO: Investigate why this is the case - # inputs["num_inference_steps"] = 1 - for output_type in ["latent", "np"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - self.assertTrue( - np.allclose(ort_output, diffusers_output, atol=1e-4), - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), - ) - self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -179,7 +161,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -231,18 +213,12 @@ def test_image_reproducibility(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for generator_framework in ["np", "pt"]: - if model_arch in ["latent-consistency"] and generator_framework == "np": - pytest.skip("Latent Consistency Model (LCM) scheduler doesn't support numpy generator") - ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue( - np.allclose(ort_outputs_1.images[0], ort_outputs_2.images[0]), - np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0]), - ) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): @@ -254,11 +230,8 @@ def test_negative_prompt(self, model_arch: str): negative_prompt = ["This is a negative prompt"] pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - image_slice_1 = pipeline( - **inputs, - negative_prompt=negative_prompt, - generator=torch.Generator().manual_seed(SEED), - ).images[0, -3:, -3:, -1] + + images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images prompt = inputs.pop("prompt") if model_arch == "stable-diffusion-xl": @@ -267,31 +240,15 @@ def test_negative_prompt(self, model_arch: str): inputs["negative_prompt_embeds"], inputs["pooled_prompt_embeds"], inputs["negative_pooled_prompt_embeds"], - ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt) + ) = pipeline.encode_prompt(prompt=prompt, negative_prompt=negative_prompt) else: - text_ids = pipeline.tokenizer( - prompt, - max_length=pipeline.tokenizer.model_max_length, - padding="max_length", - return_tensors="np", - truncation=True, - ).input_ids - negative_text_ids = pipeline.tokenizer( - negative_prompt, - max_length=pipeline.tokenizer.model_max_length, - padding="max_length", - return_tensors="np", - truncation=True, - ).input_ids - inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0] - inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0] - - image_slice_2 = pipeline( - **inputs, - generator=torch.Generator().manual_seed(SEED), - ).images[0, -3:, -3:, -1] - - self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1)) + inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( + prompt=prompt, negative_prompt=negative_prompt + ) + + images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) @@ -410,7 +367,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -454,27 +411,23 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - pytest.skip("Img2Img models do not support support output reproducibility for some reason") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2)) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): - pytest.skip("Img2Img models do not support support output reproducibility for some reason") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -488,8 +441,8 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) @@ -619,7 +572,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -685,10 +638,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ort_output = ort_pipeline(**inputs, latents=np_latents).images diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images - self.assertTrue( - np.allclose(ort_output, diffusers_output, atol=1e-4), - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), - ) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -706,8 +656,8 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) From b4e4f4131c10a0b4650c694a2e7482ab02647861 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 11 Sep 2024 12:17:01 +0200 Subject: [PATCH 27/71] fix sd xl hidden states --- optimum/onnxruntime/modeling_diffusion.py | 7 +++ .../diffusers/pipeline_stable_diffusion.py | 41 +++++++++++--- .../diffusers/pipeline_stable_diffusion_xl.py | 9 ++-- tests/onnxruntime/test_diffusion.py | 53 +++++++++++++++++++ 4 files changed, 99 insertions(+), 11 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index ff08a72f3e..fc67d99dd7 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -525,11 +525,18 @@ def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): onnx_outputs = self.session.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + print("model_outputs", model_outputs.keys()) + if any("hidden_states" in model_output for model_output in model_outputs): model_outputs["hidden_states"] = [] + for i in range(self.config.num_hidden_layers): model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) + # exporter doesnt duplicate last hidden state for some reason + # (only returned once as last_hidden_state and not part of the list of hidden_states) + model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + return ModelOutput(**model_outputs) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py index 464c4f343c..5de9e3b682 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py @@ -20,6 +20,7 @@ import torch from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback from diffusers.image_processor import PipelineImageInput +from diffusers.loaders.textual_inversion import TextualInversionLoaderMixin from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg, retrieve_timesteps from diffusers.utils.deprecation_utils import deprecate @@ -31,7 +32,8 @@ logger = logging.getLogger(__name__) -class StableDiffusionPipelineMixin(DiffusionPipelineMixin): +class StableDiffusionPipelineMixin(DiffusionPipelineMixin, TextualInversionLoaderMixin): + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] def encode_prompt( @@ -75,9 +77,19 @@ def encode_prompt( Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. """ - device = device or self.device + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + # if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): + # self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + # if not USE_PEFT_BACKEND: + # adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + # else: + # scale_lora_layers(self.text_encoder, lora_scale) + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -86,6 +98,10 @@ def encode_prompt( batch_size = prompt_embeds.shape[0] if prompt_embeds is None: + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + text_inputs = self.tokenizer( prompt, padding="max_length", @@ -114,15 +130,15 @@ def encode_prompt( if clip_skip is None: prompt_embeds = self.text_encoder( - text_input_ids.to(device) - # attention_mask=attention_mask + text_input_ids.to(device), + # attention_mask=attention_mask, ) - prompt_embeds = next(iter(prompt_embeds.values())) + prompt_embeds = prompt_embeds[0] else: prompt_embeds = self.text_encoder( text_input_ids.to(device), # attention_mask=attention_mask, - # output_hidden_states=True + # output_hidden_states=True, ) # Access the `hidden_states` first, that contains a tuple of # all the hidden states from the encoder layers. Then index into @@ -169,6 +185,10 @@ def encode_prompt( else: uncond_tokens = negative_prompt + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -187,15 +207,22 @@ def encode_prompt( uncond_input.input_ids.to(device), # attention_mask=attention_mask, ) - negative_prompt_embeds = next(iter(negative_prompt_embeds.values())) + negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + # if self.text_encoder is not None: + # if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: + # # Retrieve the original scale by scaling back the LoRA layers + # unscale_lora_layers(self.text_encoder, lora_scale) + return prompt_embeds, negative_prompt_embeds def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 4dfd0dd1b5..5a48592626 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -19,6 +19,7 @@ import torch from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback from diffusers.image_processor import PipelineImageInput +from diffusers.loaders.textual_inversion import TextualInversionLoaderMixin from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import rescale_noise_cfg, retrieve_timesteps from diffusers.utils.deprecation_utils import deprecate @@ -147,8 +148,8 @@ def encode_prompt( prompt_embeds_list = [] prompts = [prompt, prompt_2] for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): - # if isinstance(self, TextualInversionLoaderMixin): - # prompt = self.maybe_convert_prompt(prompt, tokenizer) + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, tokenizer) text_inputs = tokenizer( prompt, @@ -219,8 +220,8 @@ def encode_prompt( negative_prompt_embeds_list = [] for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): - # if isinstance(self, TextualInversionLoaderMixin): - # negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) + if isinstance(self, TextualInversionLoaderMixin): + negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) max_length = prompt_embeds.shape[1] uncond_input = tokenizer( diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 606ad73f66..3d5e9d1599 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -127,6 +127,59 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images).images self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_prompt_embeds_to_diffusers_pipeline(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + prompt = ["sailing ship in storm by Leonardo da Vinci"] + device = torch.device("cpu") + num_images_per_prompt = 1 + do_classifier_free_guidance = True + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + if model_arch == "stable-diffusion-xl": + ( + ort_prompt_embeds, + _, + ort_pooled_prompt_embeds, + _, + ) = ort_pipeline.encode_prompt(prompt) + ( + diffusers_prompt_embeds, + _, + diffusers_pooled_prompt_embeds, + _, + ) = diffusers_pipeline.encode_prompt(prompt) + np.testing.assert_allclose( + ort_prompt_embeds.detach().numpy(), + diffusers_prompt_embeds.detach().numpy(), + atol=1e-4, + rtol=1e-2, + ) + np.testing.assert_allclose( + ort_pooled_prompt_embeds.detach().numpy(), + diffusers_pooled_prompt_embeds.detach().numpy(), + atol=1e-4, + rtol=1e-2, + ) + else: + ort_prompt_embeds, _ = ort_pipeline.encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance + ) + diffusers_prompt_embeds, _ = diffusers_pipeline.encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance + ) + np.testing.assert_allclose( + ort_prompt_embeds.detach().numpy(), + diffusers_prompt_embeds.detach().numpy(), + atol=1e-4, + rtol=1e-2, + ) + @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): From 8e35c11d6b7c2179d6f3b143c3c264b79db7f8ab Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 11 Sep 2024 12:19:24 +0200 Subject: [PATCH 28/71] style --- optimum/onnxruntime/modeling_diffusion.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index fc67d99dd7..82caf02495 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -525,8 +525,6 @@ def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): onnx_outputs = self.session.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - print("model_outputs", model_outputs.keys()) - if any("hidden_states" in model_output for model_output in model_outputs): model_outputs["hidden_states"] = [] From 475efdfcca21a34fd43204e5ce3a7d5adc44c17f Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 11 Sep 2024 13:08:56 +0200 Subject: [PATCH 29/71] support testing without diffusers --- optimum/onnxruntime/__init__.py | 16 +++++ optimum/utils/dummy_diffusers_objects.py | 44 ++++++++++++ tests/onnxruntime/test_modeling.py | 91 ++++++++++++++---------- 3 files changed, 113 insertions(+), 38 deletions(-) diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 09a48ec955..a6e3c13979 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -83,6 +83,10 @@ "ORTPipelineForInpainting", "ORTPipelineForText2Image", "ORTDiffusionPipeline", + "ORTModelTextEncoder", + "ORTModelUnet", + "ORTModelVaeDecoder", + "ORTModelVaeEncoder", ] else: _import_structure["modeling_diffusion"] = [ @@ -96,6 +100,10 @@ "ORTPipelineForInpainting", "ORTPipelineForText2Image", "ORTDiffusionPipeline", + "ORTModelTextEncoder", + "ORTModelUnet", + "ORTModelVaeDecoder", + "ORTModelVaeEncoder", ] @@ -147,6 +155,10 @@ from ..utils.dummy_diffusers_objects import ( ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, + ORTModelTextEncoder, + ORTModelUnet, + ORTModelVaeDecoder, + ORTModelVaeEncoder, ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, @@ -160,6 +172,10 @@ from .modeling_diffusion import ( ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, + ORTModelTextEncoder, + ORTModelUnet, + ORTModelVaeDecoder, + ORTModelVaeEncoder, ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py index 35d1ffe9fc..f63d3a603c 100644 --- a/optimum/utils/dummy_diffusers_objects.py +++ b/optimum/utils/dummy_diffusers_objects.py @@ -123,3 +123,47 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) + + +class ORTModelTextEncoder(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTModelVaeDecoder(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTModelVaeEncoder(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTModelUnet(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 4b44acb38a..d8dd46e4ad 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -71,6 +71,7 @@ ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME, ONNX_WEIGHTS_NAME, + ORTDiffusionPipeline, ORTModelForAudioClassification, ORTModelForAudioFrameClassification, ORTModelForAudioXVector, @@ -89,15 +90,12 @@ ORTModelForSpeechSeq2Seq, ORTModelForTokenClassification, ORTModelForVision2Seq, - ORTStableDiffusionPipeline, -) -from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder -from optimum.onnxruntime.modeling_diffusion import ( ORTModelTextEncoder, ORTModelUnet, ORTModelVaeDecoder, ORTModelVaeEncoder, ) +from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder from optimum.onnxruntime.modeling_ort import ORTModel from optimum.pipelines import pipeline from optimum.utils import ( @@ -108,7 +106,13 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm +from optimum.utils.testing_utils import ( + grid_parameters, + remove_directory, + require_diffusers, + require_hf_token, + require_ort_rocm, +) logger = logging.get_logger() @@ -205,12 +209,11 @@ def test_load_seq2seq_model_from_empty_cache(self): with self.assertRaises(Exception): _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True) + @require_diffusers def test_load_stable_diffusion_model_from_cache(self): - _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching + _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching - model = ORTStableDiffusionPipeline.from_pretrained( - self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True - ) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) @@ -218,6 +221,7 @@ def test_load_stable_diffusion_model_from_cache(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + @require_diffusers def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--") @@ -225,9 +229,7 @@ def test_load_stable_diffusion_model_from_empty_cache(self): remove_directory(dirpath) with self.assertRaises(Exception): - _ = ORTStableDiffusionPipeline.from_pretrained( - self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True - ) + _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) @require_torch_gpu @pytest.mark.cuda_ep_test @@ -300,18 +302,20 @@ def test_load_seq2seq_model_unknown_provider(self): with self.assertRaises(ValueError): ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="FooExecutionProvider") + @require_diffusers def test_load_stable_diffusion_model_from_hub(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder) self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_load_stable_diffusion_model_cuda_provider(self): - model = ORTStableDiffusionPipeline.from_pretrained( + model = ORTDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider" ) self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) @@ -321,11 +325,12 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test def test_load_stable_diffusion_model_rocm_provider(self): - model = ORTStableDiffusionPipeline.from_pretrained( + model = ORTDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="ROCMExecutionProvider" ) self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) @@ -335,8 +340,9 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): - model = ORTStableDiffusionPipeline.from_pretrained( + model = ORTDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider" ) self.assertListEqual(model.providers, ["CPUExecutionProvider"]) @@ -346,9 +352,10 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) + @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): with self.assertRaises(ValueError): - ORTStableDiffusionPipeline.from_pretrained( + ORTDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="FooExecutionProvider" ) @@ -478,12 +485,11 @@ def test_passing_session_options_seq2seq(self): self.assertEqual(model.encoder.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.decoder.session.get_session_options().intra_op_num_threads, 3) + @require_diffusers def test_passing_session_options_stable_diffusion(self): options = onnxruntime.SessionOptions() options.intra_op_num_threads = 3 - model = ORTStableDiffusionPipeline.from_pretrained( - self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options - ) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options) self.assertEqual(model.unet.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.text_encoder.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.vae_decoder.session.get_session_options().intra_op_num_threads, 3) @@ -772,10 +778,11 @@ def test_seq2seq_model_on_rocm_ep_str(self): self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_passing_provider_options_stable_diffusion(self): - model = ORTStableDiffusionPipeline.from_pretrained( + model = ORTDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider" ) self.assertEqual( @@ -791,7 +798,7 @@ def test_passing_provider_options_stable_diffusion(self): self.assertEqual( model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "1" ) - model = ORTStableDiffusionPipeline.from_pretrained( + model = ORTDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider", provider_options={"do_copy_in_default_stream": 0}, @@ -810,8 +817,9 @@ def test_passing_provider_options_stable_diffusion(self): model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "0" ) + @require_diffusers def test_stable_diffusion_model_on_cpu(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") model.to(cpu) self.assertEqual(model.device, cpu) @@ -825,9 +833,9 @@ def test_stable_diffusion_model_on_cpu(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) - # test string device input for to() + @require_diffusers def test_stable_diffusion_model_on_cpu_str(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") model.to("cpu") self.assertEqual(model.device, cpu) @@ -841,10 +849,11 @@ def test_stable_diffusion_model_on_cpu_str(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) gpu = torch.device("cuda") model.to(gpu) self.assertEqual(model.device, torch.device("cuda:0")) @@ -858,11 +867,12 @@ def test_stable_diffusion_model_on_gpu(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test def test_stable_diffusion_model_on_rocm_ep(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) gpu = torch.device("cuda") model.to(gpu) self.assertEqual(model.device, torch.device("cuda:0")) @@ -876,34 +886,35 @@ def test_stable_diffusion_model_on_rocm_ep(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu") def test_stable_diffusion_model_on_gpu_id(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to(torch.device("cuda:1")) self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to(1) self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to("cuda:1") self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") - # test string device input for to() + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu_str(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to("cuda") self.assertEqual(model.device, torch.device("cuda:0")) self.assertEqual(model.unet.device, torch.device("cuda:0")) @@ -916,11 +927,12 @@ def test_stable_diffusion_model_on_gpu_str(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test def test_stable_diffusion_model_on_rocm_ep_str(self): - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to("cuda") self.assertEqual(model.device, torch.device("cuda:0")) self.assertEqual(model.unet.device, torch.device("cuda:0")) @@ -975,9 +987,10 @@ def test_save_seq2seq_model_without_past(self): self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents) self.assertTrue(CONFIG_NAME in folder_contents) + @require_diffusers def test_save_stable_diffusion_model(self): with tempfile.TemporaryDirectory() as tmpdirname: - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertIn(model.config_name, folder_contents) @@ -1050,10 +1063,11 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") remove_directory(tmpdirname) + @require_diffusers def test_save_load_stable_diffusion_model_with_external_data(self): with tempfile.TemporaryDirectory() as tmpdirname: os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1" # force exporting small model with external data - model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) + model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) model.save_pretrained(tmpdirname) # verify external data is exported @@ -1068,7 +1082,7 @@ def test_save_load_stable_diffusion_model_with_external_data(self): self.assertIn(ONNX_WEIGHTS_NAME + "_data", folder_contents) # verify loading from local folder works - model = ORTStableDiffusionPipeline.from_pretrained(tmpdirname, export=False) + model = ORTDiffusionPipeline.from_pretrained(tmpdirname, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") remove_directory(tmpdirname) @@ -1180,11 +1194,12 @@ def test_push_seq2seq_model_with_external_data_to_hub(self): ) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + @require_diffusers @require_hf_token def test_push_stable_diffusion_model_with_external_data_to_hub(self): with tempfile.TemporaryDirectory() as tmpdirname: os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1" # force exporting small model with external data - model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) + model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) model.save_pretrained( tmpdirname + "/onnx", token=os.environ.get("HF_AUTH_TOKEN", None), @@ -1194,7 +1209,7 @@ def test_push_stable_diffusion_model_with_external_data_to_hub(self): ) # verify loading from hub works - model = ORTStableDiffusionPipeline.from_pretrained( + model = ORTDiffusionPipeline.from_pretrained( MODEL_NAMES["stable-diffusion"] + "-onnx", export=False, token=os.environ.get("HF_AUTH_TOKEN", None), From e2ad89a8ca72a1a77a960b0092728553fced5ab1 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 11 Sep 2024 13:11:41 +0200 Subject: [PATCH 30/71] remove unnecessary --- optimum/utils/testing_utils.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index 6579e230dc..76fe9a05b1 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -84,17 +84,6 @@ def require_ort_rocm(test_case): ) -def require_ort_cuda(test_case): - """Decorator marking a test that requires CUDAExecutionProvider for ONNX Runtime.""" - import onnxruntime as ort - - providers = ort.get_available_providers() - - return unittest.skipUnless("CUDAExecutionProvider" == providers[0], "test requires CUDAExecutionProvider")( - test_case - ) - - def require_hf_token(test_case): """ Decorator marking a test that requires huggingface hub token. From 7b4b5bdd614694e87830ffa03749b8b0184fb48a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 11 Sep 2024 13:53:17 +0200 Subject: [PATCH 31/71] revert --- tests/onnxruntime/test_modeling.py | 52 +++++++++++++++--------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index d8dd46e4ad..edcab8b228 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -71,7 +71,6 @@ ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME, ONNX_WEIGHTS_NAME, - ORTDiffusionPipeline, ORTModelForAudioClassification, ORTModelForAudioFrameClassification, ORTModelForAudioXVector, @@ -94,6 +93,7 @@ ORTModelUnet, ORTModelVaeDecoder, ORTModelVaeEncoder, + ORTStableDiffusionPipeline, ) from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder from optimum.onnxruntime.modeling_ort import ORTModel @@ -211,9 +211,9 @@ def test_load_seq2seq_model_from_empty_cache(self): @require_diffusers def test_load_stable_diffusion_model_from_cache(self): - _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching + _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) @@ -229,7 +229,7 @@ def test_load_stable_diffusion_model_from_empty_cache(self): remove_directory(dirpath) with self.assertRaises(Exception): - _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) + _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) @require_torch_gpu @pytest.mark.cuda_ep_test @@ -304,7 +304,7 @@ def test_load_seq2seq_model_unknown_provider(self): @require_diffusers def test_load_stable_diffusion_model_from_hub(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder) @@ -315,7 +315,7 @@ def test_load_stable_diffusion_model_from_hub(self): @require_torch_gpu @pytest.mark.cuda_ep_test def test_load_stable_diffusion_model_cuda_provider(self): - model = ORTDiffusionPipeline.from_pretrained( + model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider" ) self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) @@ -330,7 +330,7 @@ def test_load_stable_diffusion_model_cuda_provider(self): @require_ort_rocm @pytest.mark.rocm_ep_test def test_load_stable_diffusion_model_rocm_provider(self): - model = ORTDiffusionPipeline.from_pretrained( + model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="ROCMExecutionProvider" ) self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) @@ -342,7 +342,7 @@ def test_load_stable_diffusion_model_rocm_provider(self): @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): - model = ORTDiffusionPipeline.from_pretrained( + model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider" ) self.assertListEqual(model.providers, ["CPUExecutionProvider"]) @@ -355,7 +355,7 @@ def test_load_stable_diffusion_model_cpu_provider(self): @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): with self.assertRaises(ValueError): - ORTDiffusionPipeline.from_pretrained( + ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="FooExecutionProvider" ) @@ -489,7 +489,7 @@ def test_passing_session_options_seq2seq(self): def test_passing_session_options_stable_diffusion(self): options = onnxruntime.SessionOptions() options.intra_op_num_threads = 3 - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options) self.assertEqual(model.unet.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.text_encoder.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.vae_decoder.session.get_session_options().intra_op_num_threads, 3) @@ -782,7 +782,7 @@ def test_seq2seq_model_on_rocm_ep_str(self): @require_torch_gpu @pytest.mark.cuda_ep_test def test_passing_provider_options_stable_diffusion(self): - model = ORTDiffusionPipeline.from_pretrained( + model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider" ) self.assertEqual( @@ -798,7 +798,7 @@ def test_passing_provider_options_stable_diffusion(self): self.assertEqual( model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "1" ) - model = ORTDiffusionPipeline.from_pretrained( + model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider", provider_options={"do_copy_in_default_stream": 0}, @@ -819,7 +819,7 @@ def test_passing_provider_options_stable_diffusion(self): @require_diffusers def test_stable_diffusion_model_on_cpu(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") model.to(cpu) self.assertEqual(model.device, cpu) @@ -835,7 +835,7 @@ def test_stable_diffusion_model_on_cpu(self): @require_diffusers def test_stable_diffusion_model_on_cpu_str(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") model.to("cpu") self.assertEqual(model.device, cpu) @@ -853,7 +853,7 @@ def test_stable_diffusion_model_on_cpu_str(self): @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) gpu = torch.device("cuda") model.to(gpu) self.assertEqual(model.device, torch.device("cuda:0")) @@ -872,7 +872,7 @@ def test_stable_diffusion_model_on_gpu(self): @require_ort_rocm @pytest.mark.rocm_ep_test def test_stable_diffusion_model_on_rocm_ep(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) gpu = torch.device("cuda") model.to(gpu) self.assertEqual(model.device, torch.device("cuda:0")) @@ -889,21 +889,21 @@ def test_stable_diffusion_model_on_rocm_ep(self): @require_diffusers @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu") def test_stable_diffusion_model_on_gpu_id(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to(torch.device("cuda:1")) self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to(1) self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to("cuda:1") self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") @@ -914,7 +914,7 @@ def test_stable_diffusion_model_on_gpu_id(self): @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu_str(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to("cuda") self.assertEqual(model.device, torch.device("cuda:0")) self.assertEqual(model.unet.device, torch.device("cuda:0")) @@ -932,7 +932,7 @@ def test_stable_diffusion_model_on_gpu_str(self): @require_ort_rocm @pytest.mark.rocm_ep_test def test_stable_diffusion_model_on_rocm_ep_str(self): - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.to("cuda") self.assertEqual(model.device, torch.device("cuda:0")) self.assertEqual(model.unet.device, torch.device("cuda:0")) @@ -990,7 +990,7 @@ def test_save_seq2seq_model_without_past(self): @require_diffusers def test_save_stable_diffusion_model(self): with tempfile.TemporaryDirectory() as tmpdirname: - model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) + model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertIn(model.config_name, folder_contents) @@ -1067,7 +1067,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): def test_save_load_stable_diffusion_model_with_external_data(self): with tempfile.TemporaryDirectory() as tmpdirname: os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1" # force exporting small model with external data - model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) + model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) model.save_pretrained(tmpdirname) # verify external data is exported @@ -1082,7 +1082,7 @@ def test_save_load_stable_diffusion_model_with_external_data(self): self.assertIn(ONNX_WEIGHTS_NAME + "_data", folder_contents) # verify loading from local folder works - model = ORTDiffusionPipeline.from_pretrained(tmpdirname, export=False) + model = ORTStableDiffusionPipeline.from_pretrained(tmpdirname, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") remove_directory(tmpdirname) @@ -1199,7 +1199,7 @@ def test_push_seq2seq_model_with_external_data_to_hub(self): def test_push_stable_diffusion_model_with_external_data_to_hub(self): with tempfile.TemporaryDirectory() as tmpdirname: os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1" # force exporting small model with external data - model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) + model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True) model.save_pretrained( tmpdirname + "/onnx", token=os.environ.get("HF_AUTH_TOKEN", None), @@ -1209,7 +1209,7 @@ def test_push_stable_diffusion_model_with_external_data_to_hub(self): ) # verify loading from hub works - model = ORTDiffusionPipeline.from_pretrained( + model = ORTStableDiffusionPipeline.from_pretrained( MODEL_NAMES["stable-diffusion"] + "-onnx", export=False, token=os.environ.get("HF_AUTH_TOKEN", None), From 7a8396c7fc25ab4c9ce7a304b2eab5867101baee Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 12:11:31 +0200 Subject: [PATCH 32/71] export vae encoder by returning its latent distribution parameters --- optimum/exporters/onnx/model_configs.py | 10 +++++----- optimum/exporters/utils.py | 24 +++--------------------- 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index d4b15b2968..b8b8a14ebb 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1111,8 +1111,8 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]: class VaeEncoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-2 - # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu + ATOL_FOR_VALIDATION = 1e-4 + # The ONNX export of a VaeEncoder architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -1131,13 +1131,13 @@ def inputs(self) -> Dict[str, Dict[int, str]]: @property def outputs(self) -> Dict[str, Dict[int, str]]: return { - "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, + "latent_parameters": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, } class VaeDecoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-3 - # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu + ATOL_FOR_VALIDATION = 1e-4 + # The ONNX export of a VaeDecoder architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py index e2125736c4..949b54f468 100644 --- a/optimum/exporters/utils.py +++ b/optimum/exporters/utils.py @@ -46,11 +46,6 @@ from diffusers import ( DiffusionPipeline, - LatentConsistencyModelImg2ImgPipeline, - LatentConsistencyModelPipeline, - StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipeline, - StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, @@ -92,27 +87,13 @@ def _get_submodels_for_export_diffusion( Returns the components of a Stable Diffusion model. """ - is_stable_diffusion = isinstance( - pipeline, (StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline) - ) is_stable_diffusion_xl = isinstance( pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline) ) - is_latent_consistency_model = isinstance( - pipeline, (LatentConsistencyModelPipeline, LatentConsistencyModelImg2ImgPipeline) - ) - if is_stable_diffusion_xl: projection_dim = pipeline.text_encoder_2.config.projection_dim - elif is_stable_diffusion: - projection_dim = pipeline.text_encoder.config.projection_dim - elif is_latent_consistency_model: - projection_dim = pipeline.text_encoder.config.projection_dim else: - raise ValueError( - f"The export of a DiffusionPipeline model with the class name {pipeline.__class__.__name__} is currently not supported in Optimum. " - "Please open an issue or submit a PR to add the support." - ) + projection_dim = pipeline.text_encoder.config.projection_dim models_for_export = {} @@ -139,7 +120,8 @@ def _get_submodels_for_export_diffusion( vae_encoder = copy.deepcopy(pipeline.vae) if not is_torch_greater_or_equal_than_2_1: vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder) - vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()} + # we return the distribution parameters to be able to recreate it in the decoder + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} models_for_export["vae_encoder"] = vae_encoder # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 From 8458705e3316e2d77f0ff9c2e7be18c54b9c1597 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 12:27:01 +0200 Subject: [PATCH 33/71] fix the modeling to handle distributions --- optimum/onnxruntime/modeling_diffusion.py | 61 ++++++++++++++--------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 82caf02495..4638637d6d 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -33,6 +33,7 @@ LatentConsistencyModelPipeline, LMSDiscreteScheduler, PNDMScheduler, + SchedulerMixin, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionPipeline, @@ -41,6 +42,7 @@ ) from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor +from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download @@ -90,34 +92,33 @@ class ORTPipeline(ORTModel): def __init__( self, - vae_decoder_session: ort.InferenceSession, - unet_session: ort.InferenceSession, - tokenizer: CLIPTokenizer, config: Dict[str, Any], - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + tokenizer: CLIPTokenizer, + scheduler: SchedulerMixin, + unet_session: ort.InferenceSession, feature_extractor: Optional[CLIPFeatureExtractor] = None, vae_encoder_session: Optional[ort.InferenceSession] = None, + vae_decoder_session: Optional[ort.InferenceSession] = None, text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, tokenizer_2: Optional[CLIPTokenizer] = None, use_io_binding: Optional[bool] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + **kwargs, ): """ Args: - vae_decoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the VAE decoder - unet_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the U-NET. + config (`Dict[str, Any]`): + A config dictionary from which the model components will be instantiated. Make sure to only load + configuration files of compatible classes. tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) for the text encoder. - config (`Dict[str, Any]`): - A config dictionary from which the model components will be instantiated. Make sure to only load - configuration files of compatible classes. scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. + unet_session (`ort.InferenceSession`): + The ONNX Runtime inference session associated to the U-NET. feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): A model extracting features from generated images to be used as inputs for the `safety_checker` vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): @@ -134,14 +135,9 @@ def __init__( model_save_dir (`Optional[str]`, defaults to `None`): The directory under which the model exported to ONNX was saved. """ - self.shared_attributes_init( - model=vae_decoder_session, - use_io_binding=use_io_binding, - model_save_dir=model_save_dir, - ) self._internal_dict = config - self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) - self.vae_decoder_model_path = Path(vae_decoder_session._model_path) + self.shared_attributes_init(model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir) + self.unet = ORTModelUnet(unet_session, self) self.unet_model_path = Path(unet_session._model_path) @@ -153,11 +149,18 @@ def __init__( self.text_encoder = None if vae_encoder_session is not None: - self.vae_encoder_model_path = Path(vae_encoder_session._model_path) self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) + self.vae_encoder_model_path = Path(vae_encoder_session._model_path) else: - self.vae_encoder_model_path = None self.vae_encoder = None + self.vae_encoder_model_path = None + + if vae_decoder_session is not None: + self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) + self.vae_decoder_model_path = Path(vae_decoder_session._model_path) + else: + self.vae_decoder = None + self.vae_decoder_model_path = None if text_encoder_2_session is not None: self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path) @@ -166,11 +169,11 @@ def __init__( self.text_encoder_2_model_path = None self.text_encoder_2 = None + self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 - self.scheduler = scheduler self.feature_extractor = feature_extractor - self.safety_checker = None + self.safety_checker = kwargs.get("safety_checker", None) sub_models = { DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, @@ -185,7 +188,17 @@ def __init__( self._internal_dict[name] = ( ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None) ) - self._internal_dict.pop("vae", None) + + # Create an Vae object to be used by the pipeline mixin with minimal changes + class Vae: + if self.vae_encoder is not None: + config = self.vae_encoder.config + decode = self.vae_decoder + if self.vae_decoder is not None: + config = self.vae_decoder.config + encode = self.vae_encoder + + self.vae = Vae() self.vae_scale_factor = 2 ** (len(self.vae_decoder.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) @@ -594,6 +607,8 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor]): if "latent_sample" in model_outputs: model_outputs["latents"] = model_outputs.pop("latent_sample") + elif "latent_parameters" in model_outputs: + model_outputs["latent_dist"] = DiagonalGaussianDistribution(model_outputs.pop("latent_parameters")) return ModelOutput(**model_outputs) From 76e7f018011c03fd67a79d0a75216c94ab67ee63 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 12:27:23 +0200 Subject: [PATCH 34/71] create vae class to minimize changes in pipeline mixins --- .../diffusers/pipeline_latent_consistency.py | 2 +- .../diffusers/pipeline_stable_diffusion.py | 44 ++----------------- .../pipeline_stable_diffusion_img2img.py | 6 +-- .../pipeline_stable_diffusion_inpaint.py | 10 ++--- .../diffusers/pipeline_stable_diffusion_xl.py | 15 ++++--- .../pipeline_stable_diffusion_xl_img2img.py | 42 +++++++----------- 6 files changed, 37 insertions(+), 82 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py index 505a8890ff..89e0bc00f1 100644 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py @@ -421,7 +421,7 @@ def __call__( denoised = denoised.to(prompt_embeds.dtype) if not output_type == "latent": image = self.vae_decoder( - denoised / self.vae_decoder.config.scaling_factor, + denoised / self.vae.config.scaling_factor, # return_dict=False, )[0] image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py index 5de9e3b682..a0ae2cb44f 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py @@ -225,30 +225,6 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds - def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): - dtype = next(self.image_encoder.parameters()).dtype - - if not isinstance(image, torch.Tensor): - image = self.feature_extractor(image, return_tensors="pt").pixel_values - - image = image.to(device=device, dtype=dtype) - if output_hidden_states: - image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] - image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) - uncond_image_enc_hidden_states = self.image_encoder( - torch.zeros_like(image), output_hidden_states=True - ).hidden_states[-2] - uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( - num_images_per_prompt, dim=0 - ) - return image_enc_hidden_states, uncond_image_enc_hidden_states - else: - image_embeds = self.image_encoder(image).image_embeds - image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) - uncond_image_embeds = torch.zeros_like(image_embeds) - - return image_embeds, uncond_image_embeds - def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: has_nsfw_concept = None @@ -263,17 +239,6 @@ def run_safety_checker(self, image, device, dtype): ) return image, has_nsfw_concept - def decode_latents(self, latents): - latents = 1 / self.vae_decoder.config.scaling_factor * latents - image = self.vae_decoder( - latents, - # return_dict=False, - )[0] - image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() - return image - def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. @@ -699,13 +664,10 @@ def __call__( callback(step_idx, t, latents) if not output_type == "latent": - image = self.vae_decoder( - latents / self.vae_decoder.config.get("scaling_factor"), + image = self.vae.decode( + latents / self.vae.config.get("scaling_factor"), # return_dict=False, - # generator=generator, - # TODO: in some models, it might be mandatory to pass generator here for reproducibility - ) - image = next(iter(image.values())) + )[0] image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: image = latents diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py index f98daa3dc1..90f5666d11 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py @@ -139,7 +139,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt else: init_latents = retrieve_latents(self.vae_encoder(image), generator=generator) - init_latents = self.vae_decoder.config.scaling_factor * init_latents + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size @@ -438,7 +438,7 @@ def __call__( generator, ) - # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + # 7. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7.1 Add image embeds for IP-Adapter @@ -508,7 +508,7 @@ def __call__( if not output_type == "latent": image = self.vae_decoder( - latents / self.vae_decoder.config.scaling_factor, + latents / self.vae.config.scaling_factor, # return_dict=False, # generator=generator, )[0] diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py index 2791b37b32..a232a75721 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py @@ -184,14 +184,14 @@ def prepare_latents( def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): if isinstance(generator, list): image_latents = [ - retrieve_latents(self.vae_encoder(image[i : i + 1]), generator=generator[i]) + retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0]) ] image_latents = torch.cat(image_latents, dim=0) else: - image_latents = retrieve_latents(self.vae_encoder(image), generator=generator) + image_latents = retrieve_latents(self.vae.encode(image), generator=generator) - image_latents = self.vae_encoder.config.scaling_factor * image_latents + image_latents = self.vae.config.scaling_factor * image_latents return image_latents @@ -590,7 +590,7 @@ def __call__( init_image = init_image.to(dtype=torch.float32) # 6. Prepare latent variables - num_channels_latents = self.vae_decoder.config.latent_channels + num_channels_latents = self.vae.config.latent_channels num_channels_unet = self.unet.config.in_channels return_image_latents = num_channels_unet == 4 @@ -753,7 +753,7 @@ def __call__( # mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype) # condition_kwargs = {"image": init_image_condition, "mask": mask_condition} image = self.vae_decoder( - latents / self.vae_decoder.config.scaling_factor, + latents / self.vae.config.scaling_factor, # return_dict=False, # generator=generator, **condition_kwargs, diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 5a48592626..66a64c20cd 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -943,30 +943,31 @@ def __call__( # unscale/denormalize the latents # denormalize with the mean and std if available and not None has_latents_mean = ( - hasattr(self.vae_decoder.config, "latents_mean") and self.vae_decoder.config.latents_mean is not None + hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None ) has_latents_std = ( - hasattr(self.vae_decoder.config, "latents_std") and self.vae_decoder.config.latents_std is not None + hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None ) if has_latents_mean and has_latents_std: latents_mean = ( - torch.tensor(self.vae_decoder.config.latents_mean) + torch.tensor(self.vae.config.latents_mean) .view(1, 4, 1, 1) .to(latents.device, latents.dtype) ) latents_std = ( - torch.tensor(self.vae_decoder.config.latents_std) + torch.tensor(self.vae.config.latents_std) .view(1, 4, 1, 1) .to(latents.device, latents.dtype) ) - latents = latents * latents_std / self.vae_decoder.config.scaling_factor + latents_mean + latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean else: - latents = latents / self.vae_decoder.config.scaling_factor + latents = latents / self.vae.config.scaling_factor - image = self.vae_decoder( + image = self.vae.decode( latents, # return_dict=False, )[0] + # cast back to fp16 if needed # if needs_upcasting: # self.vae.to(dtype=torch.float16) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py index 66b7c79320..64984ff22b 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py @@ -192,10 +192,10 @@ def prepare_latents( ) latents_mean = latents_std = None - if hasattr(self.vae_decoder.config, "latents_mean") and self.vae_decoder.config.latents_mean is not None: - latents_mean = torch.tensor(self.vae_decoder.config.latents_mean).view(1, 4, 1, 1) - if hasattr(self.vae_decoder.config, "latents_std") and self.vae_decoder.config.latents_std is not None: - latents_std = torch.tensor(self.vae_decoder.config.latents_std).view(1, 4, 1, 1) + if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None: + latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1) + if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None: + latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1) # Offload text encoder if `enable_model_cpu_offload` was enabled if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: @@ -211,7 +211,7 @@ def prepare_latents( else: # make sure the VAE is in float32 mode, as it overflows in float16 - # if self.vae_decoder.config.force_upcast: + # if self.vae.config.force_upcast: # image = image.float() # self.vae_decoder.to(dtype=torch.float32) @@ -230,23 +230,23 @@ def prepare_latents( ) init_latents = [ - retrieve_latents(self.vae_encoder(image[i : i + 1]), generator=generator[i]) + retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) ] init_latents = torch.cat(init_latents, dim=0) else: - init_latents = retrieve_latents(self.vae_encoder(image), generator=generator) + init_latents = retrieve_latents(self.vae.encode(image), generator=generator) - # if self.vae_decoder.config.force_upcast: + # if self.vae.config.force_upcast: # self.vae_decoder.to(dtype) init_latents = init_latents.to(dtype) if latents_mean is not None and latents_std is not None: latents_mean = latents_mean.to(device=device, dtype=dtype) latents_std = latents_std.to(device=device, dtype=dtype) - init_latents = (init_latents - latents_mean) * self.vae_decoder.config.scaling_factor / latents_std + init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std else: - init_latents = self.vae_decoder.config.scaling_factor * init_latents + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size @@ -877,7 +877,7 @@ def denoising_value_valid(dnv): if not output_type == "latent": # make sure the VAE is in float32 mode, as it overflows in float16 - # needs_upcasting = self.vae_decoder.dtype == torch.float16 and self.vae_decoder.config.force_upcast + # needs_upcasting = self.vae_decoder.dtype == torch.float16 and self.vae.config.force_upcast # if needs_upcasting: # self.upcast_vae() @@ -889,26 +889,18 @@ def denoising_value_valid(dnv): # unscale/denormalize the latents # denormalize with the mean and std if available and not None - has_latents_mean = ( - hasattr(self.vae_decoder.config, "latents_mean") and self.vae_decoder.config.latents_mean is not None - ) - has_latents_std = ( - hasattr(self.vae_decoder.config, "latents_std") and self.vae_decoder.config.latents_std is not None - ) + has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None + has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None if has_latents_mean and has_latents_std: latents_mean = ( - torch.tensor(self.vae_decoder.config.latents_mean) - .view(1, 4, 1, 1) - .to(latents.device, latents.dtype) + torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype) ) latents_std = ( - torch.tensor(self.vae_decoder.config.latents_std) - .view(1, 4, 1, 1) - .to(latents.device, latents.dtype) + torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype) ) - latents = latents * latents_std / self.vae_decoder.config.scaling_factor + latents_mean + latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean else: - latents = latents / self.vae_decoder.config.scaling_factor + latents = latents / self.vae.config.scaling_factor image = self.vae_decoder( latents, From 24ee099799f7ba3a4f0dd878f570c1d2d50a7893 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 12:27:41 +0200 Subject: [PATCH 35/71] remove unnecessary tests --- tests/onnxruntime/test_diffusion.py | 91 ++++------------------------- 1 file changed, 12 insertions(+), 79 deletions(-) diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 3d5e9d1599..d70f4f5663 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -127,59 +127,6 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images).images self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_prompt_embeds_to_diffusers_pipeline(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - - prompt = ["sailing ship in storm by Leonardo da Vinci"] - device = torch.device("cpu") - num_images_per_prompt = 1 - do_classifier_free_guidance = True - - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - - if model_arch == "stable-diffusion-xl": - ( - ort_prompt_embeds, - _, - ort_pooled_prompt_embeds, - _, - ) = ort_pipeline.encode_prompt(prompt) - ( - diffusers_prompt_embeds, - _, - diffusers_pooled_prompt_embeds, - _, - ) = diffusers_pipeline.encode_prompt(prompt) - np.testing.assert_allclose( - ort_prompt_embeds.detach().numpy(), - diffusers_prompt_embeds.detach().numpy(), - atol=1e-4, - rtol=1e-2, - ) - np.testing.assert_allclose( - ort_pooled_prompt_embeds.detach().numpy(), - diffusers_pooled_prompt_embeds.detach().numpy(), - atol=1e-4, - rtol=1e-2, - ) - else: - ort_prompt_embeds, _ = ort_pipeline.encode_prompt( - prompt, device, num_images_per_prompt, do_classifier_free_guidance - ) - diffusers_prompt_embeds, _ = diffusers_pipeline.encode_prompt( - prompt, device, num_images_per_prompt, do_classifier_free_guidance - ) - np.testing.assert_allclose( - ort_prompt_embeds.detach().numpy(), - diffusers_prompt_embeds.detach().numpy(), - atol=1e-4, - rtol=1e-2, - ) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): @@ -192,7 +139,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - for output_type in ["latent", "np"]: + for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images @@ -382,11 +329,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - - # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): @@ -473,10 +415,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -587,11 +532,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - - # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): @@ -678,20 +618,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - latents_shape = ( - batch_size, - ort_pipeline.vae_decoder.config["latent_channels"], - height // ort_pipeline.vae_scale_factor, - width // ort_pipeline.vae_scale_factor, - ) - - np_latents = np.random.rand(*latents_shape).astype(np.float32) - torch_latents = torch.from_numpy(np_latents) + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, latents=np_latents).images - diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers From 83f2dcc88144d308775c9b04ea1c3a6fe5f2f1d4 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 12:31:20 +0200 Subject: [PATCH 36/71] style --- .../diffusers/pipeline_stable_diffusion_xl.py | 16 ++++------------ tests/onnxruntime/test_modeling.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 66a64c20cd..704c6fd3ad 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -942,22 +942,14 @@ def __call__( # unscale/denormalize the latents # denormalize with the mean and std if available and not None - has_latents_mean = ( - hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None - ) - has_latents_std = ( - hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None - ) + has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None + has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None if has_latents_mean and has_latents_std: latents_mean = ( - torch.tensor(self.vae.config.latents_mean) - .view(1, 4, 1, 1) - .to(latents.device, latents.dtype) + torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype) ) latents_std = ( - torch.tensor(self.vae.config.latents_std) - .view(1, 4, 1, 1) - .to(latents.device, latents.dtype) + torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype) ) latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean else: diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index edcab8b228..af3d47f29d 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -213,7 +213,9 @@ def test_load_seq2seq_model_from_empty_cache(self): def test_load_stable_diffusion_model_from_cache(self): _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) + model = ORTStableDiffusionPipeline.from_pretrained( + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True + ) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) @@ -229,7 +231,9 @@ def test_load_stable_diffusion_model_from_empty_cache(self): remove_directory(dirpath) with self.assertRaises(Exception): - _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) + _ = ORTStableDiffusionPipeline.from_pretrained( + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True + ) @require_torch_gpu @pytest.mark.cuda_ep_test @@ -489,7 +493,9 @@ def test_passing_session_options_seq2seq(self): def test_passing_session_options_stable_diffusion(self): options = onnxruntime.SessionOptions() options.intra_op_num_threads = 3 - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options) + model = ORTStableDiffusionPipeline.from_pretrained( + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options + ) self.assertEqual(model.unet.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.text_encoder.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.vae_decoder.session.get_session_options().intra_op_num_threads, 3) From 036dc46b09b43a1c189e234768b79cdbdb54c7a0 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 12:32:57 +0200 Subject: [PATCH 37/71] style --- tests/onnxruntime/test_modeling.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index edcab8b228..af3d47f29d 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -213,7 +213,9 @@ def test_load_seq2seq_model_from_empty_cache(self): def test_load_stable_diffusion_model_from_cache(self): _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) + model = ORTStableDiffusionPipeline.from_pretrained( + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True + ) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) @@ -229,7 +231,9 @@ def test_load_stable_diffusion_model_from_empty_cache(self): remove_directory(dirpath) with self.assertRaises(Exception): - _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True) + _ = ORTStableDiffusionPipeline.from_pretrained( + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True + ) @require_torch_gpu @pytest.mark.cuda_ep_test @@ -489,7 +493,9 @@ def test_passing_session_options_seq2seq(self): def test_passing_session_options_stable_diffusion(self): options = onnxruntime.SessionOptions() options.intra_op_num_threads = 3 - model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options) + model = ORTStableDiffusionPipeline.from_pretrained( + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options + ) self.assertEqual(model.unet.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.text_encoder.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.vae_decoder.session.get_session_options().intra_op_num_threads, 3) From c1a75c4c8adf28180416f799ac49c33b388b87b8 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 12:58:24 +0200 Subject: [PATCH 38/71] update diffusion models export test --- tests/exporters/onnx/test_onnx_export.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py index d1471aa218..13aed4c8e3 100644 --- a/tests/exporters/onnx/test_onnx_export.py +++ b/tests/exporters/onnx/test_onnx_export.py @@ -292,27 +292,22 @@ def _onnx_export( gc.collect() - def _onnx_export_sd(self, model_type: str, model_name: str, device="cpu"): + def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device="cpu"): pipeline = TasksManager.get_model_from_task(model_type, model_name, device=device) models_and_onnx_configs = get_diffusion_models_for_export(pipeline) - output_names = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs] - model, _ = models_and_onnx_configs["vae_encoder"] - model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters} with TemporaryDirectory() as tmpdirname: _, onnx_outputs = export_models( models_and_onnx_configs=models_and_onnx_configs, opset=14, output_dir=Path(tmpdirname), - output_names=output_names, device=device, ) validate_models_outputs( models_and_onnx_configs=models_and_onnx_configs, onnx_named_outputs=onnx_outputs, output_dir=Path(tmpdirname), - atol=1e-3, - onnx_files_subpaths=output_names, + atol=1e-4, use_subprocess=False, ) @@ -403,7 +398,7 @@ def test_tensorflow_export( @require_vision @require_diffusers def test_pytorch_export_for_diffusion_models(self, model_type, model_name): - self._onnx_export_sd(model_type, model_name) + self._onnx_export_diffusion_models(model_type, model_name) @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items()) @require_torch @@ -414,7 +409,7 @@ def test_pytorch_export_for_diffusion_models(self, model_type, model_name): @pytest.mark.run_slow @pytest.mark.gpu_test def test_pytorch_export_for_diffusion_models_cuda(self, model_type, model_name): - self._onnx_export_sd(model_type, model_name, device="cuda") + self._onnx_export_diffusion_models(model_type, model_name, device="cuda") class CustomWhisperOnnxConfig(WhisperOnnxConfig): From 48f1329b7607982bb9f01849a1a61eb57c566349 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 13:21:16 +0200 Subject: [PATCH 39/71] style --- tests/exporters/onnx/test_onnx_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py index 13aed4c8e3..7671d6cd2e 100644 --- a/tests/exporters/onnx/test_onnx_export.py +++ b/tests/exporters/onnx/test_onnx_export.py @@ -43,7 +43,7 @@ from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.onnx.model_configs import WhisperOnnxConfig from optimum.exporters.onnx.utils import get_speecht5_models_for_export -from optimum.utils import ONNX_WEIGHTS_NAME, DummyPastKeyValuesGenerator, NormalizedTextConfig +from optimum.utils import DummyPastKeyValuesGenerator, NormalizedTextConfig from optimum.utils.save_utils import maybe_load_preprocessors from optimum.utils.testing_utils import grid_parameters, require_diffusers From 5814a340eaa2df5c50a557f051f51863209bb3d5 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 16:22:48 +0200 Subject: [PATCH 40/71] fall back for when block_out_channels is not in vae config --- optimum/onnxruntime/modeling_diffusion.py | 93 ++++++++--------------- optimum/pipelines/diffusers/watermark.py | 31 -------- 2 files changed, 30 insertions(+), 94 deletions(-) delete mode 100644 optimum/pipelines/diffusers/watermark.py diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 4638637d6d..7a9926635c 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -29,10 +29,7 @@ AutoPipelineForInpainting, AutoPipelineForText2Image, ConfigMixin, - DDIMScheduler, LatentConsistencyModelPipeline, - LMSDiscreteScheduler, - PNDMScheduler, SchedulerMixin, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, @@ -80,6 +77,9 @@ ) +if is_invisible_watermark_available(): + from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker + logger = logging.getLogger(__name__) @@ -169,6 +169,17 @@ def __init__( self.text_encoder_2_model_path = None self.text_encoder_2 = None + # Create an Vae object to be used by the pipeline mixin with minimal changes + class Vae: + if self.vae_decoder is not None: + config = self.vae_decoder.config + encode = self.vae_encoder + + if self.vae_encoder is not None: + decode = self.vae_decoder + + self.vae = Vae() + self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 @@ -189,18 +200,11 @@ def __init__( ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None) ) - # Create an Vae object to be used by the pipeline mixin with minimal changes - class Vae: - if self.vae_encoder is not None: - config = self.vae_encoder.config - decode = self.vae_decoder - if self.vae_decoder is not None: - config = self.vae_decoder.config - encode = self.vae_encoder - - self.vae = Vae() + if hasattr(self.vae.config, "block_out_channels"): + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + else: + self.vae_scale_factor = 8 - self.vae_scale_factor = 2 ** (len(self.vae_decoder.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True @@ -661,55 +665,8 @@ class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMi __call__ = LatentConsistencyPipelineMixin.__call__ -class ORTStableDiffusionXLPipelineBase(ORTPipeline): - def __init__( - self, - vae_decoder_session: ort.InferenceSession, - text_encoder_session: ort.InferenceSession, - unet_session: ort.InferenceSession, - config: Dict[str, Any], - tokenizer: CLIPTokenizer, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - feature_extractor: Optional[CLIPFeatureExtractor] = None, - vae_encoder_session: Optional[ort.InferenceSession] = None, - text_encoder_2_session: Optional[ort.InferenceSession] = None, - tokenizer_2: Optional[CLIPTokenizer] = None, - use_io_binding: Optional[bool] = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - add_watermarker: Optional[bool] = None, - ): - super().__init__( - vae_decoder_session=vae_decoder_session, - text_encoder_session=text_encoder_session, - unet_session=unet_session, - config=config, - tokenizer=tokenizer, - scheduler=scheduler, - feature_extractor=feature_extractor, - vae_encoder_session=vae_encoder_session, - text_encoder_2_session=text_encoder_2_session, - tokenizer_2=tokenizer_2, - use_io_binding=use_io_binding, - model_save_dir=model_save_dir, - ) - - add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() - - if add_watermarker: - if not is_invisible_watermark_available(): - raise ImportError( - "`add_watermarker` requires invisible-watermark to be installed, which can be installed with `pip install invisible-watermark`." - ) - - from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker - - self.watermark = StableDiffusionXLWatermarker() - else: - self.watermark = None - - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin): +class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ @@ -719,9 +676,19 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu __call__ = StableDiffusionXLPipelineMixin.__call__ + def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): + super().__init__(*args, **kwargs) + + add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() + + if add_watermarker: + self.watermark = StableDiffusionXLWatermarker() + else: + self.watermark = None + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin): +class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ diff --git a/optimum/pipelines/diffusers/watermark.py b/optimum/pipelines/diffusers/watermark.py deleted file mode 100644 index b3cd622eda..0000000000 --- a/optimum/pipelines/diffusers/watermark.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -from imwatermark import WatermarkEncoder - - -WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110 -WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]] - - -# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion_xl/watermark.py#L12 -class StableDiffusionXLWatermarker: - def __init__(self): - self.watermark = WATERMARK_BITS - self.encoder = WatermarkEncoder() - self.encoder.set_watermark("bits", self.watermark) - - def apply_watermark(self, images: np.array): - # can't encode images that are smaller than 256 - if images.shape[-1] < 256: - return images - - # cv2 doesn't support float16 - if images.dtype == np.float16: - images = images.astype(np.float32) - - images = (255 * (images / 2 + 0.5)).transpose((0, 2, 3, 1)) - - images = np.array([self.encoder.encode(image, "dwtDct") for image in images]).transpose((0, 3, 1, 2)) - - np.clip(2 * (images / 255 - 0.5), -1.0, 1.0, out=images) - - return images From afbb9afc99c556a4dae3cbc2207f1d62e045388b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 12 Sep 2024 16:30:10 +0200 Subject: [PATCH 41/71] remove model parts from optimum.onnxruntime --- optimum/onnxruntime/__init__.py | 16 --------- optimum/utils/dummy_diffusers_objects.py | 44 ------------------------ tests/onnxruntime/test_modeling.py | 16 ++++++--- 3 files changed, 11 insertions(+), 65 deletions(-) diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index a6e3c13979..09a48ec955 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -83,10 +83,6 @@ "ORTPipelineForInpainting", "ORTPipelineForText2Image", "ORTDiffusionPipeline", - "ORTModelTextEncoder", - "ORTModelUnet", - "ORTModelVaeDecoder", - "ORTModelVaeEncoder", ] else: _import_structure["modeling_diffusion"] = [ @@ -100,10 +96,6 @@ "ORTPipelineForInpainting", "ORTPipelineForText2Image", "ORTDiffusionPipeline", - "ORTModelTextEncoder", - "ORTModelUnet", - "ORTModelVaeDecoder", - "ORTModelVaeEncoder", ] @@ -155,10 +147,6 @@ from ..utils.dummy_diffusers_objects import ( ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, @@ -172,10 +160,6 @@ from .modeling_diffusion import ( ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py index f63d3a603c..35d1ffe9fc 100644 --- a/optimum/utils/dummy_diffusers_objects.py +++ b/optimum/utils/dummy_diffusers_objects.py @@ -123,47 +123,3 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) - - -class ORTModelTextEncoder(metaclass=DummyObject): - _backends = ["diffusers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["diffusers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["diffusers"]) - - -class ORTModelVaeDecoder(metaclass=DummyObject): - _backends = ["diffusers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["diffusers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["diffusers"]) - - -class ORTModelVaeEncoder(metaclass=DummyObject): - _backends = ["diffusers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["diffusers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["diffusers"]) - - -class ORTModelUnet(metaclass=DummyObject): - _backends = ["diffusers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["diffusers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["diffusers"]) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index af3d47f29d..199b96342e 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -89,11 +89,6 @@ ORTModelForSpeechSeq2Seq, ORTModelForTokenClassification, ORTModelForVision2Seq, - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, - ORTStableDiffusionPipeline, ) from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder from optimum.onnxruntime.modeling_ort import ORTModel @@ -106,6 +101,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) +from optimum.utils.import_utils import is_diffusers_available from optimum.utils.testing_utils import ( grid_parameters, remove_directory, @@ -115,6 +111,16 @@ ) +if is_diffusers_available(): + from optimum.onnxruntime.modeling_diffusion import ( + ORTModelTextEncoder, + ORTModelUnet, + ORTModelVaeDecoder, + ORTModelVaeEncoder, + ORTStableDiffusionPipeline, + ) + + logger = logging.get_logger() From 53eedc6646c51ddca94a9a2b061247969be06671 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 13 Sep 2024 11:06:24 +0200 Subject: [PATCH 42/71] added .to to model parts --- optimum/onnxruntime/base.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index 0e54bafed7..845780cafa 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -71,6 +71,25 @@ def dtype(self): return None + def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, torch.device): + device = arg + elif isinstance(arg, torch.dtype): + dtype = arg + + if device is not None and device != self.device: + raise ValueError( + "Cannot change the device of a model part without changing the device of the parent model. " + "Please use the `to` method of the parent model to change the device." + ) + + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " + f"Please export the model with the desired dtype." + ) + @abstractmethod def forward(self, *args, **kwargs): pass From a70620475d2325a31d9a569e07b7d602c255dae2 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 13 Sep 2024 11:17:31 +0200 Subject: [PATCH 43/71] remove custom mixins --- optimum/onnxruntime/modeling_diffusion.py | 281 +++-- optimum/onnxruntime/utils.py | 20 + .../diffusers/pipeline_latent_consistency.py | 445 -------- .../diffusers/pipeline_stable_diffusion.py | 732 ------------- .../pipeline_stable_diffusion_img2img.py | 533 ---------- .../pipeline_stable_diffusion_inpaint.py | 782 -------------- .../diffusers/pipeline_stable_diffusion_xl.py | 982 ------------------ .../pipeline_stable_diffusion_xl_img2img.py | 928 ----------------- optimum/pipelines/diffusers/pipeline_utils.py | 54 - tests/onnxruntime/test_diffusion.py | 14 +- 10 files changed, 233 insertions(+), 4538 deletions(-) delete mode 100644 optimum/pipelines/diffusers/pipeline_latent_consistency.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py delete mode 100644 optimum/pipelines/diffusers/pipeline_utils.py diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 7a9926635c..6dfc3dbac1 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -53,12 +53,6 @@ from ..exporters.onnx import main_export from ..onnx.utils import _get_external_data_paths -from ..pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin from ..utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -72,6 +66,7 @@ from .utils import ( ONNX_WEIGHTS_NAME, get_provider_for_device, + np_to_pt, parse_device, validate_provider_availability, ) @@ -135,12 +130,8 @@ def __init__( model_save_dir (`Optional[str]`, defaults to `None`): The directory under which the model exported to ONNX was saved. """ - self._internal_dict = config - self.shared_attributes_init(model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir) - - self.unet = ORTModelUnet(unet_session, self) - self.unet_model_path = Path(unet_session._model_path) + # Text encoder if text_encoder_session is not None: self.text_encoder_model_path = Path(text_encoder_session._model_path) self.text_encoder = ORTModelTextEncoder(text_encoder_session, self) @@ -148,20 +139,11 @@ def __init__( self.text_encoder_model_path = None self.text_encoder = None - if vae_encoder_session is not None: - self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) - self.vae_encoder_model_path = Path(vae_encoder_session._model_path) - else: - self.vae_encoder = None - self.vae_encoder_model_path = None - - if vae_decoder_session is not None: - self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) - self.vae_decoder_model_path = Path(vae_decoder_session._model_path) - else: - self.vae_decoder = None - self.vae_decoder_model_path = None + # U-Net + self.unet = ORTModelUnet(unet_session, self) + self.unet_model_path = Path(unet_session._model_path) + # Text encoder 2 if text_encoder_2_session is not None: self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path) self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2_session, self) @@ -169,16 +151,20 @@ def __init__( self.text_encoder_2_model_path = None self.text_encoder_2 = None - # Create an Vae object to be used by the pipeline mixin with minimal changes - class Vae: - if self.vae_decoder is not None: - config = self.vae_decoder.config - encode = self.vae_encoder + # VAE + self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) + self.vae_decoder_model_path = Path(vae_decoder_session._model_path) - if self.vae_encoder is not None: - decode = self.vae_decoder + if vae_encoder_session is not None: + self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) + self.vae_encoder_model_path = Path(vae_encoder_session._model_path) + else: + self.vae_encoder = None + self.vae_encoder_model_path = None - self.vae = Vae() + # We create VAE encoder & decoder and wrap them in one object to + # be used by the pipeline mixins with minimal code changes (simulating the diffusers API) + self.vae = ORTVaeWrapper(self.vae_encoder, self.vae_decoder, self) self.scheduler = scheduler self.tokenizer = tokenizer @@ -186,29 +172,30 @@ class Vae: self.feature_extractor = feature_extractor self.safety_checker = kwargs.get("safety_checker", None) + if hasattr(self.vae.config, "block_out_channels"): + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + else: + self.vae_scale_factor = 8 + + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.mask_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True + ) + sub_models = { - DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder, + DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2, } # Modify config to keep the resulting model compatible with diffusers pipelines for name in sub_models.keys(): - self._internal_dict[name] = ( - ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None) - ) - - if hasattr(self.vae.config, "block_out_channels"): - self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - else: - self.vae_scale_factor = 8 + config[name] = ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - self.mask_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True - ) + self._internal_dict = FrozenDict(config) + self.shared_attributes_init(model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir) @staticmethod def load_model( @@ -512,12 +499,25 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs): def _save_config(self, save_directory): self.save_config(save_directory) + @property + def _execution_device(self): + return self.device + + def __call__(self, *args, **kwargs): + device = self._execution_device + + for i in range(len(args)): + args[i] = np_to_pt(args[i], device) + + for k, v in kwargs.items(): + kwargs[k] = np_to_pt(v, device) + + return self.auto_model_class.__call__(self, *args, **kwargs) -class ORTPipelinePart(ORTModelPart): - CONFIG_NAME = "config.json" +class ORTPipelinePart(ORTModelPart): def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): - config_path = Path(session._model_path).parent / self.CONFIG_NAME + config_path = Path(session._model_path).parent / "config.json" if config_path.is_file(): self.config = FrozenDict(parent_model._dict_from_json_file(config_path)) @@ -533,7 +533,13 @@ def input_dtype(self): class ORTModelTextEncoder(ORTPipelinePart): - def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): + def forward( + self, + input_ids: Union[np.ndarray, torch.Tensor], + attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, + output_hidden_states: Optional[bool] = None, + return_dict: bool = False, + ): use_torch = isinstance(input_ids, torch.Tensor) model_inputs = {"input_ids": input_ids} @@ -542,15 +548,18 @@ def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): onnx_outputs = self.session.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - if any("hidden_states" in model_output for model_output in model_outputs): + if output_hidden_states: model_outputs["hidden_states"] = [] - for i in range(self.config.num_hidden_layers): model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) - - # exporter doesnt duplicate last hidden state for some reason + # exporter doesnt duplicate last hidden state so we need to add it manually # (only returned once as last_hidden_state and not part of the list of hidden_states) model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + else: + model_outputs.pop("hidden_states", None) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) @@ -564,9 +573,15 @@ def forward( text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = False, ): use_torch = isinstance(sample, torch.Tensor) + if len(timestep.shape) == 0: + timestep = timestep.unsqueeze(0) + model_inputs = { "sample": sample, "timestep": timestep, @@ -574,20 +589,25 @@ def forward( "text_embeds": text_embeds, "time_ids": time_ids, "timestep_cond": timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), } onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + if return_dict: + return model_outputs + return ModelOutput(**model_outputs) -class ORTModelVaeDecoder(ORTPipelinePart): - def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): - use_torch = isinstance(latent_sample, torch.Tensor) +class ORTModelVaeEncoder(ORTPipelinePart): + def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = False): + use_torch = isinstance(sample, torch.Tensor) - model_inputs = {"latent_sample": latent_sample} + model_inputs = {"sample": sample} onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) @@ -595,15 +615,27 @@ def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): if "latent_sample" in model_outputs: model_outputs["latents"] = model_outputs.pop("latent_sample") + elif "latent_parameters" in model_outputs: + model_outputs["latent_dist"] = DiagonalGaussianDistribution( + parameters=model_outputs.pop("latent_parameters") + ) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) -class ORTModelVaeEncoder(ORTPipelinePart): - def forward(self, sample: Union[np.ndarray, torch.Tensor]): - use_torch = isinstance(sample, torch.Tensor) +class ORTModelVaeDecoder(ORTPipelinePart): + def forward( + self, + latent_sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): + use_torch = isinstance(latent_sample, torch.Tensor) - model_inputs = {"sample": sample} + model_inputs = {"latent_sample": latent_sample} onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) @@ -611,14 +643,24 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor]): if "latent_sample" in model_outputs: model_outputs["latents"] = model_outputs.pop("latent_sample") - elif "latent_parameters" in model_outputs: - model_outputs["latent_dist"] = DiagonalGaussianDistribution(model_outputs.pop("latent_parameters")) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) +class ORTVaeWrapper(ORTPipelinePart): + def __init__(self, vae_encoder: ORTModelVaeEncoder, vae_decoder: ORTModelVaeDecoder, parent_model: ORTPipeline): + self.encode = vae_encoder.forward + self.decode = vae_decoder.forward + + super().__init__(vae_decoder.session, parent_model) + + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ @@ -626,11 +668,9 @@ class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): main_input_name = "prompt" auto_model_class = StableDiffusionPipeline - __call__ = StableDiffusionPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ @@ -638,11 +678,9 @@ class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipel main_input_name = "prompt" auto_model_class = StableDiffusionImg2ImgPipeline - __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ @@ -650,11 +688,9 @@ class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipel main_input_name = "prompt" auto_model_class = StableDiffusionInpaintPipeline - __call__ = StableDiffusionInpaintPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyModelPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ @@ -662,11 +698,9 @@ class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMi main_input_name = "prompt" auto_model_class = LatentConsistencyModelPipeline - __call__ = LatentConsistencyPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipelineMixin): +class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ @@ -674,11 +708,14 @@ class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipelineMixin): main_input_name = "prompt" auto_model_class = StableDiffusionXLPipeline - __call__ = StableDiffusionXLPipelineMixin.__call__ - def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): super().__init__(*args, **kwargs) + requires_aesthetics_score = kwargs.get("requires_aesthetics_score", False) + force_zeros_for_empty_prompt = kwargs.get("force_zeros_for_empty_prompt", True) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) + add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() if add_watermarker: @@ -686,9 +723,28 @@ def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): else: self.watermark = None + # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids + def _get_add_time_ids( + self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None + ): + add_time_ids = list(original_size + crops_coords_top_left + target_size) + + # passed_add_embed_dim = ( + # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim + # ) + # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + # if expected_add_embed_dim != passed_add_embed_dim: + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + # ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipelineMixin): +class ORTStableDiffusionXLImg2ImgPipeline(ORTPipeline, StableDiffusionXLImg2ImgPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ @@ -696,7 +752,72 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipeline, StableDi main_input_name = "prompt" auto_model_class = StableDiffusionXLImg2ImgPipeline - __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): + super().__init__(*args, **kwargs) + + requires_aesthetics_score = kwargs.get("requires_aesthetics_score", False) + force_zeros_for_empty_prompt = kwargs.get("force_zeros_for_empty_prompt", True) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) + + add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() + + if add_watermarker: + self.watermark = StableDiffusionXLWatermarker() + else: + self.watermark = None + + # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + # passed_add_embed_dim = ( + # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim + # ) + # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + # if ( + # expected_add_embed_dim > passed_add_embed_dim + # and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim + # ): + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model." + # ) + # elif ( + # expected_add_embed_dim < passed_add_embed_dim + # and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim + # ): + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model." + # ) + # elif expected_add_embed_dim != passed_add_embed_dim: + # raise ValueError( + # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + # ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids SUPPORTED_ORT_PIPELINES = [ diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index ad40af92b9..67bde37a33 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -401,3 +401,23 @@ def evaluation_loop( metrics = {} return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset)) + + +def np_to_pt(np_object, device): + if isinstance(np_object, np.ndarray): + if np_object.ndim == 4: + return torch.from_numpy(np_object).permute(0, 3, 1, 2) + elif np_object.ndim == 3: + return torch.from_numpy(np_object).permute(2, 0, 1) + else: + return torch.from_numpy(np_object) + elif isinstance(np_object, list) and isinstance(np_object[0], np.ndarray): + return [np_to_pt(a, device) for a in np_object] + elif isinstance(np_object, dict) and isinstance(next(iter(np_object.values())), np.ndarray): + return {k: np_to_pt(v, device) for k, v in np_object.items()} + elif isinstance(np_object, np.random.RandomState): + return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) + elif isinstance(np_object, list) and isinstance(np_object[0], np.random.RandomState): + return [torch.Generator(device=device).manual_seed(int(a.get_state()[1][0])) for a in np_object] + else: + return np_object diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py deleted file mode 100644 index 89e0bc00f1..0000000000 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ /dev/null @@ -1,445 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np -import torch -from diffusers.image_processor import PipelineImageInput -from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_timesteps -from diffusers.utils.deprecation_utils import deprecate - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -logger = logging.getLogger(__name__) - - -class LatentConsistencyPipelineMixin(StableDiffusionPipelineMixin): - _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] - _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"] - - def get_guidance_scale_embedding( - self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.Tensor: - """ - See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - - Args: - w (`torch.Tensor`): - Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings. - embedding_dim (`int`, *optional*, defaults to 512): - Dimension of the embeddings to generate. - dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): - Data type of the generated embeddings. - - Returns: - `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. - """ - assert len(w.shape) == 1 - w = w * 1000.0 - - half_dim = embedding_dim // 2 - emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) - emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) - emb = w.to(dtype)[:, None] * emb[None, :] - emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) - if embedding_dim % 2 == 1: # zero pad - emb = torch.nn.functional.pad(emb, (0, 1)) - assert emb.shape == (w.shape[0], embedding_dim) - return emb - - # Currently StableDiffusionPipeline.check_inputs with negative prompt stuff removed - def check_inputs( - self, - prompt: Union[str, List[str]], - height: int, - width: int, - callback_steps: int, - prompt_embeds: Optional[torch.Tensor] = None, - ip_adapter_image=None, - ip_adapter_image_embeds=None, - callback_on_step_end_tensor_inputs=None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if ip_adapter_image is not None and ip_adapter_image_embeds is not None: - raise ValueError( - "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." - ) - - if ip_adapter_image_embeds is not None: - if not isinstance(ip_adapter_image_embeds, list): - raise ValueError( - f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" - ) - elif ip_adapter_image_embeds[0].ndim not in [3, 4]: - raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" - ) - - @property - def guidance_scale(self): - return self._guidance_scale - - @property - def cross_attention_kwargs(self): - return self._cross_attention_kwargs - - @property - def clip_skip(self): - return self._clip_skip - - @property - def do_classifier_free_guidance(self): - return False - - @property - def num_timesteps(self): - return self._num_timesteps - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 4, - original_inference_steps: int = None, - timesteps: List[int] = None, - guidance_scale: float = 8.5, - num_images_per_prompt: Optional[int] = 1, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.Tensor] = None, - prompt_embeds: Optional[torch.Tensor] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - The call function to the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - original_inference_steps (`int`, *optional*): - The original number of inference steps use to generate a linearly-spaced timestep schedule, from which - we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule, - following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the - scheduler's `original_inference_steps` attribute. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` - timesteps on the original LCM training/distillation timestep schedule are used. Must be in descending - order. - guidance_scale (`float`, *optional*, defaults to 7.5): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - Note that the original latent consistency models paper uses a different CFG formulation where the - guidance scales are decreased by 1 (so in the paper formulation CFG is enabled when `guidance_scale > - 0`). - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.Tensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): - Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of - IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should - contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in - [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - - Examples: - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned where the first element is a list with the generated images and the - second element is a list of `bool`s indicating whether the corresponding generated image contains - "not-safe-for-work" (nsfw) content. - """ - # must have a compatible torch device - device = self.device - - # convert numpy arrays to torch tensors - latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents - prompt_embeds = ( - self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds - ) - ip_adapter_image_embeds = ( - [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] - if ip_adapter_image_embeds is not None - else ip_adapter_image_embeds - ) - - for k, v in kwargs.items(): - if isinstance(v, np.ndarray): - kwargs[k] = self.np_to_pt(v, device) - elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): - kwargs[k] = [self.np_to_pt(i, device) for i in v] - elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): - kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} - - generator = ( - self.np_to_pt(generator, device) - if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) - or isinstance(generator, np.random.RandomState) - else generator - ) - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - height, - width, - callback_steps, - prompt_embeds, - ip_adapter_image, - ip_adapter_image_embeds, - callback_on_step_end_tensor_inputs, - ) - self._guidance_scale = guidance_scale - self._clip_skip = clip_skip - self._cross_attention_kwargs = cross_attention_kwargs - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - self.do_classifier_free_guidance, - ) - - # 3. Encode input prompt - lora_scale = ( - self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None - ) - - # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided - # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the - # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts. - prompt_embeds, _ = self.encode_prompt( - prompt, - device, - num_images_per_prompt, - self.do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=None, - lora_scale=lora_scale, - clip_skip=self.clip_skip, - ) - - # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, original_inference_steps=original_inference_steps - ) - - # 5. Prepare latent variable - num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) - bs = batch_size * num_images_per_prompt - - # 6. Get Guidance Scale Embedding - # NOTE: We use the Imagen CFG formulation that StableDiffusionPipeline uses rather than the original LCM paper - # CFG formulation, so we need to subtract 1 from the input guidance_scale. - # LCM CFG formulation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond), (cfg_scale > 0.0 using CFG) - w = torch.tensor(self.guidance_scale - 1).repeat(bs) - w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.time_cond_proj_dim).to( - device=device, dtype=latents.dtype - ) - - # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None) - - # 7.1 Add image embeds for IP-Adapter - added_cond_kwargs = ( - {"image_embeds": image_embeds} - if ip_adapter_image is not None or ip_adapter_image_embeds is not None - else None - ) - - # 8. LCM MultiStep Sampling Loop: - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - self._num_timesteps = len(timesteps) - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - latents = latents.to(prompt_embeds.dtype) - - # model prediction (v-prediction, eps, x) - model_pred = self.unet( - latents, - timestep=t.unsqueeze(0), - timestep_cond=w_embedding, - encoder_hidden_states=prompt_embeds, - **(cross_attention_kwargs or {}), - **(added_cond_kwargs or {}), - # cross_attention_kwargs=cross_attention_kwargs, - # added_cond_kwargs=added_cond_kwargs, - # return_dict=False, - )[0] - - # compute the previous noisy sample x_t -> x_t-1 - latents, denoised = self.scheduler.step(model_pred, t, latents, **extra_step_kwargs, return_dict=False) - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - w_embedding = callback_outputs.pop("w_embedding", w_embedding) - denoised = callback_outputs.pop("denoised", denoised) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - denoised = denoised.to(prompt_embeds.dtype) - if not output_type == "latent": - image = self.vae_decoder( - denoised / self.vae.config.scaling_factor, - # return_dict=False, - )[0] - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - image = denoised - has_nsfw_concept = None - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - # Offload all models - # self.maybe_free_model_hooks() - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py deleted file mode 100644 index a0ae2cb44f..0000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ /dev/null @@ -1,732 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np -import torch -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback -from diffusers.image_processor import PipelineImageInput -from diffusers.loaders.textual_inversion import TextualInversionLoaderMixin -from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg, retrieve_timesteps -from diffusers.utils.deprecation_utils import deprecate -from diffusers.utils.torch_utils import randn_tensor - -from .pipeline_utils import DiffusionPipelineMixin - - -logger = logging.getLogger(__name__) - - -class StableDiffusionPipelineMixin(DiffusionPipelineMixin, TextualInversionLoaderMixin): - _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] - _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] - - def encode_prompt( - self, - prompt: str, - device: Optional[torch.device] = None, - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - lora_scale: Optional[float] = None, - clip_skip: Optional[int] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - device: (`torch.device`): - torch device - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - lora_scale (`float`, *optional*): - A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - """ - device = device or self.device - - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - # if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): - # self._lora_scale = lora_scale - - # dynamically adjust the LoRA scale - # if not USE_PEFT_BACKEND: - # adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) - # else: - # scale_lora_layers(self.text_encoder, lora_scale) - - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - # textual inversion: process multi-vector tokens if necessary - if isinstance(self, TextualInversionLoaderMixin): - prompt = self.maybe_convert_prompt(prompt, self.tokenizer) - - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( - text_input_ids, untruncated_ids - ): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] - ) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}" - ) - - # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - # attention_mask = text_inputs.attention_mask.to(device) - # else: - # attention_mask = None - - if clip_skip is None: - prompt_embeds = self.text_encoder( - text_input_ids.to(device), - # attention_mask=attention_mask, - ) - prompt_embeds = prompt_embeds[0] - else: - prompt_embeds = self.text_encoder( - text_input_ids.to(device), - # attention_mask=attention_mask, - # output_hidden_states=True, - ) - # Access the `hidden_states` first, that contains a tuple of - # all the hidden states from the encoder layers. Then index into - # the tuple to access the hidden states from the desired layer. - prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] - # We also need to apply the final LayerNorm here to not mess with the - # representations. The `last_hidden_states` that we typically use for - # obtaining the final prompt representations passes through the LayerNorm - # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) - - if self.text_encoder is not None: - prompt_embeds_dtype = self.text_encoder.dtype - elif self.unet is not None: - prompt_embeds_dtype = self.unet.dtype - else: - prompt_embeds_dtype = prompt_embeds.dtype - - prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - # textual inversion: process multi-vector tokens if necessary - if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) - - max_length = prompt_embeds.shape[1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - - # if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - # attention_mask = uncond_input.attention_mask.to(device) - # else: - # attention_mask = None - - negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids.to(device), - # attention_mask=attention_mask, - ) - negative_prompt_embeds = negative_prompt_embeds[0] - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - - negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - - # if self.text_encoder is not None: - # if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: - # # Retrieve the original scale by scaling back the LoRA layers - # unscale_lora_layers(self.text_encoder, lora_scale) - - return prompt_embeds, negative_prompt_embeds - - def run_safety_checker(self, image, device, dtype): - if self.safety_checker is None: - has_nsfw_concept = None - else: - if torch.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - else: - feature_extractor_input = self.image_processor.numpy_to_pil(image) - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - return image, has_nsfw_concept - - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # check if the scheduler accepts generator - accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_generator: - extra_step_kwargs["generator"] = generator - return extra_step_kwargs - - def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - ip_adapter_image=None, - ip_adapter_image_embeds=None, - callback_on_step_end_tensor_inputs=None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - if ip_adapter_image is not None and ip_adapter_image_embeds is not None: - raise ValueError( - "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." - ) - - if ip_adapter_image_embeds is not None: - if not isinstance(ip_adapter_image_embeds, list): - raise ValueError( - f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" - ) - elif ip_adapter_image_embeds[0].ndim not in [3, 4]: - raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" - ) - - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): - shape = ( - batch_size, - num_channels_latents, - int(height) // self.vae_scale_factor, - int(width) // self.vae_scale_factor, - ) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - else: - latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - - @torch.no_grad() - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.Tensor] = None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - clip_skip: Optional[int] = None, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - The call function to the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - sigmas (`List[float]`, *optional*): - Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in - their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed - will be used. - guidance_scale (`float`, *optional*, defaults to 7.5): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies - to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.Tensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of - IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should - contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in - [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - guidance_rescale (`float`, *optional*, defaults to 0.0): - Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when - using zero terminal SNR. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): - A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of - each denoising step during the inference. with the following arguments: `callback_on_step_end(self: - DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a - list of all tensors as specified by `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned where the first element is a list with the generated images and the - second element is a list of `bool`s indicating whether the corresponding generated image contains - "not-safe-for-work" (nsfw) content. - """ - # must have a compatible torch device - device = self.device - - # convert numpy arrays to torch tensors - latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents - prompt_embeds = ( - self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds - ) - negative_prompt_embeds = ( - self.np_to_pt(negative_prompt_embeds, device) - if isinstance(negative_prompt_embeds, np.ndarray) - else negative_prompt_embeds - ) - ip_adapter_image_embeds = ( - [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] - if ip_adapter_image_embeds is not None - else ip_adapter_image_embeds - ) - - for k, v in kwargs.items(): - if isinstance(v, np.ndarray): - kwargs[k] = self.np_to_pt(v, device) - elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): - kwargs[k] = [self.np_to_pt(i, device) for i in v] - elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): - kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} - - generator = ( - self.np_to_pt(generator, device) - if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) - or isinstance(generator, np.random.RandomState) - else generator - ) - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - - # 0. Default height and width to unet - height = height or self.unet.config.get("sample_size") * self.vae_scale_factor - width = width or self.unet.config.get("sample_size") * self.vae_scale_factor - # to deal with lora scaling and other possible forward hooks - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - height, - width, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - ip_adapter_image, - ip_adapter_image_embeds, - callback_on_step_end_tensor_inputs, - ) - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1 and self.unet.config.get("time_cond_proj_dim", None) is None - - # 3. Encode input prompt - prompt_embeds, negative_prompt_embeds = self.encode_prompt( - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - lora_scale=lora_scale, - clip_skip=clip_skip, - ) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - if do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - do_classifier_free_guidance, - ) - else: - image_embeds = None - - # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) - - # 5. Prepare latent variables - num_channels_latents = self.unet.config.get("in_channels") - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) - - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 6.1 Add image embeds for IP-Adapter - added_cond_kwargs = ( - {"image_embeds": image_embeds} - if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) - else None - ) - - # 6.2 Optionally get Guidance Scale Embedding - timestep_cond = None - if self.unet.config.get("time_cond_proj_dim", None) is not None: - guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size * num_images_per_prompt) - timestep_cond = self.get_guidance_scale_embedding( - guidance_scale_tensor, embedding_dim=self.unet.config.get("time_cond_proj_dim", None) - ).to(device=device, dtype=latents.dtype) - - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet( - sample=latent_model_input, - timestep=t.unsqueeze(0), - encoder_hidden_states=prompt_embeds, - timestep_cond=timestep_cond, - **(cross_attention_kwargs or {}), - **(added_cond_kwargs or {}), - # cross_attention_kwargs=cross_attention_kwargs, - # added_cond_kwargs=added_cond_kwargs, - # return_dict=False, - ) - noise_pred = next(iter(noise_pred.values())) - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - if do_classifier_free_guidance and guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if not output_type == "latent": - image = self.vae.decode( - latents / self.vae.config.get("scaling_factor"), - # return_dict=False, - )[0] - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - image = latents - has_nsfw_concept = None - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - # # Offload all models - # self.maybe_free_model_hooks() - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps - def get_timesteps(self, num_inference_steps, strength, device): - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] - if hasattr(self.scheduler, "set_begin_index"): - self.scheduler.set_begin_index(t_start * self.scheduler.order) - - return timesteps, num_inference_steps - t_start - - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding - def get_guidance_scale_embedding( - self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.Tensor: - """ - See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - - Args: - w (`torch.Tensor`): - Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings. - embedding_dim (`int`, *optional*, defaults to 512): - Dimension of the embeddings to generate. - dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): - Data type of the generated embeddings. - - Returns: - `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. - """ - assert len(w.shape) == 1 - w = w * 1000.0 - - half_dim = embedding_dim // 2 - emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) - emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) - emb = w.to(dtype)[:, None] * emb[None, :] - emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) - if embedding_dim % 2 == 1: # zero pad - emb = torch.nn.functional.pad(emb, (0, 1)) - assert emb.shape == (w.shape[0], embedding_dim) - return emb diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py deleted file mode 100644 index 90f5666d11..0000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ /dev/null @@ -1,533 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback -from diffusers.image_processor import PipelineImageInput -from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents, retrieve_timesteps -from diffusers.utils.deprecation_utils import deprecate -from diffusers.utils.torch_utils import randn_tensor - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -logger = logging.getLogger(__name__) - - -class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin): - _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] - _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] - - def check_inputs( - self, - prompt, - strength, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - ip_adapter_image=None, - ip_adapter_image_embeds=None, - callback_on_step_end_tensor_inputs=None, - ): - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - - if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - if ip_adapter_image is not None and ip_adapter_image_embeds is not None: - raise ValueError( - "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." - ) - - if ip_adapter_image_embeds is not None: - if not isinstance(ip_adapter_image_embeds, list): - raise ValueError( - f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" - ) - elif ip_adapter_image_embeds[0].ndim not in [3, 4]: - raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" - ) - - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): - if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): - raise ValueError( - f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" - ) - - image = image.to(device=device, dtype=dtype) - - batch_size = batch_size * num_images_per_prompt - - if image.shape[1] == 4: - init_latents = image - - else: - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - elif isinstance(generator, list): - if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: - image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) - elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " - ) - - init_latents = [ - retrieve_latents(self.vae_encoder(image[i : i + 1]), generator=generator[i]) - for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) - else: - init_latents = retrieve_latents(self.vae_encoder(image), generator=generator) - - init_latents = self.vae.config.scaling_factor * init_latents - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: - # expand init_latents for batch_size - deprecation_message = ( - f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" - " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many initial images as text prompts to suppress this warning." - ) - deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) - additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) - else: - init_latents = torch.cat([init_latents], dim=0) - - shape = init_latents.shape - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - - # get latents - init_latents = self.scheduler.add_noise(init_latents, noise, timestep) - latents = init_latents - - return latents - - @property - def guidance_scale(self): - return self._guidance_scale - - @property - def clip_skip(self): - return self._clip_skip - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - @property - def do_classifier_free_guidance(self): - return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None - - @property - def cross_attention_kwargs(self): - return self._cross_attention_kwargs - - @property - def num_timesteps(self): - return self._num_timesteps - - @property - def interrupt(self): - return self._interrupt - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - image: PipelineImageInput = None, - strength: float = 0.8, - num_inference_steps: Optional[int] = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - guidance_scale: Optional[float] = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: Optional[float] = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - clip_skip: int = None, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - The call function to the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both - numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list - or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a - list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image - latents as `image`, but if passing latents directly it is not encoded again. - strength (`float`, *optional*, defaults to 0.8): - Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a - starting point and more noise is added the higher the `strength`. The number of denoising steps depends - on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising - process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 - essentially ignores `image`. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. This parameter is modulated by `strength`. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - sigmas (`List[float]`, *optional*): - Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in - their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed - will be used. - guidance_scale (`float`, *optional*, defaults to 7.5): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies - to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of - IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should - contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in - [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): - A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of - each denoising step during the inference. with the following arguments: `callback_on_step_end(self: - DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a - list of all tensors as specified by `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - Examples: - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned where the first element is a list with the generated images and the - second element is a list of `bool`s indicating whether the corresponding generated image contains - "not-safe-for-work" (nsfw) content. - """ - # must have a compatible torch device - device = self.device - - # convert numpy arrays to torch tensors - prompt_embeds = ( - self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds - ) - negative_prompt_embeds = ( - self.np_to_pt(negative_prompt_embeds, device) - if isinstance(negative_prompt_embeds, np.ndarray) - else negative_prompt_embeds - ) - ip_adapter_image_embeds = ( - [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] - if ip_adapter_image_embeds is not None - else ip_adapter_image_embeds - ) - - for k, v in kwargs.items(): - if isinstance(v, np.ndarray): - kwargs[k] = self.np_to_pt(v, device) - elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): - kwargs[k] = [self.np_to_pt(i, device) for i in v] - elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): - kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} - - generator = ( - self.np_to_pt(generator, device) - if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) - or isinstance(generator, np.random.RandomState) - else generator - ) - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - strength, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - ip_adapter_image, - ip_adapter_image_embeds, - callback_on_step_end_tensor_inputs, - ) - - self._guidance_scale = guidance_scale - self._clip_skip = clip_skip - self._cross_attention_kwargs = cross_attention_kwargs - self._interrupt = False - self._do_classifier_free_guidance = self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # 3. Encode input prompt - text_encoder_lora_scale = ( - self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None - ) - prompt_embeds, negative_prompt_embeds = self.encode_prompt( - prompt, - device, - num_images_per_prompt, - self._do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - lora_scale=text_encoder_lora_scale, - clip_skip=self.clip_skip, - ) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - if self._do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - self._do_classifier_free_guidance, - ) - - # 4. Preprocess image - image = self.image_processor.preprocess(image) - - # 5. set timesteps - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) - latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - - # 6. Prepare latent variables - latents = self.prepare_latents( - image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator, - ) - - # 7. Prepare extra step kwargs. - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7.1 Add image embeds for IP-Adapter - added_cond_kwargs = ( - {"image_embeds": image_embeds} - if ip_adapter_image is not None or ip_adapter_image_embeds is not None - else None - ) - - # 7.2 Optionally get Guidance Scale Embedding - timestep_cond = None - if self.unet.config.time_cond_proj_dim is not None: - guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) - timestep_cond = self.get_guidance_scale_embedding( - guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim - ).to(device=device, dtype=latents.dtype) - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - self._num_timesteps = len(timesteps) - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if self._do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet( - latent_model_input, - timestep=t.unsqueeze(0), - encoder_hidden_states=prompt_embeds, - timestep_cond=timestep_cond, - **(cross_attention_kwargs or {}), - **(added_cond_kwargs or {}), - # cross_attention_kwargs=cross_attention_kwargs, - # added_cond_kwargs=added_cond_kwargs, - # return_dict=False, - )[0] - - # perform guidance - if self._do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if not output_type == "latent": - image = self.vae_decoder( - latents / self.vae.config.scaling_factor, - # return_dict=False, - # generator=generator, - )[0] - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - image = latents - has_nsfw_concept = None - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - # Offload all models - # self.maybe_free_model_hooks() - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py deleted file mode 100644 index a232a75721..0000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ /dev/null @@ -1,782 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback -from diffusers.image_processor import PipelineImageInput -from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents, retrieve_timesteps -from diffusers.utils.deprecation_utils import deprecate -from diffusers.utils.torch_utils import randn_tensor - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -class StableDiffusionInpaintPipelineMixin(StableDiffusionPipelineMixin): - def check_inputs( - self, - prompt, - image, - mask_image, - height, - width, - strength, - callback_steps, - output_type, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - ip_adapter_image=None, - ip_adapter_image_embeds=None, - callback_on_step_end_tensor_inputs=None, - padding_mask_crop=None, - ): - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - - if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - if padding_mask_crop is not None: - if not isinstance(image, PIL.Image.Image): - raise ValueError( - f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}." - ) - if not isinstance(mask_image, PIL.Image.Image): - raise ValueError( - f"The mask image should be a PIL image when inpainting mask crop, but is of type" - f" {type(mask_image)}." - ) - if output_type != "pil": - raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.") - - if ip_adapter_image is not None and ip_adapter_image_embeds is not None: - raise ValueError( - "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." - ) - - if ip_adapter_image_embeds is not None: - if not isinstance(ip_adapter_image_embeds, list): - raise ValueError( - f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" - ) - elif ip_adapter_image_embeds[0].ndim not in [3, 4]: - raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" - ) - - def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - device, - generator, - latents=None, - image=None, - timestep=None, - is_strength_max=True, - return_noise=False, - return_image_latents=False, - ): - shape = ( - batch_size, - num_channels_latents, - int(height) // self.vae_scale_factor, - int(width) // self.vae_scale_factor, - ) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if (image is None or timestep is None) and not is_strength_max: - raise ValueError( - "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." - "However, either the image or the noise timestep has not been provided." - ) - - if return_image_latents or (latents is None and not is_strength_max): - image = image.to(device=device, dtype=dtype) - - if image.shape[1] == 4: - image_latents = image - else: - image_latents = self._encode_vae_image(image=image, generator=generator) - image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1) - - if latents is None: - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - # if strength is 1. then initialise the latents to noise, else initial to image + noise - latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) - # if pure noise then scale the initial latents by the Scheduler's init sigma - latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents - else: - noise = latents.to(device) - latents = noise * self.scheduler.init_noise_sigma - - outputs = (latents,) - - if return_noise: - outputs += (noise,) - - if return_image_latents: - outputs += (image_latents,) - - return outputs - - def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): - if isinstance(generator, list): - image_latents = [ - retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) - for i in range(image.shape[0]) - ] - image_latents = torch.cat(image_latents, dim=0) - else: - image_latents = retrieve_latents(self.vae.encode(image), generator=generator) - - image_latents = self.vae.config.scaling_factor * image_latents - - return image_latents - - def prepare_mask_latents( - self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance - ): - # resize the mask to latents shape as we concatenate the mask to the latents - # we do that before converting to dtype to avoid breaking in case we're using cpu_offload - # and half precision - mask = torch.nn.functional.interpolate( - mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) - ) - mask = mask.to(device=device, dtype=dtype) - - masked_image = masked_image.to(device=device, dtype=dtype) - - if masked_image.shape[1] == 4: - masked_image_latents = masked_image - else: - masked_image_latents = self._encode_vae_image(masked_image, generator=generator) - - # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method - if mask.shape[0] < batch_size: - if not batch_size % mask.shape[0] == 0: - raise ValueError( - "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" - f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" - " of masks that you pass is divisible by the total requested batch size." - ) - mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1) - if masked_image_latents.shape[0] < batch_size: - if not batch_size % masked_image_latents.shape[0] == 0: - raise ValueError( - "The passed images and the required batch size don't match. Images are supposed to be duplicated" - f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." - " Make sure the number of images that you pass is divisible by the total requested batch size." - ) - masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1) - - mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask - masked_image_latents = ( - torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents - ) - - # aligning device to prevent device errors when concating it with the latent model input - masked_image_latents = masked_image_latents.to(device=device, dtype=dtype) - return mask, masked_image_latents - - @property - def guidance_scale(self): - return self._guidance_scale - - @property - def clip_skip(self): - return self._clip_skip - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - @property - def do_classifier_free_guidance(self): - return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None - - @property - def cross_attention_kwargs(self): - return self._cross_attention_kwargs - - @property - def num_timesteps(self): - return self._num_timesteps - - @property - def interrupt(self): - return self._interrupt - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - image: PipelineImageInput = None, - mask_image: PipelineImageInput = None, - masked_image_latents: "torch.Tensor" = None, - height: Optional[int] = None, - width: Optional[int] = None, - padding_mask_crop: Optional[int] = None, - strength: float = 1.0, - num_inference_steps: int = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional["torch.Tensor"] = None, - prompt_embeds: Optional["torch.Tensor"] = None, - negative_prompt_embeds: Optional["torch.Tensor"] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List["torch.Tensor"]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - clip_skip: int = None, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - The call function to the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to - be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch - tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the - expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the - expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but - if passing latents directly it is not encoded again. - mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): - `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask - are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a - single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one - color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, - H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, - 1)`, or `(H, W)`. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. - padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to - image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region - with the same aspect ration of the image and contains all masked area, and then expand that area based - on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before - resizing to the original image size for inpainting. This is useful when the masked area is small while - the image is large and contain information irrelevant for inpainting, such as background. - strength (`float`, *optional*, defaults to 1.0): - Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a - starting point and more noise is added the higher the `strength`. The number of denoising steps depends - on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising - process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 - essentially ignores `image`. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. This parameter is modulated by `strength`. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - sigmas (`List[float]`, *optional*): - Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in - their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed - will be used. - guidance_scale (`float`, *optional*, defaults to 7.5): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies - to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.Tensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of - IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should - contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in - [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): - A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of - each denoising step during the inference. with the following arguments: `callback_on_step_end(self: - DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a - list of all tensors as specified by `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - Examples: - - ```py - >>> import PIL - >>> import requests - >>> import torch - >>> from io import BytesIO - - >>> from diffusers import StableDiffusionInpaintPipeline - - - >>> def download_image(url): - ... response = requests.get(url) - ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") - - - >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" - >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" - - >>> init_image = download_image(img_url).resize((512, 512)) - >>> mask_image = download_image(mask_url).resize((512, 512)) - - >>> pipe = StableDiffusionInpaintPipeline.from_pretrained( - ... "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16 - ... ) - >>> pipe = pipe.to("cuda") - - >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench" - >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0] - ``` - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned where the first element is a list with the generated images and the - second element is a list of `bool`s indicating whether the corresponding generated image contains - "not-safe-for-work" (nsfw) content. - """ - # must have a compatible torch device - device = self.device - - # convert numpy arrays to torch tensors - latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents - prompt_embeds = ( - self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds - ) - negative_prompt_embeds = ( - self.np_to_pt(negative_prompt_embeds, device) - if isinstance(negative_prompt_embeds, np.ndarray) - else negative_prompt_embeds - ) - ip_adapter_image_embeds = ( - [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] - if ip_adapter_image_embeds is not None - else ip_adapter_image_embeds - ) - - for k, v in kwargs.items(): - if isinstance(v, np.ndarray): - kwargs[k] = self.np_to_pt(v, device) - elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): - kwargs[k] = [self.np_to_pt(i, device) for i in v] - elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): - kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} - - generator = ( - self.np_to_pt(generator, device) - if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) - or isinstance(generator, np.random.RandomState) - else generator - ) - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor - - # 1. Check inputs - self.check_inputs( - prompt, - image, - mask_image, - height, - width, - strength, - callback_steps, - output_type, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - ip_adapter_image, - ip_adapter_image_embeds, - callback_on_step_end_tensor_inputs, - padding_mask_crop, - ) - - self._guidance_scale = guidance_scale - self._clip_skip = clip_skip - self._cross_attention_kwargs = cross_attention_kwargs - self._interrupt = False - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # 3. Encode input prompt - text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None - ) - prompt_embeds, negative_prompt_embeds = self.encode_prompt( - prompt, - device, - num_images_per_prompt, - self.do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - lora_scale=text_encoder_lora_scale, - clip_skip=self.clip_skip, - ) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - if self.do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - self.do_classifier_free_guidance, - ) - - # 4. set timesteps - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) - timesteps, num_inference_steps = self.get_timesteps( - num_inference_steps=num_inference_steps, strength=strength, device=device - ) - # check that number of inference steps is not < 1 - as this doesn't make sense - if num_inference_steps < 1: - raise ValueError( - f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline" - f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline." - ) - # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise - is_strength_max = strength == 1.0 - - # 5. Preprocess mask and image - - if padding_mask_crop is not None: - crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop) - resize_mode = "fill" - else: - crops_coords = None - resize_mode = "default" - - original_image = image - init_image = self.image_processor.preprocess( - image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode - ) - init_image = init_image.to(dtype=torch.float32) - - # 6. Prepare latent variables - num_channels_latents = self.vae.config.latent_channels - num_channels_unet = self.unet.config.in_channels - return_image_latents = num_channels_unet == 4 - - latents_outputs = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - image=init_image, - timestep=latent_timestep, - is_strength_max=is_strength_max, - return_noise=True, - return_image_latents=return_image_latents, - ) - - if return_image_latents: - latents, noise, image_latents = latents_outputs - else: - latents, noise = latents_outputs - - # 7. Prepare mask latent variables - mask_condition = self.mask_processor.preprocess( - mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords - ) - - if masked_image_latents is None: - masked_image = init_image * (mask_condition < 0.5) - else: - masked_image = masked_image_latents - - mask, masked_image_latents = self.prepare_mask_latents( - mask_condition, - masked_image, - batch_size * num_images_per_prompt, - height, - width, - prompt_embeds.dtype, - device, - generator, - self.do_classifier_free_guidance, - ) - - # 8. Check that sizes of mask, masked image and latents match - if num_channels_unet == 9: - # default case for runwayml/stable-diffusion-inpainting - num_channels_mask = mask.shape[1] - num_channels_masked_image = masked_image_latents.shape[1] - if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: - raise ValueError( - f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" - f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" - f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" - f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input." - ) - elif num_channels_unet != 4: - raise ValueError( - f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}." - ) - - # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 9.1 Add image embeds for IP-Adapter - added_cond_kwargs = ( - {"image_embeds": image_embeds} - if ip_adapter_image is not None or ip_adapter_image_embeds is not None - else None - ) - - # 9.2 Optionally get Guidance Scale Embedding - timestep_cond = None - if self.unet.config.time_cond_proj_dim is not None: - guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) - timestep_cond = self.get_guidance_scale_embedding( - guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim - ).to(device=device, dtype=latents.dtype) - - # 10. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - self._num_timesteps = len(timesteps) - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - - # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - if num_channels_unet == 9: - latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) - - # predict the noise residual - noise_pred = self.unet( - latent_model_input, - timestep=t.unsqueeze(0), - encoder_hidden_states=prompt_embeds, - timestep_cond=timestep_cond, - **(cross_attention_kwargs or {}), - **(added_cond_kwargs or {}), - # cross_attention_kwargs=cross_attention_kwargs, - # added_cond_kwargs=added_cond_kwargs, - # return_dict=False, - )[0] - - # perform guidance - if self.do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - if num_channels_unet == 4: - init_latents_proper = image_latents - if self.do_classifier_free_guidance: - init_mask, _ = mask.chunk(2) - else: - init_mask = mask - - if i < len(timesteps) - 1: - noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - init_latents_proper, noise, torch.tensor([noise_timestep]) - ) - - latents = (1 - init_mask) * init_latents_proper + init_mask * latents - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - mask = callback_outputs.pop("mask", mask) - masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if not output_type == "latent": - condition_kwargs = {} - # if isinstance(self.vae, AsymmetricAutoencoderKL): - # init_image = init_image.to(device=device, dtype=masked_image_latents.dtype) - # init_image_condition = init_image.clone() - # init_image = self._encode_vae_image(init_image, generator=generator) - # mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype) - # condition_kwargs = {"image": init_image_condition, "mask": mask_condition} - image = self.vae_decoder( - latents / self.vae.config.scaling_factor, - # return_dict=False, - # generator=generator, - **condition_kwargs, - )[0] - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - image = latents - has_nsfw_concept = None - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if padding_mask_crop is not None: - image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image] - - # Offload all models - # self.maybe_free_model_hooks() - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py deleted file mode 100644 index 704c6fd3ad..0000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ /dev/null @@ -1,982 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback -from diffusers.image_processor import PipelineImageInput -from diffusers.loaders.textual_inversion import TextualInversionLoaderMixin -from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput -from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import rescale_noise_cfg, retrieve_timesteps -from diffusers.utils.deprecation_utils import deprecate - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -logger = logging.getLogger(__name__) - - -class StableDiffusionXLPipelineMixin(StableDiffusionPipelineMixin): - _optional_components = [ - "tokenizer", - "tokenizer_2", - "text_encoder", - "text_encoder_2", - "image_encoder", - "feature_extractor", - ] - _callback_tensor_inputs = [ - "latents", - "prompt_embeds", - "negative_prompt_embeds", - "add_text_embeds", - "add_time_ids", - "negative_pooled_prompt_embeds", - "negative_add_time_ids", - ] - - def encode_prompt( - self, - prompt: str, - prompt_2: Optional[str] = None, - device: Optional[torch.device] = None, - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Optional[str] = None, - negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - pooled_prompt_embeds: Optional[torch.Tensor] = None, - negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, - lora_scale: Optional[float] = None, - clip_skip: Optional[int] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is - used in both text-encoders - device: (`torch.device`): - torch device - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - negative_prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and - `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - lora_scale (`float`, *optional*): - A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - """ - device = device or self.device - - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - # if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin): - # self._lora_scale = lora_scale - - # dynamically adjust the LoRA scale - # if self.text_encoder is not None: - # if not USE_PEFT_BACKEND: - # adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) - # else: - # scale_lora_layers(self.text_encoder, lora_scale) - - # if self.text_encoder_2 is not None: - # if not USE_PEFT_BACKEND: - # adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale) - # else: - # scale_lora_layers(self.text_encoder_2, lora_scale) - - prompt = [prompt] if isinstance(prompt, str) else prompt - - if prompt is not None: - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # Define tokenizers and text encoders - tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] - text_encoders = ( - [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] - ) - - if prompt_embeds is None: - prompt_2 = prompt_2 or prompt - prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 - - # textual inversion: process multi-vector tokens if necessary - prompt_embeds_list = [] - prompts = [prompt, prompt_2] - for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): - if isinstance(self, TextualInversionLoaderMixin): - prompt = self.maybe_convert_prompt(prompt, tokenizer) - - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - - text_input_ids = text_inputs.input_ids - untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( - text_input_ids, untruncated_ids - ): - removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = text_encoder( - text_input_ids.to(device), - # output_hidden_states=True, - ) - - # We are only ALWAYS interested in the pooled output of the final text encoder - pooled_prompt_embeds = prompt_embeds[0] - if clip_skip is None: - prompt_embeds = prompt_embeds.hidden_states[-2] - else: - # "2" because SDXL always indexes from the penultimate layer. - prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)] - - prompt_embeds_list.append(prompt_embeds) - - prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) - - # get unconditional embeddings for classifier free guidance - zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"] - if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = torch.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) - elif do_classifier_free_guidance and negative_prompt_embeds is None: - negative_prompt = negative_prompt or "" - negative_prompt_2 = negative_prompt_2 or negative_prompt - - # normalize str to list - negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt - negative_prompt_2 = ( - batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2 - ) - - uncond_tokens: List[str] - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = [negative_prompt, negative_prompt_2] - - negative_prompt_embeds_list = [] - for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): - if isinstance(self, TextualInversionLoaderMixin): - negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) - - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - negative_prompt, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - - negative_prompt_embeds = text_encoder( - uncond_input.input_ids.to(device), - # output_hidden_states=True, - ) - # We are only ALWAYS interested in the pooled output of the final text encoder - negative_pooled_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] - - negative_prompt_embeds_list.append(negative_prompt_embeds) - - negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) - - if self.text_encoder_2 is not None: - prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) - else: - prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device) - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - - if self.text_encoder_2 is not None: - negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) - else: - negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device) - - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - - pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( - bs_embed * num_images_per_prompt, -1 - ) - if do_classifier_free_guidance: - negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( - bs_embed * num_images_per_prompt, -1 - ) - - # if self.text_encoder is not None: - # if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND: - # # Retrieve the original scale by scaling back the LoRA layers - # unscale_lora_layers(self.text_encoder, lora_scale) - - # if self.text_encoder_2 is not None: - # if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND: - # # Retrieve the original scale by scaling back the LoRA layers - # unscale_lora_layers(self.text_encoder_2, lora_scale) - - return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds - - def check_inputs( - self, - prompt, - prompt_2, - height, - width, - callback_steps, - negative_prompt=None, - negative_prompt_2=None, - prompt_embeds=None, - negative_prompt_embeds=None, - pooled_prompt_embeds=None, - negative_pooled_prompt_embeds=None, - ip_adapter_image=None, - ip_adapter_image_embeds=None, - callback_on_step_end_tensor_inputs=None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt_2 is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): - raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - elif negative_prompt_2 is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - if prompt_embeds is not None and pooled_prompt_embeds is None: - raise ValueError( - "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." - ) - - if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: - raise ValueError( - "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." - ) - - if ip_adapter_image is not None and ip_adapter_image_embeds is not None: - raise ValueError( - "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." - ) - - if ip_adapter_image_embeds is not None: - if not isinstance(ip_adapter_image_embeds, list): - raise ValueError( - f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" - ) - elif ip_adapter_image_embeds[0].ndim not in [3, 4]: - raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" - ) - - def _get_add_time_ids( - self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None - ): - add_time_ids = list(original_size + crops_coords_top_left + target_size) - - # passed_add_embed_dim = ( - # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim - # ) - # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features - - # if expected_add_embed_dim != passed_add_embed_dim: - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." - # ) - - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - return add_time_ids - - # def upcast_vae(self): - # dtype = self.vae.dtype - # self.vae.to(dtype=torch.float32) - # use_torch_2_0_or_xformers = isinstance( - # self.vae.decoder.mid_block.attentions[0].processor, - # ( - # AttnProcessor2_0, - # XFormersAttnProcessor, - # FusedAttnProcessor2_0, - # ), - # ) - # # if xformers or torch_2_0 is used attention block does not need - # # to be in float32 which can save lots of memory - # if use_torch_2_0_or_xformers: - # self.vae.post_quant_conv.to(dtype) - # self.vae.decoder.conv_in.to(dtype) - # self.vae.decoder.mid_block.to(dtype) - - @property - def guidance_scale(self): - return self._guidance_scale - - @property - def guidance_rescale(self): - return self._guidance_rescale - - @property - def clip_skip(self): - return self._clip_skip - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - @property - def do_classifier_free_guidance(self): - return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None - - @property - def cross_attention_kwargs(self): - return self._cross_attention_kwargs - - @property - def denoising_end(self): - return self._denoising_end - - @property - def num_timesteps(self): - return self._num_timesteps - - @property - def interrupt(self): - return self._interrupt - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - prompt_2: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - denoising_end: Optional[float] = None, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - negative_prompt_2: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.Tensor] = None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - pooled_prompt_embeds: Optional[torch.Tensor] = None, - negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - original_size: Optional[Tuple[int, int]] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Optional[Tuple[int, int]] = None, - negative_original_size: Optional[Tuple[int, int]] = None, - negative_crops_coords_top_left: Tuple[int, int] = (0, 0), - negative_target_size: Optional[Tuple[int, int]] = None, - clip_skip: Optional[int] = None, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is - used in both text-encoders - height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): - The height in pixels of the generated image. This is set to 1024 by default for the best results. - Anything below 512 pixels won't work well for - [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) - and checkpoints that are not specifically fine-tuned on low resolutions. - width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): - The width in pixels of the generated image. This is set to 1024 by default for the best results. - Anything below 512 pixels won't work well for - [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) - and checkpoints that are not specifically fine-tuned on low resolutions. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - sigmas (`List[float]`, *optional*): - Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in - their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed - will be used. - denoising_end (`float`, *optional*): - When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be - completed before it is intentionally prematurely terminated. As a result, the returned sample will - still retain a substantial amount of noise as determined by the discrete timesteps selected by the - scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a - "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image - Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) - guidance_scale (`float`, *optional*, defaults to 5.0): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - negative_prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and - `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - latents (`torch.Tensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of - IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should - contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead - of a plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - guidance_rescale (`float`, *optional*, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. - `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as - explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position - `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting - `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - For most cases, `target_size` should be set to the desired height and width of the generated image. If - not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in - section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a specific image resolution. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a target image resolution. It should be as same - as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): - A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of - each denoising step during the inference. with the following arguments: `callback_on_step_end(self: - DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a - list of all tensors as specified by `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - - Examples: - - Returns: - [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is a list with the generated images. - """ - # must have a compatible torch device - device = self.device - - # convert numpy arrays to torch tensors - latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents - prompt_embeds = ( - self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds - ) - negative_prompt_embeds = ( - self.np_to_pt(negative_prompt_embeds, device) - if isinstance(negative_prompt_embeds, np.ndarray) - else negative_prompt_embeds - ) - pooled_prompt_embeds = ( - self.np_to_pt(pooled_prompt_embeds, device) - if isinstance(pooled_prompt_embeds, np.ndarray) - else pooled_prompt_embeds - ) - negative_pooled_prompt_embeds = ( - self.np_to_pt(negative_pooled_prompt_embeds, device) - if isinstance(negative_pooled_prompt_embeds, np.ndarray) - else negative_pooled_prompt_embeds - ) - ip_adapter_image_embeds = ( - [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] - if ip_adapter_image_embeds is not None - else ip_adapter_image_embeds - ) - - for k, v in kwargs.items(): - if isinstance(v, np.ndarray): - kwargs[k] = self.np_to_pt(v, device) - elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): - kwargs[k] = [self.np_to_pt(i, device) for i in v] - elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): - kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} - - generator = ( - self.np_to_pt(generator, device) - if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) - or isinstance(generator, np.random.RandomState) - else generator - ) - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - - # 0. Default height and width to unet - height = height or self.default_sample_size * self.vae_scale_factor - width = width or self.default_sample_size * self.vae_scale_factor - - original_size = original_size or (height, width) - target_size = target_size or (height, width) - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - prompt_2, - height, - width, - callback_steps, - negative_prompt, - negative_prompt_2, - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ip_adapter_image, - ip_adapter_image_embeds, - callback_on_step_end_tensor_inputs, - ) - - self._guidance_scale = guidance_scale - self._guidance_rescale = guidance_rescale - self._clip_skip = clip_skip - self._cross_attention_kwargs = cross_attention_kwargs - self._denoising_end = denoising_end - self._interrupt = False - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # 3. Encode input prompt - lora_scale = ( - self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None - ) - - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = self.encode_prompt( - prompt=prompt, - prompt_2=prompt_2, - device=device, - num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=self.do_classifier_free_guidance, - negative_prompt=negative_prompt, - negative_prompt_2=negative_prompt_2, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - lora_scale=lora_scale, - clip_skip=self.clip_skip, - ) - - # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) - - # 5. Prepare latent variables - num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) - - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7. Prepare added time ids & embeddings - add_text_embeds = pooled_prompt_embeds - if self.text_encoder_2 is None: - text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1]) - else: - text_encoder_projection_dim = self.text_encoder_2.config.projection_dim - - add_time_ids = self._get_add_time_ids( - original_size, - crops_coords_top_left, - target_size, - dtype=prompt_embeds.dtype, - text_encoder_projection_dim=text_encoder_projection_dim, - ) - if negative_original_size is not None and negative_target_size is not None: - negative_add_time_ids = self._get_add_time_ids( - negative_original_size, - negative_crops_coords_top_left, - negative_target_size, - dtype=prompt_embeds.dtype, - text_encoder_projection_dim=text_encoder_projection_dim, - ) - else: - negative_add_time_ids = add_time_ids - - if self.do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) - add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) - add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0) - - prompt_embeds = prompt_embeds.to(device) - add_text_embeds = add_text_embeds.to(device) - add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) - - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - self.do_classifier_free_guidance, - ) - - # 8. Denoising loop - num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) - - # 8.1 Apply denoising_end - if ( - self.denoising_end is not None - and isinstance(self.denoising_end, float) - and self.denoising_end > 0 - and self.denoising_end < 1 - ): - discrete_timestep_cutoff = int( - round( - self.scheduler.config.num_train_timesteps - - (self.denoising_end * self.scheduler.config.num_train_timesteps) - ) - ) - num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))) - timesteps = timesteps[:num_inference_steps] - - # 9. Optionally get Guidance Scale Embedding - timestep_cond = None - if self.unet.config.time_cond_proj_dim is not None: - guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) - timestep_cond = self.get_guidance_scale_embedding( - guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim - ).to(device=device, dtype=latents.dtype) - - self._num_timesteps = len(timesteps) - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - added_cond_kwargs["image_embeds"] = image_embeds - noise_pred = self.unet( - latent_model_input, - timestep=t.unsqueeze(0), - encoder_hidden_states=prompt_embeds, - timestep_cond=timestep_cond, - **(cross_attention_kwargs or {}), - **(added_cond_kwargs or {}), - # cross_attention_kwargs=cross_attention_kwargs, - # added_cond_kwargs=added_cond_kwargs, - # return_dict=False, - )[0] - - # perform guidance - if self.do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) - - if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - latents_dtype = latents.dtype - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - if latents.dtype != latents_dtype: - if torch.backends.mps.is_available(): - # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 - latents = latents.to(latents_dtype) - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds) - negative_pooled_prompt_embeds = callback_outputs.pop( - "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds - ) - add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) - negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - # if XLA_AVAILABLE: - # xm.mark_step() - - if not output_type == "latent": - # make sure the VAE is in float32 mode, as it overflows in float16 - # needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast - - # if needs_upcasting: - # self.upcast_vae() - # latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype) - # elif latents.dtype != self.vae.dtype: - # if torch.backends.mps.is_available(): - # # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 - # self.vae = self.vae.to(latents.dtype) - - # unscale/denormalize the latents - # denormalize with the mean and std if available and not None - has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None - has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None - if has_latents_mean and has_latents_std: - latents_mean = ( - torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype) - ) - latents_std = ( - torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype) - ) - latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean - else: - latents = latents / self.vae.config.scaling_factor - - image = self.vae.decode( - latents, - # return_dict=False, - )[0] - - # cast back to fp16 if needed - # if needs_upcasting: - # self.vae.to(dtype=torch.float16) - else: - image = latents - - if not output_type == "latent": - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - - image = self.image_processor.postprocess(image, output_type=output_type) - - # Offload all models - # self.maybe_free_model_hooks() - - if not return_dict: - return (image,) - - return StableDiffusionXLPipelineOutput(images=image) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py deleted file mode 100644 index 64984ff22b..0000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ /dev/null @@ -1,928 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback -from diffusers.image_processor import PipelineImageInput -from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput -from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import ( - rescale_noise_cfg, - retrieve_latents, - retrieve_timesteps, -) -from diffusers.utils.deprecation_utils import deprecate -from diffusers.utils.torch_utils import randn_tensor - -from .pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin - - -logger = logging.getLogger(__name__) - - -class StableDiffusionXLImg2ImgPipelineMixin(StableDiffusionXLPipelineMixin): - _optional_components = [ - "tokenizer", - "tokenizer_2", - "text_encoder", - "text_encoder_2", - "image_encoder", - "feature_extractor", - ] - _callback_tensor_inputs = [ - "latents", - "prompt_embeds", - "negative_prompt_embeds", - "add_text_embeds", - "add_time_ids", - "negative_pooled_prompt_embeds", - "add_neg_time_ids", - ] - - def check_inputs( - self, - prompt, - prompt_2, - strength, - num_inference_steps, - callback_steps, - negative_prompt=None, - negative_prompt_2=None, - prompt_embeds=None, - negative_prompt_embeds=None, - ip_adapter_image=None, - ip_adapter_image_embeds=None, - callback_on_step_end_tensor_inputs=None, - ): - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - if num_inference_steps is None: - raise ValueError("`num_inference_steps` cannot be None.") - elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0: - raise ValueError( - f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type" - f" {type(num_inference_steps)}." - ) - if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt_2 is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): - raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - elif negative_prompt_2 is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - if ip_adapter_image is not None and ip_adapter_image_embeds is not None: - raise ValueError( - "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." - ) - - if ip_adapter_image_embeds is not None: - if not isinstance(ip_adapter_image_embeds, list): - raise ValueError( - f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" - ) - elif ip_adapter_image_embeds[0].ndim not in [3, 4]: - raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" - ) - - def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None): - # get the original timestep using init_timestep - if denoising_start is None: - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - t_start = max(num_inference_steps - init_timestep, 0) - - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] - if hasattr(self.scheduler, "set_begin_index"): - self.scheduler.set_begin_index(t_start * self.scheduler.order) - - return timesteps, num_inference_steps - t_start - - else: - # Strength is irrelevant if we directly request a timestep to start at; - # that is, strength is determined by the denoising_start instead. - discrete_timestep_cutoff = int( - round( - self.scheduler.config.num_train_timesteps - - (denoising_start * self.scheduler.config.num_train_timesteps) - ) - ) - - num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item() - if self.scheduler.order == 2 and num_inference_steps % 2 == 0: - # if the scheduler is a 2nd order scheduler we might have to do +1 - # because `num_inference_steps` might be even given that every timestep - # (except the highest one) is duplicated. If `num_inference_steps` is even it would - # mean that we cut the timesteps in the middle of the denoising step - # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1 - # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler - num_inference_steps = num_inference_steps + 1 - - # because t_n+1 >= t_n, we slice the timesteps starting from the end - t_start = len(self.scheduler.timesteps) - num_inference_steps - timesteps = self.scheduler.timesteps[t_start:] - if hasattr(self.scheduler, "set_begin_index"): - self.scheduler.set_begin_index(t_start) - return timesteps, num_inference_steps - - def prepare_latents( - self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True - ): - if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): - raise ValueError( - f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" - ) - - latents_mean = latents_std = None - if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None: - latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1) - if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None: - latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1) - - # Offload text encoder if `enable_model_cpu_offload` was enabled - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.text_encoder_2.to("cpu") - torch.cuda.empty_cache() - - image = image.to(device=device, dtype=dtype) - - batch_size = batch_size * num_images_per_prompt - - if image.shape[1] == 4: - init_latents = image - - else: - # make sure the VAE is in float32 mode, as it overflows in float16 - # if self.vae.config.force_upcast: - # image = image.float() - # self.vae_decoder.to(dtype=torch.float32) - - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - elif isinstance(generator, list): - if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: - image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) - elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " - ) - - init_latents = [ - retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) - for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) - else: - init_latents = retrieve_latents(self.vae.encode(image), generator=generator) - - # if self.vae.config.force_upcast: - # self.vae_decoder.to(dtype) - - init_latents = init_latents.to(dtype) - if latents_mean is not None and latents_std is not None: - latents_mean = latents_mean.to(device=device, dtype=dtype) - latents_std = latents_std.to(device=device, dtype=dtype) - init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std - else: - init_latents = self.vae.config.scaling_factor * init_latents - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: - # expand init_latents for batch_size - additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) - else: - init_latents = torch.cat([init_latents], dim=0) - - if add_noise: - shape = init_latents.shape - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - # get latents - init_latents = self.scheduler.add_noise(init_latents, noise, timestep) - - latents = init_latents - - return latents - - def _get_add_time_ids( - self, - original_size, - crops_coords_top_left, - target_size, - aesthetic_score, - negative_aesthetic_score, - negative_original_size, - negative_crops_coords_top_left, - negative_target_size, - dtype, - text_encoder_projection_dim=None, - ): - if self.config.get("requires_aesthetics_score", False): - add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list( - negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) - ) - else: - add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) - - # passed_add_embed_dim = ( - # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim - # ) - # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features - - # if ( - # expected_add_embed_dim > passed_add_embed_dim - # and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim - # ): - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model." - # ) - # elif ( - # expected_add_embed_dim < passed_add_embed_dim - # and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim - # ): - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model." - # ) - # elif expected_add_embed_dim != passed_add_embed_dim: - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." - # ) - - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - - return add_time_ids, add_neg_time_ids - - @property - def guidance_scale(self): - return self._guidance_scale - - @property - def guidance_rescale(self): - return self._guidance_rescale - - @property - def clip_skip(self): - return self._clip_skip - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - @property - def do_classifier_free_guidance(self): - return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None - - @property - def cross_attention_kwargs(self): - return self._cross_attention_kwargs - - @property - def denoising_end(self): - return self._denoising_end - - @property - def denoising_start(self): - return self._denoising_start - - @property - def num_timesteps(self): - return self._num_timesteps - - @property - def interrupt(self): - return self._interrupt - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - prompt_2: Optional[Union[str, List[str]]] = None, - image: PipelineImageInput = None, - strength: float = 0.3, - num_inference_steps: int = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - denoising_start: Optional[float] = None, - denoising_end: Optional[float] = None, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - negative_prompt_2: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.Tensor] = None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - pooled_prompt_embeds: Optional[torch.Tensor] = None, - negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - original_size: Tuple[int, int] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Tuple[int, int] = None, - negative_original_size: Optional[Tuple[int, int]] = None, - negative_crops_coords_top_left: Tuple[int, int] = (0, 0), - negative_target_size: Optional[Tuple[int, int]] = None, - aesthetic_score: float = 6.0, - negative_aesthetic_score: float = 2.5, - clip_skip: Optional[int] = None, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is - used in both text-encoders - image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): - The image(s) to modify with the pipeline. - strength (`float`, *optional*, defaults to 0.3): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` - will be used as a starting point, adding more noise to it the larger the `strength`. The number of - denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will - be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of - `denoising_start` being declared as an integer, the value of `strength` will be ignored. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - sigmas (`List[float]`, *optional*): - Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in - their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed - will be used. - denoising_start (`float`, *optional*): - When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be - bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and - it is assumed that the passed `image` is a partly denoised image. Note that when this is specified, - strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline - is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image - Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality). - denoising_end (`float`, *optional*): - When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be - completed before it is intentionally prematurely terminated. As a result, the returned sample will - still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be - denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the - final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline - forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image - Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality). - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - negative_prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and - `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - latents (`torch.Tensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of - IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should - contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a - plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - guidance_rescale (`float`, *optional*, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. - `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as - explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position - `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting - `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - For most cases, `target_size` should be set to the desired height and width of the generated image. If - not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in - section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a specific image resolution. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a target image resolution. It should be as same - as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - aesthetic_score (`float`, *optional*, defaults to 6.0): - Used to simulate an aesthetic score of the generated image by influencing the positive text condition. - Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - negative_aesthetic_score (`float`, *optional*, defaults to 2.5): - Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to - simulate an aesthetic score of the generated image by influencing the negative text condition. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): - A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of - each denoising step during the inference. with the following arguments: `callback_on_step_end(self: - DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a - list of all tensors as specified by `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - - Examples: - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a - `tuple. When returning a tuple, the first element is a list with the generated images. - """ - # must have a compatible torch device - device = self.device - - # convert numpy arrays to torch tensors - latents = self.np_to_pt(latents, device) if isinstance(latents, np.ndarray) else latents - prompt_embeds = ( - self.np_to_pt(prompt_embeds, device) if isinstance(prompt_embeds, np.ndarray) else prompt_embeds - ) - negative_prompt_embeds = ( - self.np_to_pt(negative_prompt_embeds, device) - if isinstance(negative_prompt_embeds, np.ndarray) - else negative_prompt_embeds - ) - pooled_prompt_embeds = ( - self.np_to_pt(pooled_prompt_embeds, device) - if isinstance(pooled_prompt_embeds, np.ndarray) - else pooled_prompt_embeds - ) - negative_pooled_prompt_embeds = ( - self.np_to_pt(negative_pooled_prompt_embeds, device) - if isinstance(negative_pooled_prompt_embeds, np.ndarray) - else negative_pooled_prompt_embeds - ) - ip_adapter_image_embeds = ( - [self.np_to_pt(i, device) if isinstance(i, np.ndarray) else i for i in ip_adapter_image_embeds] - if ip_adapter_image_embeds is not None - else ip_adapter_image_embeds - ) - - for k, v in kwargs.items(): - if isinstance(v, np.ndarray): - kwargs[k] = self.np_to_pt(v, device) - elif isinstance(v, list) and all(isinstance(i, np.ndarray) for i in v): - kwargs[k] = [self.np_to_pt(i, device) for i in v] - elif isinstance(v, dict) and all(isinstance(i, np.ndarray) for i in v.values()): - kwargs[k] = {k: self.np_to_pt(v, device) for k, v in v.items()} - - generator = ( - self.np_to_pt(generator, device) - if (isinstance(generator, list) and isinstance(generator[0], np.random.RandomState)) - or isinstance(generator, np.random.RandomState) - else generator - ) - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", - ) - - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - prompt_2, - strength, - num_inference_steps, - callback_steps, - negative_prompt, - negative_prompt_2, - prompt_embeds, - negative_prompt_embeds, - ip_adapter_image, - ip_adapter_image_embeds, - callback_on_step_end_tensor_inputs, - ) - - self._guidance_scale = guidance_scale - self._guidance_rescale = guidance_rescale - self._clip_skip = clip_skip - self._cross_attention_kwargs = cross_attention_kwargs - self._denoising_end = denoising_end - self._denoising_start = denoising_start - self._interrupt = False - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # 3. Encode input prompt - text_encoder_lora_scale = ( - self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None - ) - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = self.encode_prompt( - prompt=prompt, - prompt_2=prompt_2, - device=device, - num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=self.do_classifier_free_guidance, - negative_prompt=negative_prompt, - negative_prompt_2=negative_prompt_2, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - lora_scale=text_encoder_lora_scale, - clip_skip=self.clip_skip, - ) - - # 4. Preprocess image - image = self.image_processor.preprocess(image) - - # 5. Prepare timesteps - def denoising_value_valid(dnv): - return isinstance(dnv, float) and 0 < dnv < 1 - - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) - timesteps, num_inference_steps = self.get_timesteps( - num_inference_steps, - strength, - device, - denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None, - ) - latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - - add_noise = True if self.denoising_start is None else False - - # 6. Prepare latent variables - if latents is None: - latents = self.prepare_latents( - image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator, - add_noise, - ) - # 7. Prepare extra step kwargs. - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - height, width = latents.shape[-2:] - height = height * self.vae_scale_factor - width = width * self.vae_scale_factor - - original_size = original_size or (height, width) - target_size = target_size or (height, width) - - # 8. Prepare added time ids & embeddings - if negative_original_size is None: - negative_original_size = original_size - if negative_target_size is None: - negative_target_size = target_size - - add_text_embeds = pooled_prompt_embeds - if self.text_encoder_2 is None: - text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1]) - else: - text_encoder_projection_dim = self.text_encoder_2.config.projection_dim - - add_time_ids, add_neg_time_ids = self._get_add_time_ids( - original_size, - crops_coords_top_left, - target_size, - aesthetic_score, - negative_aesthetic_score, - negative_original_size, - negative_crops_coords_top_left, - negative_target_size, - dtype=prompt_embeds.dtype, - text_encoder_projection_dim=text_encoder_projection_dim, - ) - add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1) - - if self.do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) - add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) - add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1) - add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0) - - prompt_embeds = prompt_embeds.to(device) - add_text_embeds = add_text_embeds.to(device) - add_time_ids = add_time_ids.to(device) - - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - self.do_classifier_free_guidance, - ) - - # 9. Denoising loop - num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) - - # 9.1 Apply denoising_end - if ( - self.denoising_end is not None - and self.denoising_start is not None - and denoising_value_valid(self.denoising_end) - and denoising_value_valid(self.denoising_start) - and self.denoising_start >= self.denoising_end - ): - raise ValueError( - f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: " - + f" {self.denoising_end} when using type float." - ) - elif self.denoising_end is not None and denoising_value_valid(self.denoising_end): - discrete_timestep_cutoff = int( - round( - self.scheduler.config.num_train_timesteps - - (self.denoising_end * self.scheduler.config.num_train_timesteps) - ) - ) - num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))) - timesteps = timesteps[:num_inference_steps] - - # 9.2 Optionally get Guidance Scale Embedding - timestep_cond = None - if self.unet.config.time_cond_proj_dim is not None: - guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) - timestep_cond = self.get_guidance_scale_embedding( - guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim - ).to(device=device, dtype=latents.dtype) - - self._num_timesteps = len(timesteps) - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - added_cond_kwargs["image_embeds"] = image_embeds - noise_pred = self.unet( - latent_model_input, - timestep=t.unsqueeze(0), - encoder_hidden_states=prompt_embeds, - timestep_cond=timestep_cond, - **(cross_attention_kwargs or {}), - **(added_cond_kwargs or {}), - # cross_attention_kwargs=cross_attention_kwargs, - # added_cond_kwargs=added_cond_kwargs, - # return_dict=False, - )[0] - - # perform guidance - if self.do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) - - if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - latents_dtype = latents.dtype - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - if latents.dtype != latents_dtype: - if torch.backends.mps.is_available(): - # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 - latents = latents.to(latents_dtype) - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds) - negative_pooled_prompt_embeds = callback_outputs.pop( - "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds - ) - add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) - add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - # if XLA_AVAILABLE: - # xm.mark_step() - - if not output_type == "latent": - # make sure the VAE is in float32 mode, as it overflows in float16 - # needs_upcasting = self.vae_decoder.dtype == torch.float16 and self.vae.config.force_upcast - - # if needs_upcasting: - # self.upcast_vae() - # latents = latents.to(next(iter(self.vae_decoder.post_quant_conv.parameters())).dtype) - # elif latents.dtype != self.vae_decoder.dtype: - # if torch.backends.mps.is_available(): - # # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 - # self.vae = self.vae_decoder.to(latents.dtype) - - # unscale/denormalize the latents - # denormalize with the mean and std if available and not None - has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None - has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None - if has_latents_mean and has_latents_std: - latents_mean = ( - torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype) - ) - latents_std = ( - torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype) - ) - latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean - else: - latents = latents / self.vae.config.scaling_factor - - image = self.vae_decoder( - latents, - # return_dict=False, - )[0] - - # cast back to fp16 if needed - # if needs_upcasting: - # self.vae_decoder.to(dtype=torch.float16) - else: - image = latents - - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - - image = self.image_processor.postprocess(image, output_type=output_type) - - # Offload all models - # self.maybe_free_model_hooks() - - if not return_dict: - return (image,) - - return StableDiffusionXLPipelineOutput(images=image) diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py deleted file mode 100644 index dba41381a4..0000000000 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -from typing import Union - -import numpy as np -import torch -from diffusers import ConfigMixin -from tqdm.auto import tqdm - - -logger = logging.getLogger(__name__) - - -class DiffusionPipelineMixin(ConfigMixin): - @staticmethod - def np_to_pt( - np_object: Union[np.ndarray, np.random.RandomState], device: str - ) -> Union[torch.Tensor, torch.Generator]: - if isinstance(np_object, np.ndarray): - return torch.from_numpy(np_object).to(device) - elif isinstance(np_object, np.random.RandomState): - return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) - else: - raise ValueError(f"Unsupported type {type(np_object)}") - - # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L827 - def progress_bar(self, iterable=None, total=None): - if not hasattr(self, "_progress_bar_config"): - self._progress_bar_config = {} - elif not isinstance(self._progress_bar_config, dict): - raise ValueError( - f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}." - ) - - if iterable is not None: - return tqdm(iterable, **self._progress_bar_config) - elif total is not None: - return tqdm(total=total, **self._progress_bar_config) - else: - raise ValueError("Either `total` or `iterable` has to be defined.") diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index d70f4f5663..379a23d0c6 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -240,10 +240,20 @@ def test_negative_prompt(self, model_arch: str): inputs["negative_prompt_embeds"], inputs["pooled_prompt_embeds"], inputs["negative_pooled_prompt_embeds"], - ) = pipeline.encode_prompt(prompt=prompt, negative_prompt=negative_prompt) + ) = pipeline.encode_prompt( + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) else: inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( - prompt=prompt, negative_prompt=negative_prompt + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, ) images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images From 002ed68bea9eadb2cfb0d578a6f18becbacbd52e Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 13 Sep 2024 12:29:52 +0200 Subject: [PATCH 44/71] style --- optimum/onnxruntime/modeling_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 6dfc3dbac1..0ec8cbdbf6 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -658,7 +658,6 @@ def __init__(self, vae_encoder: ORTModelVaeEncoder, vae_decoder: ORTModelVaeDeco super().__init__(vae_decoder.session, parent_model) - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipeline): """ From 70e4577b29854a7e89d70f7ec6aa91ae4bf0b6ef Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Fri, 13 Sep 2024 18:35:30 +0200 Subject: [PATCH 45/71] Update optimum/exporters/onnx/model_configs.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/onnx/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index b8b8a14ebb..e378c1fe74 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1112,7 +1112,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]: class VaeEncoderOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 - # The ONNX export of a VaeEncoder architecture, an other Stable Diffusion component, needs the Trilu + # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 From 381977c48599774b2d1371ac54d90934f73fcdb7 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Fri, 13 Sep 2024 18:35:54 +0200 Subject: [PATCH 46/71] Update optimum/exporters/onnx/model_configs.py --- optimum/exporters/onnx/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index e378c1fe74..f64484ae3e 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1137,7 +1137,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]: class VaeDecoderOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 - # The ONNX export of a VaeDecoder architecture, an other Stable Diffusion component, needs the Trilu + # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 From 9de3f71727106d850cc583f052d1fcfb1c3fbc42 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sat, 14 Sep 2024 16:47:45 +0200 Subject: [PATCH 47/71] conversion to numpy always work --- optimum/onnxruntime/modeling_ort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 254b771e33..2eda4dbff8 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -938,7 +938,7 @@ def _prepare_onnx_inputs( onnx_inputs[input_name] = inputs.pop(input_name) if use_torch: - onnx_inputs[input_name] = onnx_inputs[input_name].cpu().detach().numpy() + onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True) if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: onnx_inputs[input_name] = onnx_inputs[input_name].astype( From b2274a1b9543b38b6646f06625b19f26ac5c562e Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sat, 14 Sep 2024 21:14:31 +0200 Subject: [PATCH 48/71] test adding two new pipelines --- optimum/onnxruntime/modeling_diffusion.py | 135 ++++++++++++++-------- tests/onnxruntime/test_diffusion.py | 13 ++- 2 files changed, 92 insertions(+), 56 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 0ec8cbdbf6..996dbf2f49 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -29,12 +29,14 @@ AutoPipelineForInpainting, AutoPipelineForText2Image, ConfigMixin, + LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline, SchedulerMixin, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) from diffusers.configuration_utils import FrozenDict @@ -552,11 +554,10 @@ def forward( model_outputs["hidden_states"] = [] for i in range(self.config.num_hidden_layers): model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) - # exporter doesnt duplicate last hidden state so we need to add it manually - # (only returned once as last_hidden_state and not part of the list of hidden_states) model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) else: - model_outputs.pop("hidden_states", None) + for i in range(self.config.num_hidden_layers): + model_outputs.pop(f"hidden_states.{i}", None) if return_dict: return model_outputs @@ -652,11 +653,11 @@ def forward( class ORTVaeWrapper(ORTPipelinePart): def __init__(self, vae_encoder: ORTModelVaeEncoder, vae_decoder: ORTModelVaeDecoder, parent_model: ORTPipeline): + super().__init__(vae_decoder.session, parent_model) + self.encode = vae_encoder.forward self.decode = vae_decoder.forward - super().__init__(vae_decoder.session, parent_model) - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipeline): @@ -674,7 +675,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipel ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ - main_input_name = "prompt" + main_input_name = "image" auto_model_class = StableDiffusionImg2ImgPipeline @@ -688,16 +689,6 @@ class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipel auto_model_class = StableDiffusionInpaintPipeline -@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyModelPipeline): - """ - ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). - """ - - main_input_name = "prompt" - auto_model_class = LatentConsistencyModelPipeline - - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipeline): """ @@ -728,17 +719,8 @@ def _get_add_time_ids( ): add_time_ids = list(original_size + crops_coords_top_left + target_size) - # passed_add_embed_dim = ( - # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim - # ) - # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features - - # if expected_add_embed_dim != passed_add_embed_dim: - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." - # ) - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids @@ -789,29 +771,58 @@ def _get_add_time_ids( add_time_ids = list(original_size + crops_coords_top_left + target_size) add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) - # passed_add_embed_dim = ( - # self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim - # ) - # expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features - - # if ( - # expected_add_embed_dim > passed_add_embed_dim - # and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim - # ): - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model." - # ) - # elif ( - # expected_add_embed_dim < passed_add_embed_dim - # and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim - # ): - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model." - # ) - # elif expected_add_embed_dim != passed_add_embed_dim: - # raise ValueError( - # f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." - # ) + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids + + +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTStableDiffusionXLInpaintPipeline(ORTPipeline, StableDiffusionXLInpaintPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). + """ + + main_input_name = "image" + auto_model_class = StableDiffusionXLInpaintPipeline + + def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): + super().__init__(*args, **kwargs) + + requires_aesthetics_score = kwargs.get("requires_aesthetics_score", False) + force_zeros_for_empty_prompt = kwargs.get("force_zeros_for_empty_prompt", True) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) + + add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() + + if add_watermarker: + self.watermark = StableDiffusionXLWatermarker() + else: + self.watermark = None + + # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline._get_add_time_ids + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) add_time_ids = torch.tensor([add_time_ids], dtype=dtype) add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) @@ -819,13 +830,35 @@ def _get_add_time_ids( return add_time_ids, add_neg_time_ids +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyModelPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). + """ + + main_input_name = "prompt" + auto_model_class = LatentConsistencyModelPipeline + + +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTLatentConsistencyModelImg2ImgPipeline(ORTPipeline, LatentConsistencyModelImg2ImgPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). + """ + + main_input_name = "image" + auto_model_class = LatentConsistencyModelImg2ImgPipeline + + SUPPORTED_ORT_PIPELINES = [ ORTStableDiffusionPipeline, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, - ORTLatentConsistencyModelPipeline, ORTStableDiffusionXLPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, + ORTLatentConsistencyModelPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ] @@ -878,12 +911,14 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): [ ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), + ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline), ] ) ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline), ] ) diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 379a23d0c6..c80899ab77 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -70,7 +70,7 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class ORTPipelineForText2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] ORTMODEL_CLASS = ORTPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -301,14 +301,14 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st class ORTPipelineForImage2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] AUTOMODEL_CLASS = AutoPipelineForImage2Image ORTMODEL_CLASS = ORTPipelineForImage2Image TASK = "image-to-image" - def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"): + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): inputs = _generate_prompts(batch_size=batch_size) inputs["image"] = _generate_images( @@ -497,7 +497,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st class ORTPipelineForInpaintingTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] AUTOMODEL_CLASS = AutoPipelineForInpainting ORTMODEL_CLASS = ORTPipelineForInpainting @@ -511,12 +511,13 @@ def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_ inputs = _generate_prompts(batch_size=batch_size) inputs["image"] = _generate_images( - height=height, width=width, batch_size=1, channel=channel, input_type="pil" + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type )[0] inputs["mask_image"] = _generate_images( - height=height, width=width, batch_size=1, channel=channel, input_type="pil" + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type )[0] + inputs["strength"] = 0.75 inputs["height"] = height inputs["width"] = width From 79cd9ac900a4e14284992c0c40712e1a806fa2ef Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Sep 2024 11:13:15 +0200 Subject: [PATCH 49/71] remove duplicated tests --- tests/onnxruntime/test_diffusion.py | 104 +++++++++------------------- 1 file changed, 31 insertions(+), 73 deletions(-) diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index c80899ab77..233ac29513 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -33,7 +33,7 @@ ORTPipelineForInpainting, ORTPipelineForText2Image, ) -from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm +from optimum.utils.testing_utils import grid_parameters, require_diffusers def get_generator(framework, seed): @@ -261,41 +261,29 @@ def test_negative_prompt(self, model_arch: str): np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2) @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) ) - @require_torch_gpu + @pytest.mark.rocm_ep_test @pytest.mark.cuda_ep_test + @pytest.mark.trt_ep_test + @require_torch_gpu @require_diffusers def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 64, 32, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) @@ -453,34 +441,19 @@ def test_image_reproducibility(self, model_arch: str): np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) ) - @require_torch_gpu + @pytest.mark.rocm_ep_test @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) + @pytest.mark.trt_ep_test @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) @@ -657,34 +630,19 @@ def test_image_reproducibility(self, model_arch: str): np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) ) - @require_torch_gpu + @pytest.mark.rocm_ep_test @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) + @pytest.mark.trt_ep_test @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) From b70b641f3478e247451abc23aeacaf1868d6a14a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Sep 2024 12:44:24 +0200 Subject: [PATCH 50/71] match diffusers numpy input --- optimum/onnxruntime/modeling_diffusion.py | 4 ++-- optimum/onnxruntime/utils.py | 23 +++++++++++------------ tests/onnxruntime/test_diffusion.py | 12 +++++------- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 996dbf2f49..a0a33d3cb7 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -879,7 +879,7 @@ class ORTDiffusionPipeline(ConfigMixin): @classmethod @validate_hf_hub_args - def from_pretrained(cls, pretrained_model_or_path, **kwargs): + def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: load_config_kwargs = { "force_download": kwargs.get("force_download", False), "resume_download": kwargs.get("resume_download", None), @@ -953,7 +953,7 @@ class ORTPipelineForTask(ConfigMixin): config_name = "model_index.json" @classmethod - def from_pretrained(cls, pretrained_model_or_path, **kwargs): + def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: load_config_kwargs = { "force_download": kwargs.get("force_download", False), "resume_download": kwargs.get("resume_download", None), diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 1a1f84c884..1da49a65a2 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -407,19 +407,18 @@ def evaluation_loop( def np_to_pt(np_object, device): if isinstance(np_object, np.ndarray): - if np_object.ndim == 4: - return torch.from_numpy(np_object).permute(0, 3, 1, 2) - elif np_object.ndim == 3: - return torch.from_numpy(np_object).permute(2, 0, 1) - else: - return torch.from_numpy(np_object) - elif isinstance(np_object, list) and isinstance(np_object[0], np.ndarray): - return [np_to_pt(a, device) for a in np_object] - elif isinstance(np_object, dict) and isinstance(next(iter(np_object.values())), np.ndarray): - return {k: np_to_pt(v, device) for k, v in np_object.items()} + return torch.from_numpy(np_object) elif isinstance(np_object, np.random.RandomState): return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) - elif isinstance(np_object, list) and isinstance(np_object[0], np.random.RandomState): - return [torch.Generator(device=device).manual_seed(int(a.get_state()[1][0])) for a in np_object] + elif isinstance(np_object, np.random.Generator): + return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0])) + elif isinstance(np_object, list) and isinstance( + np_object[0], (np.ndarray, np.random.RandomState, np.random.Generator) + ): + return [np_to_pt(a, device) for a in np_object] + elif isinstance(np_object, dict) and isinstance( + next(iter(np_object.values())), (np.ndarray, np.random.RandomState, np.random.Generator) + ): + return {k: np_to_pt(v, device) for k, v in np_object.items()} else: return np_object diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 233ac29513..cdcee7f613 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -62,7 +62,7 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= "/in_paint/overture-creations-5sI6fQgYIuo.png" ).resize((width, height)) elif input_type == "np": - image = np.random.rand(height, width, channel) + image = np.random.rand(channel, height, width) elif input_type == "pt": image = torch.rand((channel, height, width)) @@ -461,10 +461,9 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device.type, "cuda") + outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) @@ -650,9 +649,8 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device, "cuda") + outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) From 7f77b1cc98999fbc6d895f6876aeefbe0272da91 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Sep 2024 15:06:10 +0200 Subject: [PATCH 51/71] simplify model saving --- optimum/onnx/utils.py | 16 +++ optimum/onnxruntime/modeling_diffusion.py | 155 +++++++++------------- 2 files changed, 82 insertions(+), 89 deletions(-) diff --git a/optimum/onnx/utils.py b/optimum/onnx/utils.py index b52c4f4cda..c014c1b342 100644 --- a/optimum/onnx/utils.py +++ b/optimum/onnx/utils.py @@ -71,6 +71,22 @@ def _get_external_data_paths(src_paths: List[Path], dst_paths: List[Path]) -> Tu return src_paths, dst_paths +def _get_model_external_data_paths(model_path: Path) -> List[Path]: + """ + Gets external data paths from the model. + """ + + onnx_model = onnx.load(str(model_path), load_external_data=False) + model_tensors = _get_initializer_tensors(onnx_model) + # filter out tensors that are not external data + model_tensors_ext = [ + ExternalDataInfo(tensor).location + for tensor in model_tensors + if tensor.HasField("data_location") and tensor.data_location == onnx.TensorProto.EXTERNAL + ] + return [model_path.parent / tensor_name for tensor_name in model_tensors_ext] + + def check_model_uses_external_data(model: onnx.ModelProto) -> bool: """ Checks if the model uses external data. diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index a0a33d3cb7..e91142dfa8 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -24,14 +24,15 @@ import numpy as np import torch -from diffusers import ( +from diffusers.configuration_utils import ConfigMixin, FrozenDict +from diffusers.image_processor import VaeImageProcessor +from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +from diffusers.pipelines import ( AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, - ConfigMixin, LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline, - SchedulerMixin, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionPipeline, @@ -39,9 +40,7 @@ StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) -from diffusers.configuration_utils import FrozenDict -from diffusers.image_processor import VaeImageProcessor -from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +from diffusers.schedulers import SchedulerMixin from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download @@ -54,7 +53,7 @@ import onnxruntime as ort from ..exporters.onnx import main_export -from ..onnx.utils import _get_external_data_paths +from ..onnx.utils import _get_model_external_data_paths from ..utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -90,15 +89,15 @@ class ORTPipeline(ORTModel): def __init__( self, config: Dict[str, Any], - tokenizer: CLIPTokenizer, scheduler: SchedulerMixin, - unet_session: ort.InferenceSession, - feature_extractor: Optional[CLIPFeatureExtractor] = None, - vae_encoder_session: Optional[ort.InferenceSession] = None, - vae_decoder_session: Optional[ort.InferenceSession] = None, - text_encoder_session: Optional[ort.InferenceSession] = None, - text_encoder_2_session: Optional[ort.InferenceSession] = None, + unet: ort.InferenceSession, + vae_encoder: Optional[ort.InferenceSession] = None, + vae_decoder: Optional[ort.InferenceSession] = None, + text_encoder: Optional[ort.InferenceSession] = None, + text_encoder_2: Optional[ort.InferenceSession] = None, + tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, + feature_extractor: Optional[CLIPFeatureExtractor] = None, use_io_binding: Optional[bool] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, @@ -114,13 +113,13 @@ def __init__( for the text encoder. scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. - unet_session (`ort.InferenceSession`): + unet (`ort.InferenceSession`): The ONNX Runtime inference session associated to the U-NET. feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): A model extracting features from generated images to be used as inputs for the `safety_checker` - vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): + vae_encoder (`Optional[ort.InferenceSession]`, defaults to `None`): The ONNX Runtime inference session associated to the VAE encoder. - text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): + text_encoder (`Optional[ort.InferenceSession]`, defaults to `None`): The ONNX Runtime inference session associated to the text encoder. tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`): Tokenizer of class @@ -133,36 +132,11 @@ def __init__( The directory under which the model exported to ONNX was saved. """ - # Text encoder - if text_encoder_session is not None: - self.text_encoder_model_path = Path(text_encoder_session._model_path) - self.text_encoder = ORTModelTextEncoder(text_encoder_session, self) - else: - self.text_encoder_model_path = None - self.text_encoder = None - - # U-Net - self.unet = ORTModelUnet(unet_session, self) - self.unet_model_path = Path(unet_session._model_path) - - # Text encoder 2 - if text_encoder_2_session is not None: - self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path) - self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2_session, self) - else: - self.text_encoder_2_model_path = None - self.text_encoder_2 = None - - # VAE - self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) - self.vae_decoder_model_path = Path(vae_decoder_session._model_path) - - if vae_encoder_session is not None: - self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) - self.vae_encoder_model_path = Path(vae_encoder_session._model_path) - else: - self.vae_encoder = None - self.vae_encoder_model_path = None + self.unet = ORTModelUnet(unet, self) + self.vae_encoder = ORTModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None + self.vae_decoder = ORTModelVaeDecoder(vae_decoder, self) if vae_decoder is not None else None + self.text_encoder = ORTModelTextEncoder(text_encoder, self) if text_encoder is not None else None + self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2, self) if text_encoder_2 is not None else None # We create VAE encoder & decoder and wrap them in one object to # be used by the pipeline mixins with minimal code changes (simulating the diffusers API) @@ -185,19 +159,19 @@ def __init__( ) sub_models = { - DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet, - DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder, - DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder, - DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, - DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2, + self.unet: DIFFUSION_MODEL_UNET_SUBFOLDER, + self.vae_decoder: DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, + self.vae_encoder: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, + self.text_encoder: DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, + self.text_encoder_2: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, } # Modify config to keep the resulting model compatible with diffusers pipelines - for name in sub_models.keys(): - config[name] = ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None) + for model, model_name in sub_models.items(): + config[model_name] = ("optimum", model.__class__.__name__) if model is not None else (None, None) self._internal_dict = FrozenDict(config) - self.shared_attributes_init(model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir) + self.shared_attributes_init(model=unet, use_io_binding=use_io_binding, model_save_dir=model_save_dir) @staticmethod def load_model( @@ -253,41 +227,37 @@ def load_model( def _save_pretrained(self, save_directory: Union[str, Path]): save_directory = Path(save_directory) - src_to_dst_path = { - self.vae_decoder_model_path: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.text_encoder_model_path: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.unet_model_path: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME, - } sub_models_to_save = { - self.vae_encoder_model_path: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, - self.text_encoder_2_model_path: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + self.unet: DIFFUSION_MODEL_UNET_SUBFOLDER, + self.vae_decoder: DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, + self.vae_encoder: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, + self.text_encoder: DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, + self.text_encoder_2: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, } - for path, subfolder in sub_models_to_save.items(): - if path is not None: - src_to_dst_path[path] = save_directory / subfolder / ONNX_WEIGHTS_NAME - - # TODO: Modify _get_external_data_paths to give dictionnary - src_paths = list(src_to_dst_path.keys()) - dst_paths = list(src_to_dst_path.values()) - # Add external data paths in case of large models - src_paths, dst_paths = _get_external_data_paths(src_paths, dst_paths) - - for src_path, dst_path in zip(src_paths, dst_paths): - dst_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(src_path, dst_path) - config_path = src_path.parent / self.sub_component_config_name - if config_path.is_file(): - shutil.copyfile(config_path, dst_path.parent / self.sub_component_config_name) + + for model, model_folder in sub_models_to_save.items(): + if model is not None: + model_path = Path(model.session._model_path) + model_save_path = save_directory / model_folder / ONNX_WEIGHTS_NAME + model_save_path.parent.mkdir(parents=True, exist_ok=True) + # copy onnx model + shutil.copyfile(model_path, model_save_path) + # copy external data + external_data_paths = _get_model_external_data_paths(model_path) + for external_data_path in external_data_paths: + shutil.copyfile(external_data_path, model_save_path.parent / external_data_path.name) + # copy config + shutil.copyfile(model_path.parent / CONFIG_NAME, model_save_path.parent / CONFIG_NAME) self.scheduler.save_pretrained(save_directory / "scheduler") - if self.feature_extractor is not None: - self.feature_extractor.save_pretrained(save_directory / "feature_extractor") if self.tokenizer is not None: self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.feature_extractor is not None: + self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @classmethod def _from_pretrained( @@ -389,16 +359,16 @@ def _from_pretrained( ) return cls( - vae_decoder_session=vae_decoder, - text_encoder_session=text_encoder, - unet_session=unet, + unet=unet, config=config, - tokenizer=sub_models.get("tokenizer", None), + vae_encoder=vae_encoder, + vae_decoder=vae_decoder, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, scheduler=sub_models.get("scheduler"), - feature_extractor=sub_models.get("feature_extractor", None), + tokenizer=sub_models.get("tokenizer", None), tokenizer_2=sub_models.get("tokenizer_2", None), - vae_encoder_session=vae_encoder, - text_encoder_2_session=text_encoder_2, + feature_extractor=sub_models.get("feature_extractor", None), use_io_binding=use_io_binding, model_save_dir=model_save_dir, ) @@ -482,13 +452,20 @@ def to(self, device: Union[torch.device, str, int]): if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider": return self - self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) - self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) self.unet.session.set_providers([provider], provider_options=[provider_options]) if self.vae_encoder is not None: self.vae_encoder.session.set_providers([provider], provider_options=[provider_options]) + if self.vae_decoder is not None: + self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) + + if self.text_encoder is not None: + self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) + + if self.text_encoder_2 is not None: + self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options]) + self.providers = self.vae_decoder.session.get_providers() self._device = device From 4933c7ce1378516b2895ea75312bb5791e428bfa Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 17 Sep 2024 14:19:45 +0200 Subject: [PATCH 52/71] extend tests and only translate generators --- optimum/onnxruntime/modeling_diffusion.py | 58 +++++++++++---- optimum/onnxruntime/modeling_seq2seq.py | 2 +- optimum/onnxruntime/utils.py | 16 ++--- tests/onnxruntime/test_diffusion.py | 86 ++++++++++++----------- 4 files changed, 94 insertions(+), 68 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index e91142dfa8..4d52dd9cd9 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -67,7 +67,7 @@ from .utils import ( ONNX_WEIGHTS_NAME, get_provider_for_device, - np_to_pt, + np_to_pt_generators, parse_device, validate_provider_availability, ) @@ -248,7 +248,10 @@ def _save_pretrained(self, save_directory: Union[str, Path]): for external_data_path in external_data_paths: shutil.copyfile(external_data_path, model_save_path.parent / external_data_path.name) # copy config - shutil.copyfile(model_path.parent / CONFIG_NAME, model_save_path.parent / CONFIG_NAME) + shutil.copyfile( + model_path.parent / self.sub_component_config_name, + model_save_path.parent / self.sub_component_config_name, + ) self.scheduler.save_pretrained(save_directory / "scheduler") @@ -486,28 +489,28 @@ def __call__(self, *args, **kwargs): device = self._execution_device for i in range(len(args)): - args[i] = np_to_pt(args[i], device) + args[i] = np_to_pt_generators(args[i], device) for k, v in kwargs.items(): - kwargs[k] = np_to_pt(v, device) + kwargs[k] = np_to_pt_generators(v, device) return self.auto_model_class.__call__(self, *args, **kwargs) class ORTPipelinePart(ORTModelPart): def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): - config_path = Path(session._model_path).parent / "config.json" - - if config_path.is_file(): - self.config = FrozenDict(parent_model._dict_from_json_file(config_path)) - else: - self.config = FrozenDict({}) - super().__init__(session, parent_model) + config_path = Path(session._model_path).parent / "config.json" + config_dict = parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} + self.config = FrozenDict(config_dict) + @property def input_dtype(self): - # for backward compatibility and diffusion mixins (will be standardized in the future) + logger.warning( + "The `input_dtype` property is deprecated and will be removed in the next release. " + "Please use `input_dtypes` along with `TypeHelper` to get the `numpy` types." + ) return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} @@ -593,7 +596,8 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = F if "latent_sample" in model_outputs: model_outputs["latents"] = model_outputs.pop("latent_sample") - elif "latent_parameters" in model_outputs: + + if "latent_parameters" in model_outputs: model_outputs["latent_dist"] = DiagonalGaussianDistribution( parameters=model_outputs.pop("latent_parameters") ) @@ -631,9 +635,32 @@ def forward( class ORTVaeWrapper(ORTPipelinePart): def __init__(self, vae_encoder: ORTModelVaeEncoder, vae_decoder: ORTModelVaeDecoder, parent_model: ORTPipeline): super().__init__(vae_decoder.session, parent_model) + self.vae_encoder = vae_encoder + self.vae_decoder = vae_decoder + + def encode( + self, + sample: Union[np.ndarray, torch.Tensor], + return_dict: bool = False, + ): + return self.vae_encoder(sample, return_dict) - self.encode = vae_encoder.forward - self.decode = vae_decoder.forward + def decode( + self, + latent_sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): + return self.vae_decoder(latent_sample, generator, return_dict) + + def forward( + self, + sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): + latent_sample = self.encode(sample).latent_dist.sample(generator=generator) + return self.decode(latent_sample, generator, return_dict) @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -930,6 +957,7 @@ class ORTPipelineForTask(ConfigMixin): config_name = "model_index.json" @classmethod + @validate_hf_hub_args def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: load_config_kwargs = { "force_download": kwargs.get("force_download", False), diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index 3cecadafe3..30f042dcc3 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -68,7 +68,7 @@ if check_if_transformers_greater("4.25.0"): from transformers.generation import GenerationMixin else: - from transformers.generation_utils import GenerationMixin + from transformers.generation_utils import GenerationMixin # type: ignore if check_if_transformers_greater("4.43.0"): diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 1da49a65a2..128e2406f1 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -405,20 +405,16 @@ def evaluation_loop( return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset)) -def np_to_pt(np_object, device): - if isinstance(np_object, np.ndarray): - return torch.from_numpy(np_object) - elif isinstance(np_object, np.random.RandomState): +def np_to_pt_generators(np_object, device): + if isinstance(np_object, np.random.RandomState): return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) elif isinstance(np_object, np.random.Generator): return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0])) - elif isinstance(np_object, list) and isinstance( - np_object[0], (np.ndarray, np.random.RandomState, np.random.Generator) - ): - return [np_to_pt(a, device) for a in np_object] + elif isinstance(np_object, list) and isinstance(np_object[0], (np.random.RandomState, np.random.Generator)): + return [np_to_pt_generators(a, device) for a in np_object] elif isinstance(np_object, dict) and isinstance( - next(iter(np_object.values())), (np.ndarray, np.random.RandomState, np.random.Generator) + next(iter(np_object.values())), (np.random.RandomState, np.random.Generator) ): - return {k: np_to_pt(v, device) for k, v in np_object.items()} + return {k: np_to_pt_generators(v, device) for k, v in np_object.items()} else: return np_object diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index cdcee7f613..7bb6128878 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -62,7 +62,7 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= "/in_paint/overture-creations-5sI6fQgYIuo.png" ).resize((width, height)) elif input_type == "np": - image = np.random.rand(channel, height, width) + image = np.random.rand(height, width, channel) elif input_type == "pt": image = torch.rand((channel, height, width)) @@ -115,17 +115,16 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): def test_num_images_per_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - height, width, batch_size = 64, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -184,17 +183,21 @@ def __call__(self, *args, **kwargs) -> None: def test_shape(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + height, width, batch_size = 128, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -334,16 +337,14 @@ def test_num_images_per_prompt(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -383,18 +384,21 @@ def test_shape(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - height, width, batch_size = 32, 64, 1 - for input_type in ["np", "pil", "pt"]: + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -477,17 +481,14 @@ class ORTPipelineForInpaintingTest(ORTModelTestMixin): TASK = "inpainting" def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): - assert batch_size == 1, "Inpainting models only support batch_size=1" - assert input_type == "pil", "Inpainting models only support input_type='pil'" - inputs = _generate_prompts(batch_size=batch_size) inputs["image"] = _generate_images( height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type - )[0] + ) inputs["mask_image"] = _generate_images( - height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type - )[0] + height=height, width=width, batch_size=batch_size, channel=1, input_type=input_type + ) inputs["strength"] = 0.75 inputs["height"] = height @@ -522,16 +523,14 @@ def test_num_images_per_prompt(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -571,18 +570,21 @@ def test_shape(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - height, width, batch_size = 32, 64, 1 - for input_type in ["pil"]: + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, From 7d50df36ad696e7349faf085bf9a4242787f24f7 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 18 Sep 2024 09:01:18 +0200 Subject: [PATCH 53/71] cleanup --- optimum/onnxruntime/modeling_diffusion.py | 163 +++++++++------------- 1 file changed, 68 insertions(+), 95 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 4d52dd9cd9..0c965d76bc 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -16,7 +16,6 @@ import logging import os import shutil -import warnings from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory @@ -31,6 +30,7 @@ AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, + DiffusionPipeline, LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline, StableDiffusionImg2ImgPipeline, @@ -42,7 +42,7 @@ ) from diffusers.schedulers import SchedulerMixin from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available +from diffusers.utils import is_invisible_watermark_available from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.utils import validate_hf_hub_args @@ -79,7 +79,7 @@ logger = logging.getLogger(__name__) -class ORTPipeline(ORTModel): +class ORTPipeline(ORTModel, ConfigMixin): auto_model_class = None model_type = "onnx_pipeline" @@ -89,12 +89,12 @@ class ORTPipeline(ORTModel): def __init__( self, config: Dict[str, Any], - scheduler: SchedulerMixin, - unet: ort.InferenceSession, + unet: Optional[ort.InferenceSession] = None, vae_encoder: Optional[ort.InferenceSession] = None, vae_decoder: Optional[ort.InferenceSession] = None, text_encoder: Optional[ort.InferenceSession] = None, text_encoder_2: Optional[ort.InferenceSession] = None, + scheduler: Optional[SchedulerMixin] = None, tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, feature_extractor: Optional[CLIPFeatureExtractor] = None, @@ -151,52 +151,51 @@ def __init__( if hasattr(self.vae.config, "block_out_channels"): self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) else: - self.vae_scale_factor = 8 + self.vae_scale_factor = 8 # for old configs without block_out_channels self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True ) - sub_models = { + # Modify config to keep the resulting model compatible with diffusers pipelines + models_to_subfolder = { self.unet: DIFFUSION_MODEL_UNET_SUBFOLDER, self.vae_decoder: DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, self.vae_encoder: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, self.text_encoder: DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, self.text_encoder_2: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, } - - # Modify config to keep the resulting model compatible with diffusers pipelines - for model, model_name in sub_models.items(): - config[model_name] = ("optimum", model.__class__.__name__) if model is not None else (None, None) + for model, model_subfolder in models_to_subfolder.items(): + config[model_subfolder] = ("optimum", model.__class__.__name__) if model is not None else (None, None) self._internal_dict = FrozenDict(config) self.shared_attributes_init(model=unet, use_io_binding=use_io_binding, model_save_dir=model_save_dir) @staticmethod def load_model( - vae_decoder_path: Union[str, Path], - text_encoder_path: Union[str, Path], unet_path: Union[str, Path], vae_encoder_path: Optional[Union[str, Path]] = None, + vae_decoder_path: Optional[Union[str, Path]] = None, + text_encoder_path: Optional[Union[str, Path]] = None, text_encoder_2_path: Optional[Union[str, Path]] = None, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict] = None, ): """ - Creates three inference sessions for respectively the VAE decoder, the text encoder and the U-NET models. + Creates three inference sessions for the components of a Diffusion Pipeline (U-NET, VAE, Text Encoders). The default provider is `CPUExecutionProvider` to match the default behaviour in PyTorch/TensorFlow/JAX. Args: - vae_decoder_path (`Union[str, Path]`): - The path to the VAE decoder ONNX model. - text_encoder_path (`Union[str, Path]`): - The path to the text encoder ONNX model. unet_path (`Union[str, Path]`): The path to the U-NET ONNX model. vae_encoder_path (`Union[str, Path]`, defaults to `None`): The path to the VAE encoder ONNX model. + vae_decoder_path (`Union[str, Path]`, defaults to `None`): + The path to the VAE decoder ONNX model. + text_encoder_path (`Union[str, Path]`, defaults to `None`): + The path to the text encoder ONNX model. text_encoder_2_path (`Union[str, Path]`, defaults to `None`): The path to the second text decoder ONNX model. provider (`str`, defaults to `"CPUExecutionProvider"`): @@ -208,50 +207,48 @@ def load_model( Provider option dictionary corresponding to the provider used. See available options for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`. """ - vae_decoder = ORTModel.load_model(vae_decoder_path, provider, session_options, provider_options) - unet = ORTModel.load_model(unet_path, provider, session_options, provider_options) - - sessions = { + paths = { + "unet": unet_path, "vae_encoder": vae_encoder_path, + "vae_decoder": vae_decoder_path, "text_encoder": text_encoder_path, "text_encoder_2": text_encoder_2_path, } - for key, value in sessions.items(): - if value is not None and value.is_file(): - sessions[key] = ORTModel.load_model(value, provider, session_options, provider_options) + sessions = {} + for model_name, model_path in paths.items(): + if model_path is not None and model_path.is_file(): + sessions[model_name] = ORTModel.load_model(model_path, provider, session_options, provider_options) else: - sessions[key] = None + sessions[model_name] = None - return vae_decoder, sessions["text_encoder"], unet, sessions["vae_encoder"], sessions["text_encoder_2"] + return sessions def _save_pretrained(self, save_directory: Union[str, Path]): save_directory = Path(save_directory) - sub_models_to_save = { - self.unet: DIFFUSION_MODEL_UNET_SUBFOLDER, - self.vae_decoder: DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, - self.vae_encoder: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, - self.text_encoder: DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, - self.text_encoder_2: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + models_to_save_paths = { + self.unet: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME, + self.vae_decoder: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, + self.vae_encoder: save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, + self.text_encoder: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, + self.text_encoder_2: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / ONNX_WEIGHTS_NAME, } - - for model, model_folder in sub_models_to_save.items(): + for model, model_save_path in models_to_save_paths.items(): if model is not None: model_path = Path(model.session._model_path) - model_save_path = save_directory / model_folder / ONNX_WEIGHTS_NAME model_save_path.parent.mkdir(parents=True, exist_ok=True) # copy onnx model shutil.copyfile(model_path, model_save_path) - # copy external data + # copy external onnx data external_data_paths = _get_model_external_data_paths(model_path) for external_data_path in external_data_paths: shutil.copyfile(external_data_path, model_save_path.parent / external_data_path.name) - # copy config - shutil.copyfile( - model_path.parent / self.sub_component_config_name, - model_save_path.parent / self.sub_component_config_name, - ) + # copy model config + config_path = model_path.parent / self.sub_component_config_name + if config_path.is_file(): + config_save_path = model_save_path.parent / self.sub_component_config_name + shutil.copyfile(config_path, config_save_path) self.scheduler.save_pretrained(save_directory / "scheduler") @@ -267,16 +264,15 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], - use_auth_token: Optional[Union[bool, str]] = None, - token: Optional[Union[bool, str]] = None, + local_files_only: bool = False, revision: Optional[str] = None, cache_dir: str = HUGGINGFACE_HUB_CACHE, - vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, - text_encoder_file_name: str = ONNX_WEIGHTS_NAME, + token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, vae_encoder_file_name: str = ONNX_WEIGHTS_NAME, + vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, + text_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME, - local_files_only: bool = False, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, @@ -284,18 +280,6 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - - if provider == "TensorrtExecutionProvider": - raise ValueError("The provider `'TensorrtExecutionProvider'` is not supported") - model_id = str(model_id) patterns = set(config.keys()) sub_models_to_load = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}) @@ -305,13 +289,13 @@ def _from_pretrained( allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")} allow_patterns.update( { - vae_decoder_file_name, - text_encoder_file_name, unet_file_name, vae_encoder_file_name, + vae_decoder_file_name, + text_encoder_file_name, text_encoder_2_file_name, + cls.sub_component_config_name, SCHEDULER_CONFIG_NAME, - CONFIG_NAME, cls.config_name, } ) @@ -340,14 +324,18 @@ def _from_pretrained( else: sub_models[name] = load_method(new_model_save_dir) - vae_decoder, text_encoder, unet, vae_encoder, text_encoder_2 = cls.load_model( - vae_decoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, - text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, - unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_2_path=( + model_paths = { + "unet_path": new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "vae_encoder_path": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, + "vae_decoder_path": new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + "text_encoder_path": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, + "text_encoder_2_path": ( new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name ), + } + + model_sessions = cls.load_model( + **model_paths, provider=provider, session_options=session_options, provider_options=provider_options, @@ -362,29 +350,21 @@ def _from_pretrained( ) return cls( - unet=unet, config=config, - vae_encoder=vae_encoder, - vae_decoder=vae_decoder, - text_encoder=text_encoder, - text_encoder_2=text_encoder_2, - scheduler=sub_models.get("scheduler"), - tokenizer=sub_models.get("tokenizer", None), - tokenizer_2=sub_models.get("tokenizer_2", None), - feature_extractor=sub_models.get("feature_extractor", None), + **model_sessions, + **sub_models, use_io_binding=use_io_binding, model_save_dir=model_save_dir, ) @classmethod - def _from_transformers( + def _export( cls, model_id: str, - config: Optional[str] = None, - use_auth_token: Optional[Union[bool, str]] = None, + config: Optional[Dict[str, Any]] = None, token: Optional[Union[bool, str]] = None, - revision: str = "main", - force_download: bool = True, + revision: Optional[str] = None, + force_download: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, subfolder: str = "", local_files_only: bool = False, @@ -395,15 +375,6 @@ def _from_transformers( use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTPipeline": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - if task is None: task = cls._auto_model_to_task(cls.auto_model_class) @@ -866,7 +837,7 @@ class ORTLatentConsistencyModelImg2ImgPipeline(ORTPipeline, LatentConsistencyMod ] -def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): +def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): for ort_pipeline_class in SUPPORTED_ORT_PIPELINES: if ( ort_pipeline_class.__name__ == pipeline_class_name @@ -879,6 +850,7 @@ def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool class ORTDiffusionPipeline(ConfigMixin): + auto_model_class = DiffusionPipeline config_name = "model_index.json" @classmethod @@ -898,7 +870,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: config = config[0] if isinstance(config, tuple) else config class_name = config["_class_name"] - ort_pipeline_class = _get_pipeline_class(class_name) + ort_pipeline_class = _get_ort_class(class_name) return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) @@ -933,7 +905,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: ] -def _get_task_class(mapping, pipeline_class_name): +def _get_task_ort_class(mapping, pipeline_class_name): def _get_model_name(pipeline_class_name): for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS: for model_name, ort_pipeline_class in ort_pipelines_mapping.items(): @@ -954,6 +926,7 @@ def _get_model_name(pipeline_class_name): class ORTPipelineForTask(ConfigMixin): + auto_model_class = DiffusionPipeline config_name = "model_index.json" @classmethod @@ -972,7 +945,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: config = config[0] if isinstance(config, tuple) else config class_name = config["_class_name"] - ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name) + ort_pipeline_class = _get_task_ort_class(cls.ort_pipelines_mapping, class_name) return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) From cce0ee8f717975b3cb9475e920e406c8132d9094 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 18 Sep 2024 09:09:30 +0200 Subject: [PATCH 54/71] reduce parent model usage in model parts --- optimum/onnxruntime/modeling_diffusion.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 0c965d76bc..99e618c5de 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -468,13 +468,17 @@ def __call__(self, *args, **kwargs): return self.auto_model_class.__call__(self, *args, **kwargs) -class ORTPipelinePart(ORTModelPart): +class ORTPipelinePart(ORTModelPart, ConfigMixin): + config_name: str = "config.json" + def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): super().__init__(session, parent_model) - config_path = Path(session._model_path).parent / "config.json" - config_dict = parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - self.config = FrozenDict(config_dict) + config_path = Path(session._model_path).parent / self.config_name + config_dict = self.load_config(config_path) if config_path.is_file() else {} + config_dict = config_dict[0] if isinstance(config_dict, tuple) else config_dict + + self._internal_dict = FrozenDict(config_dict) @property def input_dtype(self): @@ -605,10 +609,11 @@ def forward( class ORTVaeWrapper(ORTPipelinePart): def __init__(self, vae_encoder: ORTModelVaeEncoder, vae_decoder: ORTModelVaeDecoder, parent_model: ORTPipeline): - super().__init__(vae_decoder.session, parent_model) self.vae_encoder = vae_encoder self.vae_decoder = vae_decoder + super().__init__(vae_decoder.session, parent_model) + def encode( self, sample: Union[np.ndarray, torch.Tensor], From 86c2b7ebfa59c448690e1ba3b348c0ba4a391560 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 19 Sep 2024 15:20:48 +0200 Subject: [PATCH 55/71] fix --- optimum/onnxruntime/modeling_diffusion.py | 587 +++++++++++----------- 1 file changed, 300 insertions(+), 287 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 99e618c5de..28d8e284e8 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -13,9 +13,11 @@ # limitations under the License. import importlib +import inspect import logging import os import shutil +from abc import abstractmethod from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory @@ -24,7 +26,6 @@ import numpy as np import torch from diffusers.configuration_utils import ConfigMixin, FrozenDict -from diffusers.image_processor import VaeImageProcessor from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution from diffusers.pipelines import ( AutoPipelineForImage2Image, @@ -42,7 +43,7 @@ ) from diffusers.schedulers import SchedulerMixin from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from diffusers.utils import is_invisible_watermark_available +from diffusers.utils.constants import CONFIG_NAME from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.utils import validate_hf_hub_args @@ -61,7 +62,6 @@ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) -from .base import ORTModelPart from .io_binding import TypeHelper from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel from .utils import ( @@ -73,103 +73,98 @@ ) -if is_invisible_watermark_available(): - from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker - logger = logging.getLogger(__name__) class ORTPipeline(ORTModel, ConfigMixin): auto_model_class = None - model_type = "onnx_pipeline" config_name = "model_index.json" - sub_component_config_name = "config.json" def __init__( self, - config: Dict[str, Any], + # diffusers specific arguments unet: Optional[ort.InferenceSession] = None, vae_encoder: Optional[ort.InferenceSession] = None, vae_decoder: Optional[ort.InferenceSession] = None, text_encoder: Optional[ort.InferenceSession] = None, text_encoder_2: Optional[ort.InferenceSession] = None, - scheduler: Optional[SchedulerMixin] = None, - tokenizer: Optional[CLIPTokenizer] = None, - tokenizer_2: Optional[CLIPTokenizer] = None, - feature_extractor: Optional[CLIPFeatureExtractor] = None, + image_encoder: Optional[ort.InferenceSession] = None, + safety_checker: Optional[ort.InferenceSession] = None, + scheduler: Optional["SchedulerMixin"] = None, + tokenizer: Optional["CLIPTokenizer"] = None, + tokenizer_2: Optional["CLIPTokenizer"] = None, + feature_extractor: Optional["CLIPFeatureExtractor"] = None, + # stable diffusion xl specific arguments + requires_aesthetics_score: bool = False, + force_zeros_for_empty_prompt: bool = True, + add_watermarker: Optional[bool] = None, + # onnxruntime specific arguments use_io_binding: Optional[bool] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - """ - Args: - config (`Dict[str, Any]`): - A config dictionary from which the model components will be instantiated. Make sure to only load - configuration files of compatible classes. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) - for the text encoder. - scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): - A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. - unet (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the U-NET. - feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): - A model extracting features from generated images to be used as inputs for the `safety_checker` - vae_encoder (`Optional[ort.InferenceSession]`, defaults to `None`): - The ONNX Runtime inference session associated to the VAE encoder. - text_encoder (`Optional[ort.InferenceSession]`, defaults to `None`): - The ONNX Runtime inference session associated to the text encoder. - tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) - for the second text encoder. - use_io_binding (`Optional[bool]`, defaults to `None`): - Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to - `True` if the device is CUDA, otherwise defaults to `False`. - model_save_dir (`Optional[str]`, defaults to `None`): - The directory under which the model exported to ONNX was saved. - """ - - self.unet = ORTModelUnet(unet, self) - self.vae_encoder = ORTModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None - self.vae_decoder = ORTModelVaeDecoder(vae_decoder, self) if vae_decoder is not None else None - self.text_encoder = ORTModelTextEncoder(text_encoder, self) if text_encoder is not None else None - self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2, self) if text_encoder_2 is not None else None + ############################################################################################################ + self.unet = ORTModelUnet(unet, self, subfolder=DIFFUSION_MODEL_UNET_SUBFOLDER) + self.vae_encoder = ( + ORTModelVaeEncoder(vae_encoder, self, subfolder=DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) + if vae_encoder is not None + else None + ) + self.vae_decoder = ( + ORTModelVaeDecoder(vae_decoder, self, subfolder=DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) + if vae_decoder is not None + else None + ) + self.text_encoder = ( + ORTModelTextEncoder(text_encoder, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) + if text_encoder is not None + else None + ) + self.text_encoder_2 = ( + ORTModelTextEncoder(text_encoder_2, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) + if text_encoder_2 is not None + else None + ) + self.image_encoder = image_encoder # TODO: maybe implement ORTModelImageEncoder + self.safety_checker = safety_checker # TODO: maybe implement ORTModelSafetyChecker - # We create VAE encoder & decoder and wrap them in one object to - # be used by the pipeline mixins with minimal code changes (simulating the diffusers API) - self.vae = ORTVaeWrapper(self.vae_encoder, self.vae_decoder, self) + # We wrap the VAE encoder and decoder in a single object to simplify the API + self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 self.feature_extractor = feature_extractor - self.safety_checker = kwargs.get("safety_checker", None) - if hasattr(self.vae.config, "block_out_channels"): - self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - else: - self.vae_scale_factor = 8 # for old configs without block_out_channels + all_possible_init_args = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "image_encoder": self.image_encoder, + "safety_checker": self.safety_checker, + "scheduler": self.scheduler, + "tokenizer": self.tokenizer, + "tokenizer_2": self.tokenizer_2, + "feature_extractor": self.feature_extractor, + "requires_aesthetics_score": requires_aesthetics_score, + "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, + "add_watermarker": add_watermarker, + } - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - self.mask_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True - ) + diffusers_pipeline_args = {} + for key in inspect.signature(self.auto_model_class).parameters.keys(): + if key in all_possible_init_args: + diffusers_pipeline_args[key] = all_possible_init_args[key] - # Modify config to keep the resulting model compatible with diffusers pipelines - models_to_subfolder = { - self.unet: DIFFUSION_MODEL_UNET_SUBFOLDER, - self.vae_decoder: DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, - self.vae_encoder: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, - self.text_encoder: DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, - self.text_encoder_2: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, - } - for model, model_subfolder in models_to_subfolder.items(): - config[model_subfolder] = ("optimum", model.__class__.__name__) if model is not None else (None, None) + # init stuff like config, vae_scale_factor, image_processor, etc. + self.auto_model_class.__init__(self, **diffusers_pipeline_args) + # not registered correctly in the config + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) + ############################################################################################################ - self._internal_dict = FrozenDict(config) self.shared_attributes_init(model=unet, use_io_binding=use_io_binding, model_save_dir=model_save_dir) @staticmethod @@ -181,7 +176,7 @@ def load_model( text_encoder_2_path: Optional[Union[str, Path]] = None, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, - provider_options: Optional[Dict] = None, + provider_options: Optional[Dict[str, Any]] = None, ): """ Creates three inference sessions for the components of a Diffusion Pipeline (U-NET, VAE, Text Encoders). @@ -203,7 +198,7 @@ def load_model( for possible providers. session_options (`Optional[ort.SessionOptions]`, defaults to `None`): ONNX Runtime session options to use for loading the model. Defaults to `None`. - provider_options (`Optional[Dict]`, defaults to `None`): + provider_options (`Optional[Dict[str, Any]]`, defaults to `None`): Provider option dictionary corresponding to the provider used. See available options for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`. """ @@ -229,8 +224,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): models_to_save_paths = { self.unet: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.vae_decoder: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, self.vae_encoder: save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, + self.vae_decoder: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, self.text_encoder: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, self.text_encoder_2: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / ONNX_WEIGHTS_NAME, } @@ -245,9 +240,9 @@ def _save_pretrained(self, save_directory: Union[str, Path]): for external_data_path in external_data_paths: shutil.copyfile(external_data_path, model_save_path.parent / external_data_path.name) # copy model config - config_path = model_path.parent / self.sub_component_config_name + config_path = model_path.parent / CONFIG_NAME if config_path.is_file(): - config_save_path = model_save_path.parent / self.sub_component_config_name + config_save_path = model_save_path.parent / CONFIG_NAME shutil.copyfile(config_path, config_save_path) self.scheduler.save_pretrained(save_directory / "scheduler") @@ -264,6 +259,8 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], + subfolder: str = "", + force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, cache_dir: str = HUGGINGFACE_HUB_CACHE, @@ -273,20 +270,18 @@ def _from_pretrained( vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME, + use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, - use_io_binding: Optional[bool] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - model_id = str(model_id) - patterns = set(config.keys()) - sub_models_to_load = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}) + all_components = {key for key in config.keys() if not key.startswith("_")} + all_components.update({"vae_encoder", "vae_decoder"}) - if not os.path.isdir(model_id): - patterns.update({"vae_encoder", "vae_decoder"}) - allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")} + if not os.path.isdir(str(model_id)): + allow_patterns = {os.path.join(component, "*") for component in all_components} allow_patterns.update( { unet_file_name, @@ -294,64 +289,57 @@ def _from_pretrained( vae_decoder_file_name, text_encoder_file_name, text_encoder_2_file_name, - cls.sub_component_config_name, SCHEDULER_CONFIG_NAME, cls.config_name, + CONFIG_NAME, } ) - # Downloads all repo's files matching the allowed patterns model_id = snapshot_download( model_id, cache_dir=cache_dir, + force_download=force_download, local_files_only=local_files_only, - token=token, revision=revision, + token=token, allow_patterns=allow_patterns, ignore_patterns=["*.msgpack", "*.safetensors", "*.bin", "*.xml"], ) - new_model_save_dir = Path(model_id) + + model_save_path = Path(model_id) sub_models = {} - for name in sub_models_to_load: - library_name, library_classes = config[name] + for name in {"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}: + library_name, library_classes = config.get(name, (None, None)) if library_classes is not None: library = importlib.import_module(library_name) class_obj = getattr(library, library_classes) load_method = getattr(class_obj, "from_pretrained") # Check if the module is in a subdirectory - if (new_model_save_dir / name).is_dir(): - sub_models[name] = load_method(new_model_save_dir / name) + if (model_save_path / name).is_dir(): + sub_models[name] = load_method(model_save_path / name) else: - sub_models[name] = load_method(new_model_save_dir) - - model_paths = { - "unet_path": new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - "vae_encoder_path": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - "vae_decoder_path": new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, - "text_encoder_path": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, - "text_encoder_2_path": ( - new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name - ), - } + sub_models[name] = load_method(model_save_path) - model_sessions = cls.load_model( - **model_paths, - provider=provider, - session_options=session_options, - provider_options=provider_options, + paths = { + "unet_path": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "vae_encoder_path": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, + "vae_decoder_path": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + "text_encoder_path": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, + "text_encoder_2_path": model_save_path + / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER + / text_encoder_2_file_name, + } + models = cls.load_model( + **paths, provider=provider, session_options=session_options, provider_options=provider_options ) - if model_save_dir is None: - model_save_dir = new_model_save_dir - if use_io_binding: raise ValueError( "IOBinding is not yet available for stable diffusion model, please set `use_io_binding` to False." ) return cls( - config=config, - **model_sessions, + **models, **sub_models, use_io_binding=use_io_binding, model_save_dir=model_save_dir, @@ -361,49 +349,53 @@ def _from_pretrained( def _export( cls, model_id: str, - config: Optional[Dict[str, Any]] = None, - token: Optional[Union[bool, str]] = None, - revision: Optional[str] = None, - force_download: bool = False, - cache_dir: str = HUGGINGFACE_HUB_CACHE, + config: Dict[str, Any], subfolder: str = "", + force_download: bool = False, local_files_only: bool = False, + revision: Optional[str] = None, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + token: Optional[Union[bool, str]] = None, trust_remote_code: bool = False, + use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, - use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTPipeline": if task is None: task = cls._auto_model_to_task(cls.auto_model_class) - save_dir = TemporaryDirectory() - save_dir_path = Path(save_dir.name) + # we continue passing the model_save_dir from here on to avoid it being cleaned up + # might be better to use a persistent temporary directory such as the one implemented in + # https://gist.github.com/twolfson/2929dc1163b0a76d2c2b66d51f9bc808 + model_save_dir = TemporaryDirectory() + model_save_path = Path(model_save_dir.name) main_export( - model_name_or_path=model_id, - output=save_dir_path, - task=task, + model_id, + output=model_save_path, do_validation=False, no_post_process=True, - subfolder=subfolder, + token=token, revision=revision, cache_dir=cache_dir, - token=token, - local_files_only=local_files_only, + subfolder=subfolder, force_download=force_download, + local_files_only=local_files_only, trust_remote_code=trust_remote_code, + library_name="diffusers", + task=task, ) return cls._from_pretrained( - save_dir_path, + model_save_path, config=config, provider=provider, session_options=session_options, provider_options=provider_options, use_io_binding=use_io_binding, - model_save_dir=save_dir, + model_save_dir=model_save_dir, ) def to(self, device: Union[torch.device, str, int]): @@ -440,7 +432,7 @@ def to(self, device: Union[torch.device, str, int]): if self.text_encoder_2 is not None: self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options]) - self.providers = self.vae_decoder.session.get_providers() + self.providers = self.unet.session.get_providers() self._device = device return self @@ -453,32 +445,51 @@ def _save_config(self, save_directory): self.save_config(save_directory) @property - def _execution_device(self): - return self.device + def components(self) -> Dict[str, Any]: + components = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "image_encoder": self.image_encoder, + "safety_checker": self.safety_checker, + } + components = {k: v for k, v in components.items() if v is not None} + return components def __call__(self, *args, **kwargs): - device = self._execution_device + # we keep numpy random states support for now + args = list(args) for i in range(len(args)): - args[i] = np_to_pt_generators(args[i], device) + args[i] = np_to_pt_generators(args[i], self.device) for k, v in kwargs.items(): - kwargs[k] = np_to_pt_generators(v, device) + kwargs[k] = np_to_pt_generators(v, self.device) return self.auto_model_class.__call__(self, *args, **kwargs) -class ORTPipelinePart(ORTModelPart, ConfigMixin): - config_name: str = "config.json" +class ORTPipelinePart: + def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): + self.session = session + self.subfolder = subfolder + self.parent_pipeline = parent_pipeline + + self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} + self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} + self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} - def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): - super().__init__(session, parent_model) + self.model_save_dir = Path(self.session._model_path).parent + config_path = self.model_save_dir / CONFIG_NAME - config_path = Path(session._model_path).parent / self.config_name - config_dict = self.load_config(config_path) if config_path.is_file() else {} - config_dict = config_dict[0] if isinstance(config_dict, tuple) else config_dict + if not config_path.is_file(): + # config is necessary for the model to work + raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_path}") - self._internal_dict = FrozenDict(config_dict) + config_dict = parent_pipeline._dict_from_json_file(config_path) + self.config = FrozenDict(**config_dict) @property def input_dtype(self): @@ -488,6 +499,83 @@ def input_dtype(self): ) return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} + @property + def device(self): + return self.parent_pipeline.device + + @property + def dtype(self): + for dtype in self.input_dtypes.values(): + torch_dtype = TypeHelper.ort_type_to_torch_type(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + for dtype in self.output_dtypes.values(): + torch_dtype = TypeHelper.ort_type_to_torch_type(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + return None + + def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, torch.device): + device = arg + elif isinstance(arg, str): + device = torch.device(arg) + elif isinstance(arg, torch.dtype): + dtype = arg + + if device is not None and device != self.device: + raise ValueError( + "Cannot change the device of a pipeline part without changing the device of the parent pipeline. " + "Please use the `to` method of the parent pipeline to change the device." + ) + + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the pipeline from {self.dtype} to {dtype}. " + f"Please export the pipeline with the desired dtype." + ) + + def prepare_onnx_inputs(self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray]) -> Dict[str, np.ndarray]: + onnx_inputs = {} + + # converts pytorch inputs into numpy inputs for onnx + for input_name in self.input_names.keys(): + onnx_inputs[input_name] = inputs.pop(input_name) + + if use_torch: + onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True) + + if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: + onnx_inputs[input_name] = onnx_inputs[input_name].astype( + TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]) + ) + + return onnx_inputs + + def prepare_onnx_outputs( + self, use_torch: bool, *onnx_outputs: np.ndarray + ) -> Dict[str, Union[torch.Tensor, np.ndarray]]: + model_outputs = {} + + # converts onnxruntime outputs into tensor for standard outputs + for output_name, idx in self.output_names.items(): + model_outputs[output_name] = onnx_outputs[idx] + + if use_torch: + model_outputs[output_name] = torch.from_numpy(model_outputs[output_name]).to(self.device) + + return model_outputs + + @abstractmethod + def forward(self, *args, **kwargs): + pass + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + class ORTModelTextEncoder(ORTPipelinePart): def forward( @@ -501,9 +589,9 @@ def forward( model_inputs = {"input_ids": input_ids} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) if output_hidden_states: model_outputs["hidden_states"] = [] @@ -521,6 +609,12 @@ def forward( class ORTModelUnet(ORTPipelinePart): + # def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): + # super().__init__(session, parent_pipeline, subfolder) + + # if not hasattr(self.config, "time_cond_proj_dim"): + # self.config = FrozenDict(**self.config, time_cond_proj_dim=None) + def forward( self, sample: Union[np.ndarray, torch.Tensor], @@ -549,25 +643,48 @@ def forward( **(added_cond_kwargs or {}), } - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) if return_dict: return model_outputs return ModelOutput(**model_outputs) + @property + def add_embedding(self): + return FrozenDict( + linear_1=FrozenDict( + # this is a hacky way to get the attribute in add_embedding.linear_1.in_features + # (StableDiffusionXLImg2ImgPipeline/StableDiffusionXLInpaintPipeline)._get_add_time_ids + in_features=self.config.addition_time_embed_dim + * ( + 5 # list(original_size + crops_coords_top_left + (aesthetic_score,)) + if self.parent_pipeline.config.requires_aesthetics_score + else 6 # list(original_size + crops_coords_top_left + target_size) + ) + + self.parent_pipeline.text_encoder.config.projection_dim + ) + ) + class ORTModelVaeEncoder(ORTPipelinePart): + def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): + super().__init__(session, parent_pipeline, subfolder) + + if not hasattr(self.config, "scaling_factor"): + scaling_factor = 2 ** (len(self.config.block_out_channels) - 1) + self.config = FrozenDict(**self.config, scaling_factor=scaling_factor) + def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = False): use_torch = isinstance(sample, torch.Tensor) model_inputs = {"sample": sample} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) if "latent_sample" in model_outputs: model_outputs["latents"] = model_outputs.pop("latent_sample") @@ -584,6 +701,13 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = F class ORTModelVaeDecoder(ORTPipelinePart): + # def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): + # super().__init__(session, parent_pipeline, subfolder) + + # if not hasattr(self.config, "scaling_factor"): + # scaling_factor = 2 ** (len(self.config.block_out_channels) - 1) + # self.config = FrozenDict(**self.config, scaling_factor=scaling_factor) + def forward( self, latent_sample: Union[np.ndarray, torch.Tensor], @@ -594,9 +718,9 @@ def forward( model_inputs = {"latent_sample": latent_sample} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) if "latent_sample" in model_outputs: model_outputs["latents"] = model_outputs.pop("latent_sample") @@ -607,36 +731,36 @@ def forward( return ModelOutput(**model_outputs) -class ORTVaeWrapper(ORTPipelinePart): - def __init__(self, vae_encoder: ORTModelVaeEncoder, vae_decoder: ORTModelVaeDecoder, parent_model: ORTPipeline): - self.vae_encoder = vae_encoder - self.vae_decoder = vae_decoder +class ORTWrapperVae(ORTPipelinePart): + def __init__(self, encoder: ORTModelVaeEncoder, decoder: ORTModelVaeDecoder): + if encoder is not None: + self.encoder = encoder - super().__init__(vae_decoder.session, parent_model) + self.decoder = decoder - def encode( - self, - sample: Union[np.ndarray, torch.Tensor], - return_dict: bool = False, - ): - return self.vae_encoder(sample, return_dict) + @property + def config(self): + return self.decoder.config - def decode( - self, - latent_sample: Union[np.ndarray, torch.Tensor], - generator: Optional[torch.Generator] = None, - return_dict: bool = False, - ): - return self.vae_decoder(latent_sample, generator, return_dict) + @property + def dtype(self): + return self.decoder.dtype - def forward( - self, - sample: Union[np.ndarray, torch.Tensor], - generator: Optional[torch.Generator] = None, - return_dict: bool = False, - ): - latent_sample = self.encode(sample).latent_dist.sample(generator=generator) - return self.decode(latent_sample, generator, return_dict) + @property + def device(self): + return self.decoder.device + + def encode(self, *args, **kwargs): + return self.encoder(*args, **kwargs) + + def decode(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + def to(self, *args, **kwargs): + if self.encoder is not None: + self.encoder.to(*args, **kwargs) + + self.decoder.to(*args, **kwargs) @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) @@ -678,31 +802,6 @@ class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipeline): main_input_name = "prompt" auto_model_class = StableDiffusionXLPipeline - def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): - super().__init__(*args, **kwargs) - - requires_aesthetics_score = kwargs.get("requires_aesthetics_score", False) - force_zeros_for_empty_prompt = kwargs.get("force_zeros_for_empty_prompt", True) - self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) - self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) - - add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() - - if add_watermarker: - self.watermark = StableDiffusionXLWatermarker() - else: - self.watermark = None - - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids - def _get_add_time_ids( - self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None - ): - add_time_ids = list(original_size + crops_coords_top_left + target_size) - - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - - return add_time_ids - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionXLImg2ImgPipeline(ORTPipeline, StableDiffusionXLImg2ImgPipeline): @@ -713,49 +812,6 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTPipeline, StableDiffusionXLImg2ImgP main_input_name = "prompt" auto_model_class = StableDiffusionXLImg2ImgPipeline - def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): - super().__init__(*args, **kwargs) - - requires_aesthetics_score = kwargs.get("requires_aesthetics_score", False) - force_zeros_for_empty_prompt = kwargs.get("force_zeros_for_empty_prompt", True) - self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) - self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) - - add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() - - if add_watermarker: - self.watermark = StableDiffusionXLWatermarker() - else: - self.watermark = None - - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids - def _get_add_time_ids( - self, - original_size, - crops_coords_top_left, - target_size, - aesthetic_score, - negative_aesthetic_score, - negative_original_size, - negative_crops_coords_top_left, - negative_target_size, - dtype, - text_encoder_projection_dim=None, - ): - if self.config.requires_aesthetics_score: - add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list( - negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) - ) - else: - add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) - - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - - return add_time_ids, add_neg_time_ids - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionXLInpaintPipeline(ORTPipeline, StableDiffusionXLInpaintPipeline): @@ -766,49 +822,6 @@ class ORTStableDiffusionXLInpaintPipeline(ORTPipeline, StableDiffusionXLInpaintP main_input_name = "image" auto_model_class = StableDiffusionXLInpaintPipeline - def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): - super().__init__(*args, **kwargs) - - requires_aesthetics_score = kwargs.get("requires_aesthetics_score", False) - force_zeros_for_empty_prompt = kwargs.get("force_zeros_for_empty_prompt", True) - self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) - self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) - - add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() - - if add_watermarker: - self.watermark = StableDiffusionXLWatermarker() - else: - self.watermark = None - - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline._get_add_time_ids - def _get_add_time_ids( - self, - original_size, - crops_coords_top_left, - target_size, - aesthetic_score, - negative_aesthetic_score, - negative_original_size, - negative_crops_coords_top_left, - negative_target_size, - dtype, - text_encoder_projection_dim=None, - ): - if self.config.requires_aesthetics_score: - add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list( - negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) - ) - else: - add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) - - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - - return add_time_ids, add_neg_time_ids - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyModelPipeline): From 5a443ac26d2d1c5f839cf2e9b484044b1e6a7524 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 19 Sep 2024 21:01:21 +0200 Subject: [PATCH 56/71] new tiny onnx diffusion model with configs --- optimum/onnxruntime/modeling_diffusion.py | 25 ++++++----------------- tests/onnxruntime/test_modeling.py | 12 ++++++++--- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 28d8e284e8..791cf5a0db 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -31,7 +31,6 @@ AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, - DiffusionPipeline, LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline, StableDiffusionImg2ImgPipeline, @@ -77,8 +76,6 @@ class ORTPipeline(ORTModel, ConfigMixin): - auto_model_class = None - config_name = "model_index.json" def __init__( @@ -491,14 +488,6 @@ def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, config_dict = parent_pipeline._dict_from_json_file(config_path) self.config = FrozenDict(**config_dict) - @property - def input_dtype(self): - logger.warning( - "The `input_dtype` property is deprecated and will be removed in the next release. " - "Please use `input_dtypes` along with `TypeHelper` to get the `numpy` types." - ) - return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} - @property def device(self): return self.parent_pipeline.device @@ -521,7 +510,7 @@ def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtyp for arg in args: if isinstance(arg, torch.device): device = arg - elif isinstance(arg, str): + elif isinstance(arg, (int, str)): device = torch.device(arg) elif isinstance(arg, torch.dtype): dtype = arg @@ -670,12 +659,12 @@ def add_embedding(self): class ORTModelVaeEncoder(ORTPipelinePart): - def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): - super().__init__(session, parent_pipeline, subfolder) + # def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): + # super().__init__(session, parent_pipeline, subfolder) - if not hasattr(self.config, "scaling_factor"): - scaling_factor = 2 ** (len(self.config.block_out_channels) - 1) - self.config = FrozenDict(**self.config, scaling_factor=scaling_factor) + # if not hasattr(self.config, "scaling_factor"): + # scaling_factor = 2 ** (len(self.config.block_out_channels) - 1) + # self.config = FrozenDict(**self.config, scaling_factor=scaling_factor) def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = False): use_torch = isinstance(sample, torch.Tensor) @@ -868,7 +857,6 @@ def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tr class ORTDiffusionPipeline(ConfigMixin): - auto_model_class = DiffusionPipeline config_name = "model_index.json" @classmethod @@ -944,7 +932,6 @@ def _get_model_name(pipeline_class_name): class ORTPipelineForTask(ConfigMixin): - auto_model_class = DiffusionPipeline config_name = "model_index.json" @classmethod diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 199b96342e..f7b8ec3392 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -144,7 +144,7 @@ def __init__(self, *args, **kwargs): self.ONNX_SEQ2SEQ_MODEL_ID = "optimum/t5-small" self.LARGE_ONNX_SEQ2SEQ_MODEL_ID = "facebook/mbart-large-en-ro" self.TINY_ONNX_SEQ2SEQ_MODEL_ID = "fxmarty/sshleifer-tiny-mbart-onnx" - self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "IlyasMoutawwakil/tiny-stable-diffusion-onnx" def test_load_model_from_local_path(self): model = ORTModel.from_pretrained(self.LOCAL_MODEL_PATH) @@ -218,11 +218,9 @@ def test_load_seq2seq_model_from_empty_cache(self): @require_diffusers def test_load_stable_diffusion_model_from_cache(self): _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching - model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True ) - self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder) @@ -320,6 +318,7 @@ def test_load_stable_diffusion_model_from_hub(self): self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder) self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -334,6 +333,7 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_decoder.session.get_providers(), model.providers) self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -349,6 +349,7 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_decoder.session.get_providers(), model.providers) self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="cat", num_inference_steps=2) @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): @@ -361,6 +362,7 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_decoder.session.get_providers(), model.providers) self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) + model(prompt="cat", num_inference_steps=2) @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): @@ -844,6 +846,7 @@ def test_stable_diffusion_model_on_cpu(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "CPUExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) + model(prompt="cat", num_inference_steps=2) @require_diffusers def test_stable_diffusion_model_on_cpu_str(self): @@ -860,6 +863,7 @@ def test_stable_diffusion_model_on_cpu_str(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "CPUExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) + model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -878,6 +882,7 @@ def test_stable_diffusion_model_on_gpu(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) + model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -897,6 +902,7 @@ def test_stable_diffusion_model_on_rocm_ep(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "ROCMExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) + model(prompt="cat", num_inference_steps=2) @require_diffusers @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu") From 15c94bc6a68cbb19c0272f9e90abc26b618d7fa5 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 20 Sep 2024 11:57:22 +0200 Subject: [PATCH 57/71] model_save_path --- optimum/onnxruntime/modeling_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 791cf5a0db..14e41f2781 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -339,7 +339,7 @@ def _from_pretrained( **models, **sub_models, use_io_binding=use_io_binding, - model_save_dir=model_save_dir, + model_save_dir=model_save_dir or model_save_path, ) @classmethod From 2e242e5d6da2339e3ef63ceaec31a670619566e1 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Thu, 3 Oct 2024 17:15:04 +0200 Subject: [PATCH 58/71] Update optimum/onnxruntime/modeling_diffusion.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/onnxruntime/modeling_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 14e41f2781..bf6dc002f9 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -560,7 +560,7 @@ def prepare_onnx_outputs( @abstractmethod def forward(self, *args, **kwargs): - pass + raise NotImplementedError def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) From 9ed1afdeb50a25f2ba493020ac38a8f026355abf Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 00:37:03 +0200 Subject: [PATCH 59/71] migrate tiny-stable-diffusion-onnx --- tests/onnxruntime/test_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index f7b8ec3392..326f6d9d23 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -144,7 +144,7 @@ def __init__(self, *args, **kwargs): self.ONNX_SEQ2SEQ_MODEL_ID = "optimum/t5-small" self.LARGE_ONNX_SEQ2SEQ_MODEL_ID = "facebook/mbart-large-en-ro" self.TINY_ONNX_SEQ2SEQ_MODEL_ID = "fxmarty/sshleifer-tiny-mbart-onnx" - self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "IlyasMoutawwakil/tiny-stable-diffusion-onnx" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "optimum-internal-testing/tiny-stable-diffusion-onnx" def test_load_model_from_local_path(self): model = ORTModel.from_pretrained(self.LOCAL_MODEL_PATH) From 6c45e8c17822a6e8f78543ec6641ec458a4d340a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 00:55:09 +0200 Subject: [PATCH 60/71] resolve breaking change and mandatory arguments --- optimum/onnxruntime/modeling_diffusion.py | 61 ++++++++++++----------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index bf6dc002f9..bf78e89b2d 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -80,18 +80,17 @@ class ORTPipeline(ORTModel, ConfigMixin): def __init__( self, - # diffusers specific arguments - unet: Optional[ort.InferenceSession] = None, - vae_encoder: Optional[ort.InferenceSession] = None, - vae_decoder: Optional[ort.InferenceSession] = None, - text_encoder: Optional[ort.InferenceSession] = None, - text_encoder_2: Optional[ort.InferenceSession] = None, - image_encoder: Optional[ort.InferenceSession] = None, - safety_checker: Optional[ort.InferenceSession] = None, - scheduler: Optional["SchedulerMixin"] = None, - tokenizer: Optional["CLIPTokenizer"] = None, - tokenizer_2: Optional["CLIPTokenizer"] = None, + # diffusers mandatory arguments + tokenizer: Optional["CLIPTokenizer"], + scheduler: Optional["SchedulerMixin"], + unet_session: Optional[ort.InferenceSession], + vae_decoder_session: Optional[ort.InferenceSession], + # diffusers optional arguments + vae_encoder_session: Optional[ort.InferenceSession] = None, + text_encoder_session: Optional[ort.InferenceSession] = None, + text_encoder_2_session: Optional[ort.InferenceSession] = None, feature_extractor: Optional["CLIPFeatureExtractor"] = None, + tokenizer_2: Optional["CLIPTokenizer"] = None, # stable diffusion xl specific arguments requires_aesthetics_score: bool = False, force_zeros_for_empty_prompt: bool = True, @@ -101,34 +100,38 @@ def __init__( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - ############################################################################################################ - self.unet = ORTModelUnet(unet, self, subfolder=DIFFUSION_MODEL_UNET_SUBFOLDER) - self.vae_encoder = ( - ORTModelVaeEncoder(vae_encoder, self, subfolder=DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) - if vae_encoder is not None - else None + if kwargs: + logger.warning(f"{self.__class__.__name__} received additional arguments that are not used.") + + # mandatory components + self.unet = ORTModelUnet(unet_session, self, subfolder=DIFFUSION_MODEL_UNET_SUBFOLDER) + self.vae_decoder = ORTModelVaeDecoder( + vae_decoder_session, self, subfolder=DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER ) - self.vae_decoder = ( - ORTModelVaeDecoder(vae_decoder, self, subfolder=DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) - if vae_decoder is not None + + # optional components + self.vae_encoder = ( + ORTModelVaeEncoder(vae_encoder_session, self, subfolder=DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) + if vae_encoder_session is not None else None ) self.text_encoder = ( - ORTModelTextEncoder(text_encoder, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) - if text_encoder is not None + ORTModelTextEncoder(text_encoder_session, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) + if text_encoder_session is not None else None ) self.text_encoder_2 = ( - ORTModelTextEncoder(text_encoder_2, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) - if text_encoder_2 is not None + ORTModelTextEncoder(text_encoder_2_session, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) + if text_encoder_2_session is not None else None ) - self.image_encoder = image_encoder # TODO: maybe implement ORTModelImageEncoder - self.safety_checker = safety_checker # TODO: maybe implement ORTModelSafetyChecker # We wrap the VAE encoder and decoder in a single object to simplify the API self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) + self.image_encoder = None # TODO: maybe implement ORTModelImageEncoder + self.safety_checker = None # TODO: maybe implement ORTModelSafetyChecker + self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 @@ -155,14 +158,14 @@ def __init__( if key in all_possible_init_args: diffusers_pipeline_args[key] = all_possible_init_args[key] - # init stuff like config, vae_scale_factor, image_processor, etc. + # inits stuff like config, vae_scale_factor, image_processor, etc. self.auto_model_class.__init__(self, **diffusers_pipeline_args) + # not registered correctly in the config self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) - ############################################################################################################ - self.shared_attributes_init(model=unet, use_io_binding=use_io_binding, model_save_dir=model_save_dir) + self.shared_attributes_init(model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir) @staticmethod def load_model( From 3e27e4ca9ae3cbe906e34c3d2d27d1e2589b3f15 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 01:10:31 +0200 Subject: [PATCH 61/71] overwrite _get_add_time_ids --- optimum/onnxruntime/modeling_diffusion.py | 127 +++++++++++++--------- 1 file changed, 78 insertions(+), 49 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index bf78e89b2d..ff044db32f 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -170,8 +170,8 @@ def __init__( @staticmethod def load_model( unet_path: Union[str, Path], + vae_decoder_path: Optional[Union[str, Path]], vae_encoder_path: Optional[Union[str, Path]] = None, - vae_decoder_path: Optional[Union[str, Path]] = None, text_encoder_path: Optional[Union[str, Path]] = None, text_encoder_2_path: Optional[Union[str, Path]] = None, provider: str = "CPUExecutionProvider", @@ -185,10 +185,10 @@ def load_model( Args: unet_path (`Union[str, Path]`): The path to the U-NET ONNX model. + vae_decoder_path (`Union[str, Path]`): + The path to the VAE decoder ONNX model. vae_encoder_path (`Union[str, Path]`, defaults to `None`): The path to the VAE encoder ONNX model. - vae_decoder_path (`Union[str, Path]`, defaults to `None`): - The path to the VAE decoder ONNX model. text_encoder_path (`Union[str, Path]`, defaults to `None`): The path to the text encoder ONNX model. text_encoder_2_path (`Union[str, Path]`, defaults to `None`): @@ -204,8 +204,8 @@ def load_model( """ paths = { "unet": unet_path, - "vae_encoder": vae_encoder_path, "vae_decoder": vae_decoder_path, + "vae_encoder": vae_encoder_path, "text_encoder": text_encoder_path, "text_encoder_2": text_encoder_2_path, } @@ -224,8 +224,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): models_to_save_paths = { self.unet: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.vae_encoder: save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, self.vae_decoder: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, + self.vae_encoder: save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, self.text_encoder: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, self.text_encoder_2: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / ONNX_WEIGHTS_NAME, } @@ -245,21 +245,20 @@ def _save_pretrained(self, save_directory: Union[str, Path]): config_save_path = model_save_path.parent / CONFIG_NAME shutil.copyfile(config_path, config_save_path) + self.tokenizer.save_pretrained(save_directory / "tokenizer") self.scheduler.save_pretrained(save_directory / "scheduler") - if self.tokenizer is not None: - self.tokenizer.save_pretrained(save_directory / "tokenizer") - if self.tokenizer_2 is not None: - self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") if self.feature_extractor is not None: self.feature_extractor.save_pretrained(save_directory / "feature_extractor") + if self.tokenizer_2 is not None: + self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") @classmethod def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], - subfolder: str = "", + subfolder: str = "", # not used ? force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, @@ -420,12 +419,11 @@ def to(self, device: Union[torch.device, str, int]): self.unet.session.set_providers([provider], provider_options=[provider_options]) + self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) + if self.vae_encoder is not None: self.vae_encoder.session.set_providers([provider], provider_options=[provider_options]) - if self.vae_decoder is not None: - self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) - if self.text_encoder is not None: self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) @@ -601,12 +599,6 @@ def forward( class ORTModelUnet(ORTPipelinePart): - # def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): - # super().__init__(session, parent_pipeline, subfolder) - - # if not hasattr(self.config, "time_cond_proj_dim"): - # self.config = FrozenDict(**self.config, time_cond_proj_dim=None) - def forward( self, sample: Union[np.ndarray, torch.Tensor], @@ -644,31 +636,8 @@ def forward( return ModelOutput(**model_outputs) - @property - def add_embedding(self): - return FrozenDict( - linear_1=FrozenDict( - # this is a hacky way to get the attribute in add_embedding.linear_1.in_features - # (StableDiffusionXLImg2ImgPipeline/StableDiffusionXLInpaintPipeline)._get_add_time_ids - in_features=self.config.addition_time_embed_dim - * ( - 5 # list(original_size + crops_coords_top_left + (aesthetic_score,)) - if self.parent_pipeline.config.requires_aesthetics_score - else 6 # list(original_size + crops_coords_top_left + target_size) - ) - + self.parent_pipeline.text_encoder.config.projection_dim - ) - ) - class ORTModelVaeEncoder(ORTPipelinePart): - # def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): - # super().__init__(session, parent_pipeline, subfolder) - - # if not hasattr(self.config, "scaling_factor"): - # scaling_factor = 2 ** (len(self.config.block_out_channels) - 1) - # self.config = FrozenDict(**self.config, scaling_factor=scaling_factor) - def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = False): use_torch = isinstance(sample, torch.Tensor) @@ -693,13 +662,6 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = F class ORTModelVaeDecoder(ORTPipelinePart): - # def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): - # super().__init__(session, parent_pipeline, subfolder) - - # if not hasattr(self.config, "scaling_factor"): - # scaling_factor = 2 ** (len(self.config.block_out_channels) - 1) - # self.config = FrozenDict(**self.config, scaling_factor=scaling_factor) - def forward( self, latent_sample: Union[np.ndarray, torch.Tensor], @@ -794,6 +756,19 @@ class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipeline): main_input_name = "prompt" auto_model_class = StableDiffusionXLPipeline + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + dtype, + text_encoder_projection_dim=None, + ): + add_time_ids = list(original_size + crops_coords_top_left + target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionXLImg2ImgPipeline(ORTPipeline, StableDiffusionXLImg2ImgPipeline): @@ -804,6 +779,33 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTPipeline, StableDiffusionXLImg2ImgP main_input_name = "prompt" auto_model_class = StableDiffusionXLImg2ImgPipeline + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionXLInpaintPipeline(ORTPipeline, StableDiffusionXLInpaintPipeline): @@ -814,6 +816,33 @@ class ORTStableDiffusionXLInpaintPipeline(ORTPipeline, StableDiffusionXLInpaintP main_input_name = "image" auto_model_class = StableDiffusionXLInpaintPipeline + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyModelPipeline): From 65a366c58b41962815e2420a87600f12ca37f07c Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 09:15:27 +0200 Subject: [PATCH 62/71] fix --- optimum/onnxruntime/modeling_diffusion.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index ff044db32f..852ef1bf80 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -211,11 +211,11 @@ def load_model( } sessions = {} - for model_name, model_path in paths.items(): - if model_path is not None and model_path.is_file(): - sessions[model_name] = ORTModel.load_model(model_path, provider, session_options, provider_options) + for key, path in paths.items(): + if path is not None and path.is_file(): + sessions[f"{key}_session"] = ORTModel.load_model(path, provider, session_options, provider_options) else: - sessions[model_name] = None + sessions[f"{key}_session"] = None return sessions @@ -321,8 +321,8 @@ def _from_pretrained( paths = { "unet_path": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - "vae_encoder_path": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "vae_decoder_path": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + "vae_encoder_path": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "text_encoder_path": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, "text_encoder_2_path": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER From 7789a7f0ae6ae4fc81c8fb9a8ba3202cea8a783a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 09:30:14 +0200 Subject: [PATCH 63/71] remove inference calls from loading tests --- tests/onnxruntime/test_modeling.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 326f6d9d23..0fd34eabf8 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -318,7 +318,6 @@ def test_load_stable_diffusion_model_from_hub(self): self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder) self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) - model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -333,7 +332,6 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_decoder.session.get_providers(), model.providers) self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) - model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -349,7 +347,6 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_decoder.session.get_providers(), model.providers) self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) - model(prompt="cat", num_inference_steps=2) @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): @@ -362,7 +359,6 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_decoder.session.get_providers(), model.providers) self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) - model(prompt="cat", num_inference_steps=2) @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): @@ -846,7 +842,6 @@ def test_stable_diffusion_model_on_cpu(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "CPUExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) - model(prompt="cat", num_inference_steps=2) @require_diffusers def test_stable_diffusion_model_on_cpu_str(self): @@ -863,7 +858,6 @@ def test_stable_diffusion_model_on_cpu_str(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "CPUExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) - model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -882,7 +876,6 @@ def test_stable_diffusion_model_on_gpu(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) - model(prompt="cat", num_inference_steps=2) @require_diffusers @require_torch_gpu @@ -902,7 +895,6 @@ def test_stable_diffusion_model_on_rocm_ep(self): self.assertEqual(model.vae_decoder.session.get_providers()[0], "ROCMExecutionProvider") self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) - model(prompt="cat", num_inference_steps=2) @require_diffusers @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu") From 913e415da799b683db2192c05dfb0dc2001401d9 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 11:51:18 +0200 Subject: [PATCH 64/71] misc --- optimum/onnxruntime/modeling_diffusion.py | 125 +++++++++++----------- 1 file changed, 63 insertions(+), 62 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 852ef1bf80..ccadc47722 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -80,36 +80,31 @@ class ORTPipeline(ORTModel, ConfigMixin): def __init__( self, - # diffusers mandatory arguments - tokenizer: Optional["CLIPTokenizer"], - scheduler: Optional["SchedulerMixin"], - unet_session: Optional[ort.InferenceSession], - vae_decoder_session: Optional[ort.InferenceSession], - # diffusers optional arguments + scheduler: "SchedulerMixin", + unet_session: ort.InferenceSession, + vae_decoder_session: ort.InferenceSession, + # optional pipeline models vae_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, - feature_extractor: Optional["CLIPFeatureExtractor"] = None, + # optional pipeline submodels + tokenizer: Optional["CLIPTokenizer"] = None, tokenizer_2: Optional["CLIPTokenizer"] = None, + feature_extractor: Optional["CLIPFeatureExtractor"] = None, # stable diffusion xl specific arguments - requires_aesthetics_score: bool = False, force_zeros_for_empty_prompt: bool = True, + requires_aesthetics_score: bool = False, add_watermarker: Optional[bool] = None, # onnxruntime specific arguments use_io_binding: Optional[bool] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - if kwargs: - logger.warning(f"{self.__class__.__name__} received additional arguments that are not used.") - - # mandatory components self.unet = ORTModelUnet(unet_session, self, subfolder=DIFFUSION_MODEL_UNET_SUBFOLDER) self.vae_decoder = ORTModelVaeDecoder( vae_decoder_session, self, subfolder=DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER ) - # optional components self.vae_encoder = ( ORTModelVaeEncoder(vae_encoder_session, self, subfolder=DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) if vae_encoder_session is not None @@ -126,24 +121,25 @@ def __init__( else None ) - # We wrap the VAE encoder and decoder in a single object to simplify the API + # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) - self.image_encoder = None # TODO: maybe implement ORTModelImageEncoder - self.safety_checker = None # TODO: maybe implement ORTModelSafetyChecker - self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 self.feature_extractor = feature_extractor - all_possible_init_args = { + # we allow passing these as torch models for now + self.image_encoder = kwargs.pop("image_encoder", None) # TODO: maybe implement ORTModelImageEncoder + self.safety_checker = kwargs.pop("safety_checker", None) # TODO: maybe implement ORTModelSafetyChecker + + all_pipeline_init_args = { "vae": self.vae, "unet": self.unet, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, - "image_encoder": self.image_encoder, "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, "scheduler": self.scheduler, "tokenizer": self.tokenizer, "tokenizer_2": self.tokenizer_2, @@ -155,17 +151,17 @@ def __init__( diffusers_pipeline_args = {} for key in inspect.signature(self.auto_model_class).parameters.keys(): - if key in all_possible_init_args: - diffusers_pipeline_args[key] = all_possible_init_args[key] + if key in all_pipeline_init_args: + diffusers_pipeline_args[key] = all_pipeline_init_args[key] # inits stuff like config, vae_scale_factor, image_processor, etc. self.auto_model_class.__init__(self, **diffusers_pipeline_args) - - # not registered correctly in the config - self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) - self.shared_attributes_init(model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir) + self.shared_attributes_init( + model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs + ) @staticmethod def load_model( @@ -175,8 +171,8 @@ def load_model( text_encoder_path: Optional[Union[str, Path]] = None, text_encoder_2_path: Optional[Union[str, Path]] = None, provider: str = "CPUExecutionProvider", - session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, + session_options: Optional[ort.SessionOptions] = None, ): """ Creates three inference sessions for the components of a Diffusion Pipeline (U-NET, VAE, Text Encoders). @@ -196,11 +192,11 @@ def load_model( provider (`str`, defaults to `"CPUExecutionProvider"`): ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/ for possible providers. - session_options (`Optional[ort.SessionOptions]`, defaults to `None`): - ONNX Runtime session options to use for loading the model. Defaults to `None`. provider_options (`Optional[Dict[str, Any]]`, defaults to `None`): Provider option dictionary corresponding to the provider used. See available options for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`. + session_options (`Optional[ort.SessionOptions]`, defaults to `None`): + ONNX Runtime session options to use for loading the model. Defaults to `None`. """ paths = { "unet": unet_path, @@ -209,7 +205,6 @@ def load_model( "text_encoder": text_encoder_path, "text_encoder_2": text_encoder_2_path, } - sessions = {} for key, path in paths.items(): if path is not None and path.is_file(): @@ -223,13 +218,13 @@ def _save_pretrained(self, save_directory: Union[str, Path]): save_directory = Path(save_directory) models_to_save_paths = { - self.unet: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.vae_decoder: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.vae_encoder: save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.text_encoder: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.text_encoder_2: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / ONNX_WEIGHTS_NAME, + (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME), + (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME), + (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME), + (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME), + (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / ONNX_WEIGHTS_NAME), } - for model, model_save_path in models_to_save_paths.items(): + for model, model_save_path in models_to_save_paths: if model is not None: model_path = Path(model.session._model_path) model_save_path.parent.mkdir(parents=True, exist_ok=True) @@ -245,37 +240,43 @@ def _save_pretrained(self, save_directory: Union[str, Path]): config_save_path = model_save_path.parent / CONFIG_NAME shutil.copyfile(config_path, config_save_path) - self.tokenizer.save_pretrained(save_directory / "tokenizer") self.scheduler.save_pretrained(save_directory / "scheduler") - if self.feature_extractor is not None: - self.feature_extractor.save_pretrained(save_directory / "feature_extractor") + if self.tokenizer is not None: + self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.feature_extractor is not None: + self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @classmethod def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], - subfolder: str = "", # not used ? + subfolder: str = "", force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, - vae_encoder_file_name: str = ONNX_WEIGHTS_NAME, vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, + vae_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME, use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", - session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, + session_options: Optional[ort.SessionOptions] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): + if use_io_binding: + raise ValueError( + "IOBinding is not yet available for diffusion pipelines, please set `use_io_binding` to False." + ) + all_components = {key for key in config.keys() if not key.startswith("_")} all_components.update({"vae_encoder", "vae_decoder"}) @@ -284,8 +285,8 @@ def _from_pretrained( allow_patterns.update( { unet_file_name, - vae_encoder_file_name, vae_decoder_file_name, + vae_encoder_file_name, text_encoder_file_name, text_encoder_2_file_name, SCHEDULER_CONFIG_NAME, @@ -293,6 +294,10 @@ def _from_pretrained( CONFIG_NAME, } ) + + if subfolder: + allow_patterns = {os.path.join(subfolder, pattern) for pattern in allow_patterns} + model_id = snapshot_download( model_id, cache_dir=cache_dir, @@ -306,7 +311,10 @@ def _from_pretrained( model_save_path = Path(model_id) - sub_models = {} + if subfolder: + model_save_path = model_save_path / subfolder + + submodels = {} for name in {"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}: library_name, library_classes = config.get(name, (None, None)) if library_classes is not None: @@ -315,31 +323,24 @@ def _from_pretrained( load_method = getattr(class_obj, "from_pretrained") # Check if the module is in a subdirectory if (model_save_path / name).is_dir(): - sub_models[name] = load_method(model_save_path / name) + submodels[name] = load_method(model_save_path / name) else: - sub_models[name] = load_method(model_save_path) + submodels[name] = load_method(model_save_path) - paths = { - "unet_path": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - "vae_decoder_path": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, - "vae_encoder_path": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - "text_encoder_path": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, - "text_encoder_2_path": model_save_path - / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER - / text_encoder_2_file_name, - } models = cls.load_model( - **paths, provider=provider, session_options=session_options, provider_options=provider_options + unet_path=model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + vae_decoder_path=model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + vae_encoder_path=model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, + text_encoder_path=model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, + text_encoder_2_path=model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + provider=provider, + provider_options=provider_options, + session_options=session_options, ) - if use_io_binding: - raise ValueError( - "IOBinding is not yet available for stable diffusion model, please set `use_io_binding` to False." - ) - return cls( **models, - **sub_models, + **submodels, use_io_binding=use_io_binding, model_save_dir=model_save_dir or model_save_path, ) @@ -391,8 +392,8 @@ def _export( model_save_path, config=config, provider=provider, - session_options=session_options, provider_options=provider_options, + session_options=session_options, use_io_binding=use_io_binding, model_save_dir=model_save_dir, ) From 34e278999bb83fa39f89267776ae593944c4a4a5 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 13:17:30 +0200 Subject: [PATCH 65/71] better compatibility between model parts and parent pipeline --- optimum/onnxruntime/modeling_diffusion.py | 56 +++++++++-------------- tests/onnxruntime/test_modeling.py | 10 ++++ 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index ccadc47722..9ea4aa8597 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -25,7 +25,7 @@ import numpy as np import torch -from diffusers.configuration_utils import ConfigMixin, FrozenDict +from diffusers.configuration_utils import ConfigMixin from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution from diffusers.pipelines import ( AutoPipelineForImage2Image, @@ -100,27 +100,15 @@ def __init__( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - self.unet = ORTModelUnet(unet_session, self, subfolder=DIFFUSION_MODEL_UNET_SUBFOLDER) - self.vae_decoder = ORTModelVaeDecoder( - vae_decoder_session, self, subfolder=DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER - ) - - self.vae_encoder = ( - ORTModelVaeEncoder(vae_encoder_session, self, subfolder=DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) - if vae_encoder_session is not None - else None - ) + self.unet = ORTModelUnet(unet_session, self) + self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) + self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None self.text_encoder = ( - ORTModelTextEncoder(text_encoder_session, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) - if text_encoder_session is not None - else None + ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None ) self.text_encoder_2 = ( - ORTModelTextEncoder(text_encoder_2_session, self, subfolder=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) - if text_encoder_2_session is not None - else None + ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None ) - # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) @@ -154,11 +142,10 @@ def __init__( if key in all_pipeline_init_args: diffusers_pipeline_args[key] = all_pipeline_init_args[key] - # inits stuff like config, vae_scale_factor, image_processor, etc. + # inits the diffusers pipeline specific attributes self.auto_model_class.__init__(self, **diffusers_pipeline_args) - self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) - self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + # inits ort specific attributes self.shared_attributes_init( model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs ) @@ -469,10 +456,21 @@ def __call__(self, *args, **kwargs): return self.auto_model_class.__call__(self, *args, **kwargs) -class ORTPipelinePart: - def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, subfolder: str): +class ORTPipelinePart(ConfigMixin): + config_name: str = CONFIG_NAME + + def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline): + config_file_path = Path(session._model_path).parent / self.config_name + + if not config_file_path.is_file(): + # config is mandatory for the model part to be used for inference + raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}") + + config_dict = parent_pipeline._dict_from_json_file(config_file_path) + self.register_to_config(**config_dict) + + # ort model part self.session = session - self.subfolder = subfolder self.parent_pipeline = parent_pipeline self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} @@ -480,16 +478,6 @@ def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline, self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} - self.model_save_dir = Path(self.session._model_path).parent - config_path = self.model_save_dir / CONFIG_NAME - - if not config_path.is_file(): - # config is necessary for the model to work - raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_path}") - - config_dict = parent_pipeline._dict_from_json_file(config_path) - self.config = FrozenDict(**config_dict) - @property def device(self): return self.parent_pipeline.device diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 0fd34eabf8..1724d487e8 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -227,6 +227,8 @@ def test_load_stable_diffusion_model_from_cache(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( @@ -319,6 +321,8 @@ def test_load_stable_diffusion_model_from_hub(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test @@ -333,6 +337,8 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers @require_torch_gpu @require_ort_rocm @@ -348,6 +354,8 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): model = ORTStableDiffusionPipeline.from_pretrained( @@ -360,6 +368,8 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): with self.assertRaises(ValueError): From e4c71844c4091bfcf76d6ba13e2f401175ac74cc Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 7 Oct 2024 17:22:20 +0200 Subject: [PATCH 66/71] remove subfolder --- optimum/onnxruntime/modeling_diffusion.py | 25 ++++++++--------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 9ea4aa8597..51adf8c510 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -77,6 +77,7 @@ class ORTPipeline(ORTModel, ConfigMixin): config_name = "model_index.json" + auto_model_class = None def __init__( self, @@ -241,7 +242,6 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], - subfolder: str = "", force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, @@ -264,10 +264,8 @@ def _from_pretrained( "IOBinding is not yet available for diffusion pipelines, please set `use_io_binding` to False." ) - all_components = {key for key in config.keys() if not key.startswith("_")} - all_components.update({"vae_encoder", "vae_decoder"}) - if not os.path.isdir(str(model_id)): + all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"} allow_patterns = {os.path.join(component, "*") for component in all_components} allow_patterns.update( { @@ -281,10 +279,6 @@ def _from_pretrained( CONFIG_NAME, } ) - - if subfolder: - allow_patterns = {os.path.join(subfolder, pattern) for pattern in allow_patterns} - model_id = snapshot_download( model_id, cache_dir=cache_dir, @@ -298,8 +292,8 @@ def _from_pretrained( model_save_path = Path(model_id) - if subfolder: - model_save_path = model_save_path / subfolder + if model_save_dir is None: + model_save_dir = model_save_path submodels = {} for name in {"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}: @@ -329,7 +323,7 @@ def _from_pretrained( **models, **submodels, use_io_binding=use_io_binding, - model_save_dir=model_save_dir or model_save_path, + model_save_dir=model_save_dir, ) @classmethod @@ -437,8 +431,8 @@ def components(self) -> Dict[str, Any]: "unet": self.unet, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, - "image_encoder": self.image_encoder, "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, } components = {k: v for k, v in components.items() if v is not None} return components @@ -466,17 +460,16 @@ def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline): # config is mandatory for the model part to be used for inference raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}") - config_dict = parent_pipeline._dict_from_json_file(config_file_path) + config_dict = self._dict_from_json_file(config_file_path) self.register_to_config(**config_dict) - # ort model part self.session = session self.parent_pipeline = parent_pipeline self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} - self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} - self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} + self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()} + self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()} @property def device(self): From f369b084bc4f5f7c04c389aa2b750fb5a6b62479 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 8 Oct 2024 01:32:18 +0200 Subject: [PATCH 67/71] misc --- optimum/onnxruntime/modeling_diffusion.py | 164 ++++++++-------------- 1 file changed, 61 insertions(+), 103 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 51adf8c510..ae31eb44d7 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -110,18 +110,19 @@ def __init__( self.text_encoder_2 = ( ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None ) + # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) + # we allow passing these as torch models for now + self.image_encoder = kwargs.pop("image_encoder", None) # TODO: maybe implement ORTModelImageEncoder + self.safety_checker = kwargs.pop("safety_checker", None) # TODO: maybe implement ORTModelSafetyChecker + self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 self.feature_extractor = feature_extractor - # we allow passing these as torch models for now - self.image_encoder = kwargs.pop("image_encoder", None) # TODO: maybe implement ORTModelImageEncoder - self.safety_checker = kwargs.pop("safety_checker", None) # TODO: maybe implement ORTModelSafetyChecker - all_pipeline_init_args = { "vae": self.vae, "unet": self.unet, @@ -142,8 +143,6 @@ def __init__( for key in inspect.signature(self.auto_model_class).parameters.keys(): if key in all_pipeline_init_args: diffusers_pipeline_args[key] = all_pipeline_init_args[key] - - # inits the diffusers pipeline specific attributes self.auto_model_class.__init__(self, **diffusers_pipeline_args) # inits ort specific attributes @@ -151,81 +150,30 @@ def __init__( model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs ) - @staticmethod - def load_model( - unet_path: Union[str, Path], - vae_decoder_path: Optional[Union[str, Path]], - vae_encoder_path: Optional[Union[str, Path]] = None, - text_encoder_path: Optional[Union[str, Path]] = None, - text_encoder_2_path: Optional[Union[str, Path]] = None, - provider: str = "CPUExecutionProvider", - provider_options: Optional[Dict[str, Any]] = None, - session_options: Optional[ort.SessionOptions] = None, - ): - """ - Creates three inference sessions for the components of a Diffusion Pipeline (U-NET, VAE, Text Encoders). - The default provider is `CPUExecutionProvider` to match the default behaviour in PyTorch/TensorFlow/JAX. - - Args: - unet_path (`Union[str, Path]`): - The path to the U-NET ONNX model. - vae_decoder_path (`Union[str, Path]`): - The path to the VAE decoder ONNX model. - vae_encoder_path (`Union[str, Path]`, defaults to `None`): - The path to the VAE encoder ONNX model. - text_encoder_path (`Union[str, Path]`, defaults to `None`): - The path to the text encoder ONNX model. - text_encoder_2_path (`Union[str, Path]`, defaults to `None`): - The path to the second text decoder ONNX model. - provider (`str`, defaults to `"CPUExecutionProvider"`): - ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/ - for possible providers. - provider_options (`Optional[Dict[str, Any]]`, defaults to `None`): - Provider option dictionary corresponding to the provider used. See available options - for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`. - session_options (`Optional[ort.SessionOptions]`, defaults to `None`): - ONNX Runtime session options to use for loading the model. Defaults to `None`. - """ - paths = { - "unet": unet_path, - "vae_decoder": vae_decoder_path, - "vae_encoder": vae_encoder_path, - "text_encoder": text_encoder_path, - "text_encoder_2": text_encoder_2_path, - } - sessions = {} - for key, path in paths.items(): - if path is not None and path.is_file(): - sessions[f"{key}_session"] = ORTModel.load_model(path, provider, session_options, provider_options) - else: - sessions[f"{key}_session"] = None - - return sessions - def _save_pretrained(self, save_directory: Union[str, Path]): save_directory = Path(save_directory) models_to_save_paths = { - (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME), - (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME), - (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME), - (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME), - (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / ONNX_WEIGHTS_NAME), + (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER), + (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER), + (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), + (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), + (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), } - for model, model_save_path in models_to_save_paths: + for model, save_path in models_to_save_paths: if model is not None: model_path = Path(model.session._model_path) - model_save_path.parent.mkdir(parents=True, exist_ok=True) + save_path.mkdir(parents=True, exist_ok=True) # copy onnx model - shutil.copyfile(model_path, model_save_path) + shutil.copyfile(model_path, save_path / ONNX_WEIGHTS_NAME) # copy external onnx data external_data_paths = _get_model_external_data_paths(model_path) for external_data_path in external_data_paths: - shutil.copyfile(external_data_path, model_save_path.parent / external_data_path.name) + shutil.copyfile(external_data_path, save_path / external_data_path.name) # copy model config config_path = model_path.parent / CONFIG_NAME if config_path.is_file(): - config_save_path = model_save_path.parent / CONFIG_NAME + config_save_path = save_path / CONFIG_NAME shutil.copyfile(config_path, config_save_path) self.scheduler.save_pretrained(save_directory / "scheduler") @@ -242,9 +190,11 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], + subfolder: str = "", force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, + trust_remote_code: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, @@ -295,35 +245,45 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = model_save_path + model_paths = { + "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, + "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, + "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + } + + sessions = {} + for model, path in model_paths.items(): + if kwargs.get(model, None) is not None: + # this allows passing a model directly to from_pretrained + sessions[f"{model}_session"] = kwargs.pop(model) + elif path.is_file(): + sessions[f"{model}_session"] = ORTModel.load_model(path, provider, session_options, provider_options) + else: + sessions[f"{model}_session"] = None + submodels = {} - for name in {"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}: - library_name, library_classes = config.get(name, (None, None)) - if library_classes is not None: + for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}: + if kwargs.get(submodel, None) is not None: + submodels[submodel] = kwargs.pop(submodel) + elif config.get(submodel, (None, None))[0] is not None: + library_name, library_classes = config.get(submodel) library = importlib.import_module(library_name) class_obj = getattr(library, library_classes) load_method = getattr(class_obj, "from_pretrained") # Check if the module is in a subdirectory - if (model_save_path / name).is_dir(): - submodels[name] = load_method(model_save_path / name) + if (model_save_path / submodel).is_dir(): + submodels[submodel] = load_method(model_save_path / submodel, trust_remote_code=trust_remote_code) else: - submodels[name] = load_method(model_save_path) - - models = cls.load_model( - unet_path=model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - vae_decoder_path=model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, - vae_encoder_path=model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_path=model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, - text_encoder_2_path=model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, - provider=provider, - provider_options=provider_options, - session_options=session_options, - ) + submodels[submodel] = load_method(model_save_path, trust_remote_code=trust_remote_code) return cls( - **models, + **sessions, **submodels, use_io_binding=use_io_binding, model_save_dir=model_save_dir, + **kwargs, ) @classmethod @@ -394,21 +354,18 @@ def to(self, device: Union[torch.device, str, int]): device, provider_options = parse_device(device) provider = get_provider_for_device(device) - validate_provider_availability(provider) # raise error if the provider is not available + validate_provider_availability(provider) if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider": return self self.unet.session.set_providers([provider], provider_options=[provider_options]) - self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) if self.vae_encoder is not None: self.vae_encoder.session.set_providers([provider], provider_options=[provider_options]) - if self.text_encoder is not None: self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) - if self.text_encoder_2 is not None: self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options]) @@ -454,15 +411,6 @@ class ORTPipelinePart(ConfigMixin): config_name: str = CONFIG_NAME def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline): - config_file_path = Path(session._model_path).parent / self.config_name - - if not config_file_path.is_file(): - # config is mandatory for the model part to be used for inference - raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}") - - config_dict = self._dict_from_json_file(config_file_path) - self.register_to_config(**config_dict) - self.session = session self.parent_pipeline = parent_pipeline @@ -471,6 +419,13 @@ def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline): self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()} self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()} + config_file_path = Path(session._model_path).parent / self.config_name + if not config_file_path.is_file(): + # config is mandatory for the model part to be used for inference + raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}") + config_dict = self._dict_from_json_file(config_file_path) + self.register_to_config(**config_dict) + @property def device(self): return self.parent_pipeline.device @@ -620,7 +575,12 @@ def forward( class ORTModelVaeEncoder(ORTPipelinePart): - def forward(self, sample: Union[np.ndarray, torch.Tensor], return_dict: bool = False): + def forward( + self, + sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): use_torch = isinstance(sample, torch.Tensor) model_inputs = {"sample": sample} @@ -669,11 +629,10 @@ def forward( class ORTWrapperVae(ORTPipelinePart): def __init__(self, encoder: ORTModelVaeEncoder, decoder: ORTModelVaeDecoder): + self.decoder = decoder if encoder is not None: self.encoder = encoder - self.decoder = decoder - @property def config(self): return self.decoder.config @@ -693,11 +652,10 @@ def decode(self, *args, **kwargs): return self.decoder(*args, **kwargs) def to(self, *args, **kwargs): + self.decoder.to(*args, **kwargs) if self.encoder is not None: self.encoder.to(*args, **kwargs) - self.decoder.to(*args, **kwargs) - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipeline): From 7dd8f587470a831d709be85d326d621fb0244b5d Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 8 Oct 2024 10:18:56 +0200 Subject: [PATCH 68/71] update --- optimum/onnxruntime/modeling_diffusion.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index ae31eb44d7..41618f16e8 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -194,7 +194,6 @@ def _from_pretrained( force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, - trust_remote_code: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, @@ -258,10 +257,10 @@ def _from_pretrained( if kwargs.get(model, None) is not None: # this allows passing a model directly to from_pretrained sessions[f"{model}_session"] = kwargs.pop(model) - elif path.is_file(): - sessions[f"{model}_session"] = ORTModel.load_model(path, provider, session_options, provider_options) else: - sessions[f"{model}_session"] = None + sessions[f"{model}_session"] = ( + ORTModel.load_model(path, provider, session_options, provider_options) if path.is_file() else None + ) submodels = {} for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}: @@ -274,9 +273,9 @@ def _from_pretrained( load_method = getattr(class_obj, "from_pretrained") # Check if the module is in a subdirectory if (model_save_path / submodel).is_dir(): - submodels[submodel] = load_method(model_save_path / submodel, trust_remote_code=trust_remote_code) + submodels[submodel] = load_method(model_save_path / submodel) else: - submodels[submodel] = load_method(model_save_path, trust_remote_code=trust_remote_code) + submodels[submodel] = load_method(model_save_path) return cls( **sessions, From a61a5c0bf40b8461ed4dcd958c6fb8440533aa04 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 8 Oct 2024 11:21:11 +0200 Subject: [PATCH 69/71] support passing safety checker --- optimum/onnxruntime/modeling_diffusion.py | 3 + tests/onnxruntime/test_diffusion.py | 101 ++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 41618f16e8..ec2b0df554 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -191,6 +191,7 @@ def _from_pretrained( model_id: Union[str, Path], config: Dict[str, Any], subfolder: str = "", + trust_remote_code: bool = False, force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, @@ -302,6 +303,7 @@ def _export( session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, task: Optional[str] = None, + **kwargs, ) -> "ORTPipeline": if task is None: task = cls._auto_model_to_task(cls.auto_model_class) @@ -336,6 +338,7 @@ def _export( session_options=session_options, use_io_binding=use_io_binding, model_save_dir=model_save_dir, + **kwargs, ) def to(self, device: Union[torch.device, str, int]): diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 7bb6128878..956566f0e1 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -22,6 +22,7 @@ AutoPipelineForText2Image, DiffusionPipeline, ) +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image from parameterized import parameterized from transformers.testing_utils import require_torch_gpu @@ -290,6 +291,39 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) + + ort_images = ort_output.images + diffusers_images = diffusers_output.images + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) + class ORTPipelineForImage2ImageTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] @@ -471,6 +505,40 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) + + ort_images = ort_output.images + diffusers_images = diffusers_output.images + + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) + class ORTPipelineForInpaintingTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] @@ -656,3 +724,36 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): outputs = pipeline(**inputs).images self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(["stable-diffusion"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) + + ort_images = ort_output.images + diffusers_images = diffusers_output.images + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) From 43dff36963f5e42f65c64bd5e0ddec080fa33fe7 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 9 Oct 2024 10:53:27 +0200 Subject: [PATCH 70/71] dummies --- optimum/onnxruntime/__init__.py | 8 ++++++++ tests/onnxruntime/utils_onnxruntime_tests.py | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 1cb5b7c47b..4e25a43690 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -79,7 +79,9 @@ "ORTStableDiffusionInpaintPipeline", "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", + "ORTStableDiffusionXLInpaintPipeline", "ORTLatentConsistencyModelPipeline", + "ORTLatentConsistencyModelImg2ImgPipeline", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", "ORTPipelineForText2Image", @@ -92,6 +94,8 @@ "ORTStableDiffusionInpaintPipeline", "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", + "ORTStableDiffusionXLInpaintPipeline", + "ORTLatentConsistencyModelImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", @@ -148,6 +152,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( ORTDiffusionPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, @@ -156,11 +161,13 @@ ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: from .modeling_diffusion import ( ORTDiffusionPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, @@ -169,6 +176,7 @@ ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 17f3b391b0..5071d0081a 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -171,6 +171,11 @@ class ORTModelTestMixin(unittest.TestCase): "np": np.ndarray, } + TASK = None + + ORTMODEL_CLASS = None + AUTOMODEL_CLASS = None + @classmethod def setUpClass(cls): cls.onnx_model_dirs = {} From db1f04d9371428deaad9616226c2294515b2e1ea Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 9 Oct 2024 11:59:27 +0200 Subject: [PATCH 71/71] remove the need for ORTPipeline --- optimum/onnxruntime/modeling_diffusion.py | 196 +++++++++++++--------- 1 file changed, 115 insertions(+), 81 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index ec2b0df554..87fcb68c7e 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -40,6 +40,7 @@ StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) +from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.schedulers import SchedulerMixin from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils.constants import CONFIG_NAME @@ -75,9 +76,12 @@ logger = logging.getLogger(__name__) -class ORTPipeline(ORTModel, ConfigMixin): +# TODO: support from_pipe() +# TODO: Instead of ORTModel, it makes sense to have a compositional ORTMixin +# TODO: instead of one bloated __init__, we should consider an __init__ per pipeline +class ORTDiffusionPipeline(ORTModel, DiffusionPipeline): config_name = "model_index.json" - auto_model_class = None + auto_model_class = DiffusionPipeline def __init__( self, @@ -110,7 +114,6 @@ def __init__( self.text_encoder_2 = ( ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None ) - # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) @@ -143,6 +146,7 @@ def __init__( for key in inspect.signature(self.auto_model_class).parameters.keys(): if key in all_pipeline_init_args: diffusers_pipeline_args[key] = all_pipeline_init_args[key] + # inits diffusers pipeline specific attributes (registers modules and config) self.auto_model_class.__init__(self, **diffusers_pipeline_args) # inits ort specific attributes @@ -191,10 +195,10 @@ def _from_pretrained( model_id: Union[str, Path], config: Dict[str, Any], subfolder: str = "", - trust_remote_code: bool = False, force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, + trust_remote_code: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, @@ -229,7 +233,7 @@ def _from_pretrained( CONFIG_NAME, } ) - model_id = snapshot_download( + model_save_folder = snapshot_download( model_id, cache_dir=cache_dir, force_download=force_download, @@ -239,8 +243,10 @@ def _from_pretrained( allow_patterns=allow_patterns, ignore_patterns=["*.msgpack", "*.safetensors", "*.bin", "*.xml"], ) + else: + model_save_folder = str(model_id) - model_save_path = Path(model_id) + model_save_path = Path(model_save_folder) if model_save_dir is None: model_save_dir = model_save_path @@ -278,7 +284,14 @@ def _from_pretrained( else: submodels[submodel] = load_method(model_save_path) - return cls( + # same as DiffusionPipeline.from_pretraoned, if called directly, it loads the class in the config + if cls.__name__ == "ORTDiffusionPipeline": + class_name = config["_class_name"] + ort_pipeline_class = _get_ort_class(class_name) + else: + ort_pipeline_class = cls + + ort_pipeline = ort_pipeline_class( **sessions, **submodels, use_io_binding=use_io_binding, @@ -286,6 +299,11 @@ def _from_pretrained( **kwargs, ) + # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from + ort_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id))) + + return ort_pipeline + @classmethod def _export( cls, @@ -295,16 +313,16 @@ def _export( force_download: bool = False, local_files_only: bool = False, revision: Optional[str] = None, + trust_remote_code: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, - trust_remote_code: bool = False, use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, task: Optional[str] = None, **kwargs, - ) -> "ORTPipeline": + ) -> "ORTDiffusionPipeline": if task is None: task = cls._auto_model_to_task(cls.auto_model_class) @@ -397,7 +415,8 @@ def components(self) -> Dict[str, Any]: return components def __call__(self, *args, **kwargs): - # we keep numpy random states support for now + # we do this to keep numpy random states support for now + # TODO: deprecate and add warnings when a random state is passed args = list(args) for i in range(len(args)): @@ -412,7 +431,7 @@ def __call__(self, *args, **kwargs): class ORTPipelinePart(ConfigMixin): config_name: str = CONFIG_NAME - def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTPipeline): + def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionPipeline): self.session = session self.parent_pipeline = parent_pipeline @@ -506,38 +525,18 @@ def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) -class ORTModelTextEncoder(ORTPipelinePart): - def forward( - self, - input_ids: Union[np.ndarray, torch.Tensor], - attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, - output_hidden_states: Optional[bool] = None, - return_dict: bool = False, - ): - use_torch = isinstance(input_ids, torch.Tensor) - - model_inputs = {"input_ids": input_ids} - - onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) - onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) - - if output_hidden_states: - model_outputs["hidden_states"] = [] - for i in range(self.config.num_hidden_layers): - model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) - model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) - else: - for i in range(self.config.num_hidden_layers): - model_outputs.pop(f"hidden_states.{i}", None) - - if return_dict: - return model_outputs - - return ModelOutput(**model_outputs) - - class ORTModelUnet(ORTPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "time_cond_proj_dim"): + logger.warning( + "The `time_cond_proj_dim` attribute is missing from the UNet configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(time_cond_proj_dim=None) + def forward( self, sample: Union[np.ndarray, torch.Tensor], @@ -576,7 +575,49 @@ def forward( return ModelOutput(**model_outputs) +class ORTModelTextEncoder(ORTPipelinePart): + def forward( + self, + input_ids: Union[np.ndarray, torch.Tensor], + attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, + output_hidden_states: Optional[bool] = None, + return_dict: bool = False, + ): + use_torch = isinstance(input_ids, torch.Tensor) + + model_inputs = {"input_ids": input_ids} + + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if output_hidden_states: + model_outputs["hidden_states"] = [] + for i in range(self.config.num_hidden_layers): + model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) + model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + else: + for i in range(self.config.num_hidden_layers): + model_outputs.pop(f"hidden_states.{i}", None) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + class ORTModelVaeEncoder(ORTPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE encoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) + def forward( self, sample: Union[np.ndarray, torch.Tensor], @@ -606,6 +647,17 @@ def forward( class ORTModelVaeDecoder(ORTPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE decoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) + def forward( self, latent_sample: Union[np.ndarray, torch.Tensor], @@ -632,8 +684,7 @@ def forward( class ORTWrapperVae(ORTPipelinePart): def __init__(self, encoder: ORTModelVaeEncoder, decoder: ORTModelVaeDecoder): self.decoder = decoder - if encoder is not None: - self.encoder = encoder + self.encoder = encoder @property def config(self): @@ -647,12 +698,12 @@ def dtype(self): def device(self): return self.decoder.device - def encode(self, *args, **kwargs): - return self.encoder(*args, **kwargs) - def decode(self, *args, **kwargs): return self.decoder(*args, **kwargs) + def encode(self, *args, **kwargs): + return self.encoder(*args, **kwargs) + def to(self, *args, **kwargs): self.decoder.to(*args, **kwargs) if self.encoder is not None: @@ -660,42 +711,46 @@ def to(self, *args, **kwargs): @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipeline): +class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ main_input_name = "prompt" + export_feature = "text-to-image" auto_model_class = StableDiffusionPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipeline): +class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ main_input_name = "image" + export_feature = "image-to-image" auto_model_class = StableDiffusionImg2ImgPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipeline): +class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ main_input_name = "prompt" + export_feature = "inpainting" auto_model_class = StableDiffusionInpaintPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLPipeline(ORTPipeline, StableDiffusionXLPipeline): +class ORTStableDiffusionXLPipeline(ORTDiffusionPipeline, StableDiffusionXLPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ main_input_name = "prompt" + export_feature = "text-to-image" auto_model_class = StableDiffusionXLPipeline def _get_add_time_ids( @@ -713,12 +768,13 @@ def _get_add_time_ids( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLImg2ImgPipeline(ORTPipeline, StableDiffusionXLImg2ImgPipeline): +class ORTStableDiffusionXLImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionXLImg2ImgPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ main_input_name = "prompt" + export_feature = "image-to-image" auto_model_class = StableDiffusionXLImg2ImgPipeline def _get_add_time_ids( @@ -750,12 +806,13 @@ def _get_add_time_ids( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLInpaintPipeline(ORTPipeline, StableDiffusionXLInpaintPipeline): +class ORTStableDiffusionXLInpaintPipeline(ORTDiffusionPipeline, StableDiffusionXLInpaintPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). """ main_input_name = "image" + export_feature = "inpainting" auto_model_class = StableDiffusionXLInpaintPipeline def _get_add_time_ids( @@ -787,22 +844,24 @@ def _get_add_time_ids( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyModelPipeline): +class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyModelPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ main_input_name = "prompt" + export_feature = "text-to-image" auto_model_class = LatentConsistencyModelPipeline @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelImg2ImgPipeline(ORTPipeline, LatentConsistencyModelImg2ImgPipeline): +class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsistencyModelImg2ImgPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). """ main_input_name = "image" + export_feature = "image-to-image" auto_model_class = LatentConsistencyModelImg2ImgPipeline @@ -830,31 +889,6 @@ def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tr raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}") -class ORTDiffusionPipeline(ConfigMixin): - config_name = "model_index.json" - - @classmethod - @validate_hf_hub_args - def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: - load_config_kwargs = { - "force_download": kwargs.get("force_download", False), - "resume_download": kwargs.get("resume_download", None), - "local_files_only": kwargs.get("local_files_only", False), - "cache_dir": kwargs.get("cache_dir", None), - "revision": kwargs.get("revision", None), - "proxies": kwargs.get("proxies", None), - "token": kwargs.get("token", None), - } - - config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) - config = config[0] if isinstance(config, tuple) else config - class_name = config["_class_name"] - - ort_pipeline_class = _get_ort_class(class_name) - - return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) - - ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionPipeline), @@ -910,7 +944,7 @@ class ORTPipelineForTask(ConfigMixin): @classmethod @validate_hf_hub_args - def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTPipeline: + def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTDiffusionPipeline: load_config_kwargs = { "force_download": kwargs.get("force_download", False), "resume_download": kwargs.get("resume_download", None),