Merge pull request #76 from okotaku/feat/deepfloyd_if

[Feature] Support DeepFloyd IF
okotaku · Oct 20, 2023 · c462ada · c462ada
2 parents 0472480 + 4df8cc7
commit c462ada
Show file tree

Hide file tree

Showing 19 changed files with 766 additions and 2 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -2,6 +2,7 @@
     "dockerComposeFile": ["../docker-compose.yml"],
     "service": "diffengine",
     "workspaceFolder": "/workspace",
+    "postCreateCommand": "pre-commit install",
     "customizations": {
       "vscode": {
         "extensions": [

diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
 FROM nvcr.io/nvidia/pytorch:23.07-py3
 
 RUN apt update -y && apt install -y \
-    git
+    git tmux
 RUN apt-get update && apt-get install -y \
     vim \
     libgl1-mesa-dev \
@@ -17,7 +17,8 @@ WORKDIR /diffengine
 COPY ./ /diffengine
 RUN pip install --upgrade pip && \
     pip install --no-cache-dir openmim==0.3.9 && \
-    pip install .
+    pip install . && \
+    pip install pre-commit
 
 # Language settings
 ENV LANG C.UTF-8

diff --git a/README.md b/README.md
@@ -151,6 +151,9 @@ For detailed user guides and advanced guides, please refer to our [Documentation
       <td>
         <b>Stable Diffusion XLs</b>
       </td>
+      <td>
+        <b>DeepFloyd IFs</b>
+      </td>
     </tr>
     <tr valign="top">
       <td>
@@ -174,6 +177,12 @@ For detailed user guides and advanced guides, please refer to our [Documentation
           <li><a href="configs/esd/README.md">Erasing Concepts from Diffusion Models (2023)</a></li>
         </ul>
       </td>
+      <td>
+        <ul>
+            <li><a href="configs/deepfloyd_if/README.md">DeepFloyd IF (2023)</a></li>
+            <li><a href="configs/deepfloyd_if_dreambooth/README.md">DreamBooth (CVPR'2023)</a></li>
+      </ul>
+      </td>
     </tr>
 </td>
     </tr>

diff --git a/configs/_base_/datasets/dog_dreambooth_if.py b/configs/_base_/datasets/dog_dreambooth_if.py
@@ -0,0 +1,32 @@
+train_pipeline = [
+    dict(type="torchvision/Resize", size=64, interpolation="bilinear"),
+    dict(type="RandomCrop", size=64),
+    dict(type="RandomHorizontalFlip", p=0.5),
+    dict(type="torchvision/ToTensor"),
+    dict(type="torchvision/Normalize", mean=[0.5], std=[0.5]),
+    dict(type="PackInputs"),
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(
+        type="HFDreamBoothDataset",
+        dataset="diffusers/dog-example",
+        instance_prompt="a photo of sks dog",
+        pipeline=train_pipeline),
+    sampler=dict(type="InfiniteSampler", shuffle=True),
+)
+
+val_dataloader = None
+val_evaluator = None
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
+
+custom_hooks = [
+    dict(
+        type="VisualizationHook",
+        prompt=["A photo of sks dog in a bucket"] * 4,
+        by_epoch=False,
+        interval=100),
+    dict(type="LoRASaveHook"),
+]
diff --git a/configs/_base_/datasets/pokemon_blip_if.py b/configs/_base_/datasets/pokemon_blip_if.py
@@ -0,0 +1,27 @@
+train_pipeline = [
+    dict(type="torchvision/Resize", size=64, interpolation="bilinear"),
+    dict(type="RandomCrop", size=64),
+    dict(type="RandomHorizontalFlip", p=0.5),
+    dict(type="torchvision/ToTensor"),
+    dict(type="torchvision/Normalize", mean=[0.5], std=[0.5]),
+    dict(type="PackInputs"),
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(
+        type="HFDataset",
+        dataset="lambdalabs/pokemon-blip-captions",
+        pipeline=train_pipeline),
+    sampler=dict(type="DefaultSampler", shuffle=True),
+)
+
+val_dataloader = None
+val_evaluator = None
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
+
+custom_hooks = [
+    dict(type="VisualizationHook", prompt=["yoda pokemon"] * 4),
+    dict(type="SDCheckpointHook"),
+]
diff --git a/configs/_base_/models/deepfloyd_if_l.py b/configs/_base_/models/deepfloyd_if_l.py
@@ -0,0 +1,4 @@
+model = dict(
+    type="DeepFloydIF",
+    model="DeepFloyd/IF-I-L-v1.0",
+    gradient_checkpointing=True)
diff --git a/configs/_base_/models/deepfloyd_if_xl_lora.py b/configs/_base_/models/deepfloyd_if_xl_lora.py
@@ -0,0 +1,5 @@
+model = dict(
+    type="DeepFloydIF",
+    model="DeepFloyd/IF-I-XL-v1.0",
+    lora_config=dict(rank=8),
+    gradient_checkpointing=True)
diff --git a/configs/deepfloyd_if/README.md b/configs/deepfloyd_if/README.md
@@ -0,0 +1,80 @@
+# DeepFloyd IF
+
+[Deepfloyd IF](https://www.deepfloyd.ai/deepfloyd-if)
+
+## Abstract
+
+We introduce DeepFloyd IF, a novel state-of-the-art open-source text-to-image model with a high degree of photorealism and language understanding. DeepFloyd IF is a modular composed of a frozen text encoder and three cascaded pixel diffusion modules: a base model that generates 64x64 px image based on text prompt and two super-resolution models, each designed to generate images of increasing resolution: 256x256 px and 1024x1024 px. All stages of the model utilize a frozen text encoder based on the T5 transformer to extract text embeddings, which are then fed into a UNet architecture enhanced with cross-attention and attention pooling. The result is a highly efficient model that outperforms current state-of-the-art models, achieving a zero-shot FID score of 6.66 on the COCO dataset. Our work underscores the potential of larger UNet architectures in the first stage of cascaded diffusion models and depicts a promising future for text-to-image synthesis.
+
+<div align=center>
+<img src="https://github.com/okotaku/diffengine/assets/24734142/a1b1a31f-5fb7-4a62-8502-2c8e4330f165"/>
+</div>
+
+## Citation
+
+```
+```
+
+## Run Training
+
+Run Training
+
+```
+# single gpu
+$ mim train diffengine ${CONFIG_FILE}
+# multi gpus
+$ mim train diffengine ${CONFIG_FILE} --gpus 2 --launcher pytorch
+
+# Example.
+$ mim train diffengine configs/deepfloyd_if/deepfloyd_if_pokemon_blip.py
+```
+
+## Inference with diffusers
+
+Once you have trained a model, specify the path to the saved model and utilize it for inference using the `diffusers.pipeline` module.
+
+Before inferencing, we should convert weights for diffusers format,
+
+```bash
+$ mim run diffengine publish_model2diffusers ${CONFIG_FILE} ${INPUT_FILENAME} ${OUTPUT_DIR} --save-keys ${SAVE_KEYS}
+# Example
+$ mim run diffengine publish_model2diffusers configs/deepfloyd_if/deepfloyd_if_l_pokemon_blip.py work_dirs/deepfloyd_if_l_pokemon_blip/epoch_50.pth work_dirs/deepfloyd_if_l_pokemon_blip --save-keys unet
+```
+
+Then we can run inference.
+
+```py
+import torch
+from diffusers import DiffusionPipeline, UNet2DConditionModel
+
+prompt = 'yoda pokemon'
+checkpoint = 'work_dirs/deepfloyd_if_l_pokemon_blip'
+
+unet = UNet2DConditionModel.from_pretrained(
+    checkpoint, subfolder='unet')
+pipe = DiffusionPipeline.from_pretrained(
+    'DeepFloyd/IF-I-L-v1.0', unet=unet)
+pipe.to('cuda')
+
+image = pipe(
+    prompt,
+    num_inference_steps=50,
+).images[0]
+image.save('demo.png')
+```
+
+We also provide inference demo scripts:
+
+```bash
+$ mim run diffengine demo_if "yoda pokemon" work_dirs/deepfloyd_if_l_pokemon_blip
+```
+
+## Results Example
+
+#### deepfloyd_if_l_pokemon_blip
+
+Stage1 output example
+![example](https://github.com/okotaku/diffengine/assets/24734142/e1e56e8e-59ec-4256-82a1-0e2941b6ee24)
+
+Stage3 output example
+![example](https://github.com/okotaku/diffengine/assets/24734142/c71b2a64-e016-4e39-b077-457d065ee8da)
diff --git a/configs/deepfloyd_if/deepfloyd_if_l_pokemon_blip.py b/configs/deepfloyd_if/deepfloyd_if_l_pokemon_blip.py
@@ -0,0 +1,11 @@
+_base_ = [
+    "../_base_/models/deepfloyd_if_l.py",
+    "../_base_/datasets/pokemon_blip_if.py",
+    "../_base_/schedules/stable_diffusion_50e.py",
+    "../_base_/default_runtime.py",
+]
+
+optim_wrapper = dict(
+    _delete_=True,
+    optimizer=dict(type="AdamW", lr=1e-4, weight_decay=1e-2),
+    clip_grad=dict(max_norm=1.0))
diff --git a/configs/deepfloyd_if_dreambooth/README.md b/configs/deepfloyd_if_dreambooth/README.md
@@ -0,0 +1,74 @@
+# DeepFloyd IF DreamBooth
+
+[DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://arxiv.org/abs/2208.12242)
+
+## Abstract
+
+Large text-to-image models achieved a remarkable leap in the evolution of AI, enabling high-quality and diverse synthesis of images from a given text prompt. However, these models lack the ability to mimic the appearance of subjects in a given reference set and synthesize novel renditions of them in different contexts. In this work, we present a new approach for "personalization" of text-to-image diffusion models. Given as input just a few images of a subject, we fine-tune a pretrained text-to-image model such that it learns to bind a unique identifier with that specific subject. Once the subject is embedded in the output domain of the model, the unique identifier can be used to synthesize novel photorealistic images of the subject contextualized in different scenes. By leveraging the semantic prior embedded in the model with a new autogenous class-specific prior preservation loss, our technique enables synthesizing the subject in diverse scenes, poses, views and lighting conditions that do not appear in the reference images. We apply our technique to several previously-unassailable tasks, including subject recontextualization, text-guided view synthesis, and artistic rendering, all while preserving the subject's key features. We also provide a new dataset and evaluation protocol for this new task of subject-driven generation.
+
+<div align=center>
+<img src="https://github.com/okotaku/dethub/assets/24734142/33b1953d-ce42-4f9a-bcbc-87050cfe4f6f"/>
+</div>
+
+## Citation
+
+```
+@inproceedings{ruiz2023dreambooth,
+  title={Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation},
+  author={Ruiz, Nataniel and Li, Yuanzhen and Jampani, Varun and Pritch, Yael and Rubinstein, Michael and Aberman, Kfir},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  year={2023}
+}
+```
+
+## Run Training
+
+Run Training
+
+```
+# single gpu
+$ mim train diffengine ${CONFIG_FILE}
+# multi gpus
+$ mim train diffengine ${CONFIG_FILE} --gpus 2 --launcher pytorch
+
+# Example.
+$ mim train diffengine configs/deepfloyd_if_dreambooth/deepfloyd_if_xl_dreambooth_lora_dog.py
+```
+
+## Inference with diffusers
+
+Once you have trained a model, specify the path to the saved model and utilize it for inference using the `diffusers.pipeline` module.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+checkpoint = 'work_dirs/deepfloyd_if_xl_dreambooth_lora_dog/step999'
+prompt = 'A photo of sks dog in a bucket'
+
+pipe = DiffusionPipeline.from_pretrained('DeepFloyd/IF-I-XL-v1.0')
+pipe.to('cuda')
+pipe.load_lora_weights(checkpoint)
+
+image = pipe(
+    prompt,
+    num_inference_steps=50,
+).images[0]
+image.save('demo.png')
+```
+
+We also provide inference demo scripts:
+
+```bash
+$ mim run diffengine demo_if_lora "A photo of sks dog in a bucket" work_dirs/deepfloyd_if_xl_dreambooth_lora_dog/step999
+```
+
+## Results Example
+
+#### deepfloyd_if_xl_dreambooth_lora_dog
+
+Stage1 output example
+![example](https://github.com/okotaku/diffengine/assets/24734142/eb4082f1-a155-4d6a-b536-36a93aa19a96)
+
+Stage3 output example
+![example](https://github.com/okotaku/diffengine/assets/24734142/0c6ee93e-661a-4242-9ce5-f7d15d7a1855)
diff --git a/configs/deepfloyd_if_dreambooth/deepfloyd_if_xl_dreambooth_lora_dog.py b/configs/deepfloyd_if_dreambooth/deepfloyd_if_xl_dreambooth_lora_dog.py
@@ -0,0 +1,11 @@
+_base_ = [
+    "../_base_/models/deepfloyd_if_xl_lora.py",
+    "../_base_/datasets/pokemon_blip_if.py",
+    "../_base_/schedules/stable_diffusion_50e.py",
+    "../_base_/default_runtime.py",
+]
+
+optim_wrapper = dict(
+    _delete_=True,
+    optimizer=dict(type="AdamW", lr=5e-6, weight_decay=1e-2),
+    clip_grad=dict(max_norm=1.0))
diff --git a/diffengine/datasets/hf_dreambooth_datasets.py b/diffengine/datasets/hf_dreambooth_datasets.py
@@ -122,6 +122,8 @@ def generate_class_image(self, class_image_config):
         pipeline = DiffusionPipeline.from_pretrained(
             class_image_config["model"],
             safety_checker=None,
+            torch_dtype=(torch.float16 if class_image_config["device"] != "cpu"
+                         else torch.float32),
         )
         pipeline.set_progress_bar_config(disable=True)
         pipeline.to(class_image_config["device"])

diff --git a/diffengine/models/editors/__init__.py b/diffengine/models/editors/__init__.py
@@ -1,3 +1,4 @@
+from .deepfloyd_if import *  # noqa: F403
 from .distill_sd import *  # noqa: F403
 from .esd import *  # noqa: F403
 from .ip_adapter import *  # noqa: F403

diff --git a/diffengine/models/editors/deepfloyd_if/__init__.py b/diffengine/models/editors/deepfloyd_if/__init__.py
@@ -0,0 +1,3 @@
+from .deepfloyd_if import DeepFloydIF
+
+__all__ = ["DeepFloydIF"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .deepfloyd_if import DeepFloydIF

		__all__ = ["DeepFloydIF"]