Merge Dev to Master (#1803)

Sygil-Dev · Jul 26, 2023 · f1b3b09 · f1b3b09
2 parents 5342f37 + 300052c
commit f1b3b09
Show file tree

Hide file tree

Showing 235 changed files with 35,354 additions and 20,027 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,3 @@
 outputs/
 src/
-configs/webui/userconfig_streamlit.yaml
+configs/webui/userconfig_streamlit.yaml
diff --git a/.gitattributes b/.gitattributes
@@ -1,4 +1,4 @@
 * text=auto
 *.{cmd,[cC][mM][dD]} text eol=crlf
 *.{bat,[bB][aA][tT]} text eol=crlf
-*.sh text eol=lf
+*.sh text eol=lf
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -40,7 +40,7 @@ body:
  - type: dropdown
  id: os
  attributes:
- label: Where are you running the webui? 
+ label: Where are you running the webui?
  multiple: true
  options:
  - Windows
@@ -52,7 +52,7 @@ body:
  attributes:
  label: Custom settings
  description: If you are running the webui with specifi settings, please paste them here for reference (like --nitro)
- render: shell 
+ render: shell
  - type: textarea
  id: logs
  attributes:
@@ -66,4 +66,4 @@ body:
  description: By submitting this issue, you agree to follow our [Code of Conduct](https://docs.github.com/en/site-policy/github-terms/github-community-code-of-conduct)
  options:
  - label: I agree to follow this project's Code of Conduct
- required: true
+ required: true
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -13,4 +13,4 @@ Closes: # (issue)
 - [ ] I have changed the base branch to `dev`
 - [ ] I have performed a self-review of my own code
 - [ ] I have commented my code in hard-to-understand areas
-- [ ] I have made corresponding changes to the documentation
+- [ ] I have made corresponding changes to the documentation
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -37,4 +37,4 @@ jobs:
  # The GH actions bot is used by default if you didn't specify the two fields.
  # You can swap them out with your own user credentials.
  user_name: github-actions[bot]
- user_email: 41898282+github-actions[bot]@users.noreply.github.com
+ user_email: 41898282+github-actions[bot]@users.noreply.github.com
diff --git a/.github/workflows/test-deploy.yml b/.github/workflows/test-deploy.yml
@@ -21,4 +21,4 @@ jobs:
  - name: Install dependencies
  run: yarn install
  - name: Test build website
- run: yarn build
+ run: yarn build
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,13 +15,13 @@ repos:
  - id: check-yaml
  - id: check-added-large-files
 
- - repo: https://github.com/charliermarsh/ruff-pre-commit
- rev: "v0.0.272"
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: "v0.0.278"
  hooks:
  - id: ruff
  args: [--fix, --exit-non-zero-on-fix]
 
  - repo: https://github.com/psf/black
- rev: 23.3.0
+ rev: 23.7.0
  hooks:
  - id: black
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 ## Installation instructions for:
 
-- **[Windows](https://sygil-dev.github.io/sygil-webui/docs/Installation/windows-installation)** 
+- **[Windows](https://sygil-dev.github.io/sygil-webui/docs/Installation/windows-installation)**
 - **[Linux](https://sygil-dev.github.io/sygil-webui/docs/Installation/linux-installation)**
 
 ### Want to ask a question or request a feature?
@@ -34,10 +34,10 @@ Check the [Contribution Guide](CONTRIBUTING.md)
 
 * Run additional upscaling models on CPU to save VRAM
 
-* Textual inversion: [Reaserch Paper](https://textual-inversion.github.io/) 
+* Textual inversion: [Reaserch Paper](https://textual-inversion.github.io/)
 
 * K-Diffusion Samplers: A great collection of samplers to use, including:
- 
+
  - `k_euler`
  - `k_lms`
  - `k_euler_a`
@@ -95,8 +95,8 @@ An easy way to work with Stable Diffusion right from your browser.
 To give a token (tag recognized by the AI) a specific or increased weight (emphasis), add `:0.##` to the prompt, where `0.##` is a decimal that will specify the weight of all tokens before the colon.
 Ex: `cat:0.30, dog:0.70` or `guy riding a bicycle :0.7, incoming car :0.30`
 
-Negative prompts can be added by using `###` , after which any tokens will be seen as negative. 
-Ex: `cat playing with string ### yarn` will negate `yarn` from the generated image. 
+Negative prompts can be added by using `###` , after which any tokens will be seen as negative.
+Ex: `cat playing with string ### yarn` will negate `yarn` from the generated image.
 
 Negatives are a very powerful tool to get rid of contextually similar or related topics, but **be careful when adding them since the AI might see connections you can't**, and end up outputting gibberish
 
@@ -131,7 +131,7 @@ Lets you improve faces in pictures using the GFPGAN model. There is a checkbox i
 
 If you want to use GFPGAN to improve generated faces, you need to install it separately.
 Download [GFPGANv1.4.pth](https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/GFPGANv1.4.pth) and put it
-into the `/sygil-webui/models/gfpgan` directory. 
+into the `/sygil-webui/models/gfpgan` directory.
 
 ### RealESRGAN
 
@@ -141,7 +141,7 @@ Lets you double the resolution of generated images. There is a checkbox in every
 There is also a separate tab for using RealESRGAN on any picture.
 
 Download [RealESRGAN_x4plus.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth) and [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth).
-Put them into the `sygil-webui/models/realesrgan` directory. 
+Put them into the `sygil-webui/models/realesrgan` directory.
 
 ### LSDR
 
@@ -174,8 +174,8 @@ which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF
 
 [Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
 model.
-Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. 
-Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487), 
+Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database.
+Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487),
 this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
 With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
 See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
@@ -184,26 +184,26 @@ See [this section](#stable-diffusion-v1) below and the [model card](https://hugg
 
 Stable Diffusion v1 refers to a specific configuration of the model
 architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
-and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and 
+and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and
 then finetuned on 512x512 images.
 
 *Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
-in its training data. 
+in its training data.
 Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
 
 ## Comments
 
 - Our code base for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
- and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). 
+ and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch).
  Thanks for open-sourcing!
 
-- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories). 
+- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories).
 
 ## BibTeX
 
 ```
 @misc{rombach2021highresolution,
- title={High-Resolution Image Synthesis with Latent Diffusion Models}, 
+ title={High-Resolution Image Synthesis with Latent Diffusion Models},
  author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
  year={2021},
  eprint={2112.10752},

diff --git a/Stable_Diffusion_v1_Model_Card.md b/Stable_Diffusion_v1_Model_Card.md
@@ -21,7 +21,7 @@ This model card focuses on the model associated with the Stable Diffusion model,
 
 # Uses
 
-## Direct Use 
+## Direct Use
 The model is intended for research purposes only. Possible research areas and
 tasks include
 
@@ -68,11 +68,11 @@ Using the model to generate content that is cruel to individuals is a misuse of
  considerations.
 
 ### Bias
-While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases. 
-Stable Diffusion v1 was trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/), 
-which consists of images that are primarily limited to English descriptions. 
-Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for. 
-This affects the overall output of the model, as white and western cultures are often set as the default. Further, the 
+While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
+Stable Diffusion v1 was trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
+which consists of images that are primarily limited to English descriptions.
+Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
+This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
 ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
 
 
@@ -84,7 +84,7 @@ The model developers used the following dataset for training the model:
 - LAION-2B (en) and subsets thereof (see next section)
 
 **Training Procedure**
-Stable Diffusion v1 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training, 
+Stable Diffusion v1 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training,
 
 - Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4
 - Text prompts are encoded through a ViT-L/14 text-encoder.
@@ -108,12 +108,12 @@ filtered to images with an original size `>= 512x512`, estimated aesthetics scor
 - **Batch:** 32 x 8 x 2 x 4 = 2048
 - **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant
 
-## Evaluation Results 
+## Evaluation Results
 Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
 5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
 steps show the relative improvements of the checkpoints:
 
-![pareto](assets/v1-variants-scores.jpg) 
+![pareto](assets/v1-variants-scores.jpg)
 
 Evaluated using 50 PLMS steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution. Not optimized for FID scores.
 ## Environmental Impact
@@ -137,4 +137,3 @@ Based on that information, we estimate the following CO2 emissions using the [Ma
  }
 
 *This model card was written by: Robin Rombach and Patrick Esser and is based on the [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
-
diff --git a/Web_based_UI_for_Stable_Diffusion_colab.ipynb b/Web_based_UI_for_Stable_Diffusion_colab.ipynb
@@ -582,4 +582,4 @@
  "outputs": []
  }
  ]
-}
+}
diff --git a/blog/2022-10-20/1.Textual inversion usage competitio.md b/blog/2022-10-20/1.Textual inversion usage competitio.md
@@ -23,7 +23,7 @@ Hopefully demand will be high, we want to train **hundreds** of new concepts!
 
 # What does `most inventive use` mean?
 
-Whatever you want it to mean! be creative! experiment! 
+Whatever you want it to mean! be creative! experiment!
 
 There are several categories we will look at:
 
@@ -33,7 +33,7 @@ There are several categories we will look at:
 
 * composition; meaning anything related to how big things are, their position, the angle, etc
 
-* styling; 
+* styling;
 
 ![image](https://user-images.githubusercontent.com/106811348/197045629-029ba6f5-1f79-475c-9ce7-969aaf3d253b.png)
 
@@ -45,7 +45,7 @@ There are several categories we will look at:
 
 ## `The Sims(TM): Stable Diffusion edition` ?
 
-For this event the theme is “The Sims: Stable Diffusion edition”. 
+For this event the theme is “The Sims: Stable Diffusion edition”.
 
 So we have selected a subset of [products from Amazon Berkely Objects dataset](https://github.com/sd-webui/abo).
 

diff --git a/configs/blip/bert_config.json b/configs/blip/bert_config.json
@@ -17,5 +17,5 @@
  "type_vocab_size": 2,
  "vocab_size": 30522,
  "encoder_width": 768,
- "add_cross_attention": true 
+ "add_cross_attention": true
 }
diff --git a/configs/blip/caption_coco.yaml b/configs/blip/caption_coco.yaml
@@ -21,7 +21,7 @@ init_lr: 1e-5
 image_size: 384
 
 # generation configs
-max_length: 20 
+max_length: 20
 min_length: 5
 num_beams: 3
 prompt: 'a picture of '
@@ -30,4 +30,3 @@ prompt: 'a picture of '
 weight_decay: 0.05
 min_lr: 0
 max_epoch: 5
-
diff --git a/configs/blip/med_config.json b/configs/blip/med_config.json
@@ -17,5 +17,5 @@
  "type_vocab_size": 2,
  "vocab_size": 30524,
  "encoder_width": 768,
- "add_cross_attention": true 
+ "add_cross_attention": true
 }
diff --git a/configs/blip/nlvr.yaml b/configs/blip/nlvr.yaml
@@ -1,13 +1,13 @@
-image_root: '/export/share/datasets/vision/NLVR2/' 
+image_root: '/export/share/datasets/vision/NLVR2/'
 ann_root: 'annotation'
 
 # set pretrained as a file path or an url
 pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth'
 
 #size of vit model; base or large
 vit: 'base'
-batch_size_train: 16 
-batch_size_test: 64 
+batch_size_train: 16
+batch_size_test: 64
 vit_grad_ckpt: False
 vit_ckpt_layer: 0
 max_epoch: 15
@@ -18,4 +18,3 @@ image_size: 384
 weight_decay: 0.05
 init_lr: 3e-5
 min_lr: 0
-
diff --git a/configs/blip/nocaps.yaml b/configs/blip/nocaps.yaml
@@ -12,4 +12,4 @@ image_size: 384
 max_length: 20
 min_length: 5
 num_beams: 3
-prompt: 'a picture of '
+prompt: 'a picture of '
diff --git a/configs/blip/pretrain.yaml b/configs/blip/pretrain.yaml
@@ -1,7 +1,7 @@
 train_file: ['/export/share/junnan-li/VL_pretrain/annotation/coco_karpathy_train.json',
  '/export/share/junnan-li/VL_pretrain/annotation/vg_caption.json',
  ]
-laion_path: '' 
+laion_path: ''
 
 # size of vit model; base or large
 vit: 'base'
@@ -22,6 +22,3 @@ warmup_lr: 1e-6
 lr_decay_rate: 0.9
 max_epoch: 20
 warmup_steps: 3000
-
-
-
diff --git a/configs/blip/retrieval_coco.yaml b/configs/blip/retrieval_coco.yaml
@@ -31,4 +31,3 @@ negative_all_rank: True
 weight_decay: 0.05
 min_lr: 0
 max_epoch: 6
-
diff --git a/configs/blip/retrieval_flickr.yaml b/configs/blip/retrieval_flickr.yaml
@@ -31,4 +31,3 @@ negative_all_rank: False
 weight_decay: 0.05
 min_lr: 0
 max_epoch: 6
-
diff --git a/configs/blip/retrieval_msrvtt.yaml b/configs/blip/retrieval_msrvtt.yaml
@@ -9,4 +9,4 @@ vit: 'base'
 batch_size: 64
 k_test: 128
 image_size: 384
-num_frm_test: 8
+num_frm_test: 8
diff --git a/configs/blip/vqa.yaml b/configs/blip/vqa.yaml
@@ -8,8 +8,8 @@ pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/mo
 
 # size of vit model; base or large
 vit: 'base'
-batch_size_train: 16 
-batch_size_test: 32 
+batch_size_train: 16
+batch_size_test: 32
 vit_grad_ckpt: False
 vit_ckpt_layer: 0
 init_lr: 2e-5
@@ -22,4 +22,4 @@ inference: 'rank'
 # optimizer
 weight_decay: 0.05
 min_lr: 0
-max_epoch: 10
+max_epoch: 10
diff --git a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml b/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
@@ -83,4 +83,4 @@ lightning:
  increase_log_steps: False
 
  trainer:
- benchmark: True
+ benchmark: True
diff --git a/configs/latent-diffusion/cin-ldm-vq-f8.yaml b/configs/latent-diffusion/cin-ldm-vq-f8.yaml
@@ -95,4 +95,4 @@ lightning:
  increase_log_steps: False
 
  trainer:
- benchmark: True
+ benchmark: True