From 89e7ebb47750e6b07b19bbdebf65cec5c7d93536 Mon Sep 17 00:00:00 2001 From: shiyiming Date: Sun, 26 Nov 2023 18:43:11 +0800 Subject: [PATCH] fix: add submodule to aviod manully replace labml_nn dependencies in docker --- .dockerignore | 14 + .gitignore | 57 +- .gitmodules | 3 + .vscode/launch.json | 60 +- .vscode/settings.json | 6 + Dockerfile | 7 +- README.md | 106 +-- baseline1.py | 205 ++++++ checkmodel.py | 112 --- configs/sample_config.py | 12 +- configs/unidiffuserv1.py | 39 +- cudatest.py | 41 -- ...otated_deep_learning_paper_implementations | 1 + get_bound.py | 192 +++++ img2img.py | 238 +++++++ img2img_copytest.py | 122 ++++ img2imgcopy.py | 184 +++++ indocker_shell.sh | 9 +- json_outputs/1.json | 1 + json_outputs/10.json | 1 + json_outputs/11.json | 1 + json_outputs/12.json | 1 + json_outputs/13.json | 1 + json_outputs/14.json | 1 + json_outputs/15.json | 1 + json_outputs/16.json | 1 + json_outputs/17.json | 1 + json_outputs/18.json | 1 + json_outputs/19.json | 1 + json_outputs/2.json | 1 + json_outputs/20.json | 1 + json_outputs/21.json | 1 + json_outputs/22.json | 1 + json_outputs/23.json | 1 + json_outputs/24.json | 1 + json_outputs/25.json | 1 + json_outputs/26.json | 1 + json_outputs/27.json | 1 + json_outputs/28.json | 1 + json_outputs/29.json | 1 + json_outputs/3.json | 1 + json_outputs/30.json | 1 + json_outputs/31.json | 1 + json_outputs/32.json | 1 + json_outputs/4.json | 1 + json_outputs/5.json | 1 + json_outputs/6.json | 1 + json_outputs/7.json | 1 + json_outputs/8.json | 1 + json_outputs/9.json | 1 + jsoncheck.py | 42 ++ libs/caption_decoder.py | 11 +- libs/clip.py | 29 + libs/cross_attention.py | 472 ------------- libs/data.py | 503 +++++++------ libs/dpm_solver_pp.py | 4 +- libs/schedule.py | 11 +- libs/testcuda.py | 10 - libs/uvit_multi_post_ln_v1.py | 98 ++- load_m.py | 131 ++++ load_model.py | 261 +++++++ main.py | 90 +++ requirements.txt | 21 +- run.sh | 70 +- runDocker | 10 +- sample.py | 69 +- sample.sh | 11 - sample_fn.py | 360 ++++++++++ score.py | 364 ++++++---- testfp16.sh | 40 ++ testtorchcompile.py | 14 + train.py | 32 +- train.sh | 8 +- trainn.py | 663 ------------------ utils.py | 359 +--------- ...04\346\265\213\350\257\264\346\230\216.md" | 111 +++ 76 files changed, 3030 insertions(+), 2204 deletions(-) create mode 100644 .gitmodules create mode 100644 .vscode/settings.json create mode 100644 baseline1.py delete mode 100644 checkmodel.py delete mode 100755 cudatest.py create mode 160000 dependency/annotated_deep_learning_paper_implementations create mode 100644 get_bound.py create mode 100644 img2img.py create mode 100644 img2img_copytest.py create mode 100644 img2imgcopy.py create mode 100644 json_outputs/1.json create mode 100644 json_outputs/10.json create mode 100644 json_outputs/11.json create mode 100644 json_outputs/12.json create mode 100644 json_outputs/13.json create mode 100644 json_outputs/14.json create mode 100644 json_outputs/15.json create mode 100644 json_outputs/16.json create mode 100644 json_outputs/17.json create mode 100644 json_outputs/18.json create mode 100644 json_outputs/19.json create mode 100644 json_outputs/2.json create mode 100644 json_outputs/20.json create mode 100644 json_outputs/21.json create mode 100644 json_outputs/22.json create mode 100644 json_outputs/23.json create mode 100644 json_outputs/24.json create mode 100644 json_outputs/25.json create mode 100644 json_outputs/26.json create mode 100644 json_outputs/27.json create mode 100644 json_outputs/28.json create mode 100644 json_outputs/29.json create mode 100644 json_outputs/3.json create mode 100644 json_outputs/30.json create mode 100644 json_outputs/31.json create mode 100644 json_outputs/32.json create mode 100644 json_outputs/4.json create mode 100644 json_outputs/5.json create mode 100644 json_outputs/6.json create mode 100644 json_outputs/7.json create mode 100644 json_outputs/8.json create mode 100644 json_outputs/9.json create mode 100644 jsoncheck.py delete mode 100755 libs/cross_attention.py delete mode 100644 libs/testcuda.py create mode 100644 load_m.py create mode 100644 load_model.py create mode 100644 main.py create mode 100644 sample_fn.py mode change 100755 => 100644 score.py create mode 100644 testfp16.sh create mode 100644 testtorchcompile.py delete mode 100755 trainn.py create mode 100644 "\350\257\204\346\265\213\350\257\264\346\230\216.md" diff --git a/.dockerignore b/.dockerignore index 35a8f6f..d33bc85 100755 --- a/.dockerignore +++ b/.dockerignore @@ -19,3 +19,17 @@ package.sh compress.py logs/* + +shit.py + +load_m.py + +img2imgcopy.py + +img2img.py + +img2img_test.py + +gpuhold.py + +compress.py \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2097f32..ee367e0 100755 --- a/.gitignore +++ b/.gitignore @@ -7,12 +7,19 @@ log*/ model_output/* outputs/ model_output_test/ - +tomesd/ output_test/ real_reg/* train_data/* +train_data_crop/* +our_image_outputs/ +our_imageoforigin2/ +our_json_imageoforigin2/ +our_json_outputs/ +other_models/* +old_train_data/ ## weights anf lib CLIP/ @@ -44,5 +51,51 @@ train_data_crop/* nnet.pt +image_outputs/ +ImageReward/* + +home/ + +final_test/ + +final_json_data/ + +final_evaluation/ +eval_prompts_advance/ + +bench_samples/* + +bench_samples_standard/* + +aaaaaaasaveresults/ + +abase_image_outputs/ +abase_json_outputs/ + +test_data/ + +test_json_out/ + +combined_output.txt + +compress.__pycache__ +img2imgcopy.__pycache__ +load_m.__pycache__ +load_model_output.txt +removebg.py + +removeple_fnn.py +run_local.sh + +runDocker +sample_bench.py +score_local.py + +score_oldone.py +score_output.txt +shit.py +shitshit.py +total_time.txt + -other_models/stablediffusion/* +gpuhold/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..7fd6a89 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dependency/annotated_deep_learning_paper_implementations"] + path = dependency/annotated_deep_learning_paper_implementations + url = https://github.com/SKDDJ/labml_nn.git diff --git a/.vscode/launch.json b/.vscode/launch.json index c314894..2dd4159 100755 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,17 +4,63 @@ // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + + + + + { + "name": "Python: score", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/score.py", + "console": "integratedTerminal", + "justMyCode": true + }, + { + "name": "Python: loadmodel", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/load_model.py", + "args": [ + // "--orig-img", "/home/schengwei/Competitionrepo/test_data/testdata/2_0.jpg", + // "--prompt", "a girl, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney,8k", + // "--scale", "0.99", + // "--steps", "25", + // "--strength", "0.03" + ], + "console": "integratedTerminal", + "justMyCode": false + }, { - "name": "Python: 当前文件", + "name": "Python: img2img", "type": "python", "request": "launch", - "program": "${file}", - "args": ["--hello", "hello world"], + "program": "${workspaceFolder}/img2img.py", + "args": [ + "--orig-img", "/home/schengwei/Competitionrepo/test_data/testdata/2_0.jpg", + "--prompt", "a girl, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney,8k", + "--scale", "0.99", + "--steps", "25", + "--strength", "0.03" + ], "console": "integratedTerminal", "justMyCode": true + }, + { + "name": "Python: main", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/main.py", + "args": [ + // "--orig-img", "/home/schengwei/Competitionrepo/test_data/testdata/2_0.jpg", + // "--prompt", "a girl, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney,8k", + // "--scale", "0.99", + // "--steps", "25", + // "--strength", "0.03" + ], + "console": "integratedTerminal", + "justMyCode": false } - ] -} - -// accelerate launch --mixed_precision no --num_processes 1 trainn.py --instance_data_dir="train_data/newboy1" --outdir="model_output/boy1" --class_data_dir="real_reg/samples_boyface" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="boy" --num_class_images=200 --instance_prompt=" a boy" --modifier_token "" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..605cd66 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "better-comments.highlightPlainText": true, + "files.exclude": { + "**/.idea": true + } +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 498781a..3dcdd11 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ # 使用基础镜像 pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime -FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime +# FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime +FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime # 设置工作目录为 /workspace WORKDIR /workspace @@ -15,7 +16,7 @@ COPY ./requirements.txt ./requirements.txt RUN apt update && apt upgrade -y # 安装所需的系统依赖包 -RUN apt install -y git libgl1-mesa-glx libglib2.0-0 +RUN DEBIAN_FRONTEND=noninteractive apt install -y git libgl1-mesa-glx libglib2.0-0 # 安装gcc和g++编译器 RUN apt-get install -y gcc g++ @@ -28,3 +29,5 @@ RUN pip install -r ./requirements.txt # 复制项目目录中的所有文件到容器的当前工作目录 COPY . . + + diff --git a/README.md b/README.md index cf8da77..95966a8 100755 --- a/README.md +++ b/README.md @@ -1,109 +1,41 @@ + # 代码说明 ## 预测性能 -预计训练和推理总耗时10小时左右 +* **训练步数** :预计训练 15,000 步 +* **耗时估计** :总耗时约 5 小时 ## 环境配置(必选) -在 Dockerfile 中有每一步的注释,详情见 Dockerfile 文件 +* **Docker 配置** :详细的步骤及注释在 `Dockerfile` 中。详见 `Dockerfile` 文件。 ## 数据(必选) -* 使用了CelebA(CelebFaces Attribute)人脸属性数据集,由香港中文大学开放提供,数据获取链接为http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html,随机获得200张男性人脸(在训练boy1和boy2模型时作为正则数据集使用)和200张女性人脸 - - * 在训练girl1和girl2模型时作为正则数据集使用 -* 对官方提供的数据集进行移除背景和裁剪人像的操作,放进train_data_crop文件夹内 - - * 在 train.sh文件中指定训练数据集的时候使用 +* **数据集** :使用 Stable Diffusion webui 制作的高质量人像文本对数据集。 ## 预训练模型(必选) -* 使用了 CompVis/stable-diffusion-v1-4中的 AutoTokenizer, 代码继承自 custom diffusion中微调 CLIP Text Encoder 部分 - * 路径位于other_models/stablediffusion/ 下 - * 并且代码中应该会自动从 hugging face 上拉取 +* **模型** :rembg ## 算法(必选) -* 整体思路介绍(必选) - -[Texual Inversion]中微调CLIP Text Encoder 的方法 -> 为防止过拟合采用[DreamBooth]中的Prior Loss方法 -> 为 UViT 的指定 Layer 加入 LoRA - -* 方法的创新点 - -之前在训练步数较少的情况下采用过 cross attention 的 LoRA - -* 算法的其他细节 - -1. 自定义Token和CLIP Tokenizer更新:通过添加一个近义词的自定义token,扩展了CLIP模型的文本处理能力。更新CLIP Tokenizer,以包括新的token,确保模型能够识别和处理它。更新CLIPTextModel的embedding层,以适应新的token; -2. 人脸正则数据集:为了避免模型的过拟合和语言漂移问题,引入了人脸正则数据集,帮助模型学习更好地理解人脸。训练数据集由 instance data 和 class data 组成,计算loss时,将模型输出分开,分别算loss; -3. 参数微调:利用peft库中的lora,对模型的"qkv","fc1","fc2","proj","text_embed","clip_img_embed"部分进行参数微调。 +* **数据处理** :使用 Stable Diffusion webui 制作的高质量人像文本对数据集来微调 unidiffuser 模型。 +* **算法增强** :引入了 DDIM 算法以提高推理速度。 +* **图像处理** :应用 image2image 方法对输入图像进行编辑。 ### 整体思路介绍(必选) -1. 用近义词初始化 ``,微调 CLIP Text Encoder -2. 为了避免过拟合和语言漂移,加入人脸正则数据集; -3. 利用peft库中的lora,对模型进行参数微调。 - -训练代码说明: - -基于单个数据集进行训练: - -```shell -accelerate launch --mixed_precision fp16 --num_processes 1 train.py -d '<训练文件所在位置>' -o '<模型输出>' --具体参数请参考 train.sh文件 -``` - -训练所有数据集: - -```shell -./train.sh -``` - -训练过程中的数据预处理, 模型架构, 加载方式等等均可进行修改, 只需要满足命令行接口即可, 并且模型输出位置的输出形式能够被 `sample.py`文件正确解析即可。 - -生成代码说明: - -基于prompt文件生成图片: - -```shell -python sample.py --restore_path '<模型输出>' --prompt_path '' --output_path '<输出路径>' -``` - -基于所有的prompt文件进行生成: - -``` -./sample.sh -``` - -文件中sample的方式, prompt均可以更改, 但是测评时只会根据文件中的prompt进行测评。每个prompt要求输出三张图片, 并存放于目标路径。模型输出路径只需要能够正确解析训练产生的文件夹即可(若使用高效参数微调方法, 只需要将额外参数保存到输出路径并在sample.py中加载即可, 无需保存整个模型, 原模型可以从 `models/uvit_v1.pth`中加载)。`sample.py`除了会生成图片, 还会检查用于生成的模型和原模型的参数差异大小, 用于衡量微淘的参数量, 具体的计算方式见代码。 - -### 客观指标打分(部分) - -```shell -python score.py - -# 默认路径为 - # --dataset: './train_data/' - # --prompts: './eval_prompts/' - # --outputs: './outputs/' - -# 可自行提供路径 -python score.py --dataset '<数据路径>' --prompts '' --outputs '<提交路径>' -``` +* **数据准备** :使用 Stable Diffusion webui 制作高质量人像文本对数据集。 +* **速度优化** :通过引入 DDIM 算法加快推理速度。 +* **功能实现** :利用 image2image 方法实现对输入图像的编辑功能。 ## 训练流程(必选) -1. 直接在容器内运行. train.sh就可以开始训练 - 以下是代码运行逻辑 -2. 定义命令行参数,包括数据目录和输出目录。 -3. 在loop()函数中,使用train_state对象进行模型训练。 -4. 在训练过程中,使用accelerator.is_main_process判断当前进程是否为主进程。 -5. 如果是主进程,计算当前步数total_step,并在达到一定步数时记录日志和保存模型。 -6. 在达到一定步数时,保存模型的checkpoint文件,以便后续进行模型推理。 -7. 在训练结束时,保存最终的模型checkpoint文件。 - -## 其他注意事项 - -* 运行代码命令须参考修改过的train.sh和sample.sh文件 -* 控制训练步数(以“图文对”为单位)的参数所在位置:train.sh中的int 参数 train_step 来指定,默认 2000步 -* 因为我们初赛最终版本train_step为 10000步,并且使用了multi_stage的训练方法,即训练中途更换过训练集图片,但是这一版做了额外的优化,可能和初赛版本持平,若有问题请联系我们团队。 +1. **启动训练** :在容器内运行 `train.sh` 脚本开始训练。 +2. **命令行参数** :定义包括数据目录和输出目录在内的命令行参数。 +3. **训练逻辑** :在 `loop()` 函数中利用 `train_state` 对象进行模型训练。 +4. **进程判断** :使用 `accelerator.is_main_process` 判断是否为主进程。 +5. **日志记录** :在主进程中,计算当前步数 `total_step`,并在达到一定步数时记录日志和保存模型。 +6. **模型保存** :在达到指定步数时保存模型的 checkpoint 文件,以便后续进行模型推理。 +7. **训练结束** :训练结束时保存最终的模型 checkpoint 文件。 diff --git a/baseline1.py b/baseline1.py new file mode 100644 index 0000000..0d8a44a --- /dev/null +++ b/baseline1.py @@ -0,0 +1,205 @@ +##################################################################################################################### +####################################k######## load_model.py exactly ############################################## +##################################################################################################################### + + + + +def prepare_context(): + """ + prepare context for later use + """ + import torch + import utils + from utils import set_logger + from absl import logging + import os + import libs.autoencoder + import clip + from libs.clip import FrozenCLIPEmbedder + from libs.caption_decoder import CaptionDecoder + from libs.uvit_multi_post_ln_v1 import UViT + from configs.unidiffuserv1 import get_config + import builtins + import ml_collections + from torch import multiprocessing as mp + import accelerate + + config = get_config() + mp.set_start_method('spawn') + assert config.gradient_accumulation_steps == 1, \ + 'fix the lr_scheduler bug before using larger gradient_accumulation_steps' + accelerator = accelerate.Accelerator(gradient_accumulation_steps=config.gradient_accumulation_steps, mixed_precision="fp16") + device = accelerator.device + accelerate.utils.set_seed(config.seed, device_specific=True) + logging.info(f'Process {accelerator.process_index} using device: {device}') + + config.mixed_precision = accelerator.mixed_precision + + accelerator.wait_for_everyone() + if accelerator.is_main_process: + set_logger(log_level='info') + logging.info(config) + else: + set_logger(log_level='error') + builtins.print = lambda *args: None + logging.info(f'Run on {accelerator.num_processes} devices') + + train_state = utils.initialize_train_state(config, device, uvit_class=UViT) + origin_sd = torch.load("models/uvit_v1.pth", map_location='cpu') + + caption_decoder = CaptionDecoder(device=device, **config.caption_decoder) + nnet, optimizer = accelerator.prepare(train_state.nnet, train_state.optimizer) + + nnet.to(device) + lr_scheduler = train_state.lr_scheduler + autoencoder = libs.autoencoder.get_model(**config.autoencoder).to(device) + + clip_text_model = FrozenCLIPEmbedder(version=config.clip_text_model, device=device) + clip_img_model, clip_img_model_preprocess = clip.load(config.clip_img_model, jit=False) + clip_img_model.to(device).eval().requires_grad_(False) + + return { + "accelerator": accelerator, + "device": device, + 'config': config, + "train_state": train_state, + "origin_sd": origin_sd, + "caption_decoder": caption_decoder, + "nnet": nnet, + "autoencoder": autoencoder, + "clip_text_model": clip_text_model, + "clip_img_model": clip_img_model, + "clip_img_model_preprocess": clip_img_model_preprocess + } + + + +def process_one_json(json_data, image_output_path, context={}): + """ + Given a json object, process the task the json describes. + :param json_data: A dictionary containing the input data for the task. + :param image_output_path: A string representing the output path for the processed images. + :param context: A dictionary containing the context for the task. + :return: A dictionary containing the processed images and their corresponding captions. + """ + # Import necessary modules + from torch.utils.data import DataLoader + import utils + from libs.schedule import stable_diffusion_beta_schedule, Schedule, LSimple_T2I + from pathlib import Path + from libs.data import PersonalizedBasev2 + from absl import logging + import torch + from sample_fn import sample + + # Get context variables + accelerator = context["accelerator"] + config = context["config"] + device = context["device"] + + nnet = context["nnet"] + autoencoder = context["autoencoder"] + clip_text_model = context["clip_text_model"] + clip_img_model = context["clip_img_model"] + clip_img_model_preprocess = context["clip_img_model_preprocess"] + + # 初始化训练步数 + # 重新初始化模型 + # Initialize training step and load model state dictionary + train_state.step = 0 + nnet.load_state_dict(origin_sd, False) + + """ + 处理数据部分 + """ + # Process data + image_paths = [i["path"] for i in json_data["source_group"]] + train_dataset = PersonalizedBasev2(image_paths, resolution=512, class_word=json_data["class_word"]) + train_dataset_loader = DataLoader(train_dataset, + batch_size=config.batch_size, + num_workers=config.num_workers, + pin_memory=True, + drop_last=True + ) + + train_data_generator = utils.get_data_generator(train_dataset_loader, enable_tqdm=accelerator.is_main_process, desc='train') + + _betas = stable_diffusion_beta_schedule() + schedule = Schedule(_betas) + logging.info(f'use {schedule}') + + # Get model parameters and initialize optimizer and learning rate scheduler + params = [] + for name, paramter in nnet.named_parameters(): + if 'attn.qkv' in name: + params.append(paramter) + optimizer = utils.get_optimizer(params, **config.optimizer) + lr_scheduler = utils.get_lr_scheduler(optimizer, **config.lr_scheduler) + + # Prepare model, optimizer, and learning rate scheduler + nnet, optimizer, lr_scheduler = accelerator.prepare(nnet, optimizer, lr_scheduler) + + # Define training step function + def train_step(): + metrics = dict() + img, img4clip, text, data_type = next(train_data_generator) + img = img.to(device) + img4clip = img4clip.to(device) + data_type = data_type.to(device) + + with torch.no_grad(): + z = autoencoder.encode(img) + clip_img = clip_img_model.encode_image(img4clip).unsqueeze(1) + text = clip_text_model.encode(text) + text = caption_decoder.encode_prefix(text) + + loss, loss_img, loss_clip_img, loss_text = LSimple_T2I(img=z, clip_img=clip_img, text=text, data_type=data_type, nnet=nnet, schedule=schedule, device=device) + accelerator.backward(loss.mean()) + optimizer.step() + lr_scheduler.step() + train_state.step += 1 + optimizer.zero_grad() + metrics['loss'] = accelerator.gather(loss.detach().mean()).mean().item() + metrics['loss_img'] = accelerator.gather(loss_img.detach().mean()).mean().item() + metrics['loss_clip_img'] = accelerator.gather(loss_clip_img.detach().mean()).mean().item() + metrics['scale'] = accelerator.scaler.get_scale() + metrics['lr'] = train_state.optimizer.param_groups[0]['lr'] + return metrics + + # Define loop function for training + def loop(): + log_step = 0 + while True: + nnet.train() + total_step = train_state.step * config.batch_size + with accelerator.accumulate(nnet): + metrics = train_step() + + if accelerator.is_main_process and total_step >= log_step: + nnet.eval() + total_step = train_state.step * config.batch_size + logging.info(utils.dct2str(dict(step=total_step, **metrics))) + log_step += config.log_interval + + accelerator.wait_for_everyone() + + if total_step >= config.max_step: + break + loop() + + # Set configuration for image sampling + config.n_samples = 4 + config.n_iter = 1 + images = [] + # Sample images for each caption in the input data + for caption in json_data["caption_list"]: + config.prompt = caption + paths = sample(config, nnet, clip_text_model, autoencoder, caption_decoder, device, json_data["id"], output_path=image_output_path) + images.append({"prompt": caption, "paths": paths}) + + # Return dictionary containing processed images and their corresponding captions + return { + "id": json_data["id"], + "images": images + } \ No newline at end of file diff --git a/checkmodel.py b/checkmodel.py deleted file mode 100644 index f886db3..0000000 --- a/checkmodel.py +++ /dev/null @@ -1,112 +0,0 @@ - -import torch -import utils -import os - -import libs.autoencoder -import clip -from libs.clip import CLIPEmbedder -from libs.caption_decoder import CaptionDecoder -from torch.utils.data import DataLoader -from libs.schedule import stable_diffusion_beta_schedule, Schedule, LSimple_T2I -import argparse -import yaml -import datetime -from pathlib import Path -from libs.data import PersonalizedBase, PromptDataset, collate_fn -from libs.uvit_multi_post_ln_v1 import UViT -import diffusers -from diffusers import DiffusionPipeline -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import ProjectConfiguration, set_seed -from pathlib import Path -import torch.nn as nn - -import tqdm - -from accelerate.logging import get_logger - - - - -def train(config): - accelerator, device = utils.setup(config) - - train_state = utils.initialize_train_state(config, device, uvit_class=UViT) - #print("train_state", train_state) - - train_state.nnet.load_state_dict(torch.load(config.nnet_path, map_location='cpu'), False) - - caption_decoder = CaptionDecoder(device=device, **config.caption_decoder) - #print("caption_decoder", caption_decoder) - - - nnet, optimizer = accelerator.prepare(train_state.nnet, train_state.optimizer) - nnet.to(device) - #print("nnet", nnet) - print(nnet.state_dict()) - - - print(nnet.state_dict().keys()) - - - - - - - - # 非Lora部分不计算梯度 - for name,param in nnet.named_parameters(): - if 'lora_adapters_ttoi' in name or 'lora_adapters_itot' in name or 'token_embedding' in name: - param.requires_grad = True - else: - param.requires_grad=False - - - # check the nnet's parameters if they are frozen - for name, param in nnet.named_parameters(): - print(f'{name}: requires_grad={param.requires_grad}') - - - # Number of trainable parameters - print(sum(p.numel() for p in nnet.parameters() if p.requires_grad)) - - - - print("optimizer", optimizer) - - - - lr_scheduler = train_state.lr_scheduler - print("lr_scheduler", lr_scheduler) - - autoencoder = libs.autoencoder.get_model(**config.autoencoder).to(device) - print("autoencoder", autoencoder) - - clip_text_model = CLIPEmbedder(version=config.clip_text_model, device=device) - print("clip_text_model", clip_text_model) - - - # img clip model - clip_img_model, clip_img_model_preprocess = clip.load(config.clip_img_model, jit=False) - - # freeze the parameters of clip img model - clip_img_model.to(device).eval().requires_grad_(False) - - -def main(): - print("hello world") - from configs.unidiffuserv1 import get_config - config = get_config() - config.ckpt_root = "/home/schengwei" - config.workdir = "/home/schengwei" - # print("config", config) - - config_name = "unidiffuserv1" - train(config) - print("bye world") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/configs/sample_config.py b/configs/sample_config.py index 9921513..1cc29b2 100755 --- a/configs/sample_config.py +++ b/configs/sample_config.py @@ -10,7 +10,7 @@ def get_config(): config = ml_collections.ConfigDict() - + config.device = "cuda:7" # config.seed = 3214 # # config.seed = random.randint(500, 2000) # generate a random seed config.pred = 'noise_pred' @@ -29,9 +29,11 @@ def get_config(): config.batch_size = 6 config.resolution = 512 config.closerprompt = "photo of a girl" - config.clip_img_model = "ViT-B/32" + config.clip_img_model = "other_models/clip" # config.clip_text_model = "/data/hdd3/schengwei/models--openai--clip-vit-large-patch14/snapshots/8d052a0f05efbaefbc9e8786ba291cfdf93e5bff" - config.clip_text_model ="openai/clip-vit-large-patch14" + config.clip_text_model ="other_models/models--openai--clip-vit-large-patch14" + # config.uvit = "other_models/first_15000.pth" + config.uvit = "/home/wuyujia/.cache/final_test/nnet.pth" config.only_load_model = True @@ -83,8 +85,8 @@ def get_config(): config.n_samples = 6 # control the numbers of generating images config.n_iter = 1 # 过多的迭代次数可能导致过拟合或生成的样本过于接近训练数据 config.sample = d( - sample_steps=150, - scale=7.5, # scale of the text embedding 7 - 10 + sample_steps=80, + scale=9, # scale of the text embedding 7 - 10 t2i_cfg_mode='true_uncond', # 'empty_token' or 'true_uncond' ) diff --git a/configs/unidiffuserv1.py b/configs/unidiffuserv1.py index 70e5cb3..89afb90 100755 --- a/configs/unidiffuserv1.py +++ b/configs/unidiffuserv1.py @@ -18,23 +18,27 @@ def get_config(): config.clip_text_dim = 768 config.text_dim = 64 # reduce dimension config.data_type = 1 - config.gradient_accumulation_steps = 1 - - # config.log_interval = 1000 - # config.eval_interval = 100 - # config.save_interval = 200 - - config.max_step = 200 + + + config.localtest = 1 # if test in 145 localhost otherwise use 0 + + config.batch_size = 4 config.center_crop = True config.real_prior = True config.reversion = None - - config.nnet_path = "models/uvit_v1.pth" - + + # config.nnet_path = "models/uvit_v1.pth" + config.uvit = "other_models/first_15000.pth" + # config.uvit = "/data/hdd3/wuyujia/ImageReward/ImageReward/Competitionrepo/final_test/nnet-75000-good.pth" + # config.uvit = "/home/wuyujia/.cache/final_test/nnet.pth" + # config.uvit = "/data/hdd3/wuyujia/ImageReward/ImageReward/Competitionrepo/final_test/nnet_6000.pth" config.max_grad_norm = 1.0 + config.device = "cuda:3" + config.use_nnet_standard = True + config.dataloader_num_workers = 10 # original is 10 @@ -45,10 +49,13 @@ def get_config(): config.resolution = 512 - config.clip_img_model = "ViT-B/32" - config.clip_text_model = "openai/clip-vit-large-patch14" + config.clip_img_model = "other_models/clip" + config.clip_text_model = "other_models/models--openai--clip-vit-large-patch14" # config.clip_text_model = "/home/schengwei/.cache/huggingface/hub/models--openai--clip-vit-large-patch14/snapshots/8d052a0f05efbaefbc9e8786ba291cfdf93e5bff" - + # config.modelcontext = '/workspace/final_json_data' + config.modelcontext = 'final_json_data/json' + config.accelerate_adapters = 'other_models/adapter' + config.only_load_model = True @@ -104,12 +111,12 @@ def get_config(): # sample config.mode = "t2i" - config.n_samples = 9 # control the numbers of generating images + # config.n_samples = 9 # control the numbers of generating images config.n_iter = 1 # 过多的迭代次数可能导致过拟合或生成的样本过于接近训练数据 config.nrow = 4 config.sample = d( - sample_steps=100, - scale=7., + sample_steps=10, + scale=9, t2i_cfg_mode='true_uncond' ) diff --git a/cudatest.py b/cudatest.py deleted file mode 100755 index 3155b89..0000000 --- a/cudatest.py +++ /dev/null @@ -1,41 +0,0 @@ -# import os -# import torch -# os.environ['CUDA_VISIBLE_DEVICES']="0" -# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -# torch.randn((200000, 300, 200, 20), device=device) - -import os - -import torch - -from libs.dpm_solver_pp import NoiseScheduleVP, DPM_Solver - -from libs.uvit_multi_post_ln_v1 import UViT -from peft import inject_adapter_in_model, LoraConfig,get_peft_model - -lora_config = LoraConfig( - inference_mode=False, r=64, lora_alpha=32, lora_dropout=0.1,target_modules=["qkv","fc1","fc2","proj","text_embed","clip_img_embed"] -) - - -from configs.sample_config import get_config - - -config = get_config() - -config.lora_path = os.path.join("model_output/girl2_copy", "lora.pt.tmp",'lora.pt') - -device = "cuda" - -# init models -nnet = UViT(**config.nnet) - -print(f'load nnet from {config.lora_path}') - -nnet.load_state_dict(torch.load("/home/wuyujia/competition/models/uvit_v1.pth", map_location='cpu'), False) -print(nnet) - -nnet = get_peft_model(nnet,lora_config) - -nnet.load_state_dict(torch.load(config.lora_path, map_location='cpu'), False) - diff --git a/dependency/annotated_deep_learning_paper_implementations b/dependency/annotated_deep_learning_paper_implementations new file mode 160000 index 0000000..a1c17f3 --- /dev/null +++ b/dependency/annotated_deep_learning_paper_implementations @@ -0,0 +1 @@ +Subproject commit a1c17f3b4172460624b92be4c3e8d0da857aa39b diff --git a/get_bound.py b/get_bound.py new file mode 100644 index 0000000..805260e --- /dev/null +++ b/get_bound.py @@ -0,0 +1,192 @@ +import json +import os +import time +from PIL import Image +import argparse +import numpy as np +TIME_ST = time.time() +TIME_ED = time.time() + + +def prepare_context(): + """ + prepare context for later use + """ + import torch + import utils + from utils import set_logger + from absl import logging + import os + import libs.autoencoder + import clip + from libs.clip import FrozenCLIPEmbedder + from libs.uvit_multi_post_ln_v1 import UViT + from configs.unidiffuserv1 import get_config + import builtins + import ml_collections + from score import Evaluator + from torch import multiprocessing as mp + + config = get_config() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # cuda:0 default + + nnet = UViT(**config.nnet) + origin_sd = torch.load("models/uvit_v1.pth", map_location='cpu') + nnet.load_state_dict(origin_sd, strict=False) + + nnet.to(device) + + + + + autoencoder = libs.autoencoder.get_model(**config.autoencoder).to(device) + clip_text_model = FrozenCLIPEmbedder(version=config.clip_text_model, device=device) + clip_img_model, clip_img_model_preprocess = clip.load(config.clip_img_model, jit=False) + clip_img_model.to(device).eval().requires_grad_(False) + + ev = Evaluator() + + return { + "device": device, + 'config': config, + "origin_sd": origin_sd, + "nnet": nnet, + "autoencoder": autoencoder, + "clip_text_model": clip_text_model, + "clip_img_model": clip_img_model, + "clip_img_model_preprocess": clip_img_model_preprocess, + "ev": ev + } + + +def load_json_files(path): + """ + given a directory, load all json files in that directory + return a list of json objects + """ + d_ls = [] + for file in os.listdir(path): + if file.endswith(".json"): + with open(os.path.join(path, file), 'r') as f: + json_data = json.load(f) + d_ls.append(json_data) + return d_ls + + +def process_one_json(json_data, image_output_path, context={}): + """ + given a json object, process the task the json describes + """ + import utils + from absl import logging + import torch + from sample_fn import sample + + # 初始化训练步数 + + config = context["config"] + device = context["device"] + + nnet = context["nnet"] + autoencoder = context["autoencoder"] + clip_text_model = context["clip_text_model"] + ev = context["ev"] + + config.n_samples = 4 + config.n_iter = 5 + + origin_images = [Image.open(i["path"]).convert('RGB') for i in json_data["source_group"]] + origin_face_embs = [ev.get_face_embedding(i) for i in origin_images] + origin_face_embs = [emb for emb in origin_face_embs if emb is not None] + origin_face_embs = torch.cat(origin_face_embs) + + origin_clip_embs = [ev.get_img_embedding(i) for i in origin_images] + origin_clip_embs = torch.cat(origin_clip_embs) + + images = [] + for caption in json_data["caption_list"]: + config.prompt = caption + paths = sample(config, nnet, clip_text_model, autoencoder, device, json_data["id"], output_path=image_output_path) + # face max sim is source group self sim + max_face_sim = (origin_face_embs @ origin_face_embs.T).mean().item() + + # face min sim is randon pic gened by prompt + samples = [Image.open(sample_path).convert('RGB') for sample_path in paths] + face_embs = [ev.get_face_embedding(sample) for sample in samples] + face_embs = [emb for emb in face_embs if emb is not None] + if len(face_embs) == 0: + print(f"no face for case{json_data['id']} caption {caption}") + continue + min_face_sim = (origin_face_embs @ torch.cat(face_embs).T).mean().item() + + # text max sim is image gened by prompt sim with prompt + text_emb = ev.get_text_embedding(caption) + gen_clip_embs = torch.cat([ev.get_img_embedding(i) for i in samples]) + max_text_sim = (text_emb @ gen_clip_embs.T).mean().item() + + + # text min sim is source group with prompt + min_text_sim = (text_emb @ origin_clip_embs.T).mean().item() + + # image reward max sim is gened by prompt sim with prompt + max_image_reward = np.mean([ev.image_reward.score(caption, path) for path in paths ]).item() + + # image reward min sim is source group with prompt + min_image_reward = np.mean([ev.image_reward.score(caption, i["path"]) for i in json_data["source_group"] ]).item() + + + + images.append({"prompt": caption, "paths": paths, + "max_face_sim": max_face_sim, + "min_face_sim": min_face_sim, + "max_text_sim": max_text_sim, + "min_text_sim": min_text_sim, + "max_image_reward": max_image_reward, + "min_image_reward": min_image_reward, + }) + + return { + "id": json_data["id"], + "images": images + } + + + +def tik(): + global TIME_ST + TIME_ST = time.time() +def tok(name): + global TIME_ED + TIME_ED = time.time() + print(f"Time {name} elapsed: {TIME_ED - TIME_ST}") + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-d","--json_data_path", type=str, default="test_json_data", help="file contains prompts") + parser.add_argument("-j","--json_output_path", type=str, default="bound_json_outputs", help="file contains scores") + parser.add_argument("-i","--image_output_path", type=str, default="bound_image_outputs", help="output dir for generated images") + return parser.parse_args() + +def main(): + """ + main function + """ + arg = get_args() + os.makedirs(arg.json_output_path, exist_ok=True) + os.makedirs(arg.image_output_path, exist_ok=True) + # load json files + json_data_ls = load_json_files(arg.json_data_path) + + # process json files + context = prepare_context() + + for json_data in json_data_ls: + tik() + out = process_one_json(json_data, arg.image_output_path, context) + tok(f"process_one_json: {json_data['id']}") + with open(os.path.join(arg.json_output_path, f"{json_data['id']}.json"), 'w') as f: + json.dump(out, f, indent=4) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/img2img.py b/img2img.py new file mode 100644 index 0000000..a76523f --- /dev/null +++ b/img2img.py @@ -0,0 +1,238 @@ +""" +--- +title: Generate images using stable diffusion with a prompt from a given image +summary: > + Generate images using stable diffusion with a prompt from a given image +--- + +# Generate images using [stable diffusion](../index.html) with a prompt from a given image +""" + +import torch + +import argparse +from torch import nn +import einops +from resize import resize_images_in_path + +import argparse +from pathlib import Path +import clip +from labml import lab, monit +from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler +from labml_nn.diffusion.stable_diffusion.util import load_model, load_img, save_images, set_seed +# from torchvision.utils import save_image + +class Img2Img: + """ + ### Image to image class + """ + + def __init__(self, *, config, + ddim_steps: int = 50, + ddim_eta: float = 0.0): + """ + :param checkpoint_path: is the path of the checkpoint + :param ddim_steps: is the number of sampling steps + :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant + """ + self.ddim_steps = ddim_steps + + # Load [latent diffusion model](../latent_diffusion.html) + self.model = load_model(config.uvit ,config) + + # Get device + self.device = torch.device(config.device) if torch.cuda.is_available() else torch.device("cpu") + # Move the model to device + self.model.to(self.device) + + # Initialize [DDIM sampler](../sampler/ddim.html) + self.sampler = DDIMSampler(self.model, + n_steps=ddim_steps, + ddim_eta=ddim_eta) + + + @torch.no_grad() + def __call__(self, *, + dest_path: str, + orig_img: str, + strength: float, + batch_size: int = 3, + prompt: str, + uncond_scale: float = 5.0, + ): + """ + :param dest_path: is the path to store the generated images + :param orig_img: is the image to transform + :param strength: specifies how much of the original image should not be preserved + :param batch_size: is the number of images to generate in a batch + :param prompt: is the prompt to generate images with + :param uncond_scale: is the unconditional guidance scale $s$. This is used for + $\epsilon_\theta(x_t, c) = s\epsilon_\text{cond}(x_t, c) + (s - 1)\epsilon_\text{cond}(x_t, c_u)$ + """ + + def combine(z, clip_img): + z = einops.rearrange(z, 'B C H W -> B (C H W)') + clip_img = einops.rearrange(clip_img, 'B L D -> B (L D)') + return torch.concat([z, clip_img], dim=-1) + def unpreprocess(v): # to B C H W and [0, 1] + v = 0.5 * (v + 1.) + v.clamp_(0., 1.) + return v + def split(x): + ### x : torch.Size([4, 4, 64, 64]) + C, H, W = (4, 64, 64) + z_dim = C * H * W + z, clip_img = x.split([z_dim, 512], dim=1) + + z = einops.rearrange(z, 'B (C H W) -> B C H W', C=C, H=H, W=W) + clip_img = einops.rearrange(clip_img, 'B (L D) -> B L D', L=1, D=512) + return z, clip_img + + + # Make a batch of prompts + prompts = batch_size * [prompt] + + # Load image 已经是 tensor + orig_image = load_img(orig_img).to(self.device) + + + # Encode the image in the latent space and make `batch_size` copies of it + orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1).to(self.device) + + ### orig: torch.Size([4, 4, 80, 60]) + + orig_clipimg = self.model.get_clipimg_embedding(orig_image).repeat(batch_size,1,1).to(self.device) + + + # orig = combine(orig, orig_clipimg) + + + + # Get the number of steps to diffuse the original + assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]' + t_index = int(strength * self.ddim_steps) # int 37 + + # AMP auto casting + with torch.cuda.amp.autocast(): + # In unconditional scaling is not $1$ get the embeddings for empty prompts (no conditioning). + if uncond_scale != 1.0: + un_cond = self.model.get_text_conditioning(batch_size * [""]) + ### un_cond.shape: torch.Size([4, 77, 768]) + else: + un_cond = None + + + # Get the prompt embeddings + cond = self.model.get_text_conditioning(prompts) + + ### cond.shape: torch.Size([4, 77, 768]) + cond = self.model.get_encode_prefix(cond) + + def captiondecodeprefix(x): + return self.model.get_decode_prefix(x) + + def captionencodeprefix(x): + return self.model.get_encode_prefix(x) + + # Add noise to the original image + + t_img = torch.Tensor(t_index).unsqueeze(0).repeat(batch_size, 1).to(self.device) + t_text = torch.zeros(t_img.size(0), dtype=torch.int, device=self.device) + datatype = torch.zeros_like(t_text, device=self.device, dtype=torch.int) + 1 + + + x,added_noise = self.sampler.q_sample(orig, t_index) + + # Reconstruct from the noisy image + x = self.sampler.paint(x, cond, t_index,t_img, orig_clipimg, t_text, datatype, captiondecodeprefix,captionencodeprefix, + uncond_scale=uncond_scale, + uncond_cond=un_cond) + # Decode the image from the [autoencoder](../model/autoencoder.html) + + # z, _ = split(x) + + images = self.model.autoencoder_decode(x) + + # Save images + save_images(images, dest_path, 'img_') + + +def main(): + """ + ### CLI + """ + from configs.sample_config import get_config + + config = get_config() + + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--prompt", + type=str, + nargs="?", + default="", + help="the prompt to render" + ) + + parser.add_argument( + "--orig-img", + type=str, + nargs="?", + default="/home/schengwei/Competitionrepo/resources/boy1_example.jpeg", + help="path to the input image" + ) + parser.add_argument( + "--device-id", + type=str, + default="cuda:5", + help="device to use" + ) + # init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png") + parser.add_argument("--batch_size", type=int, default=4, help="batch size", ) + parser.add_argument("--steps", type=int, default=50, help="number of ddim sampling steps") + + parser.add_argument("--scale", type=float, default=5.0, + help="unconditional guidance scale: " + "eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))") + + parser.add_argument("--strength", type=float, default=0.01, + help="strength for noise: " + "vary from 0.0 to 1.0 which 1.0 corresponds to full destruction of information in init image") + parser.add_argument( + "--ddim_eta", + type=int, + default=0, + help="ddim eta control the noise adding each step." + ) + parser.add_argument( + "--dest_path", + type=str, + default="/home/schengwei/Competitionrepo/ddimoutput", + help="the path to save the generated images" + ) + + opt = parser.parse_args() + set_seed(42) + + # which gpu to use + config.device = opt.device_id + + img2img = Img2Img(config=config, ddim_steps=opt.steps, ddim_eta=opt.ddim_eta) + + + with monit.section('Generate'): + img2img( + dest_path='/home/schengwei/Competitionrepo/ddimoutput', + orig_img=opt.orig_img, + strength=opt.strength, + batch_size=opt.batch_size, + prompt=opt.prompt, + uncond_scale=opt.scale) + + +# +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/img2img_copytest.py b/img2img_copytest.py new file mode 100644 index 0000000..032ea21 --- /dev/null +++ b/img2img_copytest.py @@ -0,0 +1,122 @@ +""" +--- +title: Generate images using stable diffusion with a prompt from a given image +summary: > + Generate images using stable diffusion with a prompt from a given image +--- + +# Generate images using [stable diffusion](../index.html) with a prompt from a given image +""" + +import torch + +import argparse +from torch import nn +import einops +from resize import resize_images_in_path +import libs.autoencoder +import argparse +# from configs.sample_config import get_config +from pathlib import Path +import clip +from labml import lab, monit +from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler +from labml_nn.diffusion.stable_diffusion.util import load_model, load_img, set_seed +# from torchvision.utils import save_image + +class Img2Img: + """ + ### Image to image class + """ + + def __init__(self, *, config, + ddim_steps: int = 50, + ddim_eta: float = 0.0): + """ + :param checkpoint_path: is the path of the checkpoint + :param ddim_steps: is the number of sampling steps + :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant + """ + self.ddim_steps = ddim_steps + + # Load [latent diffusion model](../latent_diffusion.html) + self.model = load_model(config.uvit ,config) + + # Get device + self.device = torch.device(config.device) if torch.cuda.is_available() else torch.device("cpu") + # Move the model to device + self.model.to(self.device) + + # Initialize [DDIM sampler](../sampler/ddim.html) + self.sampler = DDIMSampler(self.model, + n_steps=ddim_steps, + ddim_eta=ddim_eta) + + + @torch.no_grad() + def __call__(self, context,latent_cb: torch.tensor, *, + strength: float = 0.0, + batch_size: int = 1, + prompt: str = "", + uncond_scale: float = 5.0, + # autoencoder=autoencoder + ): + + + orig = latent_cb + + orig_clipimg = torch.randn(1, 1, 512, device=self.device) + + # autoencoder = autoencoder.to(self.device) + + # Get the number of steps to diffuse the original + assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]' + if strength * self.ddim_steps < 1.0: + t_index = int(1) + else: + t_index = int(strength * self.ddim_steps) # int 37 + + # AMP auto casting + with torch.cuda.amp.autocast(): + # un_cond = self.model.get_text_conditioning(batch_size * [""]) + + + # Get the prompt embeddings + cond = context + ### now cond is torch.Size([2, 77, 768]) + + ### cond.shape: torch.Size([4, 77, 768]) + # cond = self.model.get_encode_prefix(cond) + + def captiondecodeprefix(x): + return self.model.get_decode_prefix(x) + + def captionencodeprefix(x): + return self.model.get_encode_prefix(x) + + # Add noise to the original image + + t_img = torch.Tensor([1]).unsqueeze(0).repeat(batch_size, 1).to(self.device) + t_text = torch.zeros(t_img.size(0), dtype=torch.int, device=self.device) + datatype = torch.zeros_like(t_text, device=self.device, dtype=torch.int) + 1 + + + # x,added_noise = self.sampler.q_sample(orig, t_index) + x,orgin_n = self.sampler.q_sample(orig, t_index) + + # Reconstruct from the noisy image + x = self.sampler.paint(x, + cond, + t_index, + t_img, + orig_clipimg, + t_text, + datatype, + captiondecodeprefix, + captionencodeprefix, + orgin_n, + uncond_scale=uncond_scale + ) + images = x + + return images diff --git a/img2imgcopy.py b/img2imgcopy.py new file mode 100644 index 0000000..79ee228 --- /dev/null +++ b/img2imgcopy.py @@ -0,0 +1,184 @@ +""" +--- +title: Generate images using stable diffusion with a prompt from a given image +summary: > + Generate images using stable diffusion with a prompt from a given image +--- + +# Generate images using [stable diffusion](../index.html) with a prompt from a given image +""" + +import torch + +import argparse +from torch import nn +import einops +from resize import resize_images_in_path + +import argparse +from pathlib import Path +import clip +from labml import lab, monit +from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler +from labml_nn.diffusion.stable_diffusion.util import load_model, load_img, save_images, set_seed +# from torchvision.utils import save_image + +class Img2Img: + """ + ### Image to image class + """ + + def __init__(self, *, config, + ddim_steps: int = 50, + ddim_eta: float = 0.0): + """ + :param checkpoint_path: is the path of the checkpoint + :param ddim_steps: is the number of sampling steps + :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant + """ + self.ddim_steps = ddim_steps + + # Load [latent diffusion model](../latent_diffusion.html) + self.model = load_model(config.uvit ,config) + + # Get device + self.device = torch.device(config.device) if torch.cuda.is_available() else torch.device("cpu") + # Move the model to device + self.model.to(self.device) + + # Initialize [DDIM sampler](../sampler/ddim.html) + self.sampler = DDIMSampler(self.model, + n_steps=ddim_steps, + ddim_eta=ddim_eta) + + + @torch.no_grad() + def __call__(self, context,latent_cb: torch.tensor, *, + strength: float = 0.0, + batch_size: int = 1, + prompt: str = "", + uncond_scale: float = 5.0, + + ): + """ + :param dest_path: is the path to store the generated images + :param orig_img: is the image to transform + :param strength: specifies how much of the original image should not be preserved + :param batch_size: is the number of images to generate in a batch + :param prompt: is the prompt to generate images with + :param uncond_scale: is the unconditional guidance scale $s$. This is used for + $\epsilon_\theta(x_t, c) = s\epsilon_\text{cond}(x_t, c) + (s - 1)\epsilon_\text{cond}(x_t, c_u)$ + """ + + orig = latent_cb + + ### orig: torch.Size([4, 4, 80, 60]) + + orig_clipimg = torch.randn(1, 1, 512, device=self.device) + + + # Get the number of steps to diffuse the original + assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]' + t_index = int(strength * self.ddim_steps) # int 37 + + # AMP auto casting + with torch.cuda.amp.autocast(): + un_cond = self.model.get_text_conditioning(batch_size * [""]) + + + # Get the prompt embeddings + cond = context + + ### cond.shape: torch.Size([4, 77, 768]) + cond = self.model.get_encode_prefix(cond) + + def captiondecodeprefix(x): + return self.model.get_decode_prefix(x) + + def captionencodeprefix(x): + return self.model.get_encode_prefix(x) + + # Add noise to the original image + + t_img = torch.Tensor(t_index).unsqueeze(0).repeat(batch_size, 1).to(self.device) + t_text = torch.zeros(t_img.size(0), dtype=torch.int, device=self.device) + datatype = torch.zeros_like(t_text, device=self.device, dtype=torch.int) + 1 + + + x,added_noise = self.sampler.q_sample(orig, t_index) + + # Reconstruct from the noisy image + x = self.sampler.paint(x, cond, t_index,t_img, orig_clipimg, t_text, datatype, captiondecodeprefix,captionencodeprefix, + uncond_scale=uncond_scale, + uncond_cond=un_cond) + images = x + + return images + + +def main(): + """ + ### CLI + """ + from configs.sample_config import get_config + + config = get_config() + + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--prompt", + type=str, + nargs="?", + default="", + help="the prompt to render" + ) + + parser.add_argument( + "--orig-img", + type=str, + nargs="?", + default="/home/schengwei/Competitionrepo/resources/boy1_example.jpeg", + help="path to the input image" + ) + parser.add_argument( + "--device-id", + type=str, + default="cuda:5", + help="device to use" + ) + # init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png") + parser.add_argument("--batch_size", type=int, default=4, help="batch size", ) + parser.add_argument("--steps", type=int, default=50, help="number of ddim sampling steps") + + parser.add_argument("--scale", type=float, default=5.0, + help="unconditional guidance scale: " + "eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))") + + parser.add_argument("--strength", type=float, default=0.01, + help="strength for noise: " + "vary from 0.0 to 1.0 which 1.0 corresponds to full destruction of information in init image") + + opt = parser.parse_args() + set_seed(42) + + # which gpu to use + config.device = opt.device_id + + img2img = Img2Img(config=config, ddim_steps=opt.steps, ddim_eta=0) + + + with monit.section('Generate'): + img2img( + dest_path='/home/schengwei/Competitionrepo/ddimoutput', + orig_img=opt.orig_img, + strength=opt.strength, + batch_size=opt.batch_size, + prompt=opt.prompt, + uncond_scale=opt.scale) + + +# +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/indocker_shell.sh b/indocker_shell.sh index f556c85..30297b0 100755 --- a/indocker_shell.sh +++ b/indocker_shell.sh @@ -1,5 +1,8 @@ #!/bin/bash +python main.py -/workspace/sample.sh > /workspace/results.log 2>&1 -cat /workspace/results.log | grep 'finetuned parameters' | awk '{s+=$(NF)} END {print s}' -python score.py + +# the old one +# /workspace/sample.sh > /workspace/results.log 2>&1 +# cat /workspace/results.log | grep 'finetuned parameters' | awk '{s+=$(NF)} END {print s}' +# python score.py \ No newline at end of file diff --git a/json_outputs/1.json b/json_outputs/1.json new file mode 100644 index 0000000..5b87cdb --- /dev/null +++ b/json_outputs/1.json @@ -0,0 +1 @@ +{"id": 1, "images": [{"prompt": "A man is standing in front of the Custom House, posing for a photo ", "paths": ["image_outputs/1-A man is standing in front of the Custom House, posing for a photo -000.jpg", "image_outputs/1-A man is standing in front of the Custom House, posing for a photo -001.jpg", "image_outputs/1-A man is standing in front of the Custom House, posing for a photo -002.jpg", "image_outputs/1-A man is standing in front of the Custom House, posing for a photo -003.jpg"]}, {"prompt": "A man wearing a black suit and red bow tie, standing confidently for a photo ", "paths": ["image_outputs/1-A man wearing a black suit and red bow tie, standing confidently for a photo -000.jpg", "image_outputs/1-A man wearing a black suit and red bow tie, standing confidently for a photo -001.jpg", "image_outputs/1-A man wearing a black suit and red bow tie, standing confidently for a photo -002.jpg", "image_outputs/1-A man wearing a black suit and red bow tie, standing confidently for a photo -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/10.json b/json_outputs/10.json new file mode 100644 index 0000000..94fd4fe --- /dev/null +++ b/json_outputs/10.json @@ -0,0 +1 @@ +{"id": 10, "images": [{"prompt": "a woman in a blue blouse and black skirt ", "paths": ["image_outputs/10-a woman in a blue blouse and black skirt -000.jpg", "image_outputs/10-a woman in a blue blouse and black skirt -001.jpg", "image_outputs/10-a woman in a blue blouse and black skirt -002.jpg", "image_outputs/10-a woman in a blue blouse and black skirt -003.jpg"]}, {"prompt": "an asian woman sitting at a table in a restaurant ", "paths": ["image_outputs/10-an asian woman sitting at a table in a restaurant -000.jpg", "image_outputs/10-an asian woman sitting at a table in a restaurant -001.jpg", "image_outputs/10-an asian woman sitting at a table in a restaurant -002.jpg", "image_outputs/10-an asian woman sitting at a table in a restaurant -003.jpg"]}, {"prompt": "an asian woman in a black swimsuit posing in the water ", "paths": ["image_outputs/10-an asian woman in a black swimsuit posing in the water -000.jpg", "image_outputs/10-an asian woman in a black swimsuit posing in the water -001.jpg", "image_outputs/10-an asian woman in a black swimsuit posing in the water -002.jpg", "image_outputs/10-an asian woman in a black swimsuit posing in the water -003.jpg"]}, {"prompt": "a woman sitting at a table with three trays of food ", "paths": ["image_outputs/10-a woman sitting at a table with three trays of food -000.jpg", "image_outputs/10-a woman sitting at a table with three trays of food -001.jpg", "image_outputs/10-a woman sitting at a table with three trays of food -002.jpg", "image_outputs/10-a woman sitting at a table with three trays of food -003.jpg"]}, {"prompt": "an asian woman wearing a necklace and earphones ", "paths": ["image_outputs/10-an asian woman wearing a necklace and earphones -000.jpg", "image_outputs/10-an asian woman wearing a necklace and earphones -001.jpg", "image_outputs/10-an asian woman wearing a necklace and earphones -002.jpg", "image_outputs/10-an asian woman wearing a necklace and earphones -003.jpg"]}, {"prompt": "a woman taking a selfie in front of a mirror ", "paths": ["image_outputs/10-a woman taking a selfie in front of a mirror -000.jpg", "image_outputs/10-a woman taking a selfie in front of a mirror -001.jpg", "image_outputs/10-a woman taking a selfie in front of a mirror -002.jpg", "image_outputs/10-a woman taking a selfie in front of a mirror -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/11.json b/json_outputs/11.json new file mode 100644 index 0000000..439c582 --- /dev/null +++ b/json_outputs/11.json @@ -0,0 +1 @@ +{"id": 11, "images": [{"prompt": "a young asian woman wearing a black top and earphones ", "paths": ["image_outputs/11-a young asian woman wearing a black top and earphones -000.jpg", "image_outputs/11-a young asian woman wearing a black top and earphones -001.jpg", "image_outputs/11-a young asian woman wearing a black top and earphones -002.jpg", "image_outputs/11-a young asian woman wearing a black top and earphones -003.jpg"]}, {"prompt": "a woman in a pink vest sitting in a car ", "paths": ["image_outputs/11-a woman in a pink vest sitting in a car -000.jpg", "image_outputs/11-a woman in a pink vest sitting in a car -001.jpg", "image_outputs/11-a woman in a pink vest sitting in a car -002.jpg", "image_outputs/11-a woman in a pink vest sitting in a car -003.jpg"]}, {"prompt": "a young asian woman sitting on a couch with a pillow in front of her ", "paths": ["image_outputs/11-a young asian woman sitting on a couch with a pillow in front of her -000.jpg", "image_outputs/11-a young asian woman sitting on a couch with a pillow in front of her -001.jpg", "image_outputs/11-a young asian woman sitting on a couch with a pillow in front of her -002.jpg", "image_outputs/11-a young asian woman sitting on a couch with a pillow in front of her -003.jpg"]}, {"prompt": "a woman in jeans and a leather jacket taking a selfie ", "paths": ["image_outputs/11-a woman in jeans and a leather jacket taking a selfie -000.jpg", "image_outputs/11-a woman in jeans and a leather jacket taking a selfie -001.jpg", "image_outputs/11-a woman in jeans and a leather jacket taking a selfie -002.jpg", "image_outputs/11-a woman in jeans and a leather jacket taking a selfie -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/12.json b/json_outputs/12.json new file mode 100644 index 0000000..da9fa19 --- /dev/null +++ b/json_outputs/12.json @@ -0,0 +1 @@ +{"id": 12, "images": [{"prompt": "an asian woman holding her finger up to her face ", "paths": ["image_outputs/12-an asian woman holding her finger up to her face -000.jpg", "image_outputs/12-an asian woman holding her finger up to her face -001.jpg", "image_outputs/12-an asian woman holding her finger up to her face -002.jpg", "image_outputs/12-an asian woman holding her finger up to her face -003.jpg"]}, {"prompt": "a young asian woman wearing a hat and scarf ", "paths": ["image_outputs/12-a young asian woman wearing a hat and scarf -000.jpg", "image_outputs/12-a young asian woman wearing a hat and scarf -001.jpg", "image_outputs/12-a young asian woman wearing a hat and scarf -002.jpg", "image_outputs/12-a young asian woman wearing a hat and scarf -003.jpg"]}, {"prompt": "an asian woman in a blue dress sitting in the back seat of a car ", "paths": ["image_outputs/12-an asian woman in a blue dress sitting in the back seat of a car -000.jpg", "image_outputs/12-an asian woman in a blue dress sitting in the back seat of a car -001.jpg", "image_outputs/12-an asian woman in a blue dress sitting in the back seat of a car -002.jpg", "image_outputs/12-an asian woman in a blue dress sitting in the back seat of a car -003.jpg"]}, {"prompt": "a young asian woman wearing a green sweater and black scarf ", "paths": ["image_outputs/12-a young asian woman wearing a green sweater and black scarf -000.jpg", "image_outputs/12-a young asian woman wearing a green sweater and black scarf -001.jpg", "image_outputs/12-a young asian woman wearing a green sweater and black scarf -002.jpg", "image_outputs/12-a young asian woman wearing a green sweater and black scarf -003.jpg"]}, {"prompt": "a woman in a pink dress posing for the camera ", "paths": ["image_outputs/12-a woman in a pink dress posing for the camera -000.jpg", "image_outputs/12-a woman in a pink dress posing for the camera -001.jpg", "image_outputs/12-a woman in a pink dress posing for the camera -002.jpg", "image_outputs/12-a woman in a pink dress posing for the camera -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/13.json b/json_outputs/13.json new file mode 100644 index 0000000..7914435 --- /dev/null +++ b/json_outputs/13.json @@ -0,0 +1 @@ +{"id": 13, "images": [{"prompt": "a woman in a black shirt posing for the camera ", "paths": ["image_outputs/13-a woman in a black shirt posing for the camera -000.jpg", "image_outputs/13-a woman in a black shirt posing for the camera -001.jpg", "image_outputs/13-a woman in a black shirt posing for the camera -002.jpg", "image_outputs/13-a woman in a black shirt posing for the camera -003.jpg"]}, {"prompt": "a woman in a green shirt and sunglasses sitting in the back seat of a car ", "paths": ["image_outputs/13-a woman in a green shirt and sunglasses sitting in the back seat of a car -000.jpg", "image_outputs/13-a woman in a green shirt and sunglasses sitting in the back seat of a car -001.jpg", "image_outputs/13-a woman in a green shirt and sunglasses sitting in the back seat of a car -002.jpg", "image_outputs/13-a woman in a green shirt and sunglasses sitting in the back seat of a car -003.jpg"]}, {"prompt": "a young asian woman wearing an adidas sweatshirt ", "paths": ["image_outputs/13-a young asian woman wearing an adidas sweatshirt -000.jpg", "image_outputs/13-a young asian woman wearing an adidas sweatshirt -001.jpg", "image_outputs/13-a young asian woman wearing an adidas sweatshirt -002.jpg", "image_outputs/13-a young asian woman wearing an adidas sweatshirt -003.jpg"]}, {"prompt": "a woman with long brown hair is taking a selfie ", "paths": ["image_outputs/13-a woman with long brown hair is taking a selfie -000.jpg", "image_outputs/13-a woman with long brown hair is taking a selfie -001.jpg", "image_outputs/13-a woman with long brown hair is taking a selfie -002.jpg", "image_outputs/13-a woman with long brown hair is taking a selfie -003.jpg"]}, {"prompt": "a woman in a bra and panties sitting on a bed with a butterfly ", "paths": ["image_outputs/13-a woman in a bra and panties sitting on a bed with a butterfly -000.jpg", "image_outputs/13-a woman in a bra and panties sitting on a bed with a butterfly -001.jpg", "image_outputs/13-a woman in a bra and panties sitting on a bed with a butterfly -002.jpg", "image_outputs/13-a woman in a bra and panties sitting on a bed with a butterfly -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/14.json b/json_outputs/14.json new file mode 100644 index 0000000..6f96d74 --- /dev/null +++ b/json_outputs/14.json @@ -0,0 +1 @@ +{"id": 14, "images": [{"prompt": "a woman in a yellow sweater sitting on a couch ", "paths": ["image_outputs/14-a woman in a yellow sweater sitting on a couch -000.jpg", "image_outputs/14-a woman in a yellow sweater sitting on a couch -001.jpg", "image_outputs/14-a woman in a yellow sweater sitting on a couch -002.jpg", "image_outputs/14-a woman in a yellow sweater sitting on a couch -003.jpg"]}, {"prompt": "a woman laying on a bed with a white cat ", "paths": ["image_outputs/14-a woman laying on a bed with a white cat -000.jpg", "image_outputs/14-a woman laying on a bed with a white cat -001.jpg", "image_outputs/14-a woman laying on a bed with a white cat -002.jpg", "image_outputs/14-a woman laying on a bed with a white cat -003.jpg"]}, {"prompt": "a young woman using her cell phone in a mall ", "paths": ["image_outputs/14-a young woman using her cell phone in a mall -000.jpg", "image_outputs/14-a young woman using her cell phone in a mall -001.jpg", "image_outputs/14-a young woman using her cell phone in a mall -002.jpg", "image_outputs/14-a young woman using her cell phone in a mall -003.jpg"]}, {"prompt": "an asian woman wearing a black shirt and blue tie ", "paths": ["image_outputs/14-an asian woman wearing a black shirt and blue tie -000.jpg", "image_outputs/14-an asian woman wearing a black shirt and blue tie -001.jpg", "image_outputs/14-an asian woman wearing a black shirt and blue tie -002.jpg", "image_outputs/14-an asian woman wearing a black shirt and blue tie -003.jpg"]}, {"prompt": "a woman posing for a photo in front of an asian restaurant ", "paths": ["image_outputs/14-a woman posing for a photo in front of an asian restaurant -000.jpg", "image_outputs/14-a woman posing for a photo in front of an asian restaurant -001.jpg", "image_outputs/14-a woman posing for a photo in front of an asian restaurant -002.jpg", "image_outputs/14-a woman posing for a photo in front of an asian restaurant -003.jpg"]}, {"prompt": "a young asian woman with short hair posing for a selfie ", "paths": ["image_outputs/14-a young asian woman with short hair posing for a selfie -000.jpg", "image_outputs/14-a young asian woman with short hair posing for a selfie -001.jpg", "image_outputs/14-a young asian woman with short hair posing for a selfie -002.jpg", "image_outputs/14-a young asian woman with short hair posing for a selfie -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/15.json b/json_outputs/15.json new file mode 100644 index 0000000..5d7e833 --- /dev/null +++ b/json_outputs/15.json @@ -0,0 +1 @@ +{"id": 15, "images": [{"prompt": "a woman in the back seat of a car holding a bottle of sunscreen ", "paths": ["image_outputs/15-a woman in the back seat of a car holding a bottle of sunscreen -000.jpg", "image_outputs/15-a woman in the back seat of a car holding a bottle of sunscreen -001.jpg", "image_outputs/15-a woman in the back seat of a car holding a bottle of sunscreen -002.jpg", "image_outputs/15-a woman in the back seat of a car holding a bottle of sunscreen -003.jpg"]}, {"prompt": "a young asian woman with big breasts and long hair ", "paths": ["image_outputs/15-a young asian woman with big breasts and long hair -000.jpg", "image_outputs/15-a young asian woman with big breasts and long hair -001.jpg", "image_outputs/15-a young asian woman with big breasts and long hair -002.jpg", "image_outputs/15-a young asian woman with big breasts and long hair -003.jpg"]}, {"prompt": "a woman in a white top and black shorts standing in front of a crowd ", "paths": ["image_outputs/15-a woman in a white top and black shorts standing in front of a crowd -000.jpg", "image_outputs/15-a woman in a white top and black shorts standing in front of a crowd -001.jpg", "image_outputs/15-a woman in a white top and black shorts standing in front of a crowd -002.jpg", "image_outputs/15-a woman in a white top and black shorts standing in front of a crowd -003.jpg"]}, {"prompt": "a woman eating a croissant in front of a tv screen ", "paths": ["image_outputs/15-a woman eating a croissant in front of a tv screen -000.jpg", "image_outputs/15-a woman eating a croissant in front of a tv screen -001.jpg", "image_outputs/15-a woman eating a croissant in front of a tv screen -002.jpg", "image_outputs/15-a woman eating a croissant in front of a tv screen -003.jpg"]}, {"prompt": "a woman in a white shirt making a kissy face ", "paths": ["image_outputs/15-a woman in a white shirt making a kissy face -000.jpg", "image_outputs/15-a woman in a white shirt making a kissy face -001.jpg", "image_outputs/15-a woman in a white shirt making a kissy face -002.jpg", "image_outputs/15-a woman in a white shirt making a kissy face -003.jpg"]}, {"prompt": "an asian woman with long brown hair posing for the camera ", "paths": ["image_outputs/15-an asian woman with long brown hair posing for the camera -000.jpg", "image_outputs/15-an asian woman with long brown hair posing for the camera -001.jpg", "image_outputs/15-an asian woman with long brown hair posing for the camera -002.jpg", "image_outputs/15-an asian woman with long brown hair posing for the camera -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/16.json b/json_outputs/16.json new file mode 100644 index 0000000..68389c8 --- /dev/null +++ b/json_outputs/16.json @@ -0,0 +1 @@ +{"id": 16, "images": [{"prompt": "a young asian woman wearing sunglasses and holding a cake ", "paths": ["image_outputs/16-a young asian woman wearing sunglasses and holding a cake -000.jpg", "image_outputs/16-a young asian woman wearing sunglasses and holding a cake -001.jpg", "image_outputs/16-a young asian woman wearing sunglasses and holding a cake -002.jpg", "image_outputs/16-a young asian woman wearing sunglasses and holding a cake -003.jpg"]}, {"prompt": "a young asian woman wearing a blue top and necklace ", "paths": ["image_outputs/16-a young asian woman wearing a blue top and necklace -000.jpg", "image_outputs/16-a young asian woman wearing a blue top and necklace -001.jpg", "image_outputs/16-a young asian woman wearing a blue top and necklace -002.jpg", "image_outputs/16-a young asian woman wearing a blue top and necklace -003.jpg"]}, {"prompt": "an asian woman with long brown hair posing for the camera ", "paths": ["image_outputs/16-an asian woman with long brown hair posing for the camera -000.jpg", "image_outputs/16-an asian woman with long brown hair posing for the camera -001.jpg", "image_outputs/16-an asian woman with long brown hair posing for the camera -002.jpg", "image_outputs/16-an asian woman with long brown hair posing for the camera -003.jpg"]}, {"prompt": "an asian woman wearing a red scarf and a black top ", "paths": ["image_outputs/16-an asian woman wearing a red scarf and a black top -000.jpg", "image_outputs/16-an asian woman wearing a red scarf and a black top -001.jpg", "image_outputs/16-an asian woman wearing a red scarf and a black top -002.jpg", "image_outputs/16-an asian woman wearing a red scarf and a black top -003.jpg"]}, {"prompt": "a woman sitting in front of a brick wall with yellow flowers ", "paths": ["image_outputs/16-a woman sitting in front of a brick wall with yellow flowers -000.jpg", "image_outputs/16-a woman sitting in front of a brick wall with yellow flowers -001.jpg", "image_outputs/16-a woman sitting in front of a brick wall with yellow flowers -002.jpg", "image_outputs/16-a woman sitting in front of a brick wall with yellow flowers -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/17.json b/json_outputs/17.json new file mode 100644 index 0000000..6f8c073 --- /dev/null +++ b/json_outputs/17.json @@ -0,0 +1 @@ +{"id": 17, "images": [{"prompt": "a woman sitting on a chair with a cup of coffee ", "paths": ["image_outputs/17-a woman sitting on a chair with a cup of coffee -000.jpg", "image_outputs/17-a woman sitting on a chair with a cup of coffee -001.jpg", "image_outputs/17-a woman sitting on a chair with a cup of coffee -002.jpg", "image_outputs/17-a woman sitting on a chair with a cup of coffee -003.jpg"]}, {"prompt": "a woman in red and white striped shirt and sunglasses ", "paths": ["image_outputs/17-a woman in red and white striped shirt and sunglasses -000.jpg", "image_outputs/17-a woman in red and white striped shirt and sunglasses -001.jpg", "image_outputs/17-a woman in red and white striped shirt and sunglasses -002.jpg", "image_outputs/17-a woman in red and white striped shirt and sunglasses -003.jpg"]}, {"prompt": "a woman with long brown hair posing for the camera ", "paths": ["image_outputs/17-a woman with long brown hair posing for the camera -000.jpg", "image_outputs/17-a woman with long brown hair posing for the camera -001.jpg", "image_outputs/17-a woman with long brown hair posing for the camera -002.jpg", "image_outputs/17-a woman with long brown hair posing for the camera -003.jpg"]}, {"prompt": "a young woman wearing sunglasses and a denim shirt ", "paths": ["image_outputs/17-a young woman wearing sunglasses and a denim shirt -000.jpg", "image_outputs/17-a young woman wearing sunglasses and a denim shirt -001.jpg", "image_outputs/17-a young woman wearing sunglasses and a denim shirt -002.jpg", "image_outputs/17-a young woman wearing sunglasses and a denim shirt -003.jpg"]}, {"prompt": "a woman in an orange blazer holding a purse ", "paths": ["image_outputs/17-a woman in an orange blazer holding a purse -000.jpg", "image_outputs/17-a woman in an orange blazer holding a purse -001.jpg", "image_outputs/17-a woman in an orange blazer holding a purse -002.jpg", "image_outputs/17-a woman in an orange blazer holding a purse -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/18.json b/json_outputs/18.json new file mode 100644 index 0000000..7371a78 --- /dev/null +++ b/json_outputs/18.json @@ -0,0 +1 @@ +{"id": 18, "images": [{"prompt": "a woman holding a glass of beer with a smile on her face ", "paths": ["image_outputs/18-a woman holding a glass of beer with a smile on her face -000.jpg", "image_outputs/18-a woman holding a glass of beer with a smile on her face -001.jpg", "image_outputs/18-a woman holding a glass of beer with a smile on her face -002.jpg", "image_outputs/18-a woman holding a glass of beer with a smile on her face -003.jpg"]}, {"prompt": "a woman sitting at a table with flowers in the background ", "paths": ["image_outputs/18-a woman sitting at a table with flowers in the background -000.jpg", "image_outputs/18-a woman sitting at a table with flowers in the background -001.jpg", "image_outputs/18-a woman sitting at a table with flowers in the background -002.jpg", "image_outputs/18-a woman sitting at a table with flowers in the background -003.jpg"]}, {"prompt": "a woman in a white t - shirt holding a jar of food ", "paths": ["image_outputs/18-a woman in a white t - shirt holding a jar of food -000.jpg", "image_outputs/18-a woman in a white t - shirt holding a jar of food -001.jpg", "image_outputs/18-a woman in a white t - shirt holding a jar of food -002.jpg", "image_outputs/18-a woman in a white t - shirt holding a jar of food -003.jpg"]}, {"prompt": "an asian woman in a car with long hair ", "paths": ["image_outputs/18-an asian woman in a car with long hair -000.jpg", "image_outputs/18-an asian woman in a car with long hair -001.jpg", "image_outputs/18-an asian woman in a car with long hair -002.jpg", "image_outputs/18-an asian woman in a car with long hair -003.jpg"]}, {"prompt": "an asian woman sitting at a desk holding a cell phone ", "paths": ["image_outputs/18-an asian woman sitting at a desk holding a cell phone -000.jpg", "image_outputs/18-an asian woman sitting at a desk holding a cell phone -001.jpg", "image_outputs/18-an asian woman sitting at a desk holding a cell phone -002.jpg", "image_outputs/18-an asian woman sitting at a desk holding a cell phone -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/19.json b/json_outputs/19.json new file mode 100644 index 0000000..cd72ad7 --- /dev/null +++ b/json_outputs/19.json @@ -0,0 +1 @@ +{"id": 19, "images": [{"prompt": "young asian woman posing in front of wooden bird houses ", "paths": ["image_outputs/19-young asian woman posing in front of wooden bird houses -000.jpg", "image_outputs/19-young asian woman posing in front of wooden bird houses -001.jpg", "image_outputs/19-young asian woman posing in front of wooden bird houses -002.jpg", "image_outputs/19-young asian woman posing in front of wooden bird houses -003.jpg"]}, {"prompt": "a beautiful asian woman is posing for the camera ", "paths": ["image_outputs/19-a beautiful asian woman is posing for the camera -000.jpg", "image_outputs/19-a beautiful asian woman is posing for the camera -001.jpg", "image_outputs/19-a beautiful asian woman is posing for the camera -002.jpg", "image_outputs/19-a beautiful asian woman is posing for the camera -003.jpg"]}, {"prompt": "a woman standing next to a body of water ", "paths": ["image_outputs/19-a woman standing next to a body of water -000.jpg", "image_outputs/19-a woman standing next to a body of water -001.jpg", "image_outputs/19-a woman standing next to a body of water -002.jpg", "image_outputs/19-a woman standing next to a body of water -003.jpg"]}, {"prompt": "an asian woman with blue hair wearing a denim jacket ", "paths": ["image_outputs/19-an asian woman with blue hair wearing a denim jacket -000.jpg", "image_outputs/19-an asian woman with blue hair wearing a denim jacket -001.jpg", "image_outputs/19-an asian woman with blue hair wearing a denim jacket -002.jpg", "image_outputs/19-an asian woman with blue hair wearing a denim jacket -003.jpg"]}, {"prompt": "a woman sitting in a chair with a cup of coffee ", "paths": ["image_outputs/19-a woman sitting in a chair with a cup of coffee -000.jpg", "image_outputs/19-a woman sitting in a chair with a cup of coffee -001.jpg", "image_outputs/19-a woman sitting in a chair with a cup of coffee -002.jpg", "image_outputs/19-a woman sitting in a chair with a cup of coffee -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/2.json b/json_outputs/2.json new file mode 100644 index 0000000..a7fe165 --- /dev/null +++ b/json_outputs/2.json @@ -0,0 +1 @@ +{"id": 2, "images": [{"prompt": "a woman wearing a black hat and a white top ", "paths": ["image_outputs/2-a woman wearing a black hat and a white top -000.jpg", "image_outputs/2-a woman wearing a black hat and a white top -001.jpg", "image_outputs/2-a woman wearing a black hat and a white top -002.jpg", "image_outputs/2-a woman wearing a black hat and a white top -003.jpg"]}, {"prompt": "a young asian woman with long black hair taking a selfie ", "paths": ["image_outputs/2-a young asian woman with long black hair taking a selfie -000.jpg", "image_outputs/2-a young asian woman with long black hair taking a selfie -001.jpg", "image_outputs/2-a young asian woman with long black hair taking a selfie -002.jpg", "image_outputs/2-a young asian woman with long black hair taking a selfie -003.jpg"]}, {"prompt": "an asian woman in a white shirt and black bow tie is sitting under a tent ", "paths": ["image_outputs/2-an asian woman in a white shirt and black bow tie is sitting under a tent -000.jpg", "image_outputs/2-an asian woman in a white shirt and black bow tie is sitting under a tent -001.jpg", "image_outputs/2-an asian woman in a white shirt and black bow tie is sitting under a tent -002.jpg", "image_outputs/2-an asian woman in a white shirt and black bow tie is sitting under a tent -003.jpg"]}, {"prompt": "an asian woman with long hair is taking a selfie ", "paths": ["image_outputs/2-an asian woman with long hair is taking a selfie -000.jpg", "image_outputs/2-an asian woman with long hair is taking a selfie -001.jpg", "image_outputs/2-an asian woman with long hair is taking a selfie -002.jpg", "image_outputs/2-an asian woman with long hair is taking a selfie -003.jpg"]}, {"prompt": "a young woman sitting at a table in a classroom ", "paths": ["image_outputs/2-a young woman sitting at a table in a classroom -000.jpg", "image_outputs/2-a young woman sitting at a table in a classroom -001.jpg", "image_outputs/2-a young woman sitting at a table in a classroom -002.jpg", "image_outputs/2-a young woman sitting at a table in a classroom -003.jpg"]}, {"prompt": "a young asian woman is posing for the camera ", "paths": ["image_outputs/2-a young asian woman is posing for the camera -000.jpg", "image_outputs/2-a young asian woman is posing for the camera -001.jpg", "image_outputs/2-a young asian woman is posing for the camera -002.jpg", "image_outputs/2-a young asian woman is posing for the camera -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/20.json b/json_outputs/20.json new file mode 100644 index 0000000..f7be032 --- /dev/null +++ b/json_outputs/20.json @@ -0,0 +1 @@ +{"id": 20, "images": [{"prompt": "an asian woman in a red dress posing for the camera ", "paths": ["image_outputs/20-an asian woman in a red dress posing for the camera -000.jpg", "image_outputs/20-an asian woman in a red dress posing for the camera -001.jpg", "image_outputs/20-an asian woman in a red dress posing for the camera -002.jpg", "image_outputs/20-an asian woman in a red dress posing for the camera -003.jpg"]}, {"prompt": "a woman sitting at a table with her hand on her chin ", "paths": ["image_outputs/20-a woman sitting at a table with her hand on her chin -000.jpg", "image_outputs/20-a woman sitting at a table with her hand on her chin -001.jpg", "image_outputs/20-a woman sitting at a table with her hand on her chin -002.jpg", "image_outputs/20-a woman sitting at a table with her hand on her chin -003.jpg"]}, {"prompt": "a woman with long black hair posing for a selfie in a gym ", "paths": ["image_outputs/20-a woman with long black hair posing for a selfie in a gym -000.jpg", "image_outputs/20-a woman with long black hair posing for a selfie in a gym -001.jpg", "image_outputs/20-a woman with long black hair posing for a selfie in a gym -002.jpg", "image_outputs/20-a woman with long black hair posing for a selfie in a gym -003.jpg"]}, {"prompt": "a woman holding up a glass of wine while sitting at a table ", "paths": ["image_outputs/20-a woman holding up a glass of wine while sitting at a table -000.jpg", "image_outputs/20-a woman holding up a glass of wine while sitting at a table -001.jpg", "image_outputs/20-a woman holding up a glass of wine while sitting at a table -002.jpg", "image_outputs/20-a woman holding up a glass of wine while sitting at a table -003.jpg"]}, {"prompt": "a young asian woman laying on a bed ", "paths": ["image_outputs/20-a young asian woman laying on a bed -000.jpg", "image_outputs/20-a young asian woman laying on a bed -001.jpg", "image_outputs/20-a young asian woman laying on a bed -002.jpg", "image_outputs/20-a young asian woman laying on a bed -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/21.json b/json_outputs/21.json new file mode 100644 index 0000000..b268a7d --- /dev/null +++ b/json_outputs/21.json @@ -0,0 +1 @@ +{"id": 21, "images": [{"prompt": "a woman with long black hair wearing sunglasses in a car ", "paths": ["image_outputs/21-a woman with long black hair wearing sunglasses in a car -000.jpg", "image_outputs/21-a woman with long black hair wearing sunglasses in a car -001.jpg", "image_outputs/21-a woman with long black hair wearing sunglasses in a car -002.jpg", "image_outputs/21-a woman with long black hair wearing sunglasses in a car -003.jpg"]}, {"prompt": "an asian woman with long hair posing for a selfie ", "paths": ["image_outputs/21-an asian woman with long hair posing for a selfie -000.jpg", "image_outputs/21-an asian woman with long hair posing for a selfie -001.jpg", "image_outputs/21-an asian woman with long hair posing for a selfie -002.jpg", "image_outputs/21-an asian woman with long hair posing for a selfie -003.jpg"]}, {"prompt": "a woman is taking a selfie in a store ", "paths": ["image_outputs/21-a woman is taking a selfie in a store -000.jpg", "image_outputs/21-a woman is taking a selfie in a store -001.jpg", "image_outputs/21-a woman is taking a selfie in a store -002.jpg", "image_outputs/21-a woman is taking a selfie in a store -003.jpg"]}, {"prompt": "a woman in a red dress sitting in a car ", "paths": ["image_outputs/21-a woman in a red dress sitting in a car -000.jpg", "image_outputs/21-a woman in a red dress sitting in a car -001.jpg", "image_outputs/21-a woman in a red dress sitting in a car -002.jpg", "image_outputs/21-a woman in a red dress sitting in a car -003.jpg"]}, {"prompt": "a woman sitting at a table with oysters and wine ", "paths": ["image_outputs/21-a woman sitting at a table with oysters and wine -000.jpg", "image_outputs/21-a woman sitting at a table with oysters and wine -001.jpg", "image_outputs/21-a woman sitting at a table with oysters and wine -002.jpg", "image_outputs/21-a woman sitting at a table with oysters and wine -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/22.json b/json_outputs/22.json new file mode 100644 index 0000000..71ab19b --- /dev/null +++ b/json_outputs/22.json @@ -0,0 +1 @@ +{"id": 22, "images": [{"prompt": "an asian woman holding a flower in her hand ", "paths": ["image_outputs/22-an asian woman holding a flower in her hand -000.jpg", "image_outputs/22-an asian woman holding a flower in her hand -001.jpg", "image_outputs/22-an asian woman holding a flower in her hand -002.jpg", "image_outputs/22-an asian woman holding a flower in her hand -003.jpg"]}, {"prompt": "a woman wearing a black top and earrings ", "paths": ["image_outputs/22-a woman wearing a black top and earrings -000.jpg", "image_outputs/22-a woman wearing a black top and earrings -001.jpg", "image_outputs/22-a woman wearing a black top and earrings -002.jpg", "image_outputs/22-a woman wearing a black top and earrings -003.jpg"]}, {"prompt": "a woman smiling at the camera in front of a wine rack ", "paths": ["image_outputs/22-a woman smiling at the camera in front of a wine rack -000.jpg", "image_outputs/22-a woman smiling at the camera in front of a wine rack -001.jpg", "image_outputs/22-a woman smiling at the camera in front of a wine rack -002.jpg", "image_outputs/22-a woman smiling at the camera in front of a wine rack -003.jpg"]}, {"prompt": "a woman sitting on top of a child in a chair ", "paths": ["image_outputs/22-a woman sitting on top of a child in a chair -000.jpg", "image_outputs/22-a woman sitting on top of a child in a chair -001.jpg", "image_outputs/22-a woman sitting on top of a child in a chair -002.jpg", "image_outputs/22-a woman sitting on top of a child in a chair -003.jpg"]}, {"prompt": "a close up of a woman with long hair ", "paths": ["image_outputs/22-a close up of a woman with long hair -000.jpg", "image_outputs/22-a close up of a woman with long hair -001.jpg", "image_outputs/22-a close up of a woman with long hair -002.jpg", "image_outputs/22-a close up of a woman with long hair -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/23.json b/json_outputs/23.json new file mode 100644 index 0000000..4cd7068 --- /dev/null +++ b/json_outputs/23.json @@ -0,0 +1 @@ +{"id": 23, "images": [{"prompt": "an asian woman wearing a red jacket and earrings ", "paths": ["image_outputs/23-an asian woman wearing a red jacket and earrings -000.jpg", "image_outputs/23-an asian woman wearing a red jacket and earrings -001.jpg", "image_outputs/23-an asian woman wearing a red jacket and earrings -002.jpg", "image_outputs/23-an asian woman wearing a red jacket and earrings -003.jpg"]}, {"prompt": "a woman taking a selfie in a bathroom ", "paths": ["image_outputs/23-a woman taking a selfie in a bathroom -000.jpg", "image_outputs/23-a woman taking a selfie in a bathroom -001.jpg", "image_outputs/23-a woman taking a selfie in a bathroom -002.jpg", "image_outputs/23-a woman taking a selfie in a bathroom -003.jpg"]}, {"prompt": "an asian woman in a striped shirt and yellow skirt ", "paths": ["image_outputs/23-an asian woman in a striped shirt and yellow skirt -000.jpg", "image_outputs/23-an asian woman in a striped shirt and yellow skirt -001.jpg", "image_outputs/23-an asian woman in a striped shirt and yellow skirt -002.jpg", "image_outputs/23-an asian woman in a striped shirt and yellow skirt -003.jpg"]}, {"prompt": "a young asian woman wearing a t - shirt with the word givenchy written on it ", "paths": ["image_outputs/23-a young asian woman wearing a t - shirt with the word givenchy written on it -000.jpg", "image_outputs/23-a young asian woman wearing a t - shirt with the word givenchy written on it -001.jpg", "image_outputs/23-a young asian woman wearing a t - shirt with the word givenchy written on it -002.jpg", "image_outputs/23-a young asian woman wearing a t - shirt with the word givenchy written on it -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/24.json b/json_outputs/24.json new file mode 100644 index 0000000..932249a --- /dev/null +++ b/json_outputs/24.json @@ -0,0 +1 @@ +{"id": 24, "images": [{"prompt": "a young asian woman with a necklace on her neck ", "paths": ["image_outputs/24-a young asian woman with a necklace on her neck -000.jpg", "image_outputs/24-a young asian woman with a necklace on her neck -001.jpg", "image_outputs/24-a young asian woman with a necklace on her neck -002.jpg", "image_outputs/24-a young asian woman with a necklace on her neck -003.jpg"]}, {"prompt": "a woman wearing a t - shirt with peppa pig on it ", "paths": ["image_outputs/24-a woman wearing a t - shirt with peppa pig on it -000.jpg", "image_outputs/24-a woman wearing a t - shirt with peppa pig on it -001.jpg", "image_outputs/24-a woman wearing a t - shirt with peppa pig on it -002.jpg", "image_outputs/24-a woman wearing a t - shirt with peppa pig on it -003.jpg"]}, {"prompt": "a woman with her hand on her chin posing for a photo ", "paths": ["image_outputs/24-a woman with her hand on her chin posing for a photo -000.jpg", "image_outputs/24-a woman with her hand on her chin posing for a photo -001.jpg", "image_outputs/24-a woman with her hand on her chin posing for a photo -002.jpg", "image_outputs/24-a woman with her hand on her chin posing for a photo -003.jpg"]}, {"prompt": "a woman holding up a cup of tea with the words king club written on it ", "paths": ["image_outputs/24-a woman holding up a cup of tea with the words king club written on it -000.jpg", "image_outputs/24-a woman holding up a cup of tea with the words king club written on it -001.jpg", "image_outputs/24-a woman holding up a cup of tea with the words king club written on it -002.jpg", "image_outputs/24-a woman holding up a cup of tea with the words king club written on it -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/25.json b/json_outputs/25.json new file mode 100644 index 0000000..7cf820d --- /dev/null +++ b/json_outputs/25.json @@ -0,0 +1 @@ +{"id": 25, "images": [{"prompt": "an asian woman in a striped shirt and pink lipstick ", "paths": ["image_outputs/25-an asian woman in a striped shirt and pink lipstick -000.jpg", "image_outputs/25-an asian woman in a striped shirt and pink lipstick -001.jpg", "image_outputs/25-an asian woman in a striped shirt and pink lipstick -002.jpg", "image_outputs/25-an asian woman in a striped shirt and pink lipstick -003.jpg"]}, {"prompt": "a woman sitting in a car with a cell phone in her hand ", "paths": ["image_outputs/25-a woman sitting in a car with a cell phone in her hand -000.jpg", "image_outputs/25-a woman sitting in a car with a cell phone in her hand -001.jpg", "image_outputs/25-a woman sitting in a car with a cell phone in her hand -002.jpg", "image_outputs/25-a woman sitting in a car with a cell phone in her hand -003.jpg"]}, {"prompt": "a woman in a floral dress drinking from a straw ", "paths": ["image_outputs/25-a woman in a floral dress drinking from a straw -000.jpg", "image_outputs/25-a woman in a floral dress drinking from a straw -001.jpg", "image_outputs/25-a woman in a floral dress drinking from a straw -002.jpg", "image_outputs/25-a woman in a floral dress drinking from a straw -003.jpg"]}, {"prompt": "a woman standing in front of a cherry blossom tree ", "paths": ["image_outputs/25-a woman standing in front of a cherry blossom tree -000.jpg", "image_outputs/25-a woman standing in front of a cherry blossom tree -001.jpg", "image_outputs/25-a woman standing in front of a cherry blossom tree -002.jpg", "image_outputs/25-a woman standing in front of a cherry blossom tree -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/26.json b/json_outputs/26.json new file mode 100644 index 0000000..f46e551 --- /dev/null +++ b/json_outputs/26.json @@ -0,0 +1 @@ +{"id": 26, "images": [{"prompt": "a woman in an orange top holding a straw in her mouth ", "paths": ["image_outputs/26-a woman in an orange top holding a straw in her mouth -000.jpg", "image_outputs/26-a woman in an orange top holding a straw in her mouth -001.jpg", "image_outputs/26-a woman in an orange top holding a straw in her mouth -002.jpg", "image_outputs/26-a woman in an orange top holding a straw in her mouth -003.jpg"]}, {"prompt": "a woman in a blue dress with tattoos on her arm ", "paths": ["image_outputs/26-a woman in a blue dress with tattoos on her arm -000.jpg", "image_outputs/26-a woman in a blue dress with tattoos on her arm -001.jpg", "image_outputs/26-a woman in a blue dress with tattoos on her arm -002.jpg", "image_outputs/26-a woman in a blue dress with tattoos on her arm -003.jpg"]}, {"prompt": "a woman in a green dress with a sunflower on her head ", "paths": ["image_outputs/26-a woman in a green dress with a sunflower on her head -000.jpg", "image_outputs/26-a woman in a green dress with a sunflower on her head -001.jpg", "image_outputs/26-a woman in a green dress with a sunflower on her head -002.jpg", "image_outputs/26-a woman in a green dress with a sunflower on her head -003.jpg"]}, {"prompt": "a woman standing in front of a movie poster ", "paths": ["image_outputs/26-a woman standing in front of a movie poster -000.jpg", "image_outputs/26-a woman standing in front of a movie poster -001.jpg", "image_outputs/26-a woman standing in front of a movie poster -002.jpg", "image_outputs/26-a woman standing in front of a movie poster -003.jpg"]}, {"prompt": "a young asian woman with black hair wearing a sweater ", "paths": ["image_outputs/26-a young asian woman with black hair wearing a sweater -000.jpg", "image_outputs/26-a young asian woman with black hair wearing a sweater -001.jpg", "image_outputs/26-a young asian woman with black hair wearing a sweater -002.jpg", "image_outputs/26-a young asian woman with black hair wearing a sweater -003.jpg"]}, {"prompt": "a woman sitting on a bench wearing brown pants ", "paths": ["image_outputs/26-a woman sitting on a bench wearing brown pants -000.jpg", "image_outputs/26-a woman sitting on a bench wearing brown pants -001.jpg", "image_outputs/26-a woman sitting on a bench wearing brown pants -002.jpg", "image_outputs/26-a woman sitting on a bench wearing brown pants -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/27.json b/json_outputs/27.json new file mode 100644 index 0000000..201fb79 --- /dev/null +++ b/json_outputs/27.json @@ -0,0 +1 @@ +{"id": 27, "images": [{"prompt": "an asian woman holding an umbrella while sitting on a chair ", "paths": ["image_outputs/27-an asian woman holding an umbrella while sitting on a chair -000.jpg", "image_outputs/27-an asian woman holding an umbrella while sitting on a chair -001.jpg", "image_outputs/27-an asian woman holding an umbrella while sitting on a chair -002.jpg", "image_outputs/27-an asian woman holding an umbrella while sitting on a chair -003.jpg"]}, {"prompt": "a woman in a white sweater is standing in front of a counter ", "paths": ["image_outputs/27-a woman in a white sweater is standing in front of a counter -000.jpg", "image_outputs/27-a woman in a white sweater is standing in front of a counter -001.jpg", "image_outputs/27-a woman in a white sweater is standing in front of a counter -002.jpg", "image_outputs/27-a woman in a white sweater is standing in front of a counter -003.jpg"]}, {"prompt": "an asian woman in a pink blazer posing for the camera ", "paths": ["image_outputs/27-an asian woman in a pink blazer posing for the camera -000.jpg", "image_outputs/27-an asian woman in a pink blazer posing for the camera -001.jpg", "image_outputs/27-an asian woman in a pink blazer posing for the camera -002.jpg", "image_outputs/27-an asian woman in a pink blazer posing for the camera -003.jpg"]}, {"prompt": "a woman sitting on a couch looking at her cell phone ", "paths": ["image_outputs/27-a woman sitting on a couch looking at her cell phone -000.jpg", "image_outputs/27-a woman sitting on a couch looking at her cell phone -001.jpg", "image_outputs/27-a woman sitting on a couch looking at her cell phone -002.jpg", "image_outputs/27-a woman sitting on a couch looking at her cell phone -003.jpg"]}, {"prompt": "an asian woman in a black dress posing for the camera ", "paths": ["image_outputs/27-an asian woman in a black dress posing for the camera -000.jpg", "image_outputs/27-an asian woman in a black dress posing for the camera -001.jpg", "image_outputs/27-an asian woman in a black dress posing for the camera -002.jpg", "image_outputs/27-an asian woman in a black dress posing for the camera -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/28.json b/json_outputs/28.json new file mode 100644 index 0000000..1b6b408 --- /dev/null +++ b/json_outputs/28.json @@ -0,0 +1 @@ +{"id": 28, "images": [{"prompt": "a woman in a graduation cap and gown posing for the camera ", "paths": ["image_outputs/28-a woman in a graduation cap and gown posing for the camera -000.jpg", "image_outputs/28-a woman in a graduation cap and gown posing for the camera -001.jpg", "image_outputs/28-a woman in a graduation cap and gown posing for the camera -002.jpg", "image_outputs/28-a woman in a graduation cap and gown posing for the camera -003.jpg"]}, {"prompt": "a woman sitting at a table with a plate of food ", "paths": ["image_outputs/28-a woman sitting at a table with a plate of food -000.jpg", "image_outputs/28-a woman sitting at a table with a plate of food -001.jpg", "image_outputs/28-a woman sitting at a table with a plate of food -002.jpg", "image_outputs/28-a woman sitting at a table with a plate of food -003.jpg"]}, {"prompt": "a woman with long hair covering her face with a tissue ", "paths": ["image_outputs/28-a woman with long hair covering her face with a tissue -000.jpg", "image_outputs/28-a woman with long hair covering her face with a tissue -001.jpg", "image_outputs/28-a woman with long hair covering her face with a tissue -002.jpg", "image_outputs/28-a woman with long hair covering her face with a tissue -003.jpg"]}, {"prompt": "a woman wearing a plaid shirt and blue nail polish ", "paths": ["image_outputs/28-a woman wearing a plaid shirt and blue nail polish -000.jpg", "image_outputs/28-a woman wearing a plaid shirt and blue nail polish -001.jpg", "image_outputs/28-a woman wearing a plaid shirt and blue nail polish -002.jpg", "image_outputs/28-a woman wearing a plaid shirt and blue nail polish -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/29.json b/json_outputs/29.json new file mode 100644 index 0000000..10f1d3a --- /dev/null +++ b/json_outputs/29.json @@ -0,0 +1 @@ +{"id": 29, "images": [{"prompt": "a young asian woman with short hair posing for the camera ", "paths": ["image_outputs/29-a young asian woman with short hair posing for the camera -000.jpg", "image_outputs/29-a young asian woman with short hair posing for the camera -001.jpg", "image_outputs/29-a young asian woman with short hair posing for the camera -002.jpg", "image_outputs/29-a young asian woman with short hair posing for the camera -003.jpg"]}, {"prompt": "a young asian woman with long hair holding a stuffed animal ", "paths": ["image_outputs/29-a young asian woman with long hair holding a stuffed animal -000.jpg", "image_outputs/29-a young asian woman with long hair holding a stuffed animal -001.jpg", "image_outputs/29-a young asian woman with long hair holding a stuffed animal -002.jpg", "image_outputs/29-a young asian woman with long hair holding a stuffed animal -003.jpg"]}, {"prompt": "a woman is sitting on a chair wearing a t - shirt with a tiger on it ", "paths": ["image_outputs/29-a woman is sitting on a chair wearing a t - shirt with a tiger on it -000.jpg", "image_outputs/29-a woman is sitting on a chair wearing a t - shirt with a tiger on it -001.jpg", "image_outputs/29-a woman is sitting on a chair wearing a t - shirt with a tiger on it -002.jpg", "image_outputs/29-a woman is sitting on a chair wearing a t - shirt with a tiger on it -003.jpg"]}, {"prompt": "a woman in black stockings and high heels sitting on the floor ", "paths": ["image_outputs/29-a woman in black stockings and high heels sitting on the floor -000.jpg", "image_outputs/29-a woman in black stockings and high heels sitting on the floor -001.jpg", "image_outputs/29-a woman in black stockings and high heels sitting on the floor -002.jpg", "image_outputs/29-a woman in black stockings and high heels sitting on the floor -003.jpg"]}, {"prompt": "a woman holding a camera and a bouquet of flowers ", "paths": ["image_outputs/29-a woman holding a camera and a bouquet of flowers -000.jpg", "image_outputs/29-a woman holding a camera and a bouquet of flowers -001.jpg", "image_outputs/29-a woman holding a camera and a bouquet of flowers -002.jpg", "image_outputs/29-a woman holding a camera and a bouquet of flowers -003.jpg"]}, {"prompt": "an asian woman in a yellow dress posing for the camera ", "paths": ["image_outputs/29-an asian woman in a yellow dress posing for the camera -000.jpg", "image_outputs/29-an asian woman in a yellow dress posing for the camera -001.jpg", "image_outputs/29-an asian woman in a yellow dress posing for the camera -002.jpg", "image_outputs/29-an asian woman in a yellow dress posing for the camera -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/3.json b/json_outputs/3.json new file mode 100644 index 0000000..48ac478 --- /dev/null +++ b/json_outputs/3.json @@ -0,0 +1 @@ +{"id": 3, "images": [{"prompt": "a woman in a red dress and sunglasses posing for the camera ", "paths": ["image_outputs/3-a woman in a red dress and sunglasses posing for the camera -000.jpg", "image_outputs/3-a woman in a red dress and sunglasses posing for the camera -001.jpg", "image_outputs/3-a woman in a red dress and sunglasses posing for the camera -002.jpg", "image_outputs/3-a woman in a red dress and sunglasses posing for the camera -003.jpg"]}, {"prompt": "a woman in a black shirt posing for the camera ", "paths": ["image_outputs/3-a woman in a black shirt posing for the camera -000.jpg", "image_outputs/3-a woman in a black shirt posing for the camera -001.jpg", "image_outputs/3-a woman in a black shirt posing for the camera -002.jpg", "image_outputs/3-a woman in a black shirt posing for the camera -003.jpg"]}, {"prompt": "a woman holding a dog in front of a christmas tree ", "paths": ["image_outputs/3-a woman holding a dog in front of a christmas tree -000.jpg", "image_outputs/3-a woman holding a dog in front of a christmas tree -001.jpg", "image_outputs/3-a woman holding a dog in front of a christmas tree -002.jpg", "image_outputs/3-a woman holding a dog in front of a christmas tree -003.jpg"]}, {"prompt": "a young asian woman wearing a white top ", "paths": ["image_outputs/3-a young asian woman wearing a white top -000.jpg", "image_outputs/3-a young asian woman wearing a white top -001.jpg", "image_outputs/3-a young asian woman wearing a white top -002.jpg", "image_outputs/3-a young asian woman wearing a white top -003.jpg"]}, {"prompt": "an asian woman with long black hair posing for a selfie ", "paths": ["image_outputs/3-an asian woman with long black hair posing for a selfie -000.jpg", "image_outputs/3-an asian woman with long black hair posing for a selfie -001.jpg", "image_outputs/3-an asian woman with long black hair posing for a selfie -002.jpg", "image_outputs/3-an asian woman with long black hair posing for a selfie -003.jpg"]}, {"prompt": "an asian woman is taking a selfie in the middle of the night ", "paths": ["image_outputs/3-an asian woman is taking a selfie in the middle of the night -000.jpg", "image_outputs/3-an asian woman is taking a selfie in the middle of the night -001.jpg", "image_outputs/3-an asian woman is taking a selfie in the middle of the night -002.jpg", "image_outputs/3-an asian woman is taking a selfie in the middle of the night -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/30.json b/json_outputs/30.json new file mode 100644 index 0000000..b1cabb5 --- /dev/null +++ b/json_outputs/30.json @@ -0,0 +1 @@ +{"id": 30, "images": [{"prompt": "a woman with a cup of coffee in front of her ", "paths": ["image_outputs/30-a woman with a cup of coffee in front of her -000.jpg", "image_outputs/30-a woman with a cup of coffee in front of her -001.jpg", "image_outputs/30-a woman with a cup of coffee in front of her -002.jpg", "image_outputs/30-a woman with a cup of coffee in front of her -003.jpg"]}, {"prompt": "a young woman making the peace sign while sitting on a bed ", "paths": ["image_outputs/30-a young woman making the peace sign while sitting on a bed -000.jpg", "image_outputs/30-a young woman making the peace sign while sitting on a bed -001.jpg", "image_outputs/30-a young woman making the peace sign while sitting on a bed -002.jpg", "image_outputs/30-a young woman making the peace sign while sitting on a bed -003.jpg"]}, {"prompt": "an asian woman with flowers in her hair ", "paths": ["image_outputs/30-an asian woman with flowers in her hair -000.jpg", "image_outputs/30-an asian woman with flowers in her hair -001.jpg", "image_outputs/30-an asian woman with flowers in her hair -002.jpg", "image_outputs/30-an asian woman with flowers in her hair -003.jpg"]}, {"prompt": "a woman sitting at a table with a drink in her hand ", "paths": ["image_outputs/30-a woman sitting at a table with a drink in her hand -000.jpg", "image_outputs/30-a woman sitting at a table with a drink in her hand -001.jpg", "image_outputs/30-a woman sitting at a table with a drink in her hand -002.jpg", "image_outputs/30-a woman sitting at a table with a drink in her hand -003.jpg"]}, {"prompt": "a woman in a denim dress standing in front of a wall ", "paths": ["image_outputs/30-a woman in a denim dress standing in front of a wall -000.jpg", "image_outputs/30-a woman in a denim dress standing in front of a wall -001.jpg", "image_outputs/30-a woman in a denim dress standing in front of a wall -002.jpg", "image_outputs/30-a woman in a denim dress standing in front of a wall -003.jpg"]}, {"prompt": "an asian woman in a white dress taking a selfie ", "paths": ["image_outputs/30-an asian woman in a white dress taking a selfie -000.jpg", "image_outputs/30-an asian woman in a white dress taking a selfie -001.jpg", "image_outputs/30-an asian woman in a white dress taking a selfie -002.jpg", "image_outputs/30-an asian woman in a white dress taking a selfie -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/31.json b/json_outputs/31.json new file mode 100644 index 0000000..78a63cd --- /dev/null +++ b/json_outputs/31.json @@ -0,0 +1 @@ +{"id": 31, "images": [{"prompt": "a woman with long hair posing in front of a plant ", "paths": ["image_outputs/31-a woman with long hair posing in front of a plant -000.jpg", "image_outputs/31-a woman with long hair posing in front of a plant -001.jpg", "image_outputs/31-a woman with long hair posing in front of a plant -002.jpg", "image_outputs/31-a woman with long hair posing in front of a plant -003.jpg"]}, {"prompt": "a close up of a woman looking at the camera ", "paths": ["image_outputs/31-a close up of a woman looking at the camera -000.jpg", "image_outputs/31-a close up of a woman looking at the camera -001.jpg", "image_outputs/31-a close up of a woman looking at the camera -002.jpg", "image_outputs/31-a close up of a woman looking at the camera -003.jpg"]}, {"prompt": "an asian woman in a white blazer and black shirt ", "paths": ["image_outputs/31-an asian woman in a white blazer and black shirt -000.jpg", "image_outputs/31-an asian woman in a white blazer and black shirt -001.jpg", "image_outputs/31-an asian woman in a white blazer and black shirt -002.jpg", "image_outputs/31-an asian woman in a white blazer and black shirt -003.jpg"]}, {"prompt": "a woman wearing a dress with reindeer antlers on her head ", "paths": ["image_outputs/31-a woman wearing a dress with reindeer antlers on her head -000.jpg", "image_outputs/31-a woman wearing a dress with reindeer antlers on her head -001.jpg", "image_outputs/31-a woman wearing a dress with reindeer antlers on her head -002.jpg", "image_outputs/31-a woman wearing a dress with reindeer antlers on her head -003.jpg"]}, {"prompt": "a woman wearing a hat and sunglasses in a store ", "paths": ["image_outputs/31-a woman wearing a hat and sunglasses in a store -000.jpg", "image_outputs/31-a woman wearing a hat and sunglasses in a store -001.jpg", "image_outputs/31-a woman wearing a hat and sunglasses in a store -002.jpg", "image_outputs/31-a woman wearing a hat and sunglasses in a store -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/32.json b/json_outputs/32.json new file mode 100644 index 0000000..ac761de --- /dev/null +++ b/json_outputs/32.json @@ -0,0 +1 @@ +{"id": 32, "images": [{"prompt": "a woman with long black hair posing for a photo ", "paths": ["image_outputs/32-a woman with long black hair posing for a photo -000.jpg", "image_outputs/32-a woman with long black hair posing for a photo -001.jpg", "image_outputs/32-a woman with long black hair posing for a photo -002.jpg", "image_outputs/32-a woman with long black hair posing for a photo -003.jpg"]}, {"prompt": "a woman in a gold dress posing for the camera ", "paths": ["image_outputs/32-a woman in a gold dress posing for the camera -000.jpg", "image_outputs/32-a woman in a gold dress posing for the camera -001.jpg", "image_outputs/32-a woman in a gold dress posing for the camera -002.jpg", "image_outputs/32-a woman in a gold dress posing for the camera -003.jpg"]}, {"prompt": "an asian woman wearing a hat posing for a photo ", "paths": ["image_outputs/32-an asian woman wearing a hat posing for a photo -000.jpg", "image_outputs/32-an asian woman wearing a hat posing for a photo -001.jpg", "image_outputs/32-an asian woman wearing a hat posing for a photo -002.jpg", "image_outputs/32-an asian woman wearing a hat posing for a photo -003.jpg"]}, {"prompt": "an asian woman posing in front of a staircase ", "paths": ["image_outputs/32-an asian woman posing in front of a staircase -000.jpg", "image_outputs/32-an asian woman posing in front of a staircase -001.jpg", "image_outputs/32-an asian woman posing in front of a staircase -002.jpg", "image_outputs/32-an asian woman posing in front of a staircase -003.jpg"]}, {"prompt": "an asian woman with red lipstick posing for the camera ", "paths": ["image_outputs/32-an asian woman with red lipstick posing for the camera -000.jpg", "image_outputs/32-an asian woman with red lipstick posing for the camera -001.jpg", "image_outputs/32-an asian woman with red lipstick posing for the camera -002.jpg", "image_outputs/32-an asian woman with red lipstick posing for the camera -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/4.json b/json_outputs/4.json new file mode 100644 index 0000000..fde4d40 --- /dev/null +++ b/json_outputs/4.json @@ -0,0 +1 @@ +{"id": 4, "images": [{"prompt": "a young woman wearing a blue jacket and striped shirt ", "paths": ["image_outputs/4-a young woman wearing a blue jacket and striped shirt -000.jpg", "image_outputs/4-a young woman wearing a blue jacket and striped shirt -001.jpg", "image_outputs/4-a young woman wearing a blue jacket and striped shirt -002.jpg", "image_outputs/4-a young woman wearing a blue jacket and striped shirt -003.jpg"]}, {"prompt": "a woman in a blue sweater is posing for the camera ", "paths": ["image_outputs/4-a woman in a blue sweater is posing for the camera -000.jpg", "image_outputs/4-a woman in a blue sweater is posing for the camera -001.jpg", "image_outputs/4-a woman in a blue sweater is posing for the camera -002.jpg", "image_outputs/4-a woman in a blue sweater is posing for the camera -003.jpg"]}, {"prompt": "a woman with a watch on her wrist standing in front of a crosswalk ", "paths": ["image_outputs/4-a woman with a watch on her wrist standing in front of a crosswalk -000.jpg", "image_outputs/4-a woman with a watch on her wrist standing in front of a crosswalk -001.jpg", "image_outputs/4-a woman with a watch on her wrist standing in front of a crosswalk -002.jpg", "image_outputs/4-a woman with a watch on her wrist standing in front of a crosswalk -003.jpg"]}, {"prompt": "a woman sitting at a table with a drink in front of her ", "paths": ["image_outputs/4-a woman sitting at a table with a drink in front of her -000.jpg", "image_outputs/4-a woman sitting at a table with a drink in front of her -001.jpg", "image_outputs/4-a woman sitting at a table with a drink in front of her -002.jpg", "image_outputs/4-a woman sitting at a table with a drink in front of her -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/5.json b/json_outputs/5.json new file mode 100644 index 0000000..0a639ae --- /dev/null +++ b/json_outputs/5.json @@ -0,0 +1 @@ +{"id": 5, "images": [{"prompt": "a young asian woman in a dress posing for the camera ", "paths": ["image_outputs/5-a young asian woman in a dress posing for the camera -000.jpg", "image_outputs/5-a young asian woman in a dress posing for the camera -001.jpg", "image_outputs/5-a young asian woman in a dress posing for the camera -002.jpg", "image_outputs/5-a young asian woman in a dress posing for the camera -003.jpg"]}, {"prompt": "an asian woman in a red bikini posing for the camera ", "paths": ["image_outputs/5-an asian woman in a red bikini posing for the camera -000.jpg", "image_outputs/5-an asian woman in a red bikini posing for the camera -001.jpg", "image_outputs/5-an asian woman in a red bikini posing for the camera -002.jpg", "image_outputs/5-an asian woman in a red bikini posing for the camera -003.jpg"]}, {"prompt": "a woman in a black and white shirt taking a selfie ", "paths": ["image_outputs/5-a woman in a black and white shirt taking a selfie -000.jpg", "image_outputs/5-a woman in a black and white shirt taking a selfie -001.jpg", "image_outputs/5-a woman in a black and white shirt taking a selfie -002.jpg", "image_outputs/5-a woman in a black and white shirt taking a selfie -003.jpg"]}, {"prompt": "a young asian woman with long hair and bangs ", "paths": ["image_outputs/5-a young asian woman with long hair and bangs -000.jpg", "image_outputs/5-a young asian woman with long hair and bangs -001.jpg", "image_outputs/5-a young asian woman with long hair and bangs -002.jpg", "image_outputs/5-a young asian woman with long hair and bangs -003.jpg"]}, {"prompt": "a woman sitting at a table with chopsticks in her hands ", "paths": ["image_outputs/5-a woman sitting at a table with chopsticks in her hands -000.jpg", "image_outputs/5-a woman sitting at a table with chopsticks in her hands -001.jpg", "image_outputs/5-a woman sitting at a table with chopsticks in her hands -002.jpg", "image_outputs/5-a woman sitting at a table with chopsticks in her hands -003.jpg"]}, {"prompt": "a young asian woman in a white polo shirt ", "paths": ["image_outputs/5-a young asian woman in a white polo shirt -000.jpg", "image_outputs/5-a young asian woman in a white polo shirt -001.jpg", "image_outputs/5-a young asian woman in a white polo shirt -002.jpg", "image_outputs/5-a young asian woman in a white polo shirt -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/6.json b/json_outputs/6.json new file mode 100644 index 0000000..24d2769 --- /dev/null +++ b/json_outputs/6.json @@ -0,0 +1 @@ +{"id": 6, "images": [{"prompt": "a beautiful asian woman posing for the camera ", "paths": ["image_outputs/6-a beautiful asian woman posing for the camera -000.jpg", "image_outputs/6-a beautiful asian woman posing for the camera -001.jpg", "image_outputs/6-a beautiful asian woman posing for the camera -002.jpg", "image_outputs/6-a beautiful asian woman posing for the camera -003.jpg"]}, {"prompt": "a young asian woman in a black dress with a strawberry on her chest ", "paths": ["image_outputs/6-a young asian woman in a black dress with a strawberry on her chest -000.jpg", "image_outputs/6-a young asian woman in a black dress with a strawberry on her chest -001.jpg", "image_outputs/6-a young asian woman in a black dress with a strawberry on her chest -002.jpg", "image_outputs/6-a young asian woman in a black dress with a strawberry on her chest -003.jpg"]}, {"prompt": "a woman wearing a plaid shirt sitting in a restaurant ", "paths": ["image_outputs/6-a woman wearing a plaid shirt sitting in a restaurant -000.jpg", "image_outputs/6-a woman wearing a plaid shirt sitting in a restaurant -001.jpg", "image_outputs/6-a woman wearing a plaid shirt sitting in a restaurant -002.jpg", "image_outputs/6-a woman wearing a plaid shirt sitting in a restaurant -003.jpg"]}, {"prompt": "a woman taking a picture of herself with her cell phone ", "paths": ["image_outputs/6-a woman taking a picture of herself with her cell phone -000.jpg", "image_outputs/6-a woman taking a picture of herself with her cell phone -001.jpg", "image_outputs/6-a woman taking a picture of herself with her cell phone -002.jpg", "image_outputs/6-a woman taking a picture of herself with her cell phone -003.jpg"]}, {"prompt": "a smiling woman in a blue shirt standing in front of plants ", "paths": ["image_outputs/6-a smiling woman in a blue shirt standing in front of plants -000.jpg", "image_outputs/6-a smiling woman in a blue shirt standing in front of plants -001.jpg", "image_outputs/6-a smiling woman in a blue shirt standing in front of plants -002.jpg", "image_outputs/6-a smiling woman in a blue shirt standing in front of plants -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/7.json b/json_outputs/7.json new file mode 100644 index 0000000..8a322f0 --- /dev/null +++ b/json_outputs/7.json @@ -0,0 +1 @@ +{"id": 7, "images": [{"prompt": "a young woman holding a stuffed doll in front of her ", "paths": ["image_outputs/7-a young woman holding a stuffed doll in front of her -000.jpg", "image_outputs/7-a young woman holding a stuffed doll in front of her -001.jpg", "image_outputs/7-a young woman holding a stuffed doll in front of her -002.jpg", "image_outputs/7-a young woman holding a stuffed doll in front of her -003.jpg"]}, {"prompt": "a woman sitting at a table with a plate of food ", "paths": ["image_outputs/7-a woman sitting at a table with a plate of food -000.jpg", "image_outputs/7-a woman sitting at a table with a plate of food -001.jpg", "image_outputs/7-a woman sitting at a table with a plate of food -002.jpg", "image_outputs/7-a woman sitting at a table with a plate of food -003.jpg"]}, {"prompt": "a woman in jeans and an off the shoulder sweater ", "paths": ["image_outputs/7-a woman in jeans and an off the shoulder sweater -000.jpg", "image_outputs/7-a woman in jeans and an off the shoulder sweater -001.jpg", "image_outputs/7-a woman in jeans and an off the shoulder sweater -002.jpg", "image_outputs/7-a woman in jeans and an off the shoulder sweater -003.jpg"]}, {"prompt": "a woman in a red sweater posing for a picture ", "paths": ["image_outputs/7-a woman in a red sweater posing for a picture -000.jpg", "image_outputs/7-a woman in a red sweater posing for a picture -001.jpg", "image_outputs/7-a woman in a red sweater posing for a picture -002.jpg", "image_outputs/7-a woman in a red sweater posing for a picture -003.jpg"]}, {"prompt": "a woman in a black dress taking a selfie ", "paths": ["image_outputs/7-a woman in a black dress taking a selfie -000.jpg", "image_outputs/7-a woman in a black dress taking a selfie -001.jpg", "image_outputs/7-a woman in a black dress taking a selfie -002.jpg", "image_outputs/7-a woman in a black dress taking a selfie -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/8.json b/json_outputs/8.json new file mode 100644 index 0000000..217af54 --- /dev/null +++ b/json_outputs/8.json @@ -0,0 +1 @@ +{"id": 8, "images": [{"prompt": "an asian woman in a white sweater posing for the camera ", "paths": ["image_outputs/8-an asian woman in a white sweater posing for the camera -000.jpg", "image_outputs/8-an asian woman in a white sweater posing for the camera -001.jpg", "image_outputs/8-an asian woman in a white sweater posing for the camera -002.jpg", "image_outputs/8-an asian woman in a white sweater posing for the camera -003.jpg"]}, {"prompt": "a woman standing in front of a wall with a sign that says happy birthday ", "paths": ["image_outputs/8-a woman standing in front of a wall with a sign that says happy birthday -000.jpg", "image_outputs/8-a woman standing in front of a wall with a sign that says happy birthday -001.jpg", "image_outputs/8-a woman standing in front of a wall with a sign that says happy birthday -002.jpg", "image_outputs/8-a woman standing in front of a wall with a sign that says happy birthday -003.jpg"]}, {"prompt": "a woman is holding a coconut in her hand ", "paths": ["image_outputs/8-a woman is holding a coconut in her hand -000.jpg", "image_outputs/8-a woman is holding a coconut in her hand -001.jpg", "image_outputs/8-a woman is holding a coconut in her hand -002.jpg", "image_outputs/8-a woman is holding a coconut in her hand -003.jpg"]}, {"prompt": "a woman in a black top sitting on a couch ", "paths": ["image_outputs/8-a woman in a black top sitting on a couch -000.jpg", "image_outputs/8-a woman in a black top sitting on a couch -001.jpg", "image_outputs/8-a woman in a black top sitting on a couch -002.jpg", "image_outputs/8-a woman in a black top sitting on a couch -003.jpg"]}]} \ No newline at end of file diff --git a/json_outputs/9.json b/json_outputs/9.json new file mode 100644 index 0000000..178d3ea --- /dev/null +++ b/json_outputs/9.json @@ -0,0 +1 @@ +{"id": 9, "images": [{"prompt": "a woman with short hair sitting on a bench ", "paths": ["image_outputs/9-a woman with short hair sitting on a bench -000.jpg", "image_outputs/9-a woman with short hair sitting on a bench -001.jpg", "image_outputs/9-a woman with short hair sitting on a bench -002.jpg", "image_outputs/9-a woman with short hair sitting on a bench -003.jpg"]}, {"prompt": "a woman with long brown hair and a black t - shirt ", "paths": ["image_outputs/9-a woman with long brown hair and a black t - shirt -000.jpg", "image_outputs/9-a woman with long brown hair and a black t - shirt -001.jpg", "image_outputs/9-a woman with long brown hair and a black t - shirt -002.jpg", "image_outputs/9-a woman with long brown hair and a black t - shirt -003.jpg"]}, {"prompt": "a woman in a black leather jacket and pink shirt ", "paths": ["image_outputs/9-a woman in a black leather jacket and pink shirt -000.jpg", "image_outputs/9-a woman in a black leather jacket and pink shirt -001.jpg", "image_outputs/9-a woman in a black leather jacket and pink shirt -002.jpg", "image_outputs/9-a woman in a black leather jacket and pink shirt -003.jpg"]}, {"prompt": "an asian woman in a red swimsuit sitting on a bench ", "paths": ["image_outputs/9-an asian woman in a red swimsuit sitting on a bench -000.jpg", "image_outputs/9-an asian woman in a red swimsuit sitting on a bench -001.jpg", "image_outputs/9-an asian woman in a red swimsuit sitting on a bench -002.jpg", "image_outputs/9-an asian woman in a red swimsuit sitting on a bench -003.jpg"]}]} \ No newline at end of file diff --git a/jsoncheck.py b/jsoncheck.py new file mode 100644 index 0000000..a5cd65c --- /dev/null +++ b/jsoncheck.py @@ -0,0 +1,42 @@ +import json +import argparse +import os +import logging + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-d","--json_data_path", type=str, default="/home/schengwei/Competitionrepo/train_data/json", help="file contains prompts") + return parser.parse_args() + +def load_json_files(path): + """ + given a directory, load all json files in that directory + return a list of json objects + """ + d_ls = [] + for file in os.listdir(path): + if file.endswith(".json"): + with open(os.path.join(path, file), 'r') as f: + json_data = json.load(f) + d_ls.append(json_data) + return d_ls +def main(): + arg = get_args() + # load json files + json_data_ls = load_json_files(arg.json_data_path) + print("json_data_ls: {}".format(len(json_data_ls))) + + for json_data in json_data_ls: + logging.info(f"process json_data: {json_data['id']}") + # print(f"process json_data: {json_data['id']}") + image_paths = [i["path"] for i in json_data["source_group"]] + # 获取目录中所有的图片文件 + all_files = [f for f in os.listdir(input_dir) if f.endswith(('.jpg', '.png'))] +# 按照人的编号和照片的编号对文件名进行排序 + sorted_files = sorted(all_files, key=lambda x: (int(x.split('_')[0]), int(x.split('_')[1].split('.')[0]))) + + print(sorted_files) + +if __name__ == "__main__": + main() + print("success!") diff --git a/libs/caption_decoder.py b/libs/caption_decoder.py index f8935cc..e927cbb 100755 --- a/libs/caption_decoder.py +++ b/libs/caption_decoder.py @@ -62,6 +62,8 @@ def __init__(self, prefix_length: int, hidden_dim=None, tokenizer_path="gpt2"): self.gpt.resize_token_embeddings(len(base_tokenizer)) self.hidden_dim = hidden_dim + ### hidden_dim = text_dim = 64 + ### encode_prefix 就是把 text embedding 从 768 降维 到 64 self.encode_prefix = nn.Linear(768, hidden_dim) if hidden_dim is not None else nn.Identity() self.decode_prefix = nn.Linear(hidden_dim, 768) if hidden_dim is not None else nn.Identity() @@ -251,8 +253,8 @@ def __init__(self, device, pretrained_path, hidden_dim=-1, tokenizer_path = "gpt new_k = k[7:] state_dict[new_k] = v mk, uk = self.caption_model.load_state_dict(state_dict, strict=False) - assert len(mk) == 0 - assert all([name.startswith('clip') for name in uk]) + # assert len(mk) == 0 + # assert all([name.startswith('clip') for name in uk]) self.caption_model.eval() self.caption_model.to(device) self.caption_model.requires_grad_(False) @@ -260,6 +262,9 @@ def __init__(self, device, pretrained_path, hidden_dim=-1, tokenizer_path = "gpt def encode_prefix(self, features): return self.caption_model.encode_prefix(features) + + def decode_prefix(self, features): + return self.caption_model.decode_prefix(features) def generate_captions(self, features): # the low dimension representation of clip feature """ @@ -282,4 +287,4 @@ def generate_captions(self, features): # the low dimension representation of cl generated_captions.append(generate2(self.caption_model, self.tokenizer, embed=feature)) return generated_captions -# %% + diff --git a/libs/clip.py b/libs/clip.py index 5f85fb5..ad11971 100755 --- a/libs/clip.py +++ b/libs/clip.py @@ -345,3 +345,32 @@ def _get_model_file( ) + + +class CLIPImageEmbedder(nn.Module): + """ + ## CLIP Image Embedder + """ + def __init__(self, model="ViT-B/32", device='cuda:0'): + super().__init__() + + self.model, _ = load_clip(name=model, device=device) + self.model.eval() + self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False) + self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False) + + def preprocess(self, x): + # normalize to [0,1] + x = kornia.geometry.resize(x, (224, 224), + interpolation='bicubic', align_corners=True) + x = (x + 1.) / 2. + # re-normalize according to clip + x = kornia.enhance.normalize(x, self.mean, self.std) + return x + + def forward(self, x): + # x is assumed to be in range [-1,1] + out = self.model.encode_image(self.preprocess(x)) + # out = out.to(x.dtype) + return out + \ No newline at end of file diff --git a/libs/cross_attention.py b/libs/cross_attention.py deleted file mode 100755 index 258ba23..0000000 --- a/libs/cross_attention.py +++ /dev/null @@ -1,472 +0,0 @@ -import torch -import torch.nn as nn -import math -from .timm import trunc_normal_, DropPath, Mlp -import einops -import torch.utils.checkpoint -import torch.nn.functional as F -class LoRALinearLayer(nn.Module): - def __init__(self, in_features, out_features, rank=24, network_alpha=None, device='cuda:0', dtype=None): - super().__init__() - print(in_features,out_features,rank) - - - if rank > min(in_features, out_features): - raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}") - - self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype) - self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype) - self.network_alpha = network_alpha - self.rank = rank - - nn.init.normal_(self.down.weight, std=1 / rank) - nn.init.zeros_(self.up.weight) - - def forward(self, hidden_states): - orig_dtype = hidden_states.dtype - dtype = self.down.weight.dtype - - down_hidden_states = self.down(hidden_states.to(dtype)) - up_hidden_states = self.up(down_hidden_states) - - if self.network_alpha is not None: - up_hidden_states *= self.network_alpha / self.rank - - return up_hidden_states.to(orig_dtype) - -class lora_cross_attention(nn.Module): - def __init__(self, img_dim=1024, rank=24, text_dim=77, heads=8, qkv_bias=False, qk_scale=None, attn_dorp=0.,proj_drop=0., hidden_size=1536, dropout = 0.0, network_alpha=None): - - super().__init__() - self.heads = heads - self.scale = qk_scale or (hidden_size//self.heads) ** -0.5 - self.hidden_size = hidden_size - self.img_dim = img_dim - self.text_dim = text_dim - self.rank = rank - self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - self.to_out_custom_diffusion = nn.ModuleList([]) - self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=True)) - self.to_out_custom_diffusion.append(nn.Dropout(dropout)) - - def head_to_batch_dim(self, tensor, out_dim=3): - head_size = self.heads - batch_size, seq_len, dim = tensor.shape - tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size) - tensor = tensor.permute(0, 2, 1, 3) - - if out_dim == 3: - tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size) - - return tensor - - def batch_to_head_dim(self, tensor): - head_size = self.heads - batch_size, seq_len, dim = tensor.shape - tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim) - tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size) - return tensor - - def prepare_attention_mask(self, attention_mask, target_length, batch_size=2, out_dim=3): - - head_size = self.heads - if attention_mask is None: - return attention_mask - - current_length: int = attention_mask.shape[-1] - if current_length != target_length: - if attention_mask.device.type == "mps": - # HACK: MPS: Does not support padding by greater than dimension of input tensor. - # Instead, we can manually construct the padding tensor. - padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length) - padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device) - attention_mask = torch.cat([attention_mask, padding], dim=2) - else: - # TODO: for pipelines such as stable-diffusion, padding cross-attn mask: - # we want to instead pad by (0, remaining_length), where remaining_length is: - # remaining_length: int = target_length - current_length - # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding - attention_mask = F.pad(attention_mask, (0, target_length), value=0.0) - - if out_dim == 3: - if attention_mask.shape[0] < batch_size * head_size: - attention_mask = attention_mask.repeat_interleave(head_size, dim=0) - elif out_dim == 4: - attention_mask = attention_mask.unsqueeze(1) - attention_mask = attention_mask.repeat_interleave(head_size, dim=1) - - return attention_mask - - def get_attention_scores(self, query, key, attention_mask=None): - dtype = query.dtype - - if attention_mask is None: - baddbmm_input = torch.empty( - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device - ) - beta = 0 - else: - baddbmm_input = attention_mask - beta = 1 - - attention_scores = torch.baddbmm( - baddbmm_input, - query, - key.transpose(-1, -2), - beta=beta, - alpha=self.scale, - ) - del baddbmm_input - - - - attention_probs = attention_scores.softmax(dim=-1) - del attention_scores - - attention_probs = attention_probs.to(dtype) - - return attention_probs - - def forward(self, img , text, attention_mask=None): - batch_size = 2 - #添加mask - attention_mask = self.prepare_attention_mask(attention_mask, 1024, batch_size) - - query = self.to_q_lora(img)#得到query架构仍然为1024*1536 - key = self.to_k_lora(text) - value =self.to_v_lora(text)#仍为77*1536 - # key = key.to(attn.to_q.weight.dtype) - # value = value.to(attn.to_q.weight.dtype) - - #优化方法,可选择添加 - detach = torch.ones_like(key) - detach[:, :1, :] = detach[:, :1, :] * 0.0 - key = detach * key + (1 - detach) * key.detach() - value = detach * value + (1 - detach) * value.detach() - - - #多头注意力 - query = self.head_to_batch_dim(query) - key = self.head_to_batch_dim(key) - value = self.head_to_batch_dim(value) - attention_probs = self.get_attention_scores(query, key, attention_mask) - hidden_states = torch.bmm(attention_probs, value) - hidden_states = self.batch_to_head_dim(hidden_states) - - # linear proj - hidden_states = self.to_out_custom_diffusion[0](hidden_states) - - # dropout - hidden_states = self.to_out_custom_diffusion[1](hidden_states) - - return hidden_states - - -class ModelWithAdapter(nn.Module): - def __init__(self, pretrained_attention_model, adapter): - super(ModelWithAdapter, self).__init__() - self.pretrained_attention_model = pretrained_attention_model - self.adapter = adapter - - def forward(self, input_data): - attention_output = self.pretrained_attention_model(input_data) - adapter_output = self.adapter(attention_output) - final_output = attention_output + adapter_output - return final_output - - -if hasattr(torch.nn.functional, 'scaled_dot_product_attention'): - ATTENTION_MODE = 'flash' -else: - try: - import xformers - import xformers.ops - ATTENTION_MODE = 'xformers' - except: - ATTENTION_MODE = 'math' -print(f'uvit attention mode is {ATTENTION_MODE}') - - -def timestep_embedding(timesteps, dim, max_period=10000): - """ - Create sinusoidal timestep embeddings. - - :param timesteps: a 1-D Tensor of N indices, one per batch element. - These may be fractional. - :param dim: the dimension of the output. - :param max_period: controls the minimum frequency of the embeddings. - :return: an [N x dim] Tensor of positional embeddings. - """ - half = dim // 2 - freqs = torch.exp( - -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half - ).to(device=timesteps.device) - args = timesteps[:, None].float() * freqs[None] - embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) - if dim % 2: - embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) - return embedding - - -def patchify(imgs, patch_size): - x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size) - return x - - -def unpatchify(x, in_chans): - patch_size = int((x.shape[2] // in_chans) ** 0.5) - h = w = int(x.shape[1] ** .5) - assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2] - x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h, p1=patch_size, p2=patch_size) - return x - - -def interpolate_pos_emb(pos_emb, old_shape, new_shape): - pos_emb = einops.rearrange(pos_emb, 'B (H W) C -> B C H W', H=old_shape[0], W=old_shape[1]) - pos_emb = F.interpolate(pos_emb, new_shape, mode='bilinear') - pos_emb = einops.rearrange(pos_emb, 'B C H W -> B (H W) C') - return pos_emb - - -class Attention(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim ** -0.5 - - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x): - B, L, C = x.shape - - qkv = self.qkv(x) - if ATTENTION_MODE == 'flash': - qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads).float() - q, k, v = qkv[0], qkv[1], qkv[2] # B H L D - x = torch.nn.functional.scaled_dot_product_attention(q, k, v) - x = einops.rearrange(x, 'B H L D -> B L (H D)') - elif ATTENTION_MODE == 'xformers': - qkv = einops.rearrange(qkv, 'B L (K H D) -> K B L H D', K=3, H=self.num_heads) - q, k, v = qkv[0], qkv[1], qkv[2] # B L H D - x = xformers.ops.memory_efficient_attention(q, k, v) - x = einops.rearrange(x, 'B L H D -> B L (H D)', H=self.num_heads) - elif ATTENTION_MODE == 'math': - with torch.amp.autocast(device_type='cuda', enabled=False): - qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads).float() - q, k, v = qkv[0], qkv[1], qkv[2] # B H L D - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - x = (attn @ v).transpose(1, 2).reshape(B, L, C) - else: - raise NotImplemented - - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Module): - - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, skip=False, use_checkpoint=False): - super().__init__() - self.norm1 = norm_layer(dim) if skip else None - self.norm2 = norm_layer(dim) - - self.attn = Attention( - dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.norm3 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) - self.skip_linear = nn.Linear(2 * dim, dim) if skip else None - self.use_checkpoint = use_checkpoint - - def forward(self, x, skip=None): - if self.use_checkpoint: - return torch.utils.checkpoint.checkpoint(self._forward, x, skip) - else: - return self._forward(x, skip) - - def _forward(self, x, skip=None): - if self.skip_linear is not None: - x = self.skip_linear(torch.cat([x, skip], dim=-1)) - x = self.norm1(x) - x = x + self.drop_path(self.attn(x)) - x = self.norm2(x) - - x = x + self.drop_path(self.mlp(x)) - x = self.norm3(x) - - return x - - -class PatchEmbed(nn.Module): - """ Image to Patch Embedding - """ - def __init__(self, patch_size, in_chans=3, embed_dim=768): - super().__init__() - self.patch_size = patch_size - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - def forward(self, x): - B, C, H, W = x.shape - assert H % self.patch_size == 0 and W % self.patch_size == 0 - x = self.proj(x).flatten(2).transpose(1, 2) - return x - - -class UViT(nn.Module): - def __init__(self, img_size, in_chans, patch_size, embed_dim=768, depth=12, - num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, pos_drop_rate=0., drop_rate=0., attn_drop_rate=0., - norm_layer=nn.LayerNorm, mlp_time_embed=False, use_checkpoint=False, - text_dim=None, num_text_tokens=None, clip_img_dim=None): - super().__init__() - self.in_chans = in_chans - self.patch_size = patch_size - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models - - self.patch_embed = PatchEmbed(patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) - self.img_size = (img_size, img_size) if isinstance(img_size, int) else img_size # the default img size - assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0 - self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size) - - self.time_img_embed = nn.Sequential( - nn.Linear(embed_dim, 4 * embed_dim), - nn.SiLU(), - nn.Linear(4 * embed_dim, embed_dim), - ) if mlp_time_embed else nn.Identity() - - self.time_text_embed = nn.Sequential( - nn.Linear(embed_dim, 4 * embed_dim), - nn.SiLU(), - nn.Linear(4 * embed_dim, embed_dim), - ) if mlp_time_embed else nn.Identity() - - self.text_embed = nn.Linear(text_dim, embed_dim) - self.text_out = nn.Linear(embed_dim, text_dim) - - self.clip_img_embed = nn.Linear(clip_img_dim, embed_dim) - self.clip_img_out = nn.Linear(embed_dim, clip_img_dim) - - self.num_text_tokens = num_text_tokens - self.num_tokens = 1 + 1 + num_text_tokens + 1 + self.num_patches - - self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, embed_dim)) - self.pos_drop = nn.Dropout(p=pos_drop_rate) - - self.in_blocks = nn.ModuleList([ - Block( - dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, use_checkpoint=use_checkpoint) - for _ in range(depth // 2)]) - - self.mid_block = Block( - dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, use_checkpoint=use_checkpoint) - - self.out_blocks = nn.ModuleList([ - Block( - dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, skip=True, use_checkpoint=use_checkpoint) - for _ in range(depth // 2)]) - - self.norm = norm_layer(embed_dim) - self.patch_dim = patch_size ** 2 * in_chans - self.decoder_pred = nn.Linear(embed_dim, self.patch_dim, bias=True) - - trunc_normal_(self.pos_embed, std=.02) - self.apply(self._init_weights) - - self.token_embedding = nn.Embedding(2, embed_dim) - self.pos_embed_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - @torch.jit.ignore - def no_weight_decay(self): - return {'pos_embed'} - - def forward(self, img, clip_img, text, t_img, t_text, data_type): - _, _, H, W = img.shape - - img = self.patch_embed(img) - - t_img_token = self.time_img_embed(timestep_embedding(t_img, self.embed_dim)) - t_img_token = t_img_token.unsqueeze(dim=1) - t_text_token = self.time_text_embed(timestep_embedding(t_text, self.embed_dim)) - t_text_token = t_text_token.unsqueeze(dim=1) - - text = self.text_embed(text) - clip_img = self.clip_img_embed(clip_img) - token_embed = self.token_embedding(data_type).unsqueeze(dim=1) - - - # t_img_token torch.Size([2, 1, 1536]) - # t_text_token torch.Size([2, 1, 1536]) - # token_embed torch.Size([2, 1, 1536]) - # text torch.Size([2, 77, 1536]) - # clip_img torch.Size([2, 1, 1536]) - # img torch.Size([2, 1024, 1536]) - - x = torch.cat((t_img_token, t_text_token, token_embed, text, clip_img, img), dim=1) - num_text_tokens, num_img_tokens = text.size(1), img.size(1) - - pos_embed = torch.cat( - [self.pos_embed[:, :1 + 1, :], self.pos_embed_token, self.pos_embed[:, 1 + 1:, :]], dim=1) - if H == self.img_size[0] and W == self.img_size[1]: - pass - else: # interpolate the positional embedding when the input image is not of the default shape - pos_embed_others, pos_embed_patches = torch.split(pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], dim=1) - pos_embed_patches = interpolate_pos_emb(pos_embed_patches, (self.img_size[0] // self.patch_size, self.img_size[1] // self.patch_size), - (H // self.patch_size, W // self.patch_size)) - pos_embed = torch.cat((pos_embed_others, pos_embed_patches), dim=1) - - x = x + pos_embed - x = self.pos_drop(x) - skips = [] - for blk in self.in_blocks: - t_img_token, t_text_token, token_embed, text, clip_img, img = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1) - model = lora_cross_attention() - model.to('cuda') - lora_img = model(img, text) - x = torch.cat((t_img_token, t_text_token, token_embed, text, clip_img, img), dim=1) - - x = blk(x) - t_img_token, t_text_token, token_embed, text, clip_img, img = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1) - img = img + lora_img - - x = torch.cat((t_img_token, t_text_token, token_embed, text, clip_img, img), dim=1) - skips.append(x) - - x = self.mid_block(x) - - for blk in self.out_blocks: - x = blk(x, skips.pop()) - - x = self.norm(x) - - t_img_token_out, t_text_token_out, token_embed_out, text_out, clip_img_out, img_out = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1) - - - img_out = self.decoder_pred(img_out) - img_out = unpatchify(img_out, self.in_chans) - - clip_img_out = self.clip_img_out(clip_img_out) - - text_out = self.text_out(text_out) - return img_out, clip_img_out, text_out \ No newline at end of file diff --git a/libs/data.py b/libs/data.py index 6223079..57f0523 100755 --- a/libs/data.py +++ b/libs/data.py @@ -8,12 +8,135 @@ from PIL import Image from torch.utils.data import Dataset import random -from PIL.ImageOps import exif_transpose -import torch -from pathlib import Path +training_templates_smallest = [ + 'photo of a sks {}', +] + +reg_templates_smallest = [ + 'photo of a {}', +] + +imagenet_templates_small = [ + 'a photo of a {}', + 'a rendering of a {}', + 'a cropped photo of the {}', + 'the photo of a {}', + 'a photo of a clean {}', + 'a photo of a dirty {}', + 'a dark photo of the {}', + 'a photo of my {}', + 'a photo of the cool {}', + 'a close-up photo of a {}', + 'a bright photo of the {}', + 'a cropped photo of a {}', + 'a photo of the {}', + 'a good photo of the {}', + 'a photo of one {}', + 'a close-up photo of the {}', + 'a rendition of the {}', + 'a photo of the clean {}', + 'a rendition of a {}', + 'a photo of a nice {}', + 'a good photo of a {}', + 'a photo of the nice {}', + 'a photo of the small {}', + 'a photo of the weird {}', + 'a photo of the large {}', + 'a photo of a cool {}', + 'a photo of a small {}', + 'an illustration of a {}', + 'a rendering of a {}', + 'a cropped photo of the {}', + 'the photo of a {}', + 'an illustration of a clean {}', + 'an illustration of a dirty {}', + 'a dark photo of the {}', + 'an illustration of my {}', + 'an illustration of the cool {}', + 'a close-up photo of a {}', + 'a bright photo of the {}', + 'a cropped photo of a {}', + 'an illustration of the {}', + 'a good photo of the {}', + 'an illustration of one {}', + 'a close-up photo of the {}', + 'a rendition of the {}', + 'an illustration of the clean {}', + 'a rendition of a {}', + 'an illustration of a nice {}', + 'a good photo of a {}', + 'an illustration of the nice {}', + 'an illustration of the small {}', + 'an illustration of the weird {}', + 'an illustration of the large {}', + 'an illustration of a cool {}', + 'an illustration of a small {}', + 'a depiction of a {}', + 'a rendering of a {}', + 'a cropped photo of the {}', + 'the photo of a {}', + 'a depiction of a clean {}', + 'a depiction of a dirty {}', + 'a dark photo of the {}', + 'a depiction of my {}', + 'a depiction of the cool {}', + 'a close-up photo of a {}', + 'a bright photo of the {}', + 'a cropped photo of a {}', + 'a depiction of the {}', + 'a good photo of the {}', + 'a depiction of one {}', + 'a close-up photo of the {}', + 'a rendition of the {}', + 'a depiction of the clean {}', + 'a rendition of a {}', + 'a depiction of a nice {}', + 'a good photo of a {}', + 'a depiction of the nice {}', + 'a depiction of the small {}', + 'a depiction of the weird {}', + 'a depiction of the large {}', + 'a depiction of a cool {}', + 'a depiction of a small {}', +] + +imagenet_dual_templates_small = [ + 'a photo of a {} with {}', + 'a rendering of a {} with {}', + 'a cropped photo of the {} with {}', + 'the photo of a {} with {}', + 'a photo of a clean {} with {}', + 'a photo of a dirty {} with {}', + 'a dark photo of the {} with {}', + 'a photo of my {} with {}', + 'a photo of the cool {} with {}', + 'a close-up photo of a {} with {}', + 'a bright photo of the {} with {}', + 'a cropped photo of a {} with {}', + 'a photo of the {} with {}', + 'a good photo of the {} with {}', + 'a photo of one {} with {}', + 'a close-up photo of the {} with {}', + 'a rendition of the {} with {}', + 'a photo of the clean {} with {}', + 'a rendition of a {} with {}', + 'a photo of a nice {} with {}', + 'a good photo of a {} with {}', + 'a photo of the nice {} with {}', + 'a photo of the small {} with {}', + 'a photo of the weird {} with {}', + 'a photo of the large {} with {}', + 'a photo of a cool {} with {}', + 'a photo of a small {} with {}', +] + +per_img_token_list = [ + 'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת', +] + def _convert_image_to_rgb(image): return image.convert("RGB") @@ -27,248 +150,176 @@ def _transform(n_px): transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ]) -def collate_fn(examples, with_prior_preservation=False): - # has_attention_mask = "instance_attention_mask" in examples[0] - - input_ids = [example["instance_prompt_ids"] for example in examples]#实例id - pixel_values = [example["instance_images"] for example in examples]#实例图像 - clip_img = [example["instance_clip_images"] for example in examples] - mask = [example["mask"] for example in examples] - # if has_attention_mask: - # attention_mask = [example["instance_attention_mask"] for example in examples] - - # Concat class and instance examples for prior preservation. - # We do this to avoid doing two forward passes. - if with_prior_preservation: - input_ids += [example["class_prompt_ids"] for example in examples] - pixel_values += [example["class_images"] for example in examples] - clip_img += [example["class_clip_images"] for example in examples] - mask += [example["class_mask"] for example in examples] - # if has_attention_mask: - # attention_mask += [example["class_attention_mask"] for example in examples] - clip_img = torch.stack(clip_img) - pixel_values = torch.stack(pixel_values) - mask = torch.stack(mask) - pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() - mask = mask.to(memory_format=torch.contiguous_format).float() - clip_img = clip_img.to(memory_format=torch.contiguous_format).float() - input_ids = torch.cat(input_ids, dim=0) - mask = mask.unsqueeze(1) - - # if has_attention_mask: - # attention_mask = torch.cat(attention_mask, dim=0) - # batch["attention_mask"] = attention_mask - return input_ids,pixel_values,clip_img,mask +class PersonalizedBase(Dataset): + def __init__(self, + data_root, # The root directory of the dataset. + resolution, # The resolution of the images. + repeats=100, # The number of times to repeat the dataset. + flip_p=0.5, # The probability of flipping the image horizontally. + set="train", # The dataset split to use. + class_word="dog", # The class word to use for the dataset. + per_image_tokens=False, # Whether to use per-image tokens. + mixing_prob=0.25, # The probability of mixing the image and text. + coarse_class_text=None, # The coarse class text to use for the dataset. + reg = False # Whether to use regression instead of classification. + ): + """ + A dataset class for personalized image-text matching. -class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + Args: + - data_root: str, the root directory of the dataset. + - resolution: int, the resolution of the images. + - repeats: int, the number of times to repeat the dataset. + - flip_p: float, the probability of flipping the image horizontally. + - set: str, the dataset split to use. + - class_word: str, the class word to use for the dataset. + - per_image_tokens: bool, whether to use per-image tokens. + - mixing_prob: float, the probability of mixing the image and text. + - coarse_class_text: str, the coarse class text to use for the dataset. + - reg: bool, whether to use regression instead of classification. + """ + self.data_root = data_root - def __init__(self, prompt, num_samples): - self.prompt = prompt - self.num_samples = num_samples + # Get the paths of all images in the dataset. + self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root) if not file_path.endswith(".txt")] - def __len__(self): - return self.num_samples + self.num_images = len(self.image_paths) + self._length = self.num_images - def __getitem__(self, index): - example = {} - example["prompt"] = self.prompt - example["index"] = index - return example + self.placeholder_token = class_word + self.resolution = resolution + self.per_image_tokens = per_image_tokens + self.mixing_prob = mixing_prob + + # Define the image transforms. + self.transform_clip = _transform(224) + self.transform = transforms.Compose([transforms.Resize(resolution), transforms.CenterCrop(resolution), + transforms.ToTensor(), transforms.Normalize(0.5, 0.5)]) + self.coarse_class_text = coarse_class_text -def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None): - if tokenizer_max_length is not None: - max_length = tokenizer_max_length - else: - max_length = tokenizer.model_max_length + # Check if per-image tokens are being used. + if per_image_tokens: + assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." - text_inputs = tokenizer( - prompt, - truncation=True, - padding="max_length", - max_length=max_length, - return_tensors="pt", - ) + # If the dataset split is "train", repeat the dataset. + if set == "train": + self._length = self.num_images * repeats - return text_inputs + self.reg = reg + def __len__(self): + return self._length -class PersonalizedBase(Dataset): + def __getitem__(self, i): + # Load the image and convert it to RGB. + pil_image = Image.open(self.image_paths[i % self.num_images]).convert("RGB") + + placeholder_string = self.placeholder_token + if self.coarse_class_text: + placeholder_string = f"{self.coarse_class_text} {placeholder_string}" + + # Generate the text for the image. + if not self.reg: + text = random.choice(training_templates_smallest).format(placeholder_string) + else: + text = random.choice(reg_templates_smallest).format(placeholder_string) + + # Apply the image transforms. + img = self.transform(pil_image) + img4clip = self.transform_clip(pil_image) + + return img, img4clip, text, 0 + + +class PersonalizedBasev2(Dataset): def __init__(self, - concepts_list, - size=512, - mask_size=64, - center_crop=False, - tokenizer_max_length=None, - num_class_images=200, - tokenizer=None, - config = None, - hflip=False, - aug=False, + image_paths, # a list of image paths + resolution, # the resolution of the images + repeats=100, # the number of times to repeat the dataset + flip_p=0.5, # the probability of flipping the image horizontally + set="train", # the dataset split to use + class_word="dog", # the class word to use for the dataset + per_image_tokens=False, # whether to use per-image tokens + mixing_prob=0.25, # the probability of mixing the text with another text + coarse_class_text=None, # the coarse class text to use for the dataset + reg = False # whether to use regular templates ): - self.size = size - self.mask_size = mask_size - - self.center_crop = center_crop - self.tokenizer = tokenizer - - self.tokenizer_max_length = tokenizer_max_length - - self.interpolation = PIL.Image.BILINEAR - self.aug = aug - - self.instance_images_path = [] - self.class_images_path = [] + """ + A dataset class for personalized image captioning. + + Args: + - image_paths: a list of image paths + - resolution: the resolution of the images + - repeats: the number of times to repeat the dataset + - flip_p: the probability of flipping the image horizontally + - set: the dataset split to use + - class_word: the class word to use for the dataset + - per_image_tokens: whether to use per-image tokens + - mixing_prob: the probability of mixing the text with another text + - coarse_class_text: the coarse class text to use for the dataset + - reg: whether to use regular templates + """ + self.image_paths = image_paths + + self.num_images = len(self.image_paths) + self._length = self.num_images + + self.placeholder_token = class_word + self.resolution = resolution + self.per_image_tokens = per_image_tokens + self.mixing_prob = mixing_prob - for concept in concepts_list: - inst_img_path = [ - (x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file() - ] - - # # 替换 prompt 如果传入的图片有背景的话 - for i in range(len(inst_img_path)): - path, text = inst_img_path[i] - if str(path).endswith('.jpeg'): - path = str(path) - if 'girl1' in path: - inst_img_path[i] = (path, 'a girl in the room') - elif 'boy2' in path: - inst_img_path[i] = (path, 'a boy in the room') - elif 'boy1' in path: - inst_img_path[i] = (path, 'a boy') - elif 'girl2' in path: - inst_img_path[i] = (path, 'a girl on the street') - - self.instance_images_path.extend(inst_img_path) - - class_data_root = Path(concept["class_data_dir"]) - - if os.path.isdir(class_data_root): - class_images_path = list(class_data_root.iterdir()) - - class_prompt = [concept["class_prompt"] for _ in range(len(class_images_path))] - else: - with open(class_data_root, "r") as f: - class_images_path = f.read().splitlines() - with open(concept["class_prompt"], "r") as f: - class_prompt = f.read().splitlines() - - class_img_path = [(x, y) for (x, y) in zip(class_images_path, class_prompt)] - self.class_images_path.extend(class_img_path[:num_class_images]) - - self.transform_clip = _transform(224)#将clip_img 转化为224分辨率 - random.shuffle(self.instance_images_path) - self.num_instance_images = len(self.instance_images_path) - - self.num_class_images = len(self.class_images_path) - self._length = max(self.num_class_images, self.num_instance_images) - self.flip = transforms.RandomHorizontalFlip(0.5 * hflip) - - self.image_transforms = transforms.Compose( - [ - self.flip, - transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), - transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ] - ) + self.transform_clip = _transform(224) + self.transform = transforms.Compose([transforms.Resize(resolution), transforms.CenterCrop(resolution), + transforms.ToTensor(), transforms.Normalize(0.5, 0.5)]) + + self.coarse_class_text = coarse_class_text + + if per_image_tokens: + assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." + + if set == "train": + self._length = self.num_images * repeats + + self.reg = reg def __len__(self): + """ + Returns the length of the dataset. + """ return self._length - - def preprocess(self, image, scale, resample): -### 在这里强行加了一堆 64*64 的mask,动态的 mask 有点问题,不确定影响大不大 - outer, inner = self.size, scale - factor = self.size // self.mask_size - if scale > self.size: - outer, inner = scale, self.size - top, left = np.random.randint(0, outer - inner + 1), np.random.randint(0, outer - inner + 1) - image = image.resize((scale, scale), resample=resample) - image = np.array(image).astype(np.uint8) - - image = (image / 127.5 - 1.0).astype(np.float32) - instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32) - mask = np.zeros((self.size // factor, self.size // factor)) - if scale > self.size: - instance_image = image[top : top + inner, left : left + inner, :] - mask = np.ones((self.size // factor, self.size // factor)) - mask = np.ones((64,64)) - + + def __getitem__(self, i): + """ + Returns the i-th item of the dataset. + + Args: + - i: the index of the item to return + + Returns: + - img: the image tensor + - img4clip: the image tensor for CLIP + - text: the text string + - 0: a dummy label + """ + pil_image = Image.open(self.image_paths[i % self.num_images]).convert("RGB") + + placeholder_string = self.placeholder_token + if self.coarse_class_text: + placeholder_string = f"{self.coarse_class_text} {placeholder_string}" + + if not self.reg: + text = random.choice(training_templates_smallest).format(placeholder_string) else: - instance_image[top : top + inner, left : left + inner, :] = image - mask[ - top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1 - ] = 1.0 - mask = np.ones((64,64)) - - - - return instance_image, mask + text = random.choice(reg_templates_smallest).format(placeholder_string) - def __getitem__(self, index): - example = {} - instance_image, instance_prompt = self.instance_images_path[index % self.num_instance_images] + # default to score-sde preprocessing + img = self.transform(pil_image) + img4clip = self.transform_clip(pil_image) - instance_image = Image.open(instance_image) - - if not instance_image.mode == "RGB": - instance_image = instance_image.convert("RGB") - example["instance_clip_images"] = self.transform_clip(instance_image) - instance_image = self.flip(instance_image) - - # apply resize augmentation and create a valid image region mask - random_scale = self.size - if self.aug: - random_scale = ( - np.random.randint(self.size // 3, self.size + 1) - if np.random.uniform() < 0.66 - else np.random.randint(int(1.2 * self.size), int(1.4 * self.size)) - ) - - random_scale = self.size - instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation) - - # if random_scale < 0.6 * self.size: - # instance_prompt = np.random.choice(["a far away ", "very small "]) + instance_prompt - # elif random_scale > self.size: - # instance_prompt = np.random.choice(["zoomed in ", "close up "]) + instance_prompt - - example["instance_images"] = torch.from_numpy(instance_image).permute(2, 0, 1) - example["mask"] = torch.from_numpy(mask) - #torch.Size([3, 512, 512]) torch.Size([64, 64]) - example["instance_prompt_ids"] = self.tokenizer( - instance_prompt, - truncation=True, - padding="max_length", - max_length=self.tokenizer.model_max_length, - return_tensors="pt", - ).input_ids - # tensor([[49406, 1125, 539, 320, 49408, 1611, 49407, 49407, 49407, 49407, - # 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - # 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - # 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - # 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - # 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - # 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - # 49407, 49407, 49407, 49407, 49407, 49407, 49407]]) - class_image, class_prompt = self.class_images_path[index % self.num_class_images] - class_image = Image.open(class_image) - if not class_image.mode == "RGB": - class_image = class_image.convert("RGB") - example["class_images"] = self.image_transforms(class_image) - example["class_mask"] = torch.ones_like(example["mask"]) - example["class_prompt_ids"] = self.tokenizer( - class_prompt, - truncation=True, - padding="max_length", - max_length=self.tokenizer.model_max_length, - return_tensors="pt", - ).input_ids - example["class_clip_images"] = self.transform_clip(class_image) - - return example + return img, img4clip, text, 0 + diff --git a/libs/dpm_solver_pp.py b/libs/dpm_solver_pp.py index c726273..b71e7e9 100755 --- a/libs/dpm_solver_pp.py +++ b/libs/dpm_solver_pp.py @@ -897,6 +897,7 @@ def sample(self, x, steps=10, eps=1e-4, T=None, order=3, skip_type='time_uniform [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021. """ + t_0 = eps t_T = self.noise_schedule.T if T is None else T device = x.device @@ -947,6 +948,7 @@ def sample(self, x, steps=10, eps=1e-4, T=None, order=3, skip_type='time_uniform for i, order in enumerate(orders): vec_s, vec_t = torch.ones((x.shape[0],)).to(device) * timesteps[i], torch.ones((x.shape[0],)).to(device) * timesteps[i + 1] x = self.dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type) - if denoise: + if True: x = self.denoise_fn(x, torch.ones((x.shape[0],)).to(device) * t_0) + return x diff --git a/libs/schedule.py b/libs/schedule.py index ec12cd7..c702ba6 100755 --- a/libs/schedule.py +++ b/libs/schedule.py @@ -62,16 +62,21 @@ def tilde_beta(self, s, t): return self.skip_betas[s, t] * self.cum_betas[s] / self.cum_betas[t] def sample(self, x0): # sample from q(xn|x0), where n is uniform + """这是个用来加噪声的函数""" if isinstance(x0, list): - n = np.random.choice(list(range(1, self.N + 1)), (len(x0[0]),)) - # n = np.array([1000, 1000]) - eps = [torch.randn_like(tensor) for tensor in x0] + n = np.random.choice(list(range(1, self.N + 1)), (len(x0[0]),)) ### 随机选择时间步: 使用 np.random.choice 从 [1, N] 范围内随机选择一个时间步 n。N是扩散过程的总步数。 + + eps = [torch.randn_like(tensor) for tensor in x0] ### 生成噪声: eps 是随机噪声,与 x0 的维度相同。 + + #### xn 是通过应用噪声到原始数据 x0 来获得的,这是扩散模型的核心步骤。这里使用了 cum_alphas 和 cum_betas,用于控制噪声大小的系数。 xn = [stp(self.cum_alphas[n] ** 0.5, tensor) + stp(self.cum_betas[n] ** 0.5, _eps) for tensor, _eps in zip(x0, eps)] return torch.tensor(n), eps, xn else: n = np.random.choice(list(range(1, self.N + 1)), (len(x0),)) eps = torch.randn_like(x0) xn = stp(self.cum_alphas[n] ** 0.5, x0) + stp(self.cum_betas[n] ** 0.5, eps) + + ###### 函数返回 n(选择的时间步),eps(生成的噪声),以及 xn(噪声处理后的数据)。 return torch.tensor(n), eps, xn def __repr__(self): diff --git a/libs/testcuda.py b/libs/testcuda.py deleted file mode 100644 index ed116b1..0000000 --- a/libs/testcuda.py +++ /dev/null @@ -1,10 +0,0 @@ -# #code for checking the cuda edition -# import torch -# import cuda -# print(torch.__version__) -# print(torch.cuda.is_available()) -# print(cuda.__version__) - -# 查看当前的 cuda 版本的代码 -import torch -print(torch.version.cuda) \ No newline at end of file diff --git a/libs/uvit_multi_post_ln_v1.py b/libs/uvit_multi_post_ln_v1.py index a7a271e..6cc3c2b 100755 --- a/libs/uvit_multi_post_ln_v1.py +++ b/libs/uvit_multi_post_ln_v1.py @@ -24,7 +24,7 @@ def timestep_embedding(timesteps, dim, max_period=10000): """ Create sinusoidal timestep embeddings. - + 生成了关于时间步的正弦波形嵌入 :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. @@ -48,7 +48,11 @@ def patchify(imgs, patch_size): def unpatchify(x, in_chans): + ### x: torch.Size([8, 1200, 16]) + patch_size = int((x.shape[2] // in_chans) ** 0.5) + # print(patch_size) + h = w = int(x.shape[1] ** .5) assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2] x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h, p1=patch_size, p2=patch_size) @@ -159,8 +163,9 @@ class UViT(nn.Module): def __init__(self, img_size, in_chans, patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, pos_drop_rate=0., drop_rate=0., attn_drop_rate=0., norm_layer=nn.LayerNorm, mlp_time_embed=False, use_checkpoint=False, - text_dim=None, num_text_tokens=None, clip_img_dim=None): + text_dim=None, num_text_tokens=None, clip_img_dim=None, n_steps=50, **kwargs): super().__init__() + self.n_steps = n_steps self.in_chans = in_chans self.patch_size = patch_size self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models @@ -170,6 +175,9 @@ def __init__(self, img_size, in_chans, patch_size, embed_dim=768, depth=12, assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0 self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size) + # ##### 新的 DDIM 的时间步嵌入 + # self.time_embedding = nn.Embedding(n_steps, embed_dim) if mlp_time_embed else nn.Identity() + ## tmd 在进行 uvit 初始化的时候 uvit 的 mlp_time_embed : false 这里传入的也是 false self.time_img_embed = nn.Sequential( nn.Linear(embed_dim, 4 * embed_dim), nn.SiLU(), @@ -182,7 +190,7 @@ def __init__(self, img_size, in_chans, patch_size, embed_dim=768, depth=12, nn.Linear(4 * embed_dim, embed_dim), ) if mlp_time_embed else nn.Identity() - self.text_embed = nn.Linear(text_dim, embed_dim) + self.text_embed = nn.Linear(text_dim, embed_dim) self.text_out = nn.Linear(embed_dim, text_dim) self.clip_img_embed = nn.Linear(clip_img_dim, embed_dim) @@ -235,18 +243,42 @@ def no_weight_decay(self): def forward(self, img, clip_img, text, t_img, t_text, data_type): _, _, H, W = img.shape + """torch.Size([4, 4, 64, 64]) + torch.Size([4, 1, 512]) + torch.Size([4, 77, 64]) + torch.Size([4]) + torch.Size([4]) + torch.Size([4]) + """ img = self.patch_embed(img) - + ### t_img : torch.Size([4, 37]) + t_img_token = self.time_img_embed(timestep_embedding(t_img, self.embed_dim)) t_img_token = t_img_token.unsqueeze(dim=1) t_text_token = self.time_text_embed(timestep_embedding(t_text, self.embed_dim)) t_text_token = t_text_token.unsqueeze(dim=1) + text = self.text_embed(text) clip_img = self.clip_img_embed(clip_img) token_embed = self.token_embedding(data_type).unsqueeze(dim=1) - + + """ + t_img_token shape: torch.Size([4, 1, 1536]) + t_text_token shape: torch.Size([4, 1, 1536]) + token_embed shape: torch.Size([4, 1, 1536]) + text shape: torch.Size([4, 77, 1536]) + clip_img shape: torch.Size([4, 1, 1536]) + img shape: torch.Size([4, 1024, 1536]) + """ + # print("t_img_token shape: ", t_img_token.shape) + # print(t_text_token.shape) + # print(token_embed.shape) + # print(text.shape) + # print(clip_img.shape) + # print(img.shape) + x = torch.cat((t_img_token, t_text_token, token_embed, text, clip_img, img), dim=1) num_text_tokens, num_img_tokens = text.size(1), img.size(1) @@ -279,9 +311,65 @@ def forward(self, img, clip_img, text, t_img, t_text, data_type): t_img_token_out, t_text_token_out, token_embed_out, text_out, clip_img_out, img_out = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1) img_out = self.decoder_pred(img_out) + # print(img_out.shape[1]) + img_out = unpatchify(img_out, self.in_chans) clip_img_out = self.clip_img_out(clip_img_out) text_out = self.text_out(text_out) return img_out, clip_img_out, text_out + + + # def forward(self, img, clip_img, text, t_img, t_text, data_type): + # _, _, H, W = img.shape + + # img = self.patch_embed(img) + + # # t_img_token = self.time_embedding(t_img) + # # t_img_token = t_img_token.unsqueeze(dim=1) + # # t_text_token = self.time_embedding(t_text) + # # t_text_token = t_text_token.unsqueeze(dim=1) + + # text = self.text_embed(text) + # clip_img = self.clip_img_embed(clip_img) + # token_embed = self.token_embedding(data_type).unsqueeze(dim=1) + + # x = torch.cat((t_img, t_text, token_embed, text, clip_img, img), dim=1) + + # num_text_tokens, num_img_tokens = text.size(1), img.size(1) + + # pos_embed = torch.cat( + # [self.pos_embed[:, :1 + 1, :], self.pos_embed_token, self.pos_embed[:, 1 + 1:, :]], dim=1) + # if H == self.img_size[0] and W == self.img_size[1]: + # pass + # else: # interpolate the positional embedding when the input image is not of the default shape + # pos_embed_others, pos_embed_patches = torch.split(pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], dim=1) + # pos_embed_patches = interpolate_pos_emb(pos_embed_patches, (self.img_size[0] // self.patch_size, self.img_size[1] // self.patch_size), + # (H // self.patch_size, W // self.patch_size)) + # pos_embed = torch.cat((pos_embed_others, pos_embed_patches), dim=1) + + # x = x + pos_embed + # x = self.pos_drop(x) + + # skips = [] + # for blk in self.in_blocks: + # x = blk(x) + # skips.append(x) + + # x = self.mid_block(x) + + # for blk in self.out_blocks: + # x = blk(x, skips.pop()) + + # x = self.norm(x) + + # t_img_token_out, t_text_token_out, token_embed_out, text_out, clip_img_out, img_out = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1) + + # img_out = self.decoder_pred(img_out) + # img_out = unpatchify(img_out, self.in_chans) + + # clip_img_out = self.clip_img_out(clip_img_out) + + # text_out = self.text_out(text_out) + # return img_out, clip_img_out, text_out \ No newline at end of file diff --git a/load_m.py b/load_m.py new file mode 100644 index 0000000..87cfa3f --- /dev/null +++ b/load_m.py @@ -0,0 +1,131 @@ +def prepare_context(): + """ + prepare context for later use + """ + import torch + import utils + from utils import set_logger + from absl import logging + import os + import libs.autoencoder + import clip + from libs.clip import FrozenCLIPEmbedder + from libs.caption_decoder import CaptionDecoder + from libs.uvit_multi_post_ln_v1 import UViT + from configs.unidiffuserv1 import get_config + import builtins + import ml_collections + from torch import multiprocessing as mp + import accelerate + + config = get_config() + mp.set_start_method('spawn') + assert config.gradient_accumulation_steps == 1, \ + 'fix the lr_scheduler bug before using larger gradient_accumulation_steps' + accelerator = accelerate.Accelerator(gradient_accumulation_steps=config.gradient_accumulation_steps, mixed_precision="fp16") + device = accelerator.device + accelerate.utils.set_seed(config.seed, device_specific=True) + logging.info(f'Process {accelerator.process_index} using device: {device}') + + config.mixed_precision = accelerator.mixed_precision + + accelerator.wait_for_everyone() + if accelerator.is_main_process: + set_logger(log_level='info') + logging.info(config) + else: + set_logger(log_level='error') + builtins.print = lambda *args: None + logging.info(f'Run on {accelerator.num_processes} devices') + + train_state = utils.initialize_train_state(config, device, uvit_class=UViT) + origin_sd = torch.load("models/uvit_v1.pth", map_location='cpu') + + caption_decoder = CaptionDecoder(device=device, **config.caption_decoder) + nnet, optimizer = accelerator.prepare(train_state.nnet, train_state.optimizer) + + nnet.to(device) + lr_scheduler = train_state.lr_scheduler + autoencoder = libs.autoencoder.get_model(**config.autoencoder).to(device) + + clip_text_model = FrozenCLIPEmbedder(version=config.clip_text_model, device=device) + clip_img_model, clip_img_model_preprocess = clip.load(config.clip_img_model, jit=False) + clip_img_model.to(device).eval().requires_grad_(False) + + return { + "accelerator": accelerator, + "device": device, + 'config': config, + "train_state": train_state, + "origin_sd": origin_sd, + "caption_decoder": caption_decoder, + "nnet": nnet, + "autoencoder": autoencoder, + "clip_text_model": clip_text_model, + "clip_img_model": clip_img_model, + "clip_img_model_preprocess": clip_img_model_preprocess + } + + + +def process_one_json(json_data, image_output_path, context={}): + """ + given a json object, process the task the json describes + """ + from torch.utils.data import DataLoader + import utils + from libs.schedule import stable_diffusion_beta_schedule, Schedule, LSimple_T2I + from pathlib import Path + from libs.data import PersonalizedBasev2 + from absl import logging + import torch + from sample_fn import sample + from PIL import Image + from rembg import new_session, remove + import os + + + # accelerator = context["accelerator"] + # config = context["config"] + # device = context["device"] + # train_state = context["train_state"] + # origin_sd = context["origin_sd"] + # caption_decoder = context["caption_decoder"] + # nnet = context["nnet"] + # autoencoder = context["autoencoder"] + # clip_text_model = context["clip_text_model"] + # clip_img_model = context["clip_img_model"] + # clip_img_model_preprocess = context["clip_img_model_preprocess"] + + # # 初始化训练步数 + # train_state.step = 0 + # # 重新初始化模型 + # nnet.load_state_dict(origin_sd, False) + + """ + 处理数据部分 + """ + # process data + #image_paths = [i["path"] for i in json_data["source_group"]] + output_dir = '/home/schengwei/Competitionrepo/ot' + output_path = os.path.join(output_dir, f"{json_data['id']}.jpg") + model = 'u2netp' + image_path = json_data["source_group"][0]["path"] + image = Image.open(image_path) + output = remove(image, session=new_session(model)) + output = output.convert("RGB") + output.save(output_path) + + # config.n_samples = 4 + # config.n_iter = 1 + # images = [] + # for caption in json_data["caption_list"]: + # config.prompt = caption + # config.output = output + # paths = sample(config, nnet, clip_text_model, autoencoder, caption_decoder, device, json_data["id"], output_path=image_output_path) + # images.append({"prompt": caption, "paths": paths}) + + # return { + # "id": json_data["id"], + # "images": images + # } \ No newline at end of file diff --git a/load_model.py b/load_model.py new file mode 100644 index 0000000..6b8a6b7 --- /dev/null +++ b/load_model.py @@ -0,0 +1,261 @@ + +import json +import os +import time +from PIL import Image +import argparse +import numpy as np +from rembg import remove, new_session +from labml import monit +from labml_nn.diffusion.stable_diffusion.util import load_img_rm +import torch +from img2img_copytest import Img2Img +from configs.unidiffuserv1 import get_config + +import multiprocessing +import torch +try: + multiprocessing.set_start_method('spawn', force=True) + print("spawned") +except RuntimeError: + pass + +# import os +# os.environ["CUDA_VISIBLE_DEVICES"] = "4" + +TIME_ST = time.time() +TIME_ED = time.time() +TOTAL_TIME = 0 +# export CUDA_VISIBLE_DEVICES=3 + +def xiugo(oimage, image): + diff = torch.abs(oimage - image) + config = get_config() + device = config.device + + diff.to(device) + mask = diff < 4 # 10 is black + rand = torch.randn_like(oimage) + + mask.to(device) + ## 用掩码选择 t1 中的元素。这将返回一个新的张量,其中掩码为 True 的位置保持不变,其余位置为 0 + image = image * mask.float() + rand[mask] = image[mask] + return rand + + +def accelerate_model( config, folder_path,autoencoder, max_files=700): + # print(autoencoder) + json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')] + processed_files = 0 + device = config.device + for json_file in json_files: + if processed_files >= max_files: + break + with open(os.path.join(folder_path, json_file), 'r') as file: + data = json.load(file) + img_id = data["id"] + paths = [item["path"] for item in data["source_group"]] + num_images = len(paths) + num_to_process = 3 if num_images > 2 else num_images + for i in range(num_to_process): + img = Image.open(paths[i]) + # print("img.mode:",img.mode) + with monit.section("preloading:"): + session = new_session('other_models/.u2net/u2net.onnx') + img_rm = remove(img, session=session) + # print("img_rm:",img_rm.mode) + if img_rm.mode == 'RGBA': + img_rm = img_rm.convert('RGB') + # print("new_img_rm:",img_rm.mode) + if img.mode == 'RGBA': + img = img.convert('RGB') + + img_rm_tensor = load_img_rm(img_rm).to(device) # Assuming load_img_rm is defined + img_tensor = load_img_rm(img).to(device) # Assuming load_img_rm is defined + # print("img_rm_tensor",img_rm_tensor.shape) + # print("img_tensor",img_tensor.shape) + latent_rm = autoencoder.encode(img_rm_tensor) + latent_img = autoencoder.encode(img_tensor) + latent_cb = xiugo(latent_img, latent_rm) + output_path = config.accelerate_adapters + ## other_models/adapter' + if not os.path.exists(output_path): + os.makedirs(output_path, exist_ok=True) + + adapterpath = os.path.join(output_path, f"{img_id}-{i+1}.pt") + + # # import os + # torch.save(latent_cb, adapterpath) + + try: + torch.save(latent_cb, adapterpath) + except Exception as e: + print(f"Error saving file {adapterpath}: {e}") + + + processed_files += 1 + + + +def prepare_context(): + """ + prepare context for later use + """ + import torch + # import libs.autoencoder + import utils + from utils import set_logger + from absl import logging + from libs.uvit_multi_post_ln_v1 import UViT + from configs.unidiffuserv1 import get_config + import builtins + import ml_collections + from labml import monit + from torch import multiprocessing as mp + from sample_fn import get_img2img + from rembg import remove, new_session + + + + torch.set_num_threads(1) + +# /workspace/final_json_data + config = get_config() + + device = config.device + + + img2img = get_img2img() + + img2img = torch.compile(img2img, mode="reduce-overhead") + + + config.prompt = "" + nnet = img2img.model + nnet_standard = img2img.model.nnet_standard + clip_text_model = img2img.model.cond_stage_model + autoencoder = img2img.model.autoencoder + # decoder_consistency = img2img.model.decoder_consistency + caption_decoder = img2img.model.caption_decoder + clip_img_model = img2img.model.image_stage_model + clip_img_model_preprocess = img2img.model.get_clipimg_embedding + + + + return { + "device": device, + 'config': config, + "origin_sd": nnet_standard, + "caption_decoder": caption_decoder, + "nnet": nnet, + "autoencoder": autoencoder, + # "decoder_consistency": decoder_consistency, + "clip_text_model": clip_text_model, + "clip_img_model": clip_img_model, + "clip_img_model_preprocess": clip_img_model_preprocess + } + + +def load_json_files(path): + """ + given a directory, load all json files in that directory + return a list of json objects + """ + d_ls = [] + for file in os.listdir(path): + if file.endswith(".json"): + with open(os.path.join(path, file), 'r') as f: + json_data = json.load(f) + d_ls.append(json_data) + return d_ls + + +def process_one_json(json_data, image_output_path, context={}): + # multiprocessing.set_start_method('spawn') + + ## other_models/adapters/xxx.pt + ## img_id 5 + + """ + given a json object, process the task the json describes + """ + + import utils + from absl import logging + import torch + from sample_fn import sample + + # 初始化训练步数 + + config = context["config"] + device = context["device"] + nnet_standard = context["origin_sd"] + caption_decoder = context["caption_decoder"] + nnet = context["nnet"] + autoencoder = context["autoencoder"] + # decoder_consistency = context["decoder_consistency"] + clip_text_model = context["clip_text_model"] + + # 静态变量存储子进程引用 + if not hasattr(process_one_json, "accelerate_process"): + process_one_json.accelerate_process = multiprocessing.Process(target=accelerate_model, args=(context["config"], context["config"].modelcontext, context["autoencoder"])) + process_one_json.accelerate_process.start() + + config.n_samples = 2 + ######### 这里的才有用!!!!!!!!######## + config.n_iter = 1 + + image_paths = [i["path"] for i in json_data["source_group"]] + images = [] + for caption in json_data["caption_list"]: + config.prompt = caption + config.image_paths = image_paths + paths = sample(config, nnet, clip_text_model, nnet_standard , autoencoder, caption_decoder, device, json_data["id"], output_path=image_output_path) + images.append({"prompt": caption, "paths": paths, + }) + + + return { + "id": json_data["id"], + "images": images + } + + + +def tik(): + global TIME_ST + TIME_ST = time.time() +def tok(name): + global TIME_ED + TIME_ED = time.time() + elapsed_time = TIME_ED - TIME_ST + print(f"Time {name} elapsed: {elapsed_time}") + +def tik_tok(): + global TOTAL_TIME + TOTAL_TIME = TOTAL_TIME + TIME_ED - TIME_ST + + + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-d","--json_data_path", type=str, default="final_json_data/json", help="gived json") + # parser.add_argument("-j","--json_output_path", type=str, default="our_json_outputs", help="111111generated json output") + # parser.add_argument("-i","--image_output_path", type=str, default="our_image_outputs", help="111111generated images output") + + # parser.add_argument("-j","--json_output_path", type=str, default="our_json_imageoforigin2", help="22222generated json output") + # parser.add_argument("-i","--image_output_path", type=str, default="our_imageoforigin2", help="222222generated images output") + + # parser.add_argument("-j","--json_output_path", type=str, default="aaaaaaasaveresults/aaaajsons", help="22222generated json output") + # parser.add_argument("-i","--image_output_path", type=str, default="aaaaaaasaveresults/aaaaimages", help="222222generated images output") + + parser.add_argument("-j","--json_output_path", type=str, default="aaaaaaasaveresults/bbbbjsons", help="33333generated json output") + parser.add_argument("-i","--image_output_path", type=str, default="aaaaaaasaveresults/bbbbimages", help="333333generated images output") + + + + # parser.add_argument("-c","--cuda_device", type=str, default="cuda:7", help="CUDA device to use (e.g., 0, 1, 2, ...)") + return parser.parse_args() + diff --git a/main.py b/main.py new file mode 100644 index 0000000..8e521de --- /dev/null +++ b/main.py @@ -0,0 +1,90 @@ + +import json +import os +import time +import signal +import argparse +# from torch import multiprocessing + + +TIME_ST = time.time() +TIME_ED = time.time() +TOTAL_TIME = 0 + + +from load_model import prepare_context, process_one_json, accelerate_model + +def handler(signum, frame): + raise Exception("end of time") + + +def load_json_files(path): + """ + given a directory, load all json files in that directory + return a list of json objects + """ + d_ls = [] + for file in os.listdir(path): + if file.endswith(".json"): + with open(os.path.join(path, file), 'r') as f: + json_data = json.load(f) + d_ls.append(json_data) + return d_ls + + + +def tik(): + global TIME_ST + TIME_ST = time.time() +def tok(name): + global TIME_ED + TIME_ED = time.time() + print(f"Time {name} elapsed: {TIME_ED - TIME_ST}") +def tik_tok(): + global TOTAL_TIME + TOTAL_TIME = TOTAL_TIME + TIME_ED - TIME_ST + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-d","--json_data_path", type=str, default="final_json_data/json", help="file contains prompts") + parser.add_argument("-j","--json_output_path", type=str, default="json_outputs", help="file contains scores") + parser.add_argument("-i","--image_output_path", type=str, default="image_outputs", help="output dir for generated images") + return parser.parse_args() + +def main(): + + arg = get_args() + os.makedirs(arg.json_output_path, exist_ok=True) + os.makedirs(arg.image_output_path, exist_ok=True) + # load json files + json_data_ls = load_json_files(arg.json_data_path) + + # process json files + context = prepare_context() + + + + signal.signal(signal.SIGALRM, handler) + signal.alarm(60*90) + + try: + for json_data in json_data_ls: + tik() + out = process_one_json(json_data, arg.image_output_path, context) + tok(f"process_one_json: {json_data['id']}") + tik_tok() + with open(os.path.join(arg.json_output_path, f"{json_data['id']}.json"), 'w') as f: + json.dump(out, f) + except Exception as e: + print(e) + + # # Wait for the accelerate_model process to finish + # process.join() + print(f"\033[91m Total Time elapsed: {TOTAL_TIME}\033[00m") +if __name__ == "__main__": + + + main() + + + diff --git a/requirements.txt b/requirements.txt index fa36505..d20fdb9 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,19 @@ accelerate==0.23.0 peft==0.5.0 absl-py==1.4.0 -transformers==4.26.0 -einops==0.6.1 + + +# einops==0.6.1 wandb==0.15.5 ml_collections==0.1.1 -xformers==0.0.16 -triton==2.0.0 +triton==2.1.0 insightface==0.7.3 mxnet==1.9.1 -numpy==1.23.0 +# numpy==1.26.0 onnxruntime==1.15.1 onnxruntime-gpu==1.15.1 -torch==1.13.1 +torch==2.1.1 ftfy==6.1.1 regex==2023.6.3 tqdm==4.65.0 @@ -21,6 +21,15 @@ logger==1.4 # git+https://github.com/openai/CLIP.git ./CLIP +rembg[gpu] + + +gitpython +numpy +pyyaml +./dependency/annotated_deep_learning_paper_implementations +transformers==4.35.2 +diffusers[torch] \ No newline at end of file diff --git a/run.sh b/run.sh index b5899e3..2164eac 100755 --- a/run.sh +++ b/run.sh @@ -1,23 +1,75 @@ #!/bin/bash + + +# This script runs a Docker container with specified GPU device and loads a tar file containing the competition code. +# The script then creates directories for output files and runs the Docker container with specified volumes and entrypoint. + +# Get the tar file and GPU device from command line arguments. tar_file=$1 device=$2 -sha256=`docker load --input $tar_file | grep -Po "sha256:(\w+)" | sed 's/sha256:\(.*\)/\1/g'` +# Set the paths for the input JSON data and the output bound JSON. +json_data_path=/home/final_evaluation/test_data +bound_json_path=/home/final_evaluation/bound_json + +# Print the current dataset and bound JSON paths. +echo --- test script of final competition of lora -- +echo curring dataset: $json_data_path +echo bound json: $bound_json_path + +# Enable debugging mode. +set -x + +# Load the Docker image from the tar file and get its identifier. +# ident=`docker load --input $tar_file | python /home/PZDS/get_indent.py` +ident=$tar_file +echo loaded: $ident + +# Get the MD5 hash of the tar file identifier. +file_ident=`echo $ident | python -c "import hashlib; print(hashlib.md5(input().encode('utf-8')).hexdigest());"` +echo file_ident: $file_ident + +# Create directories for output files. +mkdir -p /home/final_jsons/$file_ident +mkdir -p /home/final_images/$file_ident +mkdir -p /home/final_scores/$file_ident + +# Run the Docker container with specified volumes and entrypoint. +docker run -it --gpus "device=${device}" --shm-size=4g --rm \ +-v /home/PZDS/common_insightface:/root/.insightface:ro \ +-v /home/final_evaluation/ImageReward:/workspace/ImageReward:ro \ +-v /home/final_evaluation/models--bert-base-uncased:/root/.cache/huggingface/hub/models--bert-base-uncased:ro \ +-v /home/PZDS/models:/workspace/models:ro \ +-v /home/PZDS/common_diffusers_model/unidiffuser_hf:/workspace/diffusers_models:ro \ +-v /home/final_evaluation/indocker_shell.sh:/workspace/indocker_shell.sh:ro \ +-v /home/train_outputs/$file_ident.log:/workspace/train_out.log \ +-v $json_data_path:/workspace/final_json_data:ro \ +-v $bound_json_path:/workspace/bound_json_outputs:ro \ +-v /home/final_jsons/$file_ident:/workspace/json_outputs \ +-v /home/final_images/$file_ident:/workspace/image_outputs \ +-v /home/final_scores/$file_ident:/workspace/score_outputs \ +-v /home/final_evaluation/main.py:/workspace/main.py:ro \ +-v /home/final_evaluation/baseline1.py:/workspace/load_model.py:ro \ +--entrypoint /bin/bash $ident -# docker run -it --gpus "device=${device}" --rm -v /home/schengwei/Competitionrepo/models:/workspace/models -v /home/schengwei/.cache:/root/.cache $sha256 +### the old one +# tar_file=$1 +# device=$2 -# -v /root/indocker_shell.sh:/workspace/indocker_shell.sh $sha256 -docker run -it --gpus "device=${device}" --rm -v /home/test01/eval_prompts_advance:/workspace/eval_prompts_advance -v /home/test01/train_data:/workspace/train_data -v /home/test01/models:/workspace/models \ +# sha256=`docker load --input $tar_file | grep -Po "sha256:(\w+)" | sed 's/sha256:\(.*\)/\1/g'` +# # docker run -it --gpus "device=${device}" --rm -v /home/schengwei/Competitionrepo/models:/workspace/models -v /home/schengwei/.cache:/root/.cache $sha256 +# # -v /root/indocker_shell.sh:/workspace/indocker_shell.sh $sha256 +# docker run -it --gpus "device=${device}" --rm -v /home/test01/eval_prompts_advance:/workspace/eval_prompts_advance -v /home/test01/train_data:/workspace/train_data -v /home/test01/models:/workspace/models \ -# docker run -it --gpus "device=${device}" --rm -v /home/wuyujia/competition/eval_prompts_advance:/workspace/eval_prompts_advance -# -v /home/wuyujia/competition/train_data:/workspace/train_data -v /home/wuyujia/competition/models:/workspace/models \ -# -v /home/wuyujia/competition/indocker_shell.sh:/workspace/indocker_shell.sh $sha256 -# sudo docker run -it --gpus all --rm -v /home/wuyujia/competition/eval_prompts_advance:/workspace/eval_prompts_advance -v /home/wuyujia/competition/train_data:/workspace/train_data -v /home/wuyujia/competition/models:/workspace/models -v /home/wuyujia/competition/indocker_shell.sh:/workspace/indocker_shell.sh -v /home/wuyujia/competition/sample.py:/workspace/sample.py -v /home/wuyujia/.insightface:/root/.insightface -v /home/wuyujia/.cache/huggingface:/root/.cache/huggingface xiugou:v1 +# # docker run -it --gpus "device=${device}" --rm -v /home/wuyujia/competition/eval_prompts_advance:/workspace/eval_prompts_advance +# # -v /home/wuyujia/competition/train_data:/workspace/train_data -v /home/wuyujia/competition/models:/workspace/models \ +# # -v /home/wuyujia/competition/indocker_shell.sh:/workspace/indocker_shell.sh $sha256 -# sudo docker cp b012d72bdadd:/workspace /home/wuyujia/competition +# # sudo docker run -it --gpus all --rm -v /home/wuyujia/competition/eval_prompts_advance:/workspace/eval_prompts_advance -v /home/wuyujia/competition/train_data:/workspace/train_data -v /home/wuyujia/competition/models:/workspace/models -v /home/wuyujia/competition/indocker_shell.sh:/workspace/indocker_shell.sh -v /home/wuyujia/competition/sample.py:/workspace/sample.py -v /home/wuyujia/.insightface:/root/.insightface -v /home/wuyujia/.cache/huggingface:/root/.cache/huggingface xiugou:v1 +# # sudo docker cp b012d72bdadd:/workspace /home/wuyujia/competition diff --git a/runDocker b/runDocker index d346d8d..a06a797 100755 --- a/runDocker +++ b/runDocker @@ -12,4 +12,12 @@ CUDA_VISIBLE_DEVICES=7 -docker run -it --gpus all --rm -v /home/schengwei/Competitionrepo/models:/workspace/models -v /home/schengwei/.cache:/root/.cache -v /home/schengwei/.insightface:/root/.insightface -e CUDA_VISIBLE_DEVICES=7 skddj/xiugo:v7 \ No newline at end of file +docker run -it --gpus all --rm -v /home/schengwei/Competitionrepo/models:/workspace/models -v /home/schengwei/.cache:/root/.cache -v /home/schengwei/.insightface:/root/.insightface -e CUDA_VISIBLE_DEVICES=7 skddj/xiugo:v7 + + +### remember to add the dependency models into docker like image reward model? | u2net + docker run -it --gpus all --rm -v /home/schengwei/Competitionrepo/models:/workspace/models -v /home/schengwei/.u2net:/root/.u2net -v /home/schengwei/.cache:/root/.cache -v /home/schengwei/.insightface:/root/.insightface xiangxiaoyu/unidiffuser:v2.0 /bin/bash + + + docker run -it --gpus all --rm -v /home/schengwei/Competitionrepo/models:/workspace/models -v /home/schengwei/.u2net:/root/.u2net -v /home/schengwei/.cache:/root/.cache -v /home/schengwei/.insightface:/root/.insightface skddj/xiugo:v7 + diff --git a/sample.py b/sample.py index 1c4b77d..084d104 100755 --- a/sample.py +++ b/sample.py @@ -27,10 +27,13 @@ import numpy as np import json from libs.uvit_multi_post_ln_v1 import UViT -from peft import inject_adapter_in_model, LoraConfig,get_peft_model +from peft import inject_adapter_in_model, LoraConfig,get_peft_model, AdaLoraConfig from resize import resize_images_in_path -lora_config = LoraConfig( - inference_mode=False, r=64, lora_alpha=32, lora_dropout=0.1,target_modules=["qkv","fc1","fc2","proj","text_embed","clip_img_embed"] +# lora_config = LoraConfig( +# inference_mode=False, r=64, lora_alpha=32, lora_dropout=0.1,target_modules=["qkv","fc1","fc2","proj","text_embed","clip_img_embed"] +# ) +lora_config = AdaLoraConfig( + inference_mode=False, r=64, lora_alpha=32, lora_dropout=0.1,target_modules=["qkv","fc1","fc2"] ) def get_model_size(model): @@ -43,6 +46,20 @@ def get_model_size(model): return para def stable_diffusion_beta_schedule(linear_start=0.00085, linear_end=0.0120, n_timestep=1000): + """ + 根据代码,_betas 是一个用于稳定扩散过程的 beta 值序列。 + 它是通过在指定的时间步数内,在线性起始值和线性结束值之间生成一系列均匀间隔的值得到的。 + 这些值被用作稳定扩散过程中的温度参数,控制噪声的强度。 + + + _betas 是通过 stable_diffusion_beta_schedule 函数生成的, + 这个函数根据指定的线性起始值(linear_start)和线性结束值(linear_end) + 在给定的时间步数(n_timestep)内生成一个 beta 值序列。这些 beta 值控制了稳定扩散过程中的噪声强度。 + +N = len(_betas) 表示 _betas 序列的长度,即时间步数。 +在这个例子中,n_timestep 参数设置为 1000,所以 N 将等于 1000。 +这意味着扩散过程被分解为 1000 个离散的时间步。 + """ _betas = ( torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 ) @@ -87,16 +104,23 @@ def sample(prompt_index, config, nnet, clip_text_model, autoencoder, device): config = ml_collections.FrozenConfigDict(config) _betas = stable_diffusion_beta_schedule() - N = len(_betas) + N = len(_betas) # 总的时间步数 N 是扩散过程中的时间步数(Number of Timesteps) + ### 1000 个离散的时间步 use_caption_decoder = config.text_dim < config.clip_text_dim or config.mode != 't2i' - if use_caption_decoder: + ### 如果不是 t2i 模式 | text-dim(64) < clip_text_dim (768) + ### 用个线性层 linear 升到 768 应该也可以,不过 gpt2 + ### use_caption_decoder肯定是 true 了 + + if use_caption_decoder: + ### 这里如果为 true 就加载 gpt2 了 from libs.caption_decoder import CaptionDecoder caption_decoder = CaptionDecoder(device=device, **config.caption_decoder) else: caption_decoder = None + ### 这一行代码是使用clip_text_model对空字符串进行编码,并将结果存储在empty_context变量中。 empty_context = clip_text_model.encode([''])[0] def split(x): @@ -133,13 +157,20 @@ def t2i_nnet(x, timesteps, text): # text is the low dimension version of the te if config.sample.scale == 0.: return x_out + ### sample scale 最好设置为 0, 没看出来下面的有什么用噻 + + if config.sample.t2i_cfg_mode == 'empty_token': + ### 这里的 empty_context是上面直接对空字符串编码得到的,确实是 empty_token _empty_context = einops.repeat(empty_context, 'L D -> B L D', B=x.size(0)) if use_caption_decoder: - _empty_context = caption_decoder.encode_prefix(_empty_context) + _empty_context = caption_decoder.encode_prefix(_empty_context) + # 把空字符串的编码结果,用 caption_decoder 进行编码,得到的结果是 64 维的,从 768 降低到 64 维 z_out_uncond, clip_img_out_uncond, text_out_uncond = nnet(z, clip_img, text=_empty_context, t_img=timesteps, t_text=t_text, data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) x_out_uncond = combine(z_out_uncond, clip_img_out_uncond) + + elif config.sample.t2i_cfg_mode == 'true_uncond': text_N = torch.randn_like(text) # 3 other possible choices z_out_uncond, clip_img_out_uncond, text_out_uncond = nnet(z, clip_img, text=text_N, t_img=timesteps, t_text=torch.ones_like(timesteps) * N, @@ -162,13 +193,17 @@ def decode(_batch): contexts, img_contexts, clip_imgs = prepare_contexts(config, clip_text_model, autoencoder) - contexts_low_dim = contexts if not use_caption_decoder else caption_decoder.encode_prefix(contexts) # the low dimensional version of the contexts, which is the input to the nnet - + contexts_low_dim = contexts if not use_caption_decoder else caption_decoder.encode_prefix(contexts) + ### 通过一个 linear 层降低维度 from 768 to 64 + + _n_samples = contexts_low_dim.size(0) def sample_fn(**kwargs): - # _z_init = torch.randn(_n_samples, *config.z_shape, device=device) + + ### _z, _clip_img = sample_fn(text=contexts_low_dim) + _z_init = torch.randn(_n_samples, *config.z_shape, device=device) _clip_img_init = torch.randn(_n_samples, 1, config.clip_img_dim, device=device) @@ -183,7 +218,22 @@ def model_fn(x, t_continuous): dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True, thresholding=False) with torch.no_grad(), torch.autocast(device_type="cuda" if "cuda" in str(device) else "cpu"): start_time = time.time() + + ### _x_init 就是上面几行随机初始化的img+clip_img ; steps就是推理步数默认 150 在 config 中; x = dpm_solver.sample(_x_init, steps=config.sample.sample_steps, eps=1. / N, T=1.) + """ + + _x_init 就是上面几行随机初始化的img+clip_img ; steps就是推理步数默认 150 在 config 中; + + x:输入数据,通常是从正态分布采样得到的初始值,位于扩散过程的某个起始时间点 T。 + +eps:采样结束的时间点,通常非常接近 0(比如 1e-4 或 1e-3),表示扩散过程的最终阶段。 + +T:采样开始的时间点,如果是 None,则使用噪声调度的默认起始时间。 + +order:DPM-Solver 的顺序,用于指定使用哪种扩散算法。 + """ + end_time = time.time() print(f'\ngenerate {_n_samples} samples with {config.sample.sample_steps} steps takes {end_time - start_time:.2f}s') @@ -349,3 +399,4 @@ def main(argv=None): # finetuned parameters: 268028672 if __name__ == "__main__": main() + diff --git a/sample.sh b/sample.sh index 4df2ef5..176e256 100755 --- a/sample.sh +++ b/sample.sh @@ -1,15 +1,4 @@ #!/bin/bash -# python sample.py --restore_path model_output/boy1 --prompt_path eval_prompts/boy1.json --output_path outputs/boy1 --weight_dir model_output/boy1 -# python sample.py --restore_path model_output/boy2 --prompt_path eval_prompts/boy2.json --output_path outputs/boy2 --weight_dir model_output/boy2 -# python sample.py --restore_path model_output/girl1 --prompt_path eval_prompts/girl1.json --output_path outputs/girl1 --weight_dir model_output/girl1 -# python sample.py --restore_path model_output/girl2 --prompt_path eval_prompts/girl2.json --output_path outputs/girl2 --weight_dir model_output/girl2 - -# python sample.py --restore_path model_output/boy1 --prompt_path eval_prompts/boy1.json --output_path outputs/boy1 --weight_dir model_output/boy1 -# python sample.py --restore_path model_output/boy2withcrossattention --prompt_path eval_prompts/boy2.json --output_path outputs/boy2 --weight_dir model_output/boy2withcrossattention -# python sample.py --restore_path model_output/girl1withcrossattention --prompt_path eval_prompts/girl1.json --output_path outputs/girl1 --weight_dir model_output/girl1withcrossattention -# python sample.py --restore_path model_output/girl2withcrossattention --prompt_path eval_prompts/girl2.json --output_path outputs/girl2 --weight_dir model_output/girl2withcrossattention -# python sample.py --restore_path /data/hdd3/schengwei/moutput/girl2_3000 --prompt_path /home/schengwei/Competitionrepo/eval_prompts_advance/girl2_sim.json --output_path /data/hdd3/schengwei/outputs/girl2_sim - python sample.py --restore_path model_output/boy1 --prompt_path eval_prompts_advance/boy1_sim.json --output_path outputs/boy1_sim --weight_dir model_output/boy1 python sample.py --restore_path model_output/boy2 --prompt_path eval_prompts_advance/boy2_sim.json --output_path outputs/boy2_sim --weight_dir model_output/boy2 python sample.py --restore_path model_output/girl1 --prompt_path eval_prompts_advance/girl1_sim.json --output_path outputs/girl1_sim --weight_dir model_output/girl1 diff --git a/sample_fn.py b/sample_fn.py new file mode 100644 index 0000000..b081cc3 --- /dev/null +++ b/sample_fn.py @@ -0,0 +1,360 @@ +""" +采样代码 +文件输入: + prompt, 指定的输入文件夹路径, 制定的输出文件夹路径 +文件输出: + 采样的图片, 存放于指定输出文件夹路径 +- 指定prompt文件夹位置, 选手需要自行指定模型的地址以及其他微调参数的加载方式, 进行图片的生成并保存到指定地址, 此部分代码选手可以修改。 +- 输入文件夹的内容组织方式和训练代码的输出一致 +- sample的方法可以修改 +- 生成过程中prompt可以修改, 但是指标测评时会按照给定的prompt进行测评。 +""" + +import os +import ml_collections +import torch +import random +import argparse +import utils +from libs.dpm_solver_pp import NoiseScheduleVP, DPM_Solver +import einops + +import libs.clip +from torchvision.utils import save_image, make_grid +import numpy as np +import clip +import time +import numpy as np +import json + + +from img2img_copytest import Img2Img +from configs.unidiffuserv1 import get_config +from labml import monit + + + +import json +import os +import time +from PIL import Image +import argparse +import numpy as np +from rembg import remove, new_session +from labml import monit +from labml_nn.diffusion.stable_diffusion.util import load_img_rm +import torch +from img2img_copytest import Img2Img +from configs.unidiffuserv1 import get_config +from load_model import xiugo + +from PIL import Image + +img2img = None +class Timetest: + _instance = None + def __new__(cls): + if cls._instance is None: + cls._instance = super(Timetest, cls).__new__(cls) + cls._instance.total_wait_time = 0 + return cls._instance + + def time_test(self,config,task_id): + count = count_files(config.accelerate_adapters, task_id) # 更新count + while count not in [1, 2, 3]: + time.sleep(0.5) + self.total_wait_time += 0.5 + if self.total_wait_time > 60: + raise TimeoutError("Total wait time exceeded 1 minute. Please contact xiugo team.") + count = count_files(config.accelerate_adapters, task_id) # 再次更新count以便检查循环条件 + +def init_img2img(): + global img2img + config = get_config() + img2img = Img2Img(config=config, ddim_steps=60, ddim_eta=0) + return img2img + +def get_img2img(): + global img2img + if img2img is None: + init_img2img() + return img2img + +def stable_diffusion_beta_schedule(linear_start=0.00085, linear_end=0.0120, n_timestep=1000): + _betas = ( + torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 + ) + return _betas.numpy() + + +def prepare_contexts(config, clip_text_model): + device = config.device + + contexts = torch.randn(config.n_samples, 77, config.clip_text_dim).to(device) + img_contexts = torch.randn(config.n_samples, 2 * config.z_shape[0], config.z_shape[1], config.z_shape[2]) + clip_imgs = torch.randn(config.n_samples, 1, config.clip_img_dim) + + prompts = [ config.prompt ] * config.n_samples + contexts = clip_text_model(prompts) + + return contexts, img_contexts, clip_imgs + + + +def unpreprocess(v): # to B C H W and [0, 1] + v = 0.5 * (v + 1.) + v.clamp_(0., 1.) + return v + + +# @torch.cuda.amp.autocast() +def get_group2(adapterpath,context,task_id): + img2img = get_img2img() + with torch.no_grad(), torch.autocast(device_type="cuda"): + + file_name = os.path.basename(adapterpath) + extracted_id = file_name.split('-')[0] + # print(extracted_id) + # print(task_id) + if int(extracted_id) != int(task_id): + raise ValueError("PLEASE Contact xiugo team!!!!!!!PLEASE") + # input_path = adapterpath[(task_id+np.random.randint(0,6)) % len(adapterpath)] + # input_path = os.path.join('final_json_data',input_path) + latent_cb = torch.load(adapterpath).to(context.device) + + img_inversion = img2img( + context=context, + latent_cb = latent_cb + ) + # img_inversion = latent_cb + return img_inversion + + +def count_files(folder_path, start_number): + """ + Count the number of files in the given folder that start with the specified number. + + :param folder_path: Path to the folder containing the files. + :param start_number: The starting number to match the filenames. + :return: The count of files starting with the given number. + """ + count = 0 + prefix = f"{start_number}-" + + # List all files in the given directory + for filename in os.listdir(folder_path): + # print(filename) + # Check if the file starts with the specified number + if filename.startswith(prefix) and filename.endswith(".pt"): + count += 1 + # print(f"{start_number}",count) + return count + +def set_seed(seed: int): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +@torch.no_grad() +def sample( config, nnet, clip_text_model,nnet_standard , autoencoder, caption_decoder, device, task_id, output_path): + """ + using_prompt: if use prompt as file name + """ + n_iter = config.n_iter + + if config.get('benchmark', False): + torch.backends.cudnn.benchmark = True + torch.backends.cudnn.deterministic = False + config = ml_collections.FrozenConfigDict(config) + + _betas = stable_diffusion_beta_schedule() + N = len(_betas) + + + use_caption_decoder = config.text_dim < config.clip_text_dim or config.mode != 't2i' + + # empty_context = clip_text_model([''])[0] + + def split(x): + C, H, W = config.z_shape + z_dim = C * H * W + z, clip_img = x.split([z_dim, config.clip_img_dim], dim=1) + z = einops.rearrange(z, 'B (C H W) -> B C H W', C=C, H=H, W=W) + clip_img = einops.rearrange(clip_img, 'B (L D) -> B L D', L=1, D=config.clip_img_dim) + return z, clip_img + + def combine(z, clip_img): + z = einops.rearrange(z, 'B C H W -> B (C H W)') + clip_img = einops.rearrange(clip_img, 'B L D -> B (L D)') + return torch.concat([z, clip_img], dim=-1) + + + def t2i_nnet(x, timesteps, text): # text is the low dimension version of the text clip embedding + """ + 1. calculate the conditional model output + 2. calculate unconditional model output + config.sample.t2i_cfg_mode == 'empty_token': using the original cfg with the empty string + config.sample.t2i_cfg_mode == 'true_uncond: using the unconditional model learned by our method + 3. return linear combination of conditional output and unconditional output + """ + z, clip_img = split(x) + + t_text = torch.zeros(timesteps.size(0), dtype=torch.int, device=device) + + # 假设 config.use_nnet_standard 是一个布尔值,决定是否使用 nnet_standard + use_nnet_standard = config.use_nnet_standard + + z_out, clip_img_out, text_out = nnet(z, clip_img, text=text, t_img=timesteps, t_text=t_text, + data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + x_out = combine(z_out, clip_img_out) + + + text_N = torch.randn_like(text) # 3 other possible choices + z_out_uncond, clip_img_out_uncond, text_out = nnet(z, clip_img, text=text, t_img=timesteps, t_text=t_text, + data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + x_out_uncond = combine(z_out_uncond, clip_img_out_uncond) + + if use_nnet_standard: + z_out_standard, clip_img_out_standard, text_out_standard = nnet_standard(z, clip_img, text=text, t_img=timesteps, t_text=t_text, + data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + x_out_standard = combine(z_out_standard, clip_img_out_standard) + + + z_out_uncond_standard, clip_img_out_uncond_standard, text_out_uncond_standard = nnet_standard(z, clip_img, text=text_N, t_img=timesteps, t_text=torch.ones_like(timesteps) * N, + data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + x_out_uncond_standard = combine(z_out_uncond_standard, clip_img_out_uncond_standard) + + # 根据 config.sample.scale 返回不同的结果 + if config.sample.scale == 0.: + return x_out + else: + return x_out + config.sample.scale * (x_out_standard - x_out_uncond_standard) + else: + return x_out + config.sample.scale * (x_out - x_out_uncond) + + # z_out, clip_img_out, text_out = nnet(z, clip_img, text=text, t_img=timesteps, t_text=t_text, + # data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + # x_out = combine(z_out, clip_img_out) + # z_out_standard, clip_img_out_standard, text_out_standard = nnet_standard(z, clip_img, text=text, t_img=timesteps, t_text=t_text, + # data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + # x_out_standard = combine(z_out_standard, clip_img_out_standard) + # if config.sample.scale == 0.: + # return x_out + + # text_N = torch.randn_like(text) # 3 other possible choices + # z_out_uncond_standard, clip_img_out_uncond_standard, text_out_uncond_standard = nnet_standard(z, clip_img, text=text_N, t_img=timesteps, t_text=torch.ones_like(timesteps) * N, + # data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + # z_out_uncond, clip_img_out_uncond, text_out_uncond = nnet(z, clip_img, text=text_N, t_img=timesteps, t_text=torch.ones_like(timesteps) * N, + # data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type) + # # x_out_uncond = combine(z_out_uncond, clip_img_out_uncond) + # x_out_uncond_standard = combine(z_out_uncond_standard, clip_img_out_uncond_standard) + + + # # return x_out + config.sample.scale * (x_out - x_out_uncond) + # return x_out + config.sample.scale * (x_out_standard - x_out_uncond_standard) + + + contexts, img_contexts, clip_imgs = prepare_contexts(config, clip_text_model) + rm_contexts = contexts + contexts_low_dim = contexts if not use_caption_decoder else caption_decoder.encode_prefix(contexts) # the low dimensional version of the contexts, which is the input to the nnet + # print(contexts_low_dim.shape) + # exit() + _n_samples = contexts_low_dim.size(0) + + + def sample_fn(**kwargs): + + _z_init = torch.randn(_n_samples, *config.z_shape, device=device) + _clip_img_init = torch.randn(_n_samples, 1, config.clip_img_dim, device=device) + _x_init = combine(_z_init, _clip_img_init) + + noise_schedule = NoiseScheduleVP(schedule='discrete', betas=torch.tensor(_betas, device=device).float()) + + def model_fn(x, t_continuous): + t = t_continuous * N + return t2i_nnet(x, t, **kwargs) + + dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True, thresholding=False) + # print(config.sample.sample_steps) + with torch.no_grad(), torch.autocast(device_type="cuda" if "cuda" in str(device) else "cpu"), monit.section('Sample:'): + # x = dpm_solver.sample(_x_init, steps=config.sample.sample_steps, eps=1. / N, T=1.) + x = dpm_solver.sample(_x_init, steps=10, eps=1. / N, T=1.) + + _z, _clip_img = split(x) + return _z, _clip_img + if not os.path.exists("other_models/adapter"): + os.makedirs("other_models/adapter") + test_instance = Timetest() + samples = None + for i in range(n_iter): + _z, _clip_img = sample_fn(text=contexts_low_dim) # conditioned on the text embedding + # print(_z) + # print(_z.shape) + new_samples = unpreprocess(autoencoder.decode(_z)) + # device = _z.device + # new_samples = unpreprocess(decoder_consistency(_z//0.18215, device)) + + + if samples is None: + samples = new_samples + else: + samples = torch.vstack((samples, new_samples)) + error_count = 0 + + + for i in range(4 - config.n_samples): + # image_paths = config.image_paths + """ + ## input args : 1. encode 过的原图,需要原图路径,并且使用 encode 函数进行 encode + ## 2. prompts 上面的 contexts + # contexts,_ = torch.chunk(contexts,chunks=2,dim=0) + ### print(contexts.shape) + ## torch.Size([2, 77, 768]) + # new_z = get_group2(image_paths,contexts,task_id) + # print(contexts_low_dim.shape) + # print(rm_contexts.shape) + ### torch.Size([2, 77, 64]) + """ + # torch.Size([2, 77, 768]) + source_tensor = torch.empty(1, 77, 64) + rm_contexts = torch.randn_like(source_tensor).to(contexts.device) + # rm_contexts = torch.zeros_like(source_tensor).to(contexts.device) + + # task_id - rand().pt as input (from 1 - 3 ) + # count = count_files(config.accelerate_adapters,task_id) + + test_instance.time_test(config,task_id) + print(test_instance.total_wait_time) + count = count_files(config.accelerate_adapters, task_id) + if count == 1: + # print("skdljfklsdjafkljdsakfj") + random_integer = 1 + else: + random_integer = np.random.randint(1, count+1) + + + adapterpath = config.accelerate_adapters + + adapterpath = os.path.join(adapterpath,f"{task_id}-{random_integer}.pt") + ## other_models/adapters/...pt + new_z = get_group2(adapterpath,rm_contexts,task_id) + device = new_z.device + with monit.section('autoencoder_decode:'): + new_samples = unpreprocess(autoencoder.decode(new_z)) + + # new_samples = unpreprocess(decoder_consistency(new_z//0.18215, device)) + samples = torch.vstack((samples, new_samples)) + + + + paths = [] + + for idx, sample in enumerate(samples): + save_path = os.path.join(output_path, f'{task_id}-{config.prompt}-{idx:03}.jpg') + paths.append(save_path) + # with monit.section(f'Save Image {task_id}:'): + save_image(sample, save_path) + + return paths + diff --git a/score.py b/score.py old mode 100755 new mode 100644 index 80ec2a5..0ab9c10 --- a/score.py +++ b/score.py @@ -8,17 +8,29 @@ import argparse import warnings from score_utils.face_model import FaceAnalysis +import ImageReward as RM +from typing import Union +from configs.unidiffuserv1 import get_config warnings.filterwarnings("ignore") class Evaluator(): def __init__(self): - self.clip_device = "cuda" if torch.cuda.is_available() else "cpu" - self.clip_model, self.clip_preprocess = clip.load("ViT-B/32", device=self.clip_device) + config = get_config() + config.device = "cuda:1" + self.clip_device = config.device + # self.clip_device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.clip_device + self.clip_model, self.clip_preprocess = clip.load("other_models/clip/ViT-B-32.pt", device=self.clip_device) self.clip_tokenizer = clip.tokenize - + self.face_model = FaceAnalysis(providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) - self.face_model.prepare(ctx_id=0, det_size=(640, 640)) + + # self.image_reward = RM.load("ImageReward-v1.0") + + self.image_reward = RM.load("./ImageReward/ImageReward.pt", med_config="./ImageReward/med_config.json") + self.face_model.prepare(ctx_id=0, det_size=(512, 512)) + def pil_to_cv2(self, pil_img): return np.array(pil_img)[:,:,::-1] @@ -54,7 +66,7 @@ def sim_face(self, img1, img2): return 0 else: similarity = feat1 @ feat2.T - return max(0,similarity.item()) + return similarity.item() def sim_face_emb(self, img1, embs): """ @@ -69,7 +81,7 @@ def sim_face_emb(self, img1, embs): return 0 else: similarity = feat1 @ embs.T - return max(0,similarity.max().item()) + return similarity.mean().item() def get_img_embedding(self, img): """ @@ -91,20 +103,7 @@ def get_text_embedding(self, text): feat /= feat.norm(dim=-1, keepdim=True) return feat - - def sim_clip_img(self, img1, img2): - """ - calcualte img img similarity using CLIP - """ - feat1 = self.get_img_embedding(img1) - feat2 = self.get_img_embedding(img2) - similarity = feat1 @ feat2.T - return max(0,similarity.item()) - - def sim_clip_imgembs(self, img, embs): - feat = self.get_img_embedding(img) - similarity = feat @ embs.T - return max(0,similarity.max().item()) + def sim_clip_text(self, img, text): """ @@ -116,135 +115,226 @@ def sim_clip_text(self, img, text): return max(0,similarity.item()) - def score1_gen_vs_img_face(self, gen, img, alpha_img=0.5, alpha_face=0.5): - img_sim = self.sim_clip_img(gen,img) - face_sim = self.sim_face(gen, img) - - return alpha_img * img_sim + alpha_face * face_sim + - def score2_gen_vs_img(self, gen, img, alpha_img=1.0): - img_sim = self.sim_clip_img(gen,img) - - return alpha_img * img_sim +def read_img_pil(p): + return Image.open(p).convert("RGB") + + + +def load_json_files(path): + """ + given a directory, load all json files in that directory + return a list of json objects + """ + d_ls = [] + for file in os.listdir(path): + if file.endswith(".json"): + with open(os.path.join(path, file), 'r') as f: + json_data = json.load(f) + d_ls.append(json_data) + return d_ls + +def pre_check(source_json_dir, gen_json_dir, bound_json_dir): + """ + 1. check common ids + 2. check enough images + 3. return list of tuple (source_json, gen_json, bound_json) + """ - def score3_gen_vs_text(self, gen, text, alpha_text=1.0): - text_sim = self.sim_clip_text(gen,text) - return alpha_text * text_sim - - def score4_gen_vs_text_refimg(self, gen, text, ref, alpha_text=0.5, alpha_img=0.5): - text_sim = self.sim_clip_text(gen,text) - img_sim = self.sim_clip_img(gen, ref) + id_to_source_json = {json_data["id"]: json_data for json_data in load_json_files(source_json_dir)} + id_to_gen_json = {json_data["id"]: json_data for json_data in load_json_files(gen_json_dir)} + id_to_bound_json = {json_data["id"]: json_data for json_data in load_json_files(bound_json_dir)} - return alpha_text * text_sim + alpha_img * img_sim + common_ids = set(id_to_source_json.keys()) & set(id_to_gen_json.keys()) -def read_img_pil(p): - return Image.open(p) + print(f"共有{len(common_ids)}个id") + + case_pair_ls = [] + for id in common_ids: + source_json = id_to_source_json[id] + gen_json = id_to_gen_json[id] + bound_json = id_to_bound_json[id] + + + for idx, item in enumerate(gen_json["images"]): + if item["prompt"] not in source_json["caption_list"]: + print(f"prompt {item['prompt']} not in source json") + gen_json["images"].remove(item) + if len(item["paths"]) != 4: + print(f"delete item {item}") + gen_json["images"].remove(item) + case_pair_ls.append((source_json, gen_json, bound_json)) + return case_pair_ls -def score(dataset_base, prompts_base, outputs_base): - eval = Evaluator() - - DATANAMES = ["boy1", "boy2", "girl1", "girl2"] - SIM_TASKNAMES = ['boy1_sim', 'boy2_sim', 'girl1_sim', 'girl2_sim'] - EDIT_TASKNAMES = ['boy1_edit', 'boy2_edit', 'girl1_edit', 'girl2_edit'] - - ## folder check - for taskname in DATANAMES: - task_dataset = os.path.join(dataset_base, f'{taskname}') - assert os.path.exists(task_dataset), f"Missing Dataset folder: {task_dataset}" - for taskname in SIM_TASKNAMES + EDIT_TASKNAMES: - task_prompt = os.path.join(prompts_base, f'{taskname}.json') - assert os.path.exists(task_prompt), f"Missing Prompt file: {task_prompt}" - task_output = os.path.join(outputs_base, f'{taskname}') - assert os.path.exists(task_output), f"Missing Output folder: {task_output}" - - def score_task(sample_folder, dataset_folder, prompt_json): - ## get prompt, face, and ref image from dataset folder - refs = glob.glob(os.path.join(dataset_folder, "*.jpg")) + glob.glob(os.path.join(dataset_folder, "*.jpeg")) - refs_images = [read_img_pil(ref) for ref in refs] - - refs_clip = [eval.get_img_embedding(i) for i in refs_images] - refs_clip = torch.cat(refs_clip) - #### print(refs_clip.shape) - - refs_embs = [eval.get_face_embedding(i) for i in refs_images] - refs_embs = [emb for emb in refs_embs if emb is not None] - refs_embs = torch.cat(refs_embs) - #### print(refs_embs.shape) - - - - #### print("Ref Count: ", len(refs_images)) - #### print("Emb: ", refs_embs.shape) - - pompt_scores = [] - prompts = json.load(open(prompt_json, "r")) - for prompt_index, prompt in enumerate(prompts): - sample_scores = [] - for idx in range(0,3): ## 3 generation for each prompt - sample_path = os.path.join(sample_folder,f"{prompt_index}-{idx:03}.jpg") ## for face / target reference - try: - sample = read_img_pil(sample_path) - # sample vs ref - score_face = eval.sim_face_emb(sample, refs_embs) - score_clip = eval.sim_clip_imgembs(sample, refs_clip) - # sample vs prompt - score_text = eval.sim_clip_text(sample, prompt) - sample_score = [score_face, score_clip, score_text] - except Exception as e: - #### print(e) - sample_score = [0.0, 0.0, 0.0] - #### print(f"Score for sample {idx}: ", sample_score) - sample_scores.append(sample_score) - pompt_score = np.mean(sample_scores, axis=0) - #### print(f"Score for prompt {prompt_index}: ", pompt_score) - pompt_scores.append(pompt_score) - task_score = np.mean(pompt_scores, axis=0) - return task_score - - ## calculate sim score - sim_scores = [] - for dataname, taskname in zip(DATANAMES, SIM_TASKNAMES): - task_dataset = os.path.join(dataset_base, f'{dataname}') - task_prompt = os.path.join(prompts_base, f'{taskname}.json') - task_output = os.path.join(outputs_base, f'{taskname}') - score = score_task(task_output, task_dataset, task_prompt) - print(f"Score for task {taskname}: ", score) - sim_scores.append(score) - print(sim_scores) - sim_ave_score = np.mean(sim_scores, axis=0) - - edit_scores = [] - for dataname, taskname in zip(DATANAMES, EDIT_TASKNAMES): - task_dataset = os.path.join(dataset_base, f'{dataname}') - task_prompt = os.path.join(prompts_base, f'{taskname}.json') - task_output = os.path.join(outputs_base, f'{taskname}') - score = score_task(task_output, task_dataset, task_prompt) - print(f"Score for task {taskname}: ", score) - edit_scores.append(score) - print(edit_scores) - edit_ave_score = np.mean(edit_scores, axis=0) - - score_dict = { - "复现功能的人脸相似度": sim_ave_score[0], - "复现功能的CLIP图片相似度": sim_ave_score[1], +def score(ev, source_json, gen_json, bound_json, out_json_dir): + + + # get ref images + ref_image_paths = [ i["path"] for i in source_json["source_group"]] + ref_face_embs = [ev.get_face_embedding(read_img_pil(i)) for i in ref_image_paths] + ref_face_embs = [emb for emb in ref_face_embs if emb is not None] # remove None + ref_face_embs = torch.cat(ref_face_embs) - "编辑功能的人脸相似度": edit_ave_score[0], - "编辑功能的CLIP图片相似度": edit_ave_score[1], - "编辑功能的图文匹配度": edit_ave_score[2], - } - print(f"\033[91m 最终结果:\n{score_dict}\033[00m") - return score_dict + text_ac_scores = 0 + face_ac_scores = 0 + image_reward_ac_scores = 0 + image_reward_ac_decrease = 0 + + normed_text_ac_scores = 0 + normed_face_ac_scores = 0 + normed_image_reward_ac_scores = 0 + normed_image_reward_ac_decrease = 0 + out_json = {"id": gen_json["id"], "images": []} + commom_prompts = set([item["prompt"] for item in gen_json["images"]]) & set([item["prompt"] for item in bound_json["images"]]) + prompt_to_item = {item["prompt"]: item for item in gen_json["images"]} + bound_prompt_to_item = {item["prompt"]: item for item in bound_json["images"]} + if len(commom_prompts) != len(bound_json["images"]): + print(f"共有{len(commom_prompts)}个prompt, bound json有{len(bound_json['images'])}个prompt") + print(bound_json) + + for prompt in commom_prompts: + item = prompt_to_item[prompt] + bound_item = bound_prompt_to_item[prompt] + + assert item["prompt"] == bound_item["prompt"], f"prompt {item['prompt']} not equal to bound prompt {bound_item['prompt']}" + if len(item["paths"]) < 4: + continue + + # clip text similarity + samples = [read_img_pil(sample_path) for sample_path in item["paths"]] + scores_text = [ev.sim_clip_text(sample, item["prompt"]) for sample in samples] + mean_text = np.mean(scores_text) + + # image reward + scores_image_reward = [ev.image_reward.score(item["prompt"], sample_path) for sample_path in item["paths"]] + mean_image_reward = np.mean(scores_image_reward) + + # hps v2 + # scores_hpsv2 = [ev.hpsv2_score(sample, item["prompt"])[0].item() for sample in samples] + # mean_hpsv2 = np.mean(scores_hpsv2) + + # face similarity + sample_faces = [ev.get_face_embedding(sample) for sample in samples] + sample_faces = [emb for emb in sample_faces if emb is not None] # remove None + if len(sample_faces) <= 1: + print("too few faces") + continue + scores_face = [(sample_face @ ref_face_embs.T).mean().item() for sample_face in sample_faces] + mean_face = np.mean(scores_face) + + subed_score_text = mean_text - bound_item["min_text_sim"] + subed_score_face = mean_face - bound_item["min_face_sim"] + subed_image_reward = mean_image_reward - bound_item["min_image_reward"] + image_reward_decrease = bound_item["max_image_reward"] - mean_image_reward + + + normed_score_text = subed_score_text / (bound_item["max_text_sim"] - bound_item["min_text_sim"]) + normed_score_face = subed_score_face / (bound_item["max_face_sim"] - bound_item["min_face_sim"]) + normed_score_image_reward = subed_image_reward / (bound_item["max_image_reward"] - bound_item["min_image_reward"]) + normed_image_reward_decrease = image_reward_decrease / (bound_item["max_image_reward"] - bound_item["min_image_reward"]) + + if normed_score_image_reward < 0.1: + # print(f"Image reward too low for prompt: '{item['prompt']}' in item: {item}") + print(f"\033[91mface similarity too low for prompt:\033[0m '{item['prompt']}' in item(id):\033[91m{gen_json['id']}\033[0m") + # print("too low image reward") + continue + if normed_score_face < 0.1: + print(f"\033[91mface similarity too low for prompt:\033[0m '{item['prompt']}' in item(id):\033[91m{gen_json['id']}\033[0m") + # print(f"face similarity too low for prompt: '{item['prompt']}' in item: {item}") + # print("too low face similarity") + continue + + normed_text_ac_scores += normed_score_text + normed_face_ac_scores += normed_score_face + normed_image_reward_ac_scores += normed_score_image_reward + normed_image_reward_ac_decrease += normed_image_reward_decrease + + face_ac_scores += subed_score_face + text_ac_scores += subed_score_text + image_reward_ac_scores += subed_image_reward + image_reward_ac_decrease += image_reward_decrease + + out_json["images"].append({"prompt": item["prompt"], + "scores_text": scores_text, + "scores_face": scores_face, + "scores_image_reward": scores_image_reward, + # "scores_hpsv2": scores_hpsv2, + "subed_score_text": subed_score_text, + "subed_score_face": subed_score_face, + "subded_image_reward": subed_image_reward, + "image_reward_decrease": image_reward_decrease, + + "normed_score_text": normed_score_text, + "normed_score_face": normed_score_face, + "normed_score_image_reward": normed_score_image_reward, + "normed_image_reward_decrease": normed_image_reward_decrease}) + + with open(os.path.join(out_json_dir, f"{gen_json['id']}.json"), 'w') as f: + json.dump(out_json, f, indent=4) + + return {"text_ac_scores":text_ac_scores, + "face_ac_scores":face_ac_scores, + "image_reward_ac_scores":image_reward_ac_scores, + "image_reward_ac_decrease":image_reward_ac_decrease, + + "normed_text_ac_scores":normed_text_ac_scores, + "normed_face_ac_scores":normed_face_ac_scores, + "normed_image_reward_ac_scores":normed_image_reward_ac_scores, + "normed_image_reward_ac_decrease":normed_image_reward_ac_decrease, + } if __name__ == "__main__": parser = argparse.ArgumentParser(description='Evaluation Script') - parser.add_argument('--dataset', type=str, default='./train_data/', help='dataset folder') - parser.add_argument('--prompts', type=str, default='./eval_prompts_advance/', help='prompt folder') - parser.add_argument('--outputs', type=str, default='./outputs/', help='output folder') + parser.add_argument('--source_json_dir', type=str, default='final_json_data/json', help='task json files, original json data to generate images.') + + + # parser.add_argument('--gen_json_dir', type=str, default='aaaaaaasaveresults/aaaajsons', help='json after generating images.') + # parser.add_argument("--out_json_dir", type=str, default="aaaaaaasaveresults/aaaascores", help="score json ouput") + + parser.add_argument('--gen_json_dir', type=str, default='aaaaaaasaveresults/bbbbjsons', help='json after generating images.') + parser.add_argument("--out_json_dir", type=str, default="aaaaaaasaveresults/bbbbscores", help="score json ouput") + + + parser.add_argument("--bound_json_dir", type=str, default="abase_json_outputs", help="baseline score json ouput") args = parser.parse_args() + os.makedirs(args.out_json_dir, exist_ok=True) - eval_score = score(args.dataset, args.prompts, args.outputs) - print(eval_score) + pairs = pre_check(args.source_json_dir, args.gen_json_dir, args.bound_json_dir) + ev = Evaluator() + total_text_score = 0 + total_face_score = 0 + total_image_reward_score = 0 + total_image_reward_decrease = 0 + normed_total_text_score = 0 + normed_total_face_score = 0 + normed_total_image_reward_score = 0 + normed_total_image_reward_decrease = 0 + for source_json, gen_json, bound_json in pairs: + rt_dict = score(ev, source_json, gen_json, bound_json, args.out_json_dir) + + total_text_score += rt_dict["text_ac_scores"] + total_face_score += rt_dict["face_ac_scores"] + total_image_reward_score += rt_dict["image_reward_ac_scores"] + total_image_reward_decrease += rt_dict["image_reward_ac_decrease"] + + normed_total_text_score += rt_dict["normed_text_ac_scores"] + normed_total_face_score += rt_dict["normed_face_ac_scores"] + normed_total_image_reward_score += rt_dict["normed_image_reward_ac_scores"] + normed_total_image_reward_decrease += rt_dict["normed_image_reward_ac_decrease"] + print(f""" +total_text_score: {total_text_score:.4f}, +total_face_score: {total_face_score:.4f}, +total_image_reward_score: {total_image_reward_score:.4f}, +total_image_reward_decrease:{total_image_reward_decrease:.4f}, + +normed_total_text_score: {normed_total_text_score:.4f}, +normed_total_face_score: {normed_total_face_score:.4f}, +normed_total_image_reward_score: {normed_total_image_reward_score:.4f}, +normed_total_image_reward_decrease:{normed_total_image_reward_decrease:.4f}, + """) + \ No newline at end of file diff --git a/testfp16.sh b/testfp16.sh new file mode 100644 index 0000000..53d923b --- /dev/null +++ b/testfp16.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# 定义输出文件名 +output_load_model="load_model_output.txt" +output_score="score_output.txt" +total_time="total_time.txt" +combined_output="combined_output.txt" + +export PATH="/home/schengwei/anaconda3/envs/ldm/bin:$PATH" + +# export CUDA_VISIBLE_DEVICES="6" +# unset CUDA_VISIBLE_DEVICES + +# 初始化并激活 Conda 环境 +# 替换 YOUR_CONDA_PATH 为您的 conda 安装路径 +# 替换 YOUR_ENV_NAME 为您的 conda 环境名 +source /home/schengwei/anaconda3/etc/profile.d/conda.sh && conda activate ldm + +# 记录 load_model.py 的开始和结束时间 +start_load_model=$(date +%s) +/home/schengwei/anaconda3/envs/ldm/bin/accelerate launch --mixed_precision fp16 --num_processes 1 load_model.py > "$output_load_model" +end_load_model=$(date +%s) + +# 计算并记录运行时间 +echo "Runtime for load_model.py: $((end_load_model - start_load_model)) seconds" >> "$total_time" + +# 记录 score.py 的开始和结束时间 +start_score=$(date +%s) +/home/schengwei/anaconda3/envs/ldm/bin/python score.py > "$output_score" +end_score=$(date +%s) + +# 计算并记录运行时间 +echo "Runtime for score.py: $((end_score - start_score)) seconds" >> "$total_time" + +# 合并输出文件 +cat "$total_time" "$output_score" "$output_load_model" > "$combined_output" + +# 发送合并后的文件到您的邮箱 +# 注意:请将 YOUR_EMAIL_ADDRESS 替换成您的邮箱地址 +mail -s "Combined Output and Runtime of Scripts" yimingshi666@gmail.com < "$combined_output" diff --git a/testtorchcompile.py b/testtorchcompile.py new file mode 100644 index 0000000..9f17c14 --- /dev/null +++ b/testtorchcompile.py @@ -0,0 +1,14 @@ +import torch +import warnings + +gpu_ok = False +if torch.cuda.is_available(): + device_cap = torch.cuda.get_device_capability() + if device_cap in ((7, 0), (8, 0), (9, 0)): + gpu_ok = True + +if not gpu_ok: + warnings.warn( + "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower " + "than expected." + ) \ No newline at end of file diff --git a/train.py b/train.py index 4c50fd2..e5ddb36 100755 --- a/train.py +++ b/train.py @@ -29,8 +29,6 @@ from pathlib import Path from libs.data import PersonalizedBase, PromptDataset, collate_fn from libs.uvit_multi_post_ln_v1 import UViT -# import diffusers -# from diffusers import DiffusionPipeline from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed @@ -41,7 +39,7 @@ from accelerate.logging import get_logger import itertools import json -#from pathos.multiprocessing import ProcessingPool as Pool + from peft import inject_adapter_in_model, LoraConfig,get_peft_model @@ -260,10 +258,10 @@ def train(config): mask_size= 64 #custom_diffusion里mask_size的值为64 ) train_dataset_loader = DataLoader(train_dataset, - batch_size=4, + batch_size=config.batch_size, shuffle=True, collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), - num_workers=0, + num_workers=config.dataloader_num_workers, ) train_data_generator = utils.get_data_generator(train_dataset_loader, enable_tqdm=accelerator.is_main_process, desc='train') @@ -313,16 +311,6 @@ def train_step(): accelerator.backward(bloss) - # for name, param in nnet.named_parameters(): - # if param.grad is not None: - # print(name) - - - - # for name, param in text_encoder.named_parameters(): - # if param.grad is not None: - # print(name) - # 如果参数的梯度不为None,说明存在梯度 # Zero out the gradients for all token embeddings except the newly added @@ -505,7 +493,13 @@ def get_args(): action="store_true", help="real images as prior.", ) - + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help="Number of subprocesses to use for data loading.", + ) + parser.add_argument("--modifier_token", type=str, default="", help="modifier token") parser.add_argument( "--initializer_token", type=str, default="ktn+pll+ucd", help="A token to use as initializer word." @@ -556,6 +550,8 @@ def main(): config.instance_prompt = args.instance_prompt config.class_prompt = args.class_prompt + config.dataloader_num_workers = args.dataloader_num_workers + config.gradient_accumulation_steps = args.gradient_accumulation_steps config.with_prior_preservation = args.with_prior_preservation @@ -595,4 +591,6 @@ def main(): --modifier_token "" export LD_LIBRARY_PATH=/home/shiyiming/anaconda3/envs/competition/lib/python3.10/site-packages/torch/lib/ -""" \ No newline at end of file +""" + + diff --git a/train.sh b/train.sh index f95de0e..c515107 100755 --- a/train.sh +++ b/train.sh @@ -6,9 +6,9 @@ #!/bin/bash -accelerate launch --mixed_precision no --num_processes 1 train.py --train_step=3000 --instance_data_dir="train_data_crop/newboy1" --outdir="model_output/boy1" --class_data_dir="real_reg/samples_boyface" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="boy" --num_class_images=200 --instance_prompt=" a boy" --modifier_token "" -accelerate launch --mixed_precision no --num_processes 1 train.py --instance_data_dir="train_data_crop/newboy2" --outdir="model_output/boy2" --class_data_dir="real_reg/samples_boyface" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="boy" --num_class_images=200 --instance_prompt=" a boy" --modifier_token "" -accelerate launch --mixed_precision no --num_processes 1 train.py --instance_data_dir="train_data_crop/newgirl1" --outdir="model_output/girl1" --class_data_dir="real_reg/samples_girlhead" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="girl" --num_class_images=200 --instance_prompt=" a girl" --modifier_token "" -accelerate launch --mixed_precision no --num_processes 1 train.py --instance_data_dir="train_data_crop/newgirl2" --outdir="model_output/girl2" --class_data_dir="real_reg/samples_girlhead" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="girl" --num_class_images=200 --instance_prompt=" a girl" --modifier_token "" +accelerate launch --mixed_precision bf16 --num_processes 1 train.py --train_step=3000 --instance_data_dir="train_data_crop/newboy1" --outdir="model_output/boy1" --class_data_dir="real_reg/samples_boyface" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="boy" --num_class_images=200 --instance_prompt=" a boy" --modifier_token "" +accelerate launch --mixed_precision bf16 --num_processes 1 train.py --instance_data_dir="train_data_crop/newboy2" --outdir="model_output/boy2" --class_data_dir="real_reg/samples_boyface" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="boy" --num_class_images=200 --instance_prompt=" a boy" --modifier_token "" +accelerate launch --mixed_precision bf16 --num_processes 1 train.py --instance_data_dir="train_data_crop/newgirl1" --outdir="model_output/girl1" --class_data_dir="real_reg/samples_girlhead" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="girl" --num_class_images=200 --instance_prompt=" a girl" --modifier_token "" +accelerate launch --mixed_precision bf16 --num_processes 1 train.py --instance_data_dir="train_data_crop/newgirl2" --outdir="model_output/girl2" --class_data_dir="real_reg/samples_girlhead" --with_prior_preservation --prior_loss_weight=1.0 --class_prompt="girl" --num_class_images=200 --instance_prompt=" a girl" --modifier_token "" diff --git a/trainn.py b/trainn.py deleted file mode 100755 index e5fcecc..0000000 --- a/trainn.py +++ /dev/null @@ -1,663 +0,0 @@ -""" -训练代码 -代码输入: - - 数据文件夹路径, 其中包含近近脸照文件夹和全身照文件夹, - - 指定的输出路径, 用于输出模型 - - 其他的参数需要选手自行设定 -代码输出: - - 微调后的模型以及其他附加的子模块 - -accelerate launch train.py \ - --instance_data_dir ="目标图像的数据集路径" \ - --outdir="自己的模型输出路径"\ - --class_data_dir "自己的正则数据集路径" \ - --with_prior_preservation --prior_loss_weight=1.0 \ - --class_prompt="girl" --num_class_images=200 \ - --instance_prompt="photo of a girl" \ - --modifier_token "" -""" -from accelerate import Accelerator -import hashlib -import warnings -import torch -import utils -from absl import logging -import os -#import wandb -import libs.autoencoder -import clip -import itertools -from libs.clip import CLIPEmbedder -from libs.caption_decoder import CaptionDecoder -from torch.utils.data import DataLoader -from libs.schedule import stable_diffusion_beta_schedule, Schedule, LSimple_T2I -import argparse -import yaml -import datetime -from transformers import AutoTokenizer,PretrainedConfig -from pathlib import Path -from libs.data import PersonalizedBase, PromptDataset, collate_fn -from libs.uvit_multi_post_ln_v1 import UViT -# import diffusers -# from diffusers import DiffusionPipeline -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import ProjectConfiguration, set_seed -from pathlib import Path -from transformers import CLIPTextModel -import tqdm - -from accelerate.logging import get_logger -import itertools -import json -#from pathos.multiprocessing import ProcessingPool as Pool -from peft import inject_adapter_in_model, LoraConfig,get_peft_model - -os.environ['CUDA_VISIBLE_DEVICES']="1, 2" - -lora_config = LoraConfig( - inference_mode=False, - lora_alpha=16, - lora_dropout=0.1, - r=24, - bias="none", - # target_modules=["qkv","proj"], -) - - -# 保存text encoder中新增token的embedding - -def save_new_embed(clip_text_model, modifier_token_id, accelerator, args, outdir): - """Saves the new token embeddings from the text encoder.""" - logger.info("Saving embeddings") - learned_embeds = accelerator.unwrap_model(clip_text_model).get_input_embeddings().weight - for x, y in zip(modifier_token_id, args.modifier_token): - learned_embeds_dict = {} - learned_embeds_dict[y] = learned_embeds[x] - torch.save(learned_embeds_dict, f"{outdir}/{y}.bin") - -logger = get_logger(__name__) - -def freeze_params(params): - for param in params: - param.requires_grad = False -def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): - """ - 根据预训练模型的名称或路径导入相应的模型类。 - - 参数: - pretrained_model_name_or_path: 预训练模型的名称或路径。 - revision: 模型的版本号。 - - 返回: - 模型类。 - - 根据模型配置获取模型类,支持的模型包括 CLIPTextModel 和 RobertaSeriesModelWithTransformation。 - 如果模型类不在支持列表中,将引发 ValueError 异常。 - """ - # 从预训练配置中获取文本编码器配置 - text_encoder_config = PretrainedConfig.from_pretrained( - pretrained_model_name_or_path, - subfolder="text_encoder", - revision=revision, - ) - # 获取模型类名 - model_class = text_encoder_config.architectures[0] - - if model_class == "CLIPTextModel": - from transformers import CLIPTextModel - - return CLIPTextModel - - return RobertaSeriesModelWithTransformation - else: - # 模型类不在支持列表中,引发 ValueError 异常 - raise ValueError(f"{model_class} is not supported.") - - - - - -def train(config): - - """ - prepare models - 准备各类需要的模型 - """ - accelerator, device = utils.setup(config) - - args = get_args() - concepts_list = args.concepts_list - # concepts_list = [ - # { - # "instance_prompt": 'photo of a girl', #photo of a girl - # "class_prompt": 'girl',#girl - # "instance_data_dir": './train_data/oldgirl2',#./train_data/girl2 - # "class_data_dir": './real_reg/samples_girlbody/',#./real_reg/samples_person/ - # } - # ] - # Generate class images if prior preservation is enabled. - - if config.with_prior_preservation: - for i, concept in enumerate(concepts_list): - # 目录文件处理 - class_images_dir = Path(concept["class_data_dir"]) - if not class_images_dir.exists(): - class_images_dir.mkdir(parents=True, exist_ok=True) - if config.real_prior: - assert ( - class_images_dir / "images" - ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {config.num_class_images}" - assert ( - len(list((class_images_dir / "images").iterdir())) == config.num_class_images - ), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {config.num_class_images}" - assert ( - class_images_dir / "caption.txt" - ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {config.num_class_images}" - assert ( - class_images_dir / "images.txt" - ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {config.num_class_images}" - concept["class_prompt"] = os.path.join(class_images_dir, "caption.txt") - concept["class_data_dir"] = os.path.join(class_images_dir, "images.txt") - concepts_list[i] = concept - accelerator.wait_for_everyone() - - pretrained_model_name_or_path = "/home/wuyujia/.cache/huggingface/hub/models--CompVis--stable-diffusion-v1-4/snapshots/133a221b8aa7292a167afc5127cb63fb5005638b" - # pretrained_model_name_or_path = "huggingface" - #pretrained_model_name_or_path = "/home/shiyiming/.cache/huggingface/hub/models--CompVis--stable-diffusion-v1-4/snapshots/b95be7d6f134c3a9e62ee616f310733567f069ce" - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, - subfolder="tokenizer", - revision = None, - use_fast=False, - ) - text_encoder_cls = import_model_class_from_model_name_or_path(pretrained_model_name_or_path , config.revision) - text_encoder = text_encoder_cls.from_pretrained( - pretrained_model_name_or_path, subfolder="text_encoder", revision=config.revision - ) - # text_encoder = CLIPTextModel.from_pretrained( - # pretrained_model_name_or_path, subfolder="text_encoder", revision=config.revision - # ) - text_encoder.to(device) - train_state = utils.initialize_train_state(config, device, uvit_class=UViT,text_encoder = text_encoder) - - - - caption_decoder = CaptionDecoder(device=device, **config.caption_decoder) - - - nnet, optimizer = accelerator.prepare(train_state.nnet, train_state.optimizer) - nnet.to(device) - # nnet = get_peft_model(nnet,lora_config) - # for i in range (15): - # module = nnet.in_blocks[i].attn - # module = inject_adapter_in_model(lora_config, module) - # module = nnet.mid_block - # module = inject_adapter_in_model(lora_config, module) - # for i in range (15): - # module = nnet.out_blocks[i].attn - # module = inject_adapter_in_model(lora_config, module) - # print("success_add_lora") - # 全参微调不加lora - # for name,param in nnet.named_parameters(): - # param.requires_grad=True - # for name,param in nnet.named_parameters(): - # if 'lora_adapters_ttoi' in name or 'lora_adapters_itot' in name: - # param.requires_grad = False - - - - # # 非Lora部分不计算梯度 - # for name,param in nnet.named_parameters(): - # if 'lora_attention' in name or 'token_embedding' in name: - # param.requires_grad = True - # else: - # param.requires_grad=False - - # for name,param in nnet.named_parameters(): - # if 'lora' in name or 'token_embedding' in name: - # param.requires_grad = True - # else: - # param.requires_grad=False - - # for name,param in nnet.named_parameters(): - # if 'lora_attention' in name or 'token_embedding' in name or 'lora_adapters_ttoi' in name or 'lora_adapters_itot' in name: - # param.requires_grad = True - # else: - # param.requires_grad=False - - # check the nnet's parameters if they are frozen - # for name, param in nnet.named_parameters(): - # print(f'{name}: requires_grad={param.requires_grad}') - - lr_scheduler = train_state.lr_scheduler - - autoencoder = libs.autoencoder.get_model(**config.autoencoder).to(device) - - autoencoder.requires_grad = False - - - - - # Modify the code of custom diffusion to directly import the clip text encoder - # instead of freezing all parameters. - # clip_text_model = CLIPEmbedder(version=config.clip_text_model, device=device) - - - clip_img_model, clip_img_model_preprocess = clip.load(config.clip_img_model, jit=False) - # clip_img_model.to(device).eval().requires_grad_(False) - clip_img_model.to(device).requires_grad_(False) - - # Adding a modifier token which is optimized #### 来自Textual inversion代码 - # Code taken from https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py - # add modifier token - modifier_token_id = [] - initializer_token_id = [] - - if args.modifier_token is not None: - - args.modifier_token = args.modifier_token.split("+")#[''] - args.initializer_token = config.initializer_token.split("+")#['ktn', 'pll', 'ucd'] - - if len(args.modifier_token) > len(args.initializer_token): - raise ValueError("You must specify + separated initializer token for each modifier token.") - for modifier_token, initializer_token in zip( - args.modifier_token, args.initializer_token[: len(args.modifier_token)] - ): - # Add the placeholder token in tokenizer - #在添加占位符标记时,通常会将占位符添加到词汇表(vocabulary)中, - #以便在处理文本时能够正确地处理这个占位符。占位符可以在模型训练、文本生成、填充序列等任务中起到重要的作用。 - - num_added_tokens = tokenizer.add_tokens(modifier_token) - if num_added_tokens == 0: - raise ValueError( - f"The tokenizer already contains the token {modifier_token}. Please pass a different" - " `modifier_token` that is not already in the tokenizer." - ) - - # Convert the initializer_token, placeholder_token to ids - token_ids = tokenizer.encode([initializer_token], add_special_tokens=False) - - #[42170] - #ktn - - # Check if initializer_token is a single token or a sequence of tokens - if len(token_ids) > 1: - raise ValueError("The initializer token must be a single token.") - - initializer_token_id.append(token_ids[0]) - modifier_token_id.append(tokenizer.convert_tokens_to_ids(modifier_token)) - print("modifier_token_id",modifier_token_id) - - - # Resize the token embeddings as we are adding new special tokens to the tokenizer - text_encoder.resize_token_embeddings(len(tokenizer))#从40408变为40409 - - # Initialise the newly added placeholder token with the embeddings of the initializer token - token_embeds = text_encoder.get_input_embeddings().weight.data - for x, y in zip(modifier_token_id, initializer_token_id): - token_embeds[x] = token_embeds[y] - - # Freeze all parameters except for the token embeddings in text encoder - params_to_freeze = itertools.chain( - text_encoder.text_model.encoder.parameters(), - text_encoder.text_model.final_layer_norm.parameters(), - text_encoder.text_model.embeddings.position_embedding.parameters(), - ) - freeze_params(params_to_freeze) - - - """ - 处理数据部分 - """ - # process data - train_dataset = PersonalizedBase( - concepts_list=concepts_list, - num_class_images=config.num_class_images, - size=config.resolution, # 设置的默认为 512 - center_crop=config.center_crop, - tokenizer_max_length=77, - tokenizer=tokenizer, - config = config, - hflip=config.hflip, - # mask_size= autoencoder.encode(torch.randn(1, 3, config.resolution, config.resolution).to(dtype=torch.float16).to(accelerator.device) - # ) - # .latent_dist.sample() - # .size()[-1], - mask_size= 64 #custom_diffusion里mask_size的值为64 - ) - train_dataset_loader = DataLoader(train_dataset, - batch_size=2, - shuffle=True, - collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), - num_workers=0, - ) - - train_data_generator = utils.get_data_generator(train_dataset_loader, enable_tqdm=accelerator.is_main_process, desc='train') - - logging.info("saving meta data") - os.makedirs(config.meta_dir, exist_ok=True) - with open(os.path.join(config.meta_dir, "config.yaml"), "w") as f: - f.write(yaml.dump(config)) - f.close() - - _betas = stable_diffusion_beta_schedule() - schedule = Schedule(_betas) - logging.info(f'use {schedule}') - # for name, param in nnet.named_parameters(): - # param.requires_grad = True - # for name, param in nnet.named_parameters(): - # if 'lora_adapters_itot' not in name and 'lora_adapters_ttoi' not in name: - # param.requires_grad = False - # for name, param in nnet.named_parameters(): - # if 'text_embed' in name or 'token_embedding' in name: - # param.requires_grad = True - - # 验证哪些参数被冻结 - for name, param in nnet.named_parameters(): - if param.requires_grad: - print(f"未冻结的参数: {name}") - - - # total_frozen_params = sum(p.numel() for p in text_encoder.parameters() if p.requires_grad) - - # 77560320 lora_adapter+text_embedding 37946112 token_embedding - # INFO - nnet has 1029970000 parameters - # INFO - text_encoder has 123060480 parameters - # text_encoder = accelerator.prepare(text_encoder) - def train_step(): - metrics = dict() - - text, img, img4clip, mask = next(train_data_generator) - img = img.to(device) - text = text.to(device) - img4clip = img4clip.to(device) - data_type = torch.float32 - mask = mask.to(device) - # with torch.no_grad(): - z = autoencoder.encode(img) - clip_img = clip_img_model.encode_image(img4clip).unsqueeze(1).contiguous() - text = text_encoder(text)[0] - text = caption_decoder.encode_prefix(text) - #z= false text = true - - bloss = LSimple_T2I(img=z,clip_img=clip_img, text=text, data_type=data_type, nnet=nnet, schedule=schedule, device=device, config=config,mask=mask) - # bloss.requires_grad = True - - accelerator.backward(bloss) - # for name, param in nnet.named_parameters(): - # if param.grad is not None: - # print(name) - - - - # for name, param in text_encoder.named_parameters(): - # if param.grad is not None: - # print(name) - # 如果参数的梯度不为None,说明存在梯度 - - - # Zero out the gradients for all token embeddings except the newly added - # embeddings for the concept, as we only want to optimize the concept embeddings - if True: - # 谁给删了,而且改回来了,下面这个 if 语句没什么大用,都是一样的效果 - # if accelerator.num_processes > 1: - # grads_text_encoder = text_encoder.get_input_embeddings().weight.grad - # else: - # grads_text_encoder = text_encoder.get_input_embeddings().weight.grad - grads_text_encoder = text_encoder.get_input_embeddings().weight.grad - # Get the index for tokens that we want to zero the grads for - index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0] - for i in range(len(modifier_token_id[1:])): - index_grads_to_zero = index_grads_to_zero & ( - torch.arange(len(tokenizer)) != modifier_token_id[i] - ) - grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[ - index_grads_to_zero, : - ].fill_(0) - - - - params_to_clip = ( - itertools.chain(text_encoder.parameters(), nnet.parameters()) - if args.modifier_token is not None - else nnet.parameters() - ) - accelerator.clip_grad_norm_(params_to_clip, config.max_grad_norm) - - - # 更新参数 - optimizer.step() - lr_scheduler.step() - # train_state.ema_update(config.get('ema_rate', 0.9999))这个参数影响添加peft训练 - train_state.step += 1 - - optimizer.zero_grad() - metrics['bloss'] = accelerator.gather(bloss.detach().mean()).mean().item() - # metrics['loss_img'] = accelerator.gather(loss_img.detach().mean()).mean().item() - # metrics['loss_clip_img'] = accelerator.gather(loss_clip_img.detach().mean()).mean().item() - # metrics['scale'] = accelerator.scaler.get_scale() - metrics['lr'] = train_state.optimizer.param_groups[0]['lr'] - - return metrics - - # @torch.no_grad() - # @torch.autocast(device_type='cuda') - # def eval(total_step): - # """ - # write evaluation code here - # """ - - # return - - def loop(): - log_step = config.log_interval - # log_step = 0 - # eval_step = 1000000 - save_step = config.save_interval # 100 - # save_step = 0 - count = 0 - while True: - nnet.train() - with accelerator.accumulate(nnet),accelerator.accumulate(text_encoder): - metrics = train_step() - print("metrics",metrics) - count+=1 - - accelerator.wait_for_everyone() - - if accelerator.is_main_process: - # nnet.eval() - total_step = train_state.step * config.batch_size - if total_step >= log_step: - logging.info(utils.dct2str(dict(step=total_step, **metrics))) - # wandb.log(utils.add_prefix(metrics, 'train'), step=total_step) - # train_state.save(os.path.join(config.log_dir, f'{total_step:04}.ckpt')) - log_step += config.log_interval - - # if total_step >= eval_step: - # eval(total_step) - # eval_step += config.eval_interval - - # if total_step >= config.save_interval :#save_step = 300 - # logging.info(f'Save and eval checkpoint {total_step}...') - # train_state.save(os.path.join(config.ckpt_root, f'{total_step:04}.ckpt')) - # save_step += config.save_interval - - if total_step >= 800: - logging.info(f"saving final ckpts to {config.outdir}...") - save_new_embed(text_encoder, modifier_token_id, accelerator, args, args.outdir) - # train_state.save(os.path.join(config.outdir, 'final.ckpt')) - train_state.save_lora(os.path.join(config.outdir, 'lora.pt.tmp')) - break - - - loop() - -def get_args(): - parser = argparse.ArgumentParser() - # key args - # parser.add_argument('-d', '--data', type=str, default="train_data/girl2", help="datadir") - parser.add_argument('-o', "--outdir", type=str, default="model_ouput/girl2", help="output of model") - # args of logging - parser.add_argument("--logdir", type=str, default="logs", help="the dir to put logs") - parser.add_argument("--nnet_path", type=str, default="models/uvit_v1.pth", help="nnet path to resume") - parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.") - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", - ) - parser.add_argument( - "--concepts_list", - type=str, - default=None, - required=False, - help="A folder containing the training data of class images.", - ) - parser.add_argument( - "--instance_prompt", - type=str, - default=None, - required=True, - help="The prompt with identifier specifying the instance", - ) - parser.add_argument( - "--class_prompt", - type=str, - default=None, - help="The prompt to specify images in the same class as provided instance images.", - ) - parser.add_argument( - "--with_prior_preservation", - default=False, - action="store_true", - help="Flag to add prior preservation loss.", - ) - parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") - parser.add_argument( - "--num_class_images", - type=int, - default=200, - help=( - "Minimal class images for prior preservation loss. If there are not enough images already present in" - " concepts_list, additional images will be sampled with class_prompt." - ), - ) - - # parser.add_argument( - # "--logging_dir", - # type=str, - # default="logs", - # help=( - # "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" - # " *outdir/runs/**CURRENT_DATETIME_HOSTNAME***." - # ), - # ) - parser.add_argument( - "--instance_data_dir", - type=str, - default=None, - help="A folder containing the training data of instance images.", - ) - parser.add_argument( - "--class_data_dir", - type=str, - default=None, - help="A folder containing the training data of class images.", - ) - parser.add_argument( - "--real_prior", - default=True, - action="store_true", - help="real images as prior.", - ) - - parser.add_argument("--modifier_token", type=str, default="", help="modifier token") - parser.add_argument( - "--initializer_token", type=str, default="ktn+pll+ucd", help="A token to use as initializer word." - ) - - - - args = parser.parse_args() - - if args.with_prior_preservation: - if args.concepts_list is None: - args.concepts_list = [ - { - "instance_prompt": args.instance_prompt, #photo of a girl - "class_prompt": args.class_prompt,#girl - "instance_data_dir": args.instance_data_dir,#./path-to-images/ - "class_data_dir": args.class_data_dir,#./real_reg/samples_person/ - } - ] - - if args.class_prompt is None: - raise ValueError("You must specify prompt for class images.") - else: - # logger is not available yet - if args.concepts_list is not None: - warnings.warn("You need not use --concepts_list without --with_prior_preservation.") - if args.class_prompt is not None: - warnings.warn("You need not use --class_prompt without --with_prior_preservation.") - - - - return args - -def main(): - print("main start!") - # 赛手需要根据自己的需求修改config file - from configs.unidiffuserv1 import get_config - config = get_config() - config_name = "unidiffuserv1" - args = get_args() - config.log_dir = args.logdir - config.outdir = args.outdir - config.data = args.instance_data_dir - config.modifier_token = args.modifier_token - config.initializer_token = args.initializer_token - config.prior_loss_weight = args.prior_loss_weight - config.instance_prompt = args.instance_prompt - config.class_prompt = args.class_prompt - - config.gradient_accumulation_steps = args.gradient_accumulation_steps - config.with_prior_preservation = args.with_prior_preservation - - config.real_prior = args.real_prior - config.num_class_images = args.num_class_images - config.hflip = args.hflip - - data_name = Path(config.data).stem - - now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - config.workdir = os.path.join(config.log_dir, f"{config_name}-{data_name}-{now}") - config.ckpt_root = os.path.join(config.workdir, 'ckpts') - config.meta_dir = os.path.join(config.workdir, "meta") - config.nnet_path = args.nnet_path - os.makedirs(config.workdir, exist_ok=True) - - train(config) - - - - -if __name__ == "__main__": - main() - - -""" -nohup accelerate launch trainn.py \ - --instance_data_dir="train_data/newboy1" \ - --outdir="model_output/boy1"\ - --class_data_dir="real_reg/samples_boyface" \ - --with_prior_preservation --prior_loss_weight=1.0 \ - --class_prompt="boy" --num_class_images=200 \ - --instance_prompt=" a boy" \ - --modifier_token ""\ - > output.log 2>&1 & - - export LD_LIBRARY_PATH=/home/shiyiming/anaconda3/envs/competition/lib/python3.10/site-packages/torch/lib/ -""" \ No newline at end of file diff --git a/utils.py b/utils.py index d8b1bd3..b5f3eef 100755 --- a/utils.py +++ b/utils.py @@ -1,331 +1,3 @@ -# import torch -# import torch.nn as nn -# import numpy as np -# import os -# from absl import logging -# import sys -# from pathlib import Path -# from tqdm import tqdm -# from PIL import Image, ImageDraw, ImageFont -# from libs.clip import FrozenCLIPEmbedder -# import itertools -# from libs.clip import CLIPEmbedder -# from peft import inject_adapter_in_model, LoraConfig,get_peft_model -# lora_config = LoraConfig( -# r=128, lora_alpha=90, lora_dropout=0.1,target_modules=["qkv","fc1","fc2","proj","to_out","to_q","to_k","to_v","text_embed","clip_img_embed"] -# # target_modules=["qkv","fc1","fc2","proj"] -# ) - -# def get_config_name(): -# argv = sys.argv -# for i in range(1, len(argv)): -# if argv[i].startswith('--config='): -# return Path(argv[i].split('=')[-1]).stem - -# def get_data_name(): -# argv = sys.argv -# for i in range(1, len(argv)): -# if argv[i].startswith('--data='): -# return Path(argv[i].split('=')[-1]).stem - - -# def set_logger(log_level='info', fname=None): -# import logging as _logging -# handler = logging.get_absl_handler() -# formatter = _logging.Formatter('%(asctime)s - %(filename)s - %(message)s') -# handler.setFormatter(formatter) -# logging.set_verbosity(log_level) -# if fname is not None: -# handler = _logging.FileHandler(fname) -# handler.setFormatter(formatter) -# logging.get_absl_logger().addHandler(handler) - - -# def dct2str(dct): -# return str({k: f'{v:.6g}' for k, v in dct.items()}) - - - -# def get_optimizer(params, name, **kwargs): -# if name == 'adam': -# from torch.optim import Adam -# return Adam(params, **kwargs) -# elif name == 'adamw': -# from torch.optim import AdamW -# return AdamW(params, **kwargs) -# else: -# raise NotImplementedError(name) - - -# def customized_lr_scheduler(optimizer, warmup_steps=-1): -# from torch.optim.lr_scheduler import LambdaLR -# def fn(step): -# if warmup_steps > 0: -# return min(step / warmup_steps, 1) -# else: -# return 1 - -# return LambdaLR(optimizer, fn) - - -# def get_lr_scheduler(optimizer, name, **kwargs): -# if name == 'customized': -# return customized_lr_scheduler(optimizer, **kwargs) -# elif name == 'cosine': -# from torch.optim.lr_scheduler import CosineAnnealingLR -# return CosineAnnealingLR(optimizer, **kwargs) -# else: -# raise NotImplementedError(name) - - -# def ema(model_dest: nn.Module, model_src: nn.Module, rate): -# """ -# 这个函数是用于实现模型参数的指数移动平均(Exponential Moving Average,EMA)的。具体而言,它将源模型的参数按照一定的比例rate融合到目标模型的参数中。 - -# 函数的输入参数包括: - -# model_dest: 目标模型 -# model_src: 源模型 -# rate: 融合比例,通常取值在[0, 1]之间 -# 函数具体实现的步骤如下: - -# 将源模型的参数按照名称转化为字典param_dict_src。 -# 遍历目标模型的参数p_dest,对于每个参数,找到对应名称的源模型参数p_src。 -# 利用assert语句确保p_src和p_dest不是同一个对象。 -# 将p_dest的数值乘以rate后加上(1-rate)倍的p_src数值,得到融合后的结果,并将结果赋值给p_dest。 -# 这个函数的作用是在训练神经网络时,通过融合历史模型参数和当前模型参数,来平滑模型参数更新过程,从而提高模型的泛化能力。 -# """ -# param_dict_src = dict(model_src.named_parameters()) -# for p_name, p_dest in model_dest.named_parameters(): -# p_src = param_dict_src[p_name] -# assert p_src is not p_dest -# p_dest.data.mul_(rate).add_((1 - rate) * p_src.data) -# """ -# 如果代码运行到“p_src = param_dict_src[p_name]”这一行报错 KeyError,通常是由于源模型和目标模型的参数名称不一致导致的。 - -# 具体而言,param_dict_src是一个字典,它将源模型的参数名称映射为对应的参数对象。而在遍历目标模型的参数时,代码会尝试从param_dict_src中获取对应名称的源模型参数,如果找不到,则会报错 KeyError。 - -# 解决这个问题的方法是,检查源模型和目标模型的参数名称是否一致。如果不一致,可以通过修改代码来解决,或者手动将源模型的参数名称改为和目标模型一致。 -# """ - -# class TrainState(object): -# def __init__(self, optimizer, lr_scheduler, step, nnet=None, nnet_ema=None, -# lorann=None, t2i_adapter=None,text_embedding = None): -# self.optimizer = optimizer -# self.lr_scheduler = lr_scheduler -# self.step = step -# self.nnet = nnet -# self.nnet_ema = nnet_ema -# self.lorann = lorann -# self.t2i_adapter = t2i_adapter -# self.text_embedding = text_embedding -# # def ema_update(self, rate=0.9999): -# # if self.nnet_ema is not None: -# # ema(self.nnet_ema, self.nnet, rate) - -# def save(self, path): -# os.makedirs(path, exist_ok=True) -# torch.save(self.step, os.path.join(path, 'step.pth')) -# for key, val in self.__dict__.items(): -# if key != 'step' and val is not None: -# torch.save(val.state_dict(), os.path.join(path, f'{key}.pth')) - -# def save_lora(self,path): -# ## save lora weights -# os.makedirs(path, exist_ok=True) -# lora_state={} -# # for name,param in self.nnet.named_parameters(): -# # name_cols=name.split('.') -# # filter_names=['lora'] -# # if any(n==name_cols[-1] for n in filter_names): -# # lora_state[name]=param -# # print(name) -# for name,param in self.nnet.named_parameters(): -# if 'lora' in name: -# lora_state[name]=param - -# torch.save(lora_state,os.path.join(path,'lora.pt.tmp')) -# os.replace(os.path.join(path,'lora.pt.tmp'),os.path.join(path,'lora.pt')) - -# def resume(self, ckpt_path=None, only_load_model=False): -# if ckpt_path is None: -# return - -# logging.info(f'resume from {ckpt_path}, only_load_model={only_load_model}') -# self.step = torch.load(os.path.join(ckpt_path, 'step.pth')) - -# if only_load_model: -# for key, val in self.__dict__.items(): -# if key == 'nnet_ema' or key == 'nnet': -# val.load_state_dict(torch.load(os.path.join(ckpt_path, f'{key}.pth'), map_location='cpu')) -# else: -# for key, val in self.__dict__.items(): -# if key != 'step' and val is not None: -# val.load_state_dict(torch.load(os.path.join(ckpt_path, f'{key}.pth'), map_location='cpu')) - -# def to(self, device): -# for key, val in self.__dict__.items(): -# if isinstance(val, nn.Module): -# val.to(device) - - -# def cnt_params(model): -# return sum(param.numel() for param in model.parameters()) - -# def initialize_train_state(config, device, uvit_class,text_encoder = None): - - -# params = [] -# nnet = uvit_class(**config.nnet) -# param_lists = [ -# text_encoder.get_input_embeddings().parameters(), -# nnet.mid_block.lora_attention.parameters(), -# nnet.token_embedding.parameters(), -# ] -# logging.info(f'load nnet from {config.nnet_path}') - -# nnet.load_state_dict(torch.load(config.nnet_path, map_location='cpu'),False) -# nnet = get_peft_model(nnet,lora_config) -# # nnet.load_state_dict(torch.load(config.nnet_path, map_location='cpu'),True) - -# nnet.print_trainable_parameters() - - -# input_embed_params = list(text_encoder.get_input_embeddings().parameters()) -# param_lists = input_embed_params + [param for name, param in nnet.named_parameters() if 'lora' in name] - -# # for i in range(15): -# # param_lists.append(nnet.in_blocks[i].attn.parameters()) -# # param_lists.append(nnet.out_blocks[i].attn.parameters()) -# # for i in range(15): -# # param_lists.append(nnet.in_blocks[i].lora_attention.parameters()) -# # param_lists.append(nnet.out_blocks[i].lora_attention.parameters()) -# # param_lists = [ -# # text_encoder.get_input_embeddings().parameters(), -# # nnet.parameters()] - -# params = list(itertools.chain(*param_lists)) -# nnet_ema = uvit_class(**config.nnet) -# nnet_ema.eval() -# # param_lists = list(itertools.chain(*param_lists)) - -# # logging.info(f'nnet has {cnt_params(nnet)} parameters') -# # logging.info(f'text_encoder has {cnt_params(text_encoder)} parameters') - -# optimizer = get_optimizer(param_lists, **config.optimizer) - -# lr_scheduler = get_lr_scheduler(optimizer, **config.lr_scheduler) - -# train_state = TrainState(optimizer=optimizer, lr_scheduler=lr_scheduler, step=0, -# nnet=nnet, nnet_ema=nnet_ema, text_embedding=text_encoder.get_input_embeddings()) -# # train_state.ema_update(0) -# train_state.to(device) -# # no need to resume -# # train_state.resume(config.resume_ckpt_path, only_load_model=config.only_load_model) - -# # for the case when the lr is manually changed -# lr_scheduler.base_lrs = [config.optimizer.lr] -# optimizer.param_groups[0]['initial_lr'] = config.optimizer.lr -# lr_scheduler._last_lr = lr_scheduler.get_lr() -# optimizer.param_groups[0]['lr'] = lr_scheduler.get_lr()[0] - -# return train_state - - - -# def get_hparams(): -# argv = sys.argv -# lst = [] -# for i in range(1, len(argv)): -# assert '=' in argv[i] -# if argv[i].startswith('--config.'): -# hparam_full, val = argv[i].split('=') -# hparam = hparam_full.split('.')[-1] -# if hparam_full.startswith('--config.optimizer.lm'): -# hparam = f'lm_{hparam}' -# if hparam_full.startswith('--config.optimizer.decoder'): -# hparam = f'decoder_{hparam}' -# lst.append(f'{hparam}={val}') -# hparams = '-'.join(lst) -# if hparams == '': -# hparams = 'default' -# return hparams - -# def add_prefix(dct, prefix): -# return {f'{prefix}/{key}': val for key, val in dct.items()} - -# def amortize(n_samples, batch_size): -# k = n_samples // batch_size -# r = n_samples % batch_size -# return k * [batch_size] if r == 0 else k * [batch_size] + [r] - - -# def grad_norm(model): -# total_norm = 0. -# for p in model.parameters(): -# if p.grad is not None: -# param_norm = p.grad.data.norm(2) -# total_norm += param_norm.item() ** 2 -# total_norm = total_norm ** (1. / 2) -# return total_norm - - -# def setup(config): -# import builtins -# import ml_collections -# from torch import multiprocessing as mp -# import accelerate -# import wandb -# import logging -# from accelerate.logging import get_logger -# from accelerate.utils import ProjectConfiguration -# logging_dir = Path('./model_output/', './model_output/logs') -# accelerator_project_config = ProjectConfiguration(project_dir='./model_output/', logging_dir=logging_dir) -# mp.set_start_method('spawn') -# assert config.gradient_accumulation_steps == 1, \ -# 'fix the lr_scheduler bug before using larger gradient_accumulation_steps' - -# logger = get_logger(__name__) - -# accelerator = accelerate.Accelerator(gradient_accumulation_steps=config.gradient_accumulation_steps,mixed_precision=None, project_config=accelerator_project_config) -# device = accelerator.device -# accelerate.utils.set_seed(config.seed, device_specific=True) -# logging.info(f'Process {accelerator.process_index} using device: {device}') -# logging.basicConfig( -# format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", -# datefmt="%m/%d/%Y %H:%M:%S", -# level=logging.INFO, -# ) -# logger.info(accelerator.state, main_process_only=False) -# config.mixed_precision = accelerator.mixed_precision -# config = ml_collections.FrozenConfigDict(config) - -# if accelerator.is_main_process: -# os.makedirs(config.ckpt_root, exist_ok=True) -# accelerator.wait_for_everyone() -# if accelerator.is_main_process: -# #初始化跟踪器,指定项目名称为 "unidiffuser",同时传递参数配置 vars(args) -# accelerator.init_trackers("unidiffuser", config=vars(config)) -# # if accelerator.is_main_process: -# # wandb.init(dir=os.path.abspath(config.workdir), project='lora', config=config.to_dict(), job_type='train', mode="offline") -# # set_logger(log_level='info', fname=os.path.join(config.workdir, 'output.log')) -# # logging.info(config) -# # else: -# # set_logger(log_level='error') -# # builtins.print = lambda *args: None -# logging.info(f'Run on {accelerator.num_processes} devices') - -# return accelerator, device - - - -# def get_data_generator(loader, enable_tqdm, desc): -# while True: -# for data in tqdm(loader, disable=not enable_tqdm, desc=desc): -# yield data - - - import torch import torch.nn as nn import numpy as np @@ -338,12 +10,14 @@ from libs.clip import FrozenCLIPEmbedder import itertools from libs.clip import CLIPEmbedder -from peft import inject_adapter_in_model, LoraConfig,get_peft_model +from peft import inject_adapter_in_model, LoraConfig,get_peft_model,AdaLoraConfig +# lora_config = AdaLoraConfig( +# inference_mode=False, r=64, lora_alpha=32, lora_dropout=0.1,target_modules=["qkv","fc1","fc2"] +# ) lora_config = LoraConfig( inference_mode=False, r=64, lora_alpha=32, lora_dropout=0.1,target_modules=["qkv","fc1","fc2","proj","text_embed","clip_img_embed"] ) - def get_config_name(): argv = sys.argv for i in range(1, len(argv)): @@ -464,12 +138,6 @@ def save_lora(self,path): ## save lora weights os.makedirs(path, exist_ok=True) lora_state={} - # for name,param in self.nnet.named_parameters(): - # name_cols=name.split('.') - # filter_names=['lora'] - # if any(n==name_cols[-1] for n in filter_names): - # lora_state[name]=param - # print(name) for name,param in self.nnet.named_parameters(): if 'lora' in name: lora_state[name]=param @@ -508,32 +176,19 @@ def initialize_train_state(config, device, uvit_class,text_encoder = None): logging.info(f'load nnet from {config.nnet_path}') nnet.load_state_dict(torch.load(config.nnet_path, map_location='cpu'),False) - nnet = get_peft_model(nnet,lora_config) + # nnet = get_peft_model(nnet,lora_config) # nnet.load_state_dict(torch.load('/home/wuyujia/competition/model_output/girl1_new_10000/lora.pt.tmp/lora.pt', map_location='cpu'), False) - nnet.print_trainable_parameters() + # nnet.print_trainable_parameters() input_embed_params = list(text_encoder.get_input_embeddings().parameters()) param_lists = input_embed_params + [param for name, param in nnet.named_parameters() if 'lora' in name] - # for i in range(15): - # param_lists.append(nnet.in_blocks[i].attn.parameters()) - # param_lists.append(nnet.out_blocks[i].attn.parameters()) - # for i in range(15): - # param_lists.append(nnet.in_blocks[i].lora_attention.parameters()) - # param_lists.append(nnet.out_blocks[i].lora_attention.parameters()) - # param_lists = [ - # text_encoder.get_input_embeddings().parameters(), - # nnet.parameters()] nnet_ema = uvit_class(**config.nnet) nnet_ema.eval() - # param_lists = list(itertools.chain(*param_lists)) - # logging.info(f'nnet has {cnt_params(nnet)} parameters') - # logging.info(f'text_encoder has {cnt_params(text_encoder)} parameters') - optimizer = get_optimizer(param_lists, **config.optimizer) lr_scheduler = get_lr_scheduler(optimizer,**config.lr_scheduler) @@ -603,7 +258,7 @@ def setup(config): from accelerate.utils import ProjectConfiguration logging_dir = Path('./model_output/', './model_output/logs') accelerator_project_config = ProjectConfiguration(project_dir='./model_output/', logging_dir=logging_dir) - mp.set_start_method('spawn') + # mp.set_start_method('spawn') assert config.gradient_accumulation_steps == 1, \ 'fix the lr_scheduler bug before using larger gradient_accumulation_steps' diff --git "a/\350\257\204\346\265\213\350\257\264\346\230\216.md" "b/\350\257\204\346\265\213\350\257\264\346\230\216.md" new file mode 100644 index 0000000..8b6a2a6 --- /dev/null +++ "b/\350\257\204\346\265\213\350\257\264\346\230\216.md" @@ -0,0 +1,111 @@ +# 比赛决赛评测说明 + +## 提交说明 + +1. 决赛提交方式与初赛相同,项目需将和比赛相关的代码、模型权重、依赖库等数据打包为镜像tar包后进行提交(注意将构建镜像的Dockerfile文件包含在打包镜像中)。文件命名为**选手团队名字_submit.tar.gz**(打包后的tar文件在linux平台使用gzip压缩,win平台使用压缩软件压缩为.gz格式)。选手创建名字为project的文件夹,将.tar.gz文件放到project文件夹下打包为project.zip后提交。(最终需要上传平台的为project.zip,请切勿直接提交.tar.gz文件) +2. 选手需要按照要求,实现 `prepare_context`和 `process_one_json`两个函数,保证输入和输出格式规范,并将这两个函数保存在workspace目录下的**load_model.py**脚本中,以适配评测采样脚本。 +3. 选手需要将使用到的模型提前保存到docker中,同时注意**保证docker中加载模型的路径正确**。 + +## 评测标准 + +决赛从人脸相似度和图文匹配度两个角度进行评分。人脸相似度使用**insightface**特征的cosin相似度。图文匹配度使用**ImageReward**作为图文相似度。 + +要求选手仅实现 `prepare_context`和 `process_one_json`两个函数进行任务处理,评测脚本将循环提供测试集执行 `process_one_json`函数**1.5个小时**,之后将对所有完成的任务计算归一化后的累计人脸分数和文本相似度分数。 + +下面是对一个task中评分函数,其中: + +1. ev与初赛中Evaluator()类似,其中 `get_face_embedding`获取人脸特征向量,`image_reward`是**ImageReward**模型,`score`函数计算图文匹配度; +2. source_json是评测集中一个task的json描述输入; +3. gen_json是对应任务process_one_json()采样输出的json格式描述; +4. bound_json中记录了训练集中源图片和原版unidiffuser输出图片得到人脸相似度和文本匹配度上下限,用于归一化选手的得分: + + ``` + normed_score_face = (face_score - min_face_sim) / (max_face_sim - min_face_sim) + normed_score_image_reward = (image_reward - min_image_reward) / (max_image_reward min_image_reward) + ``` + + 根据 `人脸归一化累加分数*2.5 + 文本归一化累加分数`作为最后的分数, 计算最终排名。 + + **注意:只有 `normed_score_face`大于0.1, `normed_score_image_reward`大于0.1的item才会被计分, 否则以0分计算。** +5. out_json_dir是保存对应task的成绩json格式输出的文件夹。 + +```python +def score(ev, source_json, gen_json, bound_json, out_json_dir): + # get ref images + ref_image_paths = [ i["path"] for i in source_json["source_group"]] + ref_face_embs = [ev.get_face_embedding(read_img_pil(i)) for i in ref_image_paths] + ref_face_embs = [emb for emb in ref_face_embs if emb is not None] # remove None + ref_face_embs = torch.cat(ref_face_embs) + + face_ac_scores = 0 + image_reward_ac_scores = 0 + normed_face_ac_scores = 0 + normed_image_reward_ac_scores = 0 + + out_json = {"id": gen_json["id"], "images": []} + commom_prompts = set([item["prompt"] for item in gen_json["images"]]) & set([item["prompt"] for item in bound_json["images"]]) + prompt_to_item = {item["prompt"]: item for item in gen_json["images"]} + bound_prompt_to_item = {item["prompt"]: item for item in bound_json["images"]} + if len(commom_prompts) != len(bound_json["images"]): + print(f"共有{len(commom_prompts)}个prompt, bound json有{len(bound_json['images'])}个prompt") + print(bound_json) + + for prompt in commom_prompts: + item = prompt_to_item[prompt] + bound_item = bound_prompt_to_item[prompt] + + assert item["prompt"] == bound_item["prompt"], f"prompt {item['prompt']} not equal to bound prompt {bound_item['prompt']}" + if len(item["paths"]) < 4: + continue + + # image reward + samples = [read_img_pil(sample_path) for sample_path in item["paths"]]# read images + scores_image_reward = [ev.image_reward.score(item["prompt"], sample_path) for sample_path in item["paths"]] + mean_image_reward = np.mean(scores_image_reward) + + # face similarity + sample_faces = [ev.get_face_embedding(sample) for sample in samples] + sample_faces = [emb for emb in sample_faces if emb is not None] # remove None + if len(sample_faces) <= 1: + print("too few faces") + continue + + scores_face = [(sample_face @ ref_face_embs.T).mean().item() for sample_face in sample_faces] + mean_face = np.mean(scores_face) + + subed_score_face = mean_face - bound_item["min_face_sim"] + subed_image_reward = mean_image_reward - bound_item["min_image_reward"] + + normed_score_face = subed_score_face / (bound_item["max_face_sim"] - bound_item["min_face_sim"]) + normed_score_image_reward = subed_image_reward / (bound_item["max_image_reward"] - bound_item["min_image_reward"]) + + if normed_score_image_reward < 0.1: + print("too low image reward") + continue + if normed_score_face < 0.1: + print("too low face similarity") + continue + + normed_face_ac_scores += normed_score_face + normed_image_reward_ac_scores += normed_score_image_reward + + face_ac_scores += subed_score_face + image_reward_ac_scores += subed_image_reward + + out_json["images"].append({"prompt": item["prompt"], + "scores_face": scores_face, + "scores_image_reward": scores_image_reward, + "subed_score_face": subed_score_face, + "subded_image_reward": subed_image_reward, + "normed_score_face": normed_score_face, + "normed_score_image_reward": normed_score_image_reward, + + with open(os.path.join(out_json_dir, f"{gen_json['id']}.json"), 'w') as f: + json.dump(out_json, f, indent=4) + + return {"face_ac_scores":face_ac_scores, + "image_reward_ac_scores":image_reward_ac_scores, + "normed_face_ac_scores":normed_face_ac_scores, + "normed_image_reward_ac_scores":normed_image_reward_ac_scores, + } +```