diff --git a/examples/conversation_with_stablediffusion_model/README.md b/examples/conversation_with_stablediffusion_model/README.md new file mode 100644 index 000000000..59e3a7270 --- /dev/null +++ b/examples/conversation_with_stablediffusion_model/README.md @@ -0,0 +1,129 @@ +# Conversation with Stable-diffusion model + +This example will show + +- How to use Stable Diffusion models in AgentScope. + +In this example, you can interact in a conversational format to generate images. +Once the image is generated, the agent will respond with the local file path where the image is saved. + +## Minimum Hardware Requirements + +- **GPU**: NVIDIA GPU with at least 6.9GB of VRAM +- **CPU**: Modern multi-core CPU (e.g., Intel i5 or AMD Ryzen 5) +- **RAM**: Minimum 8GB +- **Storage**: At least 10GB of available hard drive space + +## How to Run + +You need to satisfy the following requirements to run this example: + +### Step 0: Install Stable Diffusion Web UI and AgentScope + +- Install Stable Diffusion Web UI by following the instructions at [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui). +- Install the latest version of AgentScope by + ```bash + git clone https://github.com/modelscope/agentscope.git + cd agentscope + pip install -e . + ``` + +### Step 1: Download the required checkpoints + +Before starting the Stable Diffusion Web UI, you need to download at least one model to ensure normal operation. +Download the model to `stable-diffusion-webui/models/Stable-diffusion` directory. + +### Step 2: Launch the Stable Diffusion Web UI + +We've provided a convenient shell script to quickly start the Stable Diffusion Web UI: +`scripts/stable_diffusion_webui/sd_setup.sh` + +Activate the virtual environment first, Then, run the following command in your terminal, replacing YOUR-SD-WEBUI-PATH with the actual path to your Stable Diffusion Web UI directory: + +```bash +bash scripts/stable_diffusion_webui/sd_setup.sh -s YOUR-SD-WEBUI-PATH +``` + +If you choose to start it on your own, you need to launch the Stable Diffusion Web UI with the following arguments: `--api --port=7862`. For more detailed instructions on starting the WebUI, refer to the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui). + +### Step 3: Running the Example + +Run the example and input your prompt. + +```bash +python conversation_with_stablediffusion_model.py +``` + +## Customization Options + +### `model_config` Example: + +```json +{ + "model_type": "sd_txt2img", + "config_name": "sd", + "options": { + "sd_model_checkpoint": "Anything-V3.0-pruned", + "sd_lora": "add_detail", + "CLIP_stop_at_last_layers": 2 + }, + "generate_args": { + "steps": 50, + "n_iter": 1, + "override_settings": { + "CLIP_stop_at_last_layers": 3 + } + } +} +``` + +### Parameter Explanation: + +- `options`: Global configuration that directly affects the WebUI settings. +- `generate_args`: Controls parameters for individual image generation requests, such as `steps` (number of sampling steps) and `n_iter` (number of iterations). + - `override_settings`: Overrides WebUI settings for a single request, taking precedence over `options`. + +Notes: + +- `override_settings` only affects the current request, while changes made to `options` persist. +- Both parameters can set the same options, but `override_settings` has a higher priority. + +As shown in the example, the final image will be generated with the following settings: + +steps: 50 +n_iter: 1 +sd_model_checkpoint: Anything-V3.0-pruned +sd_lora: add_detail +CLIP_stop_at_last_layers: 3 + +However, the web UI will always display the following settings: + +sd_model_checkpoint: Anything-V3.0-pruned +sd_lora: add_detail +CLIP_stop_at_last_layers: 2 + +### Available Parameter Lists: + +If you've successfully enabled the Stable Diffusion Web UI API, you should be able to access its documentation at http://127.0.0.1:7862/docs (or whatever URL you're using + /docs). + +- `generate_args`: {url}/docs#/default/text2imgapi_sdapi_v1_txt2img_post +- `options` and `override_settings`: {url}/docs#/default/get_config_sdapi_v1_options_get + +For this project, the "options" parameter will be posted to the /sdapi/v1/options API endpoint, +and the "generate_args" parameter will be posted to the /sdapi/v1/txt2img API endpoint. +You can refer to https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/API for a more parameter reference guide. + +## A Running Example + +- Conversation history with Stable Diffusion Web UI. + ```bash + User input:Horses on Mars + User: Horses on Mars + Assistant: Image saved to path\agentscope\runs\run_20240920-142208_rqsvhh\file\image_20240920-142522_HTF38X.png + User input: boy eating ice-cream + User: boy eating ice-cream + Assistant: Image saved to path\agentscope\runs\run_20240920-142208_rqsvhh\file\image_20240920-142559_2xGtUs.png + ``` +- Image +Horses on Mars +boy eating ice-cream \ No newline at end of file diff --git a/examples/conversation_with_stablediffusion_model/conversation_with_stablediffusion_model.py b/examples/conversation_with_stablediffusion_model/conversation_with_stablediffusion_model.py new file mode 100644 index 000000000..9a185f9c8 --- /dev/null +++ b/examples/conversation_with_stablediffusion_model/conversation_with_stablediffusion_model.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +"""conversation between user and stable-diffusion agent.""" +import agentscope +from agentscope.agents import DialogAgent +from agentscope.agents.user_agent import UserAgent + + +def main() -> None: + """A basic conversation demo""" + + agentscope.init( + model_configs=[ + { + "model_type": "sd_txt2img", + "config_name": "sd", + "options": { + "sd_model_checkpoint": "xxxxxx", + "CLIP_stop_at_last_layers": 2, + }, + "generate_args": { + "steps": 50, + "n_iter": 1, + }, + }, + ], + project="txt2img-Agent Conversation", + save_api_invoke=True, + ) + + # Init two agents + dialog_agent = DialogAgent( + name="Assistant", + sys_prompt="dreamy", # replace by your image style prompts + model_config_name="sd", # replace by your model config name + ) + user_agent = UserAgent() + + # start the conversation between user and assistant + msg = None + while True: + msg = user_agent(msg) + if msg.content == "exit": + break + msg = dialog_agent(msg) + + +if __name__ == "__main__": + main() diff --git a/scripts/stable_diffusion_webui/model_config.json b/scripts/stable_diffusion_webui/model_config.json new file mode 100644 index 000000000..823ea406e --- /dev/null +++ b/scripts/stable_diffusion_webui/model_config.json @@ -0,0 +1,14 @@ +{ + "model_type": "sd_txt2img", + "config_name": "stable_diffusion_txt2img", + "host": "127.0.0.1:7862", + "options": { + "sd_model_checkpoint": "Anything-V3.0-pruned", + "sd_lora": "add_detail", + "CLIP_stop_at_last_layers": 2 + }, + "generate_args": { + "steps": 50, + "n_iter": 1 + } + } \ No newline at end of file diff --git a/scripts/stable_diffusion_webui/sd_setup.sh b/scripts/stable_diffusion_webui/sd_setup.sh new file mode 100644 index 000000000..ce71baba8 --- /dev/null +++ b/scripts/stable_diffusion_webui/sd_setup.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# set VENV_DIR=%~dp0%venv +# call "%VENV_DIR%\Scripts\activate.bat" + +# stable_diffusion_webui_path="YOUR_PATH_TO_STABLE_DIFFUSION_WEBUI" + +port=7862 + +while getopts ":p:s:" opt +do + # shellcheck disable=SC2220 + case $opt in + p) port="$OPTARG";; + s) stable_diffusion_webui_path="$OPTARG" + ;; + esac +done + +stable_diffusion_webui_path=${stable_diffusion_webui_path%/} +launch_py_path="$stable_diffusion_webui_path/launch.py" + +# Check if the launch.py script exists +if [[ ! -f "$launch_py_path" ]]; then + echo "The launch.py script was not found at $launch_py_path." + echo "Please ensure you have specified the correct path to your Stable Diffusion WebUI using the -s option." + echo "Example: ./sd_setup.sh -s /path/to/your/stable-diffusion-webui" + echo "Alternatively, you can set the path directly in the script." + exit 1 +fi + +cd $stable_diffusion_webui_path + +python ./launch.py --api --port=$port diff --git a/setup.py b/setup.py index cd577d5b8..e29fdd8f9 100644 --- a/setup.py +++ b/setup.py @@ -90,6 +90,7 @@ extra_litellm_requires = ["litellm"] extra_zhipuai_requires = ["zhipuai"] extra_ollama_requires = ["ollama>=0.1.7"] +extra_sd_webuiapi_requires = ["webuiapi"] # Full requires extra_full_requires = ( @@ -102,6 +103,7 @@ + extra_litellm_requires + extra_zhipuai_requires + extra_ollama_requires + + extra_sd_webuiapi_requires ) # For online workstation @@ -140,6 +142,7 @@ "litellm": extra_litellm_requires, "zhipuai": extra_zhipuai_requires, "gemini": extra_gemini_requires, + "stablediffusion": extra_sd_webuiapi_requires, # For service functions "service": extra_service_requires, # For distribution mode diff --git a/src/agentscope/models/__init__.py b/src/agentscope/models/__init__.py index 0a6894b35..9cde1cd85 100644 --- a/src/agentscope/models/__init__.py +++ b/src/agentscope/models/__init__.py @@ -41,6 +41,9 @@ from .yi_model import ( YiChatWrapper, ) +from .stablediffusion_model import ( + StableDiffusionImageSynthesisWrapper, +) __all__ = [ "ModelWrapperBase", @@ -64,6 +67,7 @@ "ZhipuAIEmbeddingWrapper", "LiteLLMChatWrapper", "YiChatWrapper", + "StableDiffusionImageSynthesisWrapper", ] diff --git a/src/agentscope/models/stablediffusion_model.py b/src/agentscope/models/stablediffusion_model.py new file mode 100644 index 000000000..e1948e38a --- /dev/null +++ b/src/agentscope/models/stablediffusion_model.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- +"""Model wrapper for stable diffusion models.""" +from abc import ABC +from typing import Any, Union, Sequence + +try: + import webuiapi +except ImportError: + webuiapi = None + +from . import ModelWrapperBase, ModelResponse +from ..message import Msg +from ..manager import FileManager +from ..utils.common import _convert_to_str + + +class StableDiffusionWrapperBase(ModelWrapperBase, ABC): + """The base class for stable-diffusion model wrappers. + + To use SD-webui API, please + 1. First download stable-diffusion-webui from + https://github.com/AUTOMATIC1111/stable-diffusion-webui and + install it + 2. Move your checkpoint to 'models/Stable-diffusion' folder + 3. Start launch.py with the '--api --port=7862' parameter + 4. Install the 'webuiapi' package by 'pip install webuiapi' + After that, you can use the SD-webui API and + query the available parameters on the http://localhost:7862/docs page + """ + + model_type: str = "stable_diffusion" + + def __init__( + self, + config_name: str, + generate_args: dict = None, + options: dict = None, + host: str = "127.0.0.1", + port: int = 7862, + **kwargs: Any, + ) -> None: + """ + Initializes the SD-webui API client. + + Args: + config_name (`str`): + The name of the model config. + generate_args (`dict`, default `None`): + The extra keyword arguments used in SD api generation, + e.g. `{"steps": 50}`. + options (`dict`, default `None`): + The keyword arguments to change the sd-webui settings + such as model or CLIP skip, this changes will persist. + e.g. `{"sd_model_checkpoint": "Anything-V3.0-pruned"}`. + host (`str`, default `"127.0.0.1"`): + The host of the stable-diffusion webui server. + port (`int`, default `7862`): + The port of the stable-diffusion webui server. + """ + # Initialize the SD-webui API + self.api = webuiapi.WebUIApi(host=host, port=port, **kwargs) + self.generate_args = generate_args or {} + + # Set options if provided + if options: + self.api.set_options(options) + + # Get the default model name from the web-options + model_name = ( + self.api.get_options()["sd_model_checkpoint"].split("[")[0].strip() + ) + # Update the model name + if self.generate_args.get("override_settings"): + model_name = generate_args["override_settings"].get( + "sd_model_checkpoint", + model_name, + ) + + super().__init__(config_name=config_name, model_name=model_name) + + +class StableDiffusionImageSynthesisWrapper(StableDiffusionWrapperBase): + """Stable Diffusion Text-to-Image (txt2img) API Wrapper""" + + model_type: str = "sd_txt2img" + + def __call__( + self, + prompt: str, + save_local: bool = True, + **kwargs: Any, + ) -> ModelResponse: + """ + Args: + prompt (`str`): + The prompt string to generate images from. + save_local (`bool`, default `True`): + Whether to save the generated images locally. + **kwargs (`Any`): + The keyword arguments to SD-webui txt2img API, e.g. + `n_iter`, `steps`, `seed`, `width`, etc. Please refer to + https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/API + or http://localhost:7862/docs + for more detailed arguments. + Returns: + `ModelResponse`: + A list of image local urls in image_urls field and the + raw response in raw field. + """ + # step1: prepare keyword arguments + payload = { + "prompt": prompt, + **kwargs, + **self.generate_args, + } + + # step2: forward to generate response + response = self.api.txt2img(**payload) + + # step3: save model invocation and update monitor + self._save_model_invocation_and_update_monitor( + payload=payload, + response=response.json, + ) + + # step4: parse the response + PIL_images = response.images + + file_manager = FileManager.get_instance() + if save_local: + # Save images + image_urls = [file_manager.save_image(_) for _ in PIL_images] + text = "Image saved to " + "\n".join(image_urls) + else: + image_urls = PIL_images + text = "" # Just a placeholder + + return ModelResponse( + text=text, + image_urls=image_urls, + raw=response.json, + ) + + def _save_model_invocation_and_update_monitor( + self, + payload: dict, + response: dict, + ) -> None: + """Save the model invocation and update the monitor accordingly. + + Args: + kwargs (`dict`): + The keyword arguments to the DashScope chat API. + response (`dict`): + The response object returned by the DashScope chat API. + """ + self._save_model_invocation( + arguments=payload, + response=response, + ) + + session_parameters = response["parameters"] + size = f"{session_parameters['width']}*{session_parameters['height']}" + image_count = ( + session_parameters["batch_size"] * session_parameters["n_iter"] + ) + + self.monitor.update_image_tokens( + model_name=self.model_name, + image_count=image_count, + resolution=size, + ) + + def format(self, *args: Union[Msg, Sequence[Msg]]) -> str: + # This is a temporary implementation to focus on the prompt + # on single-turn image generation by preserving only the system prompt + # and the last user message. This logic might change in the future + # to support more complex conversational scenarios + if len(args) == 0: + raise ValueError( + "At least one message should be provided. An empty message " + "list is not allowed.", + ) + + # Parse all information into a list of messages + input_msgs = [] + for _ in args: + if _ is None: + continue + if isinstance(_, Msg): + input_msgs.append(_) + elif isinstance(_, list) and all(isinstance(__, Msg) for __ in _): + input_msgs.extend(_) + else: + raise TypeError( + f"The input should be a Msg object or a list " + f"of Msg objects, got {type(_)}.", + ) + + # record user message history as a list of strings + user_messages = [] + sys_prompt = None + for i, unit in enumerate(input_msgs): + if i == 0 and unit.role == "system": + # if system prompt is available, place it at the beginning + sys_prompt = _convert_to_str(unit.content) + elif unit.role == "user": + # Merge user messages into a conversation history prompt + user_messages.append(_convert_to_str(unit.content)) + else: + continue + + content_components = [] + # Add system prompt at the beginning if provided + if sys_prompt: + content_components.append(sys_prompt) + # Add the last user message if the user messages is not empty + if len(user_messages) > 0: + content_components.append(user_messages[-1]) + + prompt = ",".join(content_components) + + return prompt diff --git a/src/agentscope/service/__init__.py b/src/agentscope/service/__init__.py index 7d33e6501..20c1af051 100644 --- a/src/agentscope/service/__init__.py +++ b/src/agentscope/service/__init__.py @@ -45,6 +45,7 @@ openai_edit_image, openai_create_image_variation, ) +from .multi_modality.stablediffusion_services import sd_text_to_image from .service_response import ServiceResponse from .service_toolkit import ServiceToolkit @@ -117,6 +118,7 @@ def get_help() -> None: "openai_image_to_text", "openai_edit_image", "openai_create_image_variation", + "sd_text_to_image", "tripadvisor_search", "tripadvisor_search_location_photos", "tripadvisor_search_location_details", diff --git a/src/agentscope/service/multi_modality/stablediffusion_services.py b/src/agentscope/service/multi_modality/stablediffusion_services.py new file mode 100644 index 000000000..4547aa115 --- /dev/null +++ b/src/agentscope/service/multi_modality/stablediffusion_services.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +"""Use StableDiffusion-webui API to generate images +""" +import os +from typing import Optional + +from ...models import StableDiffusionImageSynthesisWrapper + +from ...manager import FileManager +from ..service_response import ( + ServiceResponse, + ServiceExecStatus, +) +from ...utils.common import ( + _get_timestamp, + _generate_random_code, +) +from ...constants import _DEFAULT_IMAGE_NAME + + +def sd_text_to_image( + prompt: str, + n_iter: int = 1, + width: int = 1024, + height: int = 1024, + options: dict = None, + baseurl: str = None, + save_dir: Optional[str] = None, +) -> ServiceResponse: + """Generate image(s) based on the given prompt, and return image url(s). + + Args: + prompt (`str`): + The text prompt to generate image. + n (`int`, defaults to `1`): + The number of images to generate. + width (`int`, defaults to `1024`): + Width of the image. + height (`int`, defaults to `1024`): + Height of the image. + options (`dict`, defaults to `None`): + The options to override the sd-webui default settings. + If not specified, will use the default settings. + baseurl (`str`, defaults to `None`): + The base url of the sd-webui. + save_dir (`Optional[str]`, defaults to 'None'): + The directory to save the generated images. If not specified, + will return the web urls. + + Returns: + ServiceResponse: + A dictionary with two variables: `status` and`content`. + If `status` is ServiceExecStatus.SUCCESS, + the `content` is a dict with key 'fig_paths" and + value is a list of the paths to the generated images. + + Example: + + .. code-block:: python + + prompt = "A beautiful sunset in the mountains" + print(sd_text_to_image(prompt, 2)) + + > { + > 'status': 'SUCCESS', + > 'content': {'image_urls': ['IMAGE_URL1', 'IMAGE_URL2']} + > } + + """ + text2img = StableDiffusionImageSynthesisWrapper( + config_name="sd-text-to-image-service", # Just a placeholder + baseurl=baseurl, + ) + try: + kwargs = {"n_iter": n_iter, "width": width, "height": height} + if options: + kwargs["override_settings"] = options + + res = text2img(prompt=prompt, save_local=False, **kwargs) + images = res.image_urls + + # save images to save_dir + if images is not None: + if save_dir: + os.makedirs(save_dir, exist_ok=True) + urls_local = [] + # Obtain the image file names in the url + for image in images: + image_name = _DEFAULT_IMAGE_NAME.format( + _get_timestamp( + "%Y%m%d-%H%M%S", + ), + _generate_random_code(), + ) + image_path = os.path.abspath( + os.path.join(save_dir, image_name), + ) + # Download the image + image.save(image_path) + urls_local.append(image_path) + return ServiceResponse( + ServiceExecStatus.SUCCESS, + {"image_urls": urls_local}, + ) + else: + # Return the default urls + file_manager = FileManager.get_instance() + urls = [file_manager.save_image(_) for _ in images] + return ServiceResponse( + ServiceExecStatus.SUCCESS, + {"image_urls": urls}, + ) + else: + return ServiceResponse( + ServiceExecStatus.ERROR, + "Error: Failed to generate images", + ) + except Exception as e: + return ServiceResponse( + ServiceExecStatus.ERROR, + str(e), + )