PaddlePaddle · LokeZhou · Mar 11, 2024 · Feb 29, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/paddlemix/examples/cogagent/chat_demo.py b/paddlemix/examples/cogagent/chat_demo.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+import paddle
+
+seed = 2024
+paddle.seed(seed)
+np.random.seed(seed)
+random.seed(seed)
+
+from paddlemix.auto.modeling import AutoModelMIX
+from paddlemix.auto.tokenizer import AutoTokenizerMIX
+
+"""
+This is a demo for using CogAgent and CogVLM in CLI
+Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required.
+In this demo, We us chat template, you can use others to replace such as 'vqa'.
+Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow.
+Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation.
+"""
+import argparse
+
+from PIL import Image
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat-hf", help="pretrained ckpt")
+parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5")
+args = parser.parse_args()
+MODEL_PATH = args.from_pretrained
+TOKENIZER_PATH = args.local_tokenizer
+DEVICE = "gpu" if paddle.device.cuda.device_count() >= 1 else "cpu"
+
+tokenizer = AutoTokenizerMIX.from_pretrained(TOKENIZER_PATH)
+
+torch_type = "float32"
+print("========Use torch type as:{} with device:{}========\n\n".format(torch_type, DEVICE))
+paddle.set_device(DEVICE)
+
+model = AutoModelMIX.from_pretrained(
+    MODEL_PATH,
+    dtype=torch_type,
+    low_cpu_mem_usage=False,
+).to(DEVICE)
+model.eval()
+
+text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
+while True:
+    image_path = input("image path >>>>> ")
+    if image_path == "":
+        print("You did not enter image path, the following will be a plain text conversation.")
+        image = None
+        text_only_first_query = True
+    else:
+        image = Image.open(image_path).convert("RGB")
+    history = []
+    while True:
+        query = input("Human:")
+        if query == "clear":
+            break
+        if image is None:
+            if text_only_first_query:
+                query = text_only_template.format(query)
+                text_only_first_query = False
+            else:
+                old_prompt = ""
+                for _, (old_query, response) in enumerate(history):
+                    old_prompt += old_query + " " + response + "\n"
+                query = old_prompt + "USER: {} ASSISTANT:".format(query)
+        if image is None:
+            input_by_model = model.build_conversation_input_ids(
+                tokenizer, query=query, history=history, template_version="base"
+            )
+        else:
+            input_by_model = model.build_conversation_input_ids(
+                tokenizer, query=query, history=history, images=[image]
+            )
+        inputs = {
+            "input_ids": input_by_model["input_ids"].unsqueeze(axis=0).to(DEVICE),
+            "token_type_ids": input_by_model["token_type_ids"].unsqueeze(axis=0).to(DEVICE),
+            "attention_mask": input_by_model["attention_mask"].unsqueeze(axis=0).to(DEVICE),
+            "images": [[input_by_model["images"][0].to(DEVICE).to(torch_type)]] if image is not None else None,
+        }
+        if "cross_images" in input_by_model and input_by_model["cross_images"]:
+            inputs["cross_images"] = [[input_by_model["cross_images"][0].to(DEVICE).to(torch_type)]]
+        gen_kwargs = {"max_new_tokens": 2048, "do_sample": False}
+        with paddle.no_grad():
+            outputs, _ = model.generate(**inputs, **gen_kwargs)
+            outputs = outputs[:, inputs["input_ids"].shape[1] :]
+            response = tokenizer.decode(outputs[0])
+            response = response.split("</s>")[0]
+            print("\nCog:", response)
+        history.append((query, response))
diff --git a/paddlemix/models/cogagent/README.md b/paddlemix/models/cogagent/README.md
@@ -0,0 +1,27 @@
+# CogAgent
+
+## 1. 模型简介
+
+该模型是 [CogAgent](https://arxiv.org/abs/2312.08914) 的 paddle 实现。对齐的是 huggingface 上的 `THUDM/cogagent-chat-hf`, tokenizer 采用的是 huggingface 上的 `lmsys/vicuna-7b-v1.5`
+
+
+## 2. Demo
+
+### 2.1 依赖安装
+
+1） 安装PaddleNLP develop版本
+```
+pip install --pre --upgrade paddlenlp -f https://www.paddlepaddle.org.cn/whl/paddlenlp.html
+```
+
+2）安装 PaddleMix 环境依赖包
+
+```
+pip install -r requirements.txt
+```
+
+### 2.2 多轮对话
+
+```bash
+python paddlemix/examples/cogagent/chat_demo.py
+```
diff --git a/paddlemix/models/cogagent/configuration.py b/paddlemix/models/cogagent/configuration.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal
+
+from paddlenlp import transformers
+
+
+class CogAgentConfig(transformers.PretrainedConfig):
+    _auto_class = "AutoConfig"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        cross_hidden_size=1024,
+        cross_compute_hidden_size=1024,
+        cross_image_size=1120,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        template_version: Literal["base", "chat"] = "chat",
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_cache=True,
+        **kwargs
+    ):
+        self.hidden_size = hidden_size
+        self.cross_hidden_size = cross_hidden_size
+        self.cross_compute_hidden_size = cross_compute_hidden_size
+        self.cross_image_size = cross_image_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_act = hidden_act
+        self.template_version = template_version
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )