PaddlePaddle · LokeZhou · Mar 11, 2024 · Feb 29, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/paddlemix/examples/cogagent/README.md b/paddlemix/examples/cogagent/README.md
@@ -0,0 +1,38 @@
+# CogAgent
+
+## 1. 模型介绍
+
+该模型是 [CogAgent](https://arxiv.org/abs/2312.08914) 的 paddle 实现。
+
+[CogAgent](https://arxiv.org/abs/2312.08914)是一个基于CogVLM改进的开源视觉语言模型。CogAgent-18B拥有110亿的视觉参数和70亿的语言参数。
+
+CogAgent-18B在9个经典的跨模态基准测试中实现了最先进的全能性能，包括VQAv2、OK-VQ、TextVQA、ST-VQA、ChartQA、infoVQA、DocVQA、MM-Vet和POPE。
+
+除了CogVLM已有的所有功能（视觉多轮对话，视觉定位）之外，CogAgent：
+
+1. 支持更高分辨率的视觉输入和对话式问答。它支持超高分辨率的图像输入，达到1120x1120。
+
+2. 拥有视觉Agent的能力，能够在任何图形用户界面截图上，为任何给定任务返回一个计划，下一步行动，以及带有坐标的特定操作。
+
+3. 增强了与图形用户界面相关的问答能力，使其能够处理关于任何图形用户界面截图的问题，例如网页、PC应用、移动应用等。
+
+4. 通过改进预训练和微调，提高了OCR相关任务的能力。
+
+本仓库提供paddle版本的 cogagent-chat 模型
+
+## 2. 环境准备
+
+1） [安装PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP?tab=readme-ov-file#%E5%AE%89%E8%A3%85)
+
+2）[安装 PaddleMix 环境依赖包](https://github.com/PaddlePaddle/PaddleMIX/tree/b4f97ff859e1964c839fc5fab94f7ba63b1e5959?tab=readme-ov-file#%E5%AE%89%E8%A3%85)
+
+## 3. 快速开始
+完成环境准备后，我们目前提供多轮对话方式使用：
+
+```bash
+python paddlemix/examples/cogagent/chat_demo.py \
+--from_pretrained "THUDM/cogagent-chat"
+```
+
+可配置参数说明：
+  * `from_pretrained`: 指定CogAgent的模型名字或权重路径以及tokenizer，默认 THUDM/cogagent-chat
diff --git a/paddlemix/examples/cogagent/chat_demo.py b/paddlemix/examples/cogagent/chat_demo.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+import paddle
+
+seed = 2024
+paddle.seed(seed)
+np.random.seed(seed)
+random.seed(seed)
+
+import argparse
+
+from PIL import Image
+
+from paddlemix.auto.modeling import AutoModelMIX
+from paddlemix.auto.tokenizer import AutoTokenizerMIX
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat", help="pretrained ckpt and tokenizer")
+args = parser.parse_args()
+MODEL_PATH = args.from_pretrained
+TOKENIZER_PATH = MODEL_PATH
+
+tokenizer = AutoTokenizerMIX.from_pretrained(TOKENIZER_PATH)
+
+data_type = "float32"
+
+model = AutoModelMIX.from_pretrained(
+    MODEL_PATH,
+    dtype=data_type,
+    low_cpu_mem_usage=False,
+)
+model.eval()
+
+text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
+while True:
+    image_path = input("image path >>>>> ")
+    if image_path == "":
+        print("You did not enter image path, the following will be a plain text conversation.")
+        image = None
+        text_only_first_query = True
+    else:
+        image = Image.open(image_path).convert("RGB")
+    history = []
+    while True:
+        query = input("Human:")
+        if query == "clear":
+            break
+        if image is None:
+            if text_only_first_query:
+                query = text_only_template.format(query)
+                text_only_first_query = False
+            else:
+                old_prompt = ""
+                for _, (old_query, response) in enumerate(history):
+                    old_prompt += old_query + " " + response + "\n"
+                query = old_prompt + "USER: {} ASSISTANT:".format(query)
+        if image is None:
+            input_by_model = model.build_conversation_input_ids(
+                tokenizer, query=query, history=history, template_version="base"
+            )
+        else:
+            input_by_model = model.build_conversation_input_ids(
+                tokenizer, query=query, history=history, images=[image]
+            )
+        inputs = {
+            "input_ids": input_by_model["input_ids"].unsqueeze(axis=0),
+            "token_type_ids": input_by_model["token_type_ids"].unsqueeze(axis=0),
+            "attention_mask": input_by_model["attention_mask"].unsqueeze(axis=0),
+            "images": [[input_by_model["images"][0].to(data_type)]] if image is not None else None,
+        }
+        if "cross_images" in input_by_model and input_by_model["cross_images"]:
+            inputs["cross_images"] = [[input_by_model["cross_images"][0].to(data_type)]]
+        gen_kwargs = {"max_new_tokens": 2048, "do_sample": False}
+        with paddle.no_grad():
+            outputs, _ = model.generate(**inputs, **gen_kwargs)
+            outputs = outputs[:, inputs["input_ids"].shape[1] :]
+            response = tokenizer.decode(outputs[0])
+            response = response.split("</s>")[0]
+            print("\nCog:", response)
+        history.append((query, response))
diff --git a/paddlemix/models/cogagent/configuration.py b/paddlemix/models/cogagent/configuration.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal
+
+from paddlenlp import transformers
+
+
+class CogAgentConfig(transformers.PretrainedConfig):
+    _auto_class = "AutoConfig"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        cross_hidden_size=1024,
+        cross_compute_hidden_size=1024,
+        cross_image_size=1120,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        template_version: Literal["base", "chat"] = "chat",
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_cache=True,
+        **kwargs
+    ):
+        self.hidden_size = hidden_size
+        self.cross_hidden_size = cross_hidden_size
+        self.cross_compute_hidden_size = cross_compute_hidden_size
+        self.cross_image_size = cross_image_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_act = hidden_act
+        self.template_version = template_version
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )