antgroup · LandJerry · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/agentuniverse_dataflow/node/data/event/planner.py b/agentuniverse_dataflow/node/data/event/planner.py
@@ -39,8 +39,8 @@ def _node_preprocess(self) -> None:
         if self._dataset_in_handler:
             perceived_list = self._dataset_in_handler.read_json_obj_list()
             # one for plan , at least one for verification
-            if not perceived_list or len(perceived_list) <= 1:
-                raise Exception('perceived json list does not provide at least 2 samples!')
+            if not perceived_list or len(perceived_list) < 1:
+                raise Exception('perceived json list does not provide at least 1 samples!')
 
             for i in range(0, len(perceived_list)):
                 json_obj = perceived_list[i]

diff --git a/agentuniverse_dataflow/node/data/prompt/input/extend.py b/agentuniverse_dataflow/node/data/prompt/input/extend.py
@@ -20,7 +20,7 @@
 class ExtendNode(PromptBase):
     """The ExtendNode class, which is used to define the class of extend node."""
 
-    extend_times: int = 4
+    extend_times: int = 1
 
     def _node_preprocess(self) -> None:
         super()._node_preprocess()

diff --git a/agentuniverse_dataflow/util/llm/llm_call.py b/agentuniverse_dataflow/util/llm/llm_call.py
@@ -25,7 +25,7 @@ async def async_batch_call(prompts: list[str], llm_name: str):
         if llm is None:
             raise Exception('LLM not found for agentuniverse data.')
         messages = [{"role": "user", "content": prompts[i]}]
-        tasks.append(llm.acall(messages=messages, timeout=200))
+        tasks.append(llm.acall(messages=messages, timeout=700))
 
     task = asyncio.create_task(show_progress(len(prompts), asyncio.get_running_loop()))
     outputs = await asyncio.gather(*tasks, return_exceptions=True)

diff --git a/docs/guidebook/_picture/dataflow_dataset_build.png b/docs/guidebook/_picture/dataflow_dataset_build.png
diff --git a/docs/guidebook/_picture/dataflow_dispatch.png b/docs/guidebook/_picture/dataflow_dispatch.png
diff --git a/docs/guidebook/_picture/dataflow_flowchart.jpg b/docs/guidebook/_picture/dataflow_flowchart.jpg
diff --git a/docs/guidebook/en/8_1_agentUniverse-dataflow.md b/docs/guidebook/en/8_1_agentUniverse-dataflow.md
@@ -0,0 +1,115 @@
+# Project Introduction
+agentUniverse-dataflow is the project based on the idea of **Agentic Workflow**, utilizing the multi-agent and flow orchestration approach. It enhances the data processing workflow of LLM and agents through **offline adaptive LLM data collection, fine-tuning dataset generation, model inference training, and multi-dimensional automatic dataset evaluation, etc.**
+
+Including:
+1. A foundational data framework: **DataFramework**, structured around the Dispatch-Flow-Node-Data/Event/Prompt/Answer/Eval, with capabilities for sustainable integration and expansion.
+2. An agent framework: **PVRP** (Planner-Verifier-Reflector-Producer).
+3. A set of capabilities:
+   - Adaptive data event collection: **auto_event**
+   - Fine-tuning dataset generation: **dataset_build**
+   - Uploading for training and inference: **train_dump**
+   - Multi-dimensional objective automatic evaluation: **eval_report**
+   - Model deployment and activation: **model_deploy**
+   - System instruction optimization: **instruct_select**
+
+# Quick Start
+In the agentUniverse standard project template `sample_standard_app`:
+## step1
+Configure the loading path of prompt in config.toml, add `agentuniverse_dataflow.prompt`, and read the built-in prompt files of agentuniverse_dataflow.
+```toml
+[CORE_PACKAGE]
+# Perform a full component scan and registration for all the paths under this list.
+default = ['sample_standard_app.app.core']
+# Scan and register agent components for all paths under this list, with priority over the default.
+agent = ['sample_standard_app.app.core.agent']
+# Scan and register agent components for all paths under this list, with priority over the default.
+knowledge = ['sample_standard_app.app.core.knowledge']
+# Scan and register knowledge components for all paths under this list, with priority over the default.
+llm = ['sample_standard_app.app.core.llm']
+# Scan and register llm components for all paths under this list, with priority over the default.
+planner = ['sample_standard_app.app.core.planner']
+# Scan and register planner components for all paths under this list, with priority over the default.
+tool = ['sample_standard_app.app.core.tool']
+# Scan and register memory components for all paths under this list, with priority over the default.
+memory = ['sample_standard_app.app.core.memory']
+# Scan and register service components for all paths under this list, with priority over the default.
+service = ['sample_standard_app.app.core.service']
+# Scan and register prompt components for all paths under this list, with priority over the default.
+prompt = ['sample_standard_app.app.core.prompt', 'agentuniverse_dataflow.prompt']
+```
+## step2
+Under the dataflow directory of sample_standard_app, dispatch is used as the entry, configure the dataflow process in `dispatch.yaml`, and then run the `dispatch.py` file (the results of the dataflow run are stored in the dispatch/data directory as jsonl).
+
+![picture](../_picture/dataflow_dispatch.png) 
+
+In the `dispatch.yaml` file, users can customize and edit the dataflow task nodes that need to be run, such as the following configuration file. After dataflow runs, it performs two tasks: **adaptive data event collection and fine-tuning dataset generation**.
+```yaml
+name: 'main_dispatch'
+description: 'dispatch with multi-dataflows which will execute one after another'
+dataflows:
+  - ../flow/auto_event.yaml
+  - ../flow/dataset_build.yaml
+```
+
+# Dataflow Introduction
+agentUniverse-dataflow currently supports six kinds of flow, namely `auto_event/dataset_build/train_dump/eval_report/model_deploy/instruct_select`, and each flow is combined through nodes to form a corresponding pipeline.
+## Dataflow Flowchart
+![picture](../_picture/dataflow_flowchart.jpg) 
+
+In the yaml configuration file of each flow, each node contains llm and prompt version configuration information. **Users can customize the llm and prompt to achieve version management and quick switching.** 
+For example, the node in the following figure uses the QWen model of the sample project and the built-in prompt file of agentuniverse_dataflow:
+
+![picture](../_picture/dataflow_dataset_build.png)  
+## Flow Detailed Introduction
+**Special instruction:** 
+`Auto event/dataset build/eval report/instruct select` in dataflow is a flow node that **users can directly run and experience**. The `train dump and model deploy` node mainly **provide the flow concept**. At present, they have been deployed and run through the standard model and training platform in Ant Group, and the open source community version will consider opening up later.
+### Auto Event
+The main function of Auto Event is to collect different data sources, perceive the log information of the corresponding agent runtime in the data source, extract valid input and output to generate specific jsonl files (currently the collection of jsonl data source is supported, and more data source types remain open.
+
+Auto Event contains three nodes, `Perceiver/Planner/Executor`, configured through yaml, and the example configuration file is `auto_event.yaml` in sample_standard_app. 
+ - Perceiver Node: perceive the data source and extracting the original input and output from the data source.
+ - Planner Node: generate code to extract valid model input and output from the data source. 
+ - Executor Node: execute code to extract valid model input and output from the data source.
+
+### Dataset Build
+The main function of Dataset Build is to generate domain-specific high-quality q&a datasets.
+
+Dataset Build contains six nodes, `Seed/Rewrite/Extend/Dedupe/Answer/Filter`, configured through yaml, and the example configuration file is `dataset_build.yaml` in sample_standard_app.
+ - Seed Node: user-specified domain to produce rough query sets.
+ - Rewrite Node: rewrite rough query sets to standard query sets.
+ - Extend Node: extent query sets.
+ - Dedupe Node: de-duplicate query sets.
+ - Answer Node: LLM calls the query sets to generate the q&a datasets.
+ - Filter Node: professional domain evaluation criteria to filter valid datasets.
+
+### Train Dump
+The main function of Train Dump is to upload specified data sets, sft training model, and dump q&a result sets.
+
+Train Dump contains four nodes, `Upload/Train/PreDeploy/Dump`, configured through yaml, and the example configuration file is `train_dump.yaml` in sample_standard_app.
+ - Upload Node: upload datasets to the model training platform.
+ - Train Node: train model according to fine-tuning data sets.
+ - PreDeploy Node: pre-deploy the model after training.
+ - Dump Node: dump q&a result sets according to query sets.
+
+### Eval Report
+The main function of Eval Report is to evaluate the quality of q&a result sets and generate evaluation reports.
+
+Eval Report contains two nodes, `Eval/Report`, configured through yaml, and the example configuration file is `eval_report.yaml` in sample_standard_app.
+ - Eval Node: multidimensional evaluation of the quality of q&a result sets.
+ - Report Node: generate quality assessment reports.
+
+### Model Deploy
+The main function of Model Deploy is to formally deploy the post-training model, dump q&a result sets.
+
+Model Deploy contains two nodes, `Deploy/Dump`, configured through yaml, and the example configuration file is `model_deploy.yaml` in sample_standard_app.
+ - Deploy Node: formal deployment of the trained model.
+ - Dump Node: dump q&a result sets according to query sets.
+
+### Instruct Select
+The main function of Instruct Select is to generate instruction sets, assemble complete prompt sets, dump q&a result sets, and evaluate the quality of result sets.
+
+Instruct Select contains four nodes, `Seed/PromptGen/Dump/Eval`, configured through yaml, and the example configuration file is `instruct_select.yaml` in sample_standard_app.
+ - Seed Node: generate instruction sets based on user-specified domain.
+ - PromptGen Node: merge instruction sets and query sets into complete prompt sets.
+ - Dump Node: dump q&a result sets according to query sets.
+ - Eval Node: multi-dimensional evaluation of the quality of q&a result sets.
diff --git a/docs/guidebook/zh/8_1_agentUniverse-dataflow.md b/docs/guidebook/zh/8_1_agentUniverse-dataflow.md
@@ -0,0 +1,115 @@
+# 项目介绍
+agentUniverse-dataflow基于**Agentic Workflow**思想，以多智能体及flow编排的形式，通过**离线自适应数据采集、微调数据集生成、模型推理训练、数据集多维度自动评估**等工作，强化大模型和Agent的数据处理流程。
+
+具体包括：
+1. 一套基础数据框架：**DataFramework**, 以Dispatch-Flow-Node-Data/Event/Prompt/Answer/Eval为框架，具有可持续集成和扩展能力
+2. 一套智能体框架：**PVRP**「Planner-Verifier-Reflector-Producer」
+3. 一套能力： 
+   - 自适应数据事件采集 **auto_event**
+   - 微调数据集生成 **dataset_build**
+   - 上传训练推理 **train_dump**
+   - 多维客观自动评估 **eval_report**
+   - 模型部署生效 **model_deploy**
+   - 系统指令优选 **instruct_select**
+
+# 快速开始
+在agentUniverse标准工程模版`sample_standard_app`中：
+## step1
+配置config.toml中prompt的加载路径，增加`agentuniverse_dataflow.prompt`，读取agentuniverse_dataflow系统内置prompt文件。
+```toml
+[CORE_PACKAGE]
+# Perform a full component scan and registration for all the paths under this list.
+default = ['sample_standard_app.app.core']
+# Scan and register agent components for all paths under this list, with priority over the default.
+agent = ['sample_standard_app.app.core.agent']
+# Scan and register agent components for all paths under this list, with priority over the default.
+knowledge = ['sample_standard_app.app.core.knowledge']
+# Scan and register knowledge components for all paths under this list, with priority over the default.
+llm = ['sample_standard_app.app.core.llm']
+# Scan and register llm components for all paths under this list, with priority over the default.
+planner = ['sample_standard_app.app.core.planner']
+# Scan and register planner components for all paths under this list, with priority over the default.
+tool = ['sample_standard_app.app.core.tool']
+# Scan and register memory components for all paths under this list, with priority over the default.
+memory = ['sample_standard_app.app.core.memory']
+# Scan and register service components for all paths under this list, with priority over the default.
+service = ['sample_standard_app.app.core.service']
+# Scan and register prompt components for all paths under this list, with priority over the default.
+prompt = ['sample_standard_app.app.core.prompt', 'agentuniverse_dataflow.prompt']
+```
+## step2
+sample_standard_app的dataflow目录下，dispatch作为入口，在`dispatch.yaml`中配置dataflow流程，之后运行`dispatch.py`文件即可（dataflow运行结果以jsonl形式存储在dispatch/data目录下）。
+
+![图片](../_picture/dataflow_dispatch.png) 
+
+在dispatch.yaml文件中，用户可以自定义编辑需要运行的dataflow任务节点，如下文配置文件dataflow运行后，执行**自适应数据事件采集和微调数据集生成**两个任务。
+```yaml
+name: 'main_dispatch'
+description: 'dispatch with multi-dataflows which will execute one after another'
+dataflows:
+  - ../flow/auto_event.yaml
+  - ../flow/dataset_build.yaml
+```
+# Dataflow介绍
+agentUniverse-dataflow目前支持6种flow，分别为`auto_event/dataset_build/train_dump/eval_report/model_deploy/instruct_select`，每个flow通过node组合形成对应的pipeline。
+## Dataflow流程图
+![图片](../_picture/dataflow_flowchart.jpg) 
+
+在每个flow的yaml配置文件，各node节点包含llm、prompt version配置信息，用户可自定义llm基座模型和prompt，实现版本管理和快捷切换。
+例如，下图中node节点使用sample工程的qwen基座模型和agentuniverse-dataflow系统内置的prompt文件：
+
+![图片](../_picture/dataflow_dataset_build.png)  
+## 各Flow具体详情
+**特殊说明：** 
+dataflow中`auto event/dataset build/eval report/instruce select`为用户可以**直接运行体验的flow节点**，`train dump/model deploy`节点**理念先行**，目前在蚂蚁集团内已通过标准的模型部署和训练平台运行，开源社区版本后续考虑开放。
+
+### Auto Event
+Auto Event主要功能是通过采集不同数据源，感知数据源中对应agent运行时的日志信息， 提取有效输入输出生成特定jsonl文件（当前支持采集jsonl数据源，更多数据源类型持续开放）。
+
+Auto Event包含3个node，`Perceiver/Planner/Executor`，pipeline流程通过yaml配置，示例配置文件为sample_standard_app中的auto_event.yaml。
+ - Perceiver Node: 感知数据源，提取数据源中的原始输入输出
+ - Planner Node: 生成代码用来提取数据源中的有效模型输入输出
+ - Executor Node: 执行代码提取数据源中的有效模型输入输出
+
+### Dataset Build
+Dataset Build主要功能是生成特定领域高质量模型问答数据集。
+
+Dataset Build包含6个node，`Seed/Rewrite/Extend/Dedupe/Answer/Filter`，pipeline流程通过yaml配置，示例配置文件为sample_standard_app中的dataset_build.yaml。
+ - Seed Node: 用户指定领域，生产粗略query集
+ - Rewrite Node: 粗版query集改写为标准query集
+ - Extend Node: 扩充query集
+ - Dedupe Node: 对query集内容去重
+ - Answer Node: query集，模型调用生成问答结果集
+ - Filter Node: 专业领域评价标准，过滤有效数据集
+
+### Train Dump
+Train Dump主要功能是上传指定数据集，sft训练模型，dump问答结果集。
+
+Train Dump包含4个node，`Upload/Train/PreDeploy/Dump`，pipeline流程通过yaml配置，示例配置文件为sample_standard_app中的train_dump.yaml。
+ - Upload Node: 上传数据集到模型训练平台
+ - Train Node: 根据微调数据集训练模型
+ - PreDeploy Node: 预部署训练后模型
+ - Dump Node: 指定query集，dump问答结果
+
+### Eval Report
+Eval Report主要功能是多维度评估问答结果集质量，生成评估报告。
+
+Eval Report包含2个node，`Eval/Report`，pipeline流程通过yaml配置，示例配置文件为sample_standard_app中的eval_report.yaml。
+ - Eval Node: 多维度评估问答结果集质量
+ - Report Node: 生产质量评估报告
+
+### Model Deploy
+Model Deploy主要功能是正式部署训练后模型，dump问答数据集。
+
+Model Deploy包含2个node，`Deploy/Dump`，pipeline流程通过yaml配置，示例配置文件为sample_standard_app中的model_deploy.yaml。
+ - Deploy Node: 正式部署训练后模型
+ - Dump Node: 指定query集，dump问答结果
+
+### Instruct Select
+Instruct Select主要功能是生成指令集，拼接完整prompt集，dump问答数据集，评估问答质量。
+
+Instruct Select包含4个node，`Seed/PromptGen/Dump/Eval`，pipeline流程通过yaml配置，示例配置文件为sample_standard_app中的instruct_select.yaml。
+ - Seed Node: 用户指定领域，生产指令集
+ - PromptGen Node: 指令集+query集合并完整prompt集
+ - Dump Node: 指定prompt集，dump问答结果
+ - Eval Node: 多维度评估问答结果集质量
diff --git a/sample_standard_app/dataflow/dispatch/data/dataset_build_flow.dedupe.out.jsonl b/sample_standard_app/dataflow/dispatch/data/dataset_build_flow.dedupe.out.jsonl
@@ -0,0 +1,10 @@
+{"prompt": "怎样判断一个企业的经济状况稳健与否？"}
+{"prompt": "做出资本配置决定时，有哪些核心标志可以体现公司的盈利表现？"}
+{"prompt": "金融投资领域中，风险控制措施的重要性及其实行方法是怎样的？"}
+{"prompt": "加密货币对常规金融市场的冲击与后续走向如何？"}
+{"prompt": "在公司收购过程中，财务审查需着重检查哪几个关键点？"}
+{"prompt": "如何评估一家银行的信贷风险并制定相应的风险管理策略？"}
+{"prompt": "在进行跨国投资时，应考虑哪些宏观经济指标来预测市场趋势？"}
+{"prompt": "保险业中的精算模型如何影响产品定价和企业利润？"}
+{"prompt": "绿色金融的发展趋势及其对传统能源行业的影响是什么？"}
+{"prompt": "金融科技如何改变个人理财规划和财富管理服务？"}