Merge pull request #145 from alipay/dev_fanen

ADD: New Knowledge component
antgroup · Aug 27, 2024 · cf2223d · cf2223d
2 parents 55837b5 + d3b8120
commit cf2223d
Show file tree

Hide file tree

Showing 135 changed files with 3,672 additions and 9,944 deletions.
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/__init__.py b/agentuniverse/agent/action/knowledge/doc_processor/__init__.py
@@ -0,0 +1,7 @@
+# !/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+# @Time    : 2024/7/23 13:59
+# @Author  : fanen.lhy
+# @Email   : fanen.lhy@antgroup.com
+# @FileName: __init__.py.py
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/character_text_splitter.py b/agentuniverse/agent/action/knowledge/doc_processor/character_text_splitter.py
@@ -0,0 +1,50 @@
+# !/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+# @Time    : 2024/8/5 14:37
+# @Author  : fanen.lhy
+# @Email   : fanen.lhy@antgroup.com
+# @FileName: text_splitter.py
+from typing import List, Optional
+from langchain.text_splitter import CharacterTextSplitter as Splitter
+
+from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
+    DocProcessor
+from agentuniverse.agent.action.knowledge.store.document import Document
+from agentuniverse.agent.action.knowledge.store.query import Query
+from agentuniverse.base.config.component_configer.component_configer import \
+    ComponentConfiger
+
+
+class CharacterTextSplitter(DocProcessor):
+    chunk_size: int = 200
+    chunk_overlap: int = 20
+    separator: str = "/n/n"
+    splitter: Optional[Splitter] = None
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.splitter = Splitter(separator=self.separator,
+                                 chunk_size=self.chunk_size,
+                                 chunk_overlap=self.chunk_overlap)
+
+    def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \
+            List[Document]:
+        lc_doc_list = self.splitter.split_documents(Document.as_langchain_list(
+            origin_docs
+        ))
+        return Document.from_langchain_list(lc_doc_list)
+
+    def _initialize_by_component_configer(self,
+                                         doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
+        super()._initialize_by_component_configer(doc_processor_configer)
+        if hasattr(doc_processor_configer, "chunk_size"):
+            self.chunk_size = doc_processor_configer.chunk_size
+        if hasattr(doc_processor_configer, "chunk_overlap"):
+            self.chunk_overlap = doc_processor_configer.chunk_overlap
+        if hasattr(doc_processor_configer, "separator"):
+            self.separator = doc_processor_configer.separator
+        self.splitter = Splitter(separator=self.separator,
+                                 chunk_size=self.chunk_size,
+                                 chunk_overlap=self.chunk_overlap)
+        return self
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/character_text_splitter.yaml b/agentuniverse/agent/action/knowledge/doc_processor/character_text_splitter.yaml
@@ -0,0 +1,9 @@
+name: 'character_text_splitter'
+description: 'langchain character text splitter'
+chunk_size: 200
+chunk_overlap: 20
+separators: "/n/n"
+metadata:
+  type: 'DOC_PROCESSOR'
+  module: 'agentuniverse.agent.action.knowledge.doc_processor.character_text_splitter'
+  class: 'CharacterTextSplitter'
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/dashscope_reranker.py b/agentuniverse/agent/action/knowledge/doc_processor/dashscope_reranker.py
@@ -0,0 +1,67 @@
+# !/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+# @Time    : 2024/8/5 15:48
+# @Author  : fanen.lhy
+# @Email   : fanen.lhy@antgroup.com
+# @FileName: dashscope_reranker.py
+
+from typing import List, Optional
+import dashscope
+from http import HTTPStatus
+
+from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
+    DocProcessor
+from agentuniverse.agent.action.knowledge.store.document import Document
+from agentuniverse.agent.action.knowledge.store.query import Query
+from agentuniverse.base.config.component_configer.component_configer import \
+    ComponentConfiger
+
+MODEL_NAME_MAP = {
+    "gte_rerank": dashscope.TextReRank.Models.gte_rerank
+}
+
+
+class DashscopeReranker(DocProcessor):
+    model_name: str = "gte_rerank"
+    top_n: int = 10
+
+    def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \
+            List[Document]:
+        if not query or not query.query_str:
+            raise Exception("Dashscope reranker need an origin string query.")
+        if len(origin_docs)<1:
+            return origin_docs
+        documents_texts = []
+        for _doc in origin_docs:
+            documents_texts.append(_doc.text)
+        resp = dashscope.TextReRank.call(
+            model=MODEL_NAME_MAP.get(self.model_name),
+            query=query.query_str,
+            documents=documents_texts,
+            top_n=self.top_n,
+            return_documents=False
+        )
+        if resp.status_code == HTTPStatus.OK:
+            results = resp.output.results
+        else:
+            raise Exception(f"Dashscope rerank api call error: {resp}")
+        rerank_docs = []
+        for _result in results:
+            index = _result.index
+            if origin_docs[index].metadata:
+                origin_docs[index].metadata["relevance_score"] = _result.relevance_score
+            else:
+                origin_docs[index].metadata = {"relevance_score": _result.relevance_score}
+            rerank_docs.append(origin_docs[index])
+
+        return rerank_docs
+
+    def _initialize_by_component_configer(self,
+                                         doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
+        super()._initialize_by_component_configer(doc_processor_configer)
+        if hasattr(doc_processor_configer, "model_name"):
+            self.model_name = doc_processor_configer.model_name
+        if hasattr(doc_processor_configer, "top_n"):
+            self.top_n = doc_processor_configer.top_n
+        return self
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/dashscope_reranker.yaml b/agentuniverse/agent/action/knowledge/doc_processor/dashscope_reranker.yaml
@@ -0,0 +1,6 @@
+name: 'dashscope_reranker'
+description: 'reranker use dashscope api'
+metadata:
+  type: 'DOC_PROCESSOR'
+  module: 'agentuniverse.agent.action.knowledge.doc_processor.dashscope_reranker'
+  class: 'DashscopeReranker'
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/doc_processor.py b/agentuniverse/agent/action/knowledge/doc_processor/doc_processor.py
@@ -0,0 +1,56 @@
+# !/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+# @Time    : 2024/7/23 14:00
+# @Author  : fanen.lhy
+# @Email   : fanen.lhy@antgroup.com
+# @FileName: doc_processor.py
+
+from abc import abstractmethod
+from typing import List, Optional
+
+from agentuniverse.agent.action.knowledge.store.query import Query
+from agentuniverse.agent.action.knowledge.store.document import Document
+from agentuniverse.base.component.component_base import ComponentEnum
+from agentuniverse.base.component.component_base import ComponentBase
+from agentuniverse.base.config.component_configer.component_configer import \
+    ComponentConfiger
+
+
+class DocProcessor(ComponentBase):
+    """The basic class for doc processor.
+    """
+
+    component_type: ComponentEnum = ComponentEnum.DOC_PROCESSOR
+    name: Optional[str] = None
+    description: Optional[str] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def process_docs(self, origin_docs: List[Document], query: Query = None) -> \
+            List[Document]:
+        """Process input documents，return should also be a document list."""
+        return self._process_docs(origin_docs, query)
+
+    @abstractmethod
+    def _process_docs(self, origin_docs: List[Document],
+                      query: Query = None) -> \
+            List[Document]:
+        """Process input documents，return should also be a document list."""
+        pass
+
+    def _initialize_by_component_configer(self,
+                                         doc_processor_configer: ComponentConfiger) \
+            -> 'DocProcessor':
+        """Initialize the DocProcessor by the ComponentConfiger object.
+
+        Args:
+            doc_processor_configer(ComponentConfiger): A configer contains DocProcessor
+            basic info.
+        Returns:
+            DocProcessor: A DocProcessor instance.
+        """
+        self.name = doc_processor_configer.name
+        self.description = doc_processor_configer.description
+        return self
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/doc_processor_manager.py b/agentuniverse/agent/action/knowledge/doc_processor/doc_processor_manager.py
@@ -0,0 +1,20 @@
+# !/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+# @Time    : 2024/7/23 14:12
+# @Author  : fanen.lhy
+# @Email   : fanen.lhy@antgroup.com
+# @FileName: doc_processor_manager.py
+
+from agentuniverse.base.annotation.singleton import singleton
+from agentuniverse.base.component.component_enum import ComponentEnum
+from agentuniverse.base.component.component_manager_base import ComponentManagerBase
+from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import DocProcessor
+
+
+@singleton
+class DocProcessorManager(ComponentManagerBase[DocProcessor]):
+    """A singleton manager class of the DocProcessor."""
+
+    def __init__(self):
+        super().__init__(ComponentEnum.DOC_PROCESSOR)
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/jieba_keyword_extractor.py b/agentuniverse/agent/action/knowledge/doc_processor/jieba_keyword_extractor.py
@@ -0,0 +1,74 @@
+# !/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+# @Time    : 2024/8/6 10:44
+# @Author  : fanen.lhy
+# @Email   : fanen.lhy@antgroup.com
+# @FileName: jieba_keyword_extractor.py
+from typing import List
+
+from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
+    DocProcessor
+from agentuniverse.agent.action.knowledge.store.document import Document
+from agentuniverse.agent.action.knowledge.store.query import Query
+from agentuniverse.base.config.component_configer.component_configer import \
+    ComponentConfiger
+
+import jieba
+import jieba.analyse
+
+# nltk english stopwords
+stop_words = {'or', "mustn't", 'how', 'their', 'again', 'few', 'other', 'who',
+              'being', 'theirs', 'during', 'if', 'on', 'she', 'wouldn', 'why',
+              'above', 'll', "weren't", 'your', 'are', 'an', 'over', 'his',
+              'hasn', 'off', 'you', 'he', 'was', "you'd", 'me', 'ain', 'any',
+              'what', 'most', 're', 'haven', "isn't", 'there', "it's", 'same',
+              'm', 'only', 'my', 'needn', 'too', 'into', 'in', 'by', 'between',
+              "that'll", "mightn't", "aren't", 'am', 'up', 'having', "you'll",
+              "you're", 'these', 'mustn', 'himself', 'down', 'such', 'wasn',
+              'ourselves', 'did', 'because', 'should', 'won', 'about', 'aren',
+              'don', 'while', 't', 'isn', 'have', 'whom', 'myself', 'itself',
+              'this', 'will', 'and', 'further', 'no', 'where', 'ma', 'yours',
+              'been', "didn't", 'that', 'had', 'when', 'we', 'herself', 'some',
+              'has', "she's", "needn't", "should've", 'of', "won't", 'both',
+              'which', "haven't", 'yourself', 'through', 'the', 'from',
+              "you've", 'for', 'then', 'hadn', 'a', 'them', 'as', 'after',
+              'themselves', "shouldn't", 'they', 'y', 'doesn', 'didn', 'here',
+              'ours', 'own', 'it', "hadn't", 'each', 'our', 'shouldn', 'all',
+              'out', 'before', 'couldn', 'd', "doesn't", 'hers', "hasn't",
+              'than', 'at', "don't", 'not', 'to', 'is', 'with', 'until',
+              'does', 'yourselves', 'under', 'below', 'i', 'those', "wouldn't",
+              'once', "couldn't", 'just', 's', 'shan', "wasn't", 'him', 'so',
+              'can', 'doing', 'o', 'her', 'were', 'now', 'very', 'weren',
+              'its', 'against', 'do', 've', 'be', 'mightn', 'but', "shan't",
+              'nor', 'more'}
+
+chinese_stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不',
+                     '人', '都', '一', '一个', '上', '也', '很', '到', '说',
+                     '要', '去', '你', '会', '着', '没有', '看', '好', '自己',
+                     '请问', '您', '他', '于', '及', '即', '为', '最', '从', '以',
+                     '了', '将', '与', '吗', '吧', '中', '#', '什么', '怎么', '哪个',
+                     '哪些', '啥', '相关'}
+
+
+class JiebaKeywordExtractor(DocProcessor):
+    top_k: int = 3
+
+    def _process_docs(self, origin_docs: List[Document], query: Query = None) \
+            -> List[Document]:
+        for _doc in origin_docs:
+            words = jieba.lcut(_doc.text)
+            filtered_words = [word for word in words if word not in
+                              chinese_stopwords and word.lower() not in stop_words]
+            keywords = jieba.analyse.extract_tags(" ".join(filtered_words),
+                                                  topK=self.top_k)
+            _doc.keywords.update(keywords)
+
+        return origin_docs
+
+    def _initialize_by_component_configer(self,
+                                         doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
+        super()._initialize_by_component_configer(doc_processor_configer)
+        if hasattr(doc_processor_configer, "top_k"):
+            self.top_k = doc_processor_configer.top_k
+        return self
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/jieba_keyword_extractor.yaml b/agentuniverse/agent/action/knowledge/doc_processor/jieba_keyword_extractor.yaml
@@ -0,0 +1,7 @@
+name: 'jieba_keyword_extractor'
+description: 'extract keywords from text'
+top_k: 3
+metadata:
+  type: 'DOC_PROCESSOR'
+  module: 'agentuniverse.agent.action.knowledge.doc_processor.jieba_keyword_extractor'
+  class: 'JiebaKeywordExtractor'
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/recursive_character_text_splitter.py b/agentuniverse/agent/action/knowledge/doc_processor/recursive_character_text_splitter.py
@@ -0,0 +1,58 @@
+# !/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from typing import List
+
+# @Time    : 2024/7/31 16:19
+# @Author  : fanen.lhy
+# @Email   : fanen.lhy@antgroup.com
+# @FileName: recursive_character_text_splitter.py
+from typing import List, Optional
+from langchain.text_splitter import RecursiveCharacterTextSplitter as Splitter
+
+from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
+    DocProcessor
+from agentuniverse.agent.action.knowledge.store.document import Document
+from agentuniverse.agent.action.knowledge.store.query import Query
+from agentuniverse.base.config.component_configer.component_configer import \
+    ComponentConfiger
+
+
+class RecursiveCharacterTextSplitter(DocProcessor):
+    chunk_size: int = 200
+    chunk_overlap: int = 20
+    separators: List[str] = ["\n\n", "\n", " ", ""]
+    splitter: Optional[Splitter] = None
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.splitter = Splitter(separators=self.separators,
+                                 chunk_size=self.chunk_size,
+                                 chunk_overlap=self.chunk_overlap)
+
+    def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \
+            List[Document]:
+        lc_doc_list = self.splitter.split_documents(Document.as_langchain_list(
+            origin_docs
+        ))
+        return Document.from_langchain_list(lc_doc_list)
+
+    def _initialize_by_component_configer(self,
+                                         doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
+        super()._initialize_by_component_configer(doc_processor_configer)
+        if hasattr(doc_processor_configer, "chunk_size"):
+            self.chunk_size = doc_processor_configer.chunk_size
+        if hasattr(doc_processor_configer, "chunk_overlap"):
+            self.chunk_overlap = doc_processor_configer.chunk_overlap
+        if hasattr(doc_processor_configer, "separators"):
+            self.separators = doc_processor_configer.separators
+        self.splitter = Splitter(separators=self.separators,
+                                 chunk_size=self.chunk_size,
+                                 chunk_overlap=self.chunk_overlap)
+        return self
+
+
+
+
+
+
+
diff --git a/agentuniverse/agent/action/knowledge/doc_processor/recursive_character_text_splitter.yaml b/agentuniverse/agent/action/knowledge/doc_processor/recursive_character_text_splitter.yaml
@@ -0,0 +1,11 @@
+name: 'recursive_character_text_splitter'
+description: 'langchain recursive character text splitter'
+chunk_size: 200
+chunk_overlap: 20
+separators:
+  - "\n\n"
+  - "\n"
+metadata:
+  type: 'DOC_PROCESSOR'
+  module: 'agentuniverse.agent.action.knowledge.doc_processor.recursive_character_text_splitter'
+  class: 'RecursiveCharacterTextSplitter'