-
Notifications
You must be signed in to change notification settings - Fork 122
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #145 from alipay/dev_fanen
ADD: New Knowledge component
- Loading branch information
Showing
135 changed files
with
3,672 additions
and
9,944 deletions.
There are no files selected for viewing
7 changes: 7 additions & 0 deletions
7
agentuniverse/agent/action/knowledge/doc_processor/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
|
||
# @Time : 2024/7/23 13:59 | ||
# @Author : fanen.lhy | ||
# @Email : fanen.lhy@antgroup.com | ||
# @FileName: __init__.py.py |
50 changes: 50 additions & 0 deletions
50
agentuniverse/agent/action/knowledge/doc_processor/character_text_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
|
||
# @Time : 2024/8/5 14:37 | ||
# @Author : fanen.lhy | ||
# @Email : fanen.lhy@antgroup.com | ||
# @FileName: text_splitter.py | ||
from typing import List, Optional | ||
from langchain.text_splitter import CharacterTextSplitter as Splitter | ||
|
||
from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \ | ||
DocProcessor | ||
from agentuniverse.agent.action.knowledge.store.document import Document | ||
from agentuniverse.agent.action.knowledge.store.query import Query | ||
from agentuniverse.base.config.component_configer.component_configer import \ | ||
ComponentConfiger | ||
|
||
|
||
class CharacterTextSplitter(DocProcessor): | ||
chunk_size: int = 200 | ||
chunk_overlap: int = 20 | ||
separator: str = "/n/n" | ||
splitter: Optional[Splitter] = None | ||
|
||
def __init__(self, **kwargs): | ||
super().__init__(**kwargs) | ||
self.splitter = Splitter(separator=self.separator, | ||
chunk_size=self.chunk_size, | ||
chunk_overlap=self.chunk_overlap) | ||
|
||
def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \ | ||
List[Document]: | ||
lc_doc_list = self.splitter.split_documents(Document.as_langchain_list( | ||
origin_docs | ||
)) | ||
return Document.from_langchain_list(lc_doc_list) | ||
|
||
def _initialize_by_component_configer(self, | ||
doc_processor_configer: ComponentConfiger) -> 'DocProcessor': | ||
super()._initialize_by_component_configer(doc_processor_configer) | ||
if hasattr(doc_processor_configer, "chunk_size"): | ||
self.chunk_size = doc_processor_configer.chunk_size | ||
if hasattr(doc_processor_configer, "chunk_overlap"): | ||
self.chunk_overlap = doc_processor_configer.chunk_overlap | ||
if hasattr(doc_processor_configer, "separator"): | ||
self.separator = doc_processor_configer.separator | ||
self.splitter = Splitter(separator=self.separator, | ||
chunk_size=self.chunk_size, | ||
chunk_overlap=self.chunk_overlap) | ||
return self |
9 changes: 9 additions & 0 deletions
9
agentuniverse/agent/action/knowledge/doc_processor/character_text_splitter.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name: 'character_text_splitter' | ||
description: 'langchain character text splitter' | ||
chunk_size: 200 | ||
chunk_overlap: 20 | ||
separators: "/n/n" | ||
metadata: | ||
type: 'DOC_PROCESSOR' | ||
module: 'agentuniverse.agent.action.knowledge.doc_processor.character_text_splitter' | ||
class: 'CharacterTextSplitter' |
67 changes: 67 additions & 0 deletions
67
agentuniverse/agent/action/knowledge/doc_processor/dashscope_reranker.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
|
||
# @Time : 2024/8/5 15:48 | ||
# @Author : fanen.lhy | ||
# @Email : fanen.lhy@antgroup.com | ||
# @FileName: dashscope_reranker.py | ||
|
||
from typing import List, Optional | ||
import dashscope | ||
from http import HTTPStatus | ||
|
||
from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \ | ||
DocProcessor | ||
from agentuniverse.agent.action.knowledge.store.document import Document | ||
from agentuniverse.agent.action.knowledge.store.query import Query | ||
from agentuniverse.base.config.component_configer.component_configer import \ | ||
ComponentConfiger | ||
|
||
MODEL_NAME_MAP = { | ||
"gte_rerank": dashscope.TextReRank.Models.gte_rerank | ||
} | ||
|
||
|
||
class DashscopeReranker(DocProcessor): | ||
model_name: str = "gte_rerank" | ||
top_n: int = 10 | ||
|
||
def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \ | ||
List[Document]: | ||
if not query or not query.query_str: | ||
raise Exception("Dashscope reranker need an origin string query.") | ||
if len(origin_docs)<1: | ||
return origin_docs | ||
documents_texts = [] | ||
for _doc in origin_docs: | ||
documents_texts.append(_doc.text) | ||
resp = dashscope.TextReRank.call( | ||
model=MODEL_NAME_MAP.get(self.model_name), | ||
query=query.query_str, | ||
documents=documents_texts, | ||
top_n=self.top_n, | ||
return_documents=False | ||
) | ||
if resp.status_code == HTTPStatus.OK: | ||
results = resp.output.results | ||
else: | ||
raise Exception(f"Dashscope rerank api call error: {resp}") | ||
rerank_docs = [] | ||
for _result in results: | ||
index = _result.index | ||
if origin_docs[index].metadata: | ||
origin_docs[index].metadata["relevance_score"] = _result.relevance_score | ||
else: | ||
origin_docs[index].metadata = {"relevance_score": _result.relevance_score} | ||
rerank_docs.append(origin_docs[index]) | ||
|
||
return rerank_docs | ||
|
||
def _initialize_by_component_configer(self, | ||
doc_processor_configer: ComponentConfiger) -> 'DocProcessor': | ||
super()._initialize_by_component_configer(doc_processor_configer) | ||
if hasattr(doc_processor_configer, "model_name"): | ||
self.model_name = doc_processor_configer.model_name | ||
if hasattr(doc_processor_configer, "top_n"): | ||
self.top_n = doc_processor_configer.top_n | ||
return self |
6 changes: 6 additions & 0 deletions
6
agentuniverse/agent/action/knowledge/doc_processor/dashscope_reranker.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
name: 'dashscope_reranker' | ||
description: 'reranker use dashscope api' | ||
metadata: | ||
type: 'DOC_PROCESSOR' | ||
module: 'agentuniverse.agent.action.knowledge.doc_processor.dashscope_reranker' | ||
class: 'DashscopeReranker' |
56 changes: 56 additions & 0 deletions
56
agentuniverse/agent/action/knowledge/doc_processor/doc_processor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
|
||
# @Time : 2024/7/23 14:00 | ||
# @Author : fanen.lhy | ||
# @Email : fanen.lhy@antgroup.com | ||
# @FileName: doc_processor.py | ||
|
||
from abc import abstractmethod | ||
from typing import List, Optional | ||
|
||
from agentuniverse.agent.action.knowledge.store.query import Query | ||
from agentuniverse.agent.action.knowledge.store.document import Document | ||
from agentuniverse.base.component.component_base import ComponentEnum | ||
from agentuniverse.base.component.component_base import ComponentBase | ||
from agentuniverse.base.config.component_configer.component_configer import \ | ||
ComponentConfiger | ||
|
||
|
||
class DocProcessor(ComponentBase): | ||
"""The basic class for doc processor. | ||
""" | ||
|
||
component_type: ComponentEnum = ComponentEnum.DOC_PROCESSOR | ||
name: Optional[str] = None | ||
description: Optional[str] = None | ||
|
||
class Config: | ||
arbitrary_types_allowed = True | ||
|
||
def process_docs(self, origin_docs: List[Document], query: Query = None) -> \ | ||
List[Document]: | ||
"""Process input documents,return should also be a document list.""" | ||
return self._process_docs(origin_docs, query) | ||
|
||
@abstractmethod | ||
def _process_docs(self, origin_docs: List[Document], | ||
query: Query = None) -> \ | ||
List[Document]: | ||
"""Process input documents,return should also be a document list.""" | ||
pass | ||
|
||
def _initialize_by_component_configer(self, | ||
doc_processor_configer: ComponentConfiger) \ | ||
-> 'DocProcessor': | ||
"""Initialize the DocProcessor by the ComponentConfiger object. | ||
Args: | ||
doc_processor_configer(ComponentConfiger): A configer contains DocProcessor | ||
basic info. | ||
Returns: | ||
DocProcessor: A DocProcessor instance. | ||
""" | ||
self.name = doc_processor_configer.name | ||
self.description = doc_processor_configer.description | ||
return self |
20 changes: 20 additions & 0 deletions
20
agentuniverse/agent/action/knowledge/doc_processor/doc_processor_manager.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
|
||
# @Time : 2024/7/23 14:12 | ||
# @Author : fanen.lhy | ||
# @Email : fanen.lhy@antgroup.com | ||
# @FileName: doc_processor_manager.py | ||
|
||
from agentuniverse.base.annotation.singleton import singleton | ||
from agentuniverse.base.component.component_enum import ComponentEnum | ||
from agentuniverse.base.component.component_manager_base import ComponentManagerBase | ||
from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import DocProcessor | ||
|
||
|
||
@singleton | ||
class DocProcessorManager(ComponentManagerBase[DocProcessor]): | ||
"""A singleton manager class of the DocProcessor.""" | ||
|
||
def __init__(self): | ||
super().__init__(ComponentEnum.DOC_PROCESSOR) |
74 changes: 74 additions & 0 deletions
74
agentuniverse/agent/action/knowledge/doc_processor/jieba_keyword_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
|
||
# @Time : 2024/8/6 10:44 | ||
# @Author : fanen.lhy | ||
# @Email : fanen.lhy@antgroup.com | ||
# @FileName: jieba_keyword_extractor.py | ||
from typing import List | ||
|
||
from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \ | ||
DocProcessor | ||
from agentuniverse.agent.action.knowledge.store.document import Document | ||
from agentuniverse.agent.action.knowledge.store.query import Query | ||
from agentuniverse.base.config.component_configer.component_configer import \ | ||
ComponentConfiger | ||
|
||
import jieba | ||
import jieba.analyse | ||
|
||
# nltk english stopwords | ||
stop_words = {'or', "mustn't", 'how', 'their', 'again', 'few', 'other', 'who', | ||
'being', 'theirs', 'during', 'if', 'on', 'she', 'wouldn', 'why', | ||
'above', 'll', "weren't", 'your', 'are', 'an', 'over', 'his', | ||
'hasn', 'off', 'you', 'he', 'was', "you'd", 'me', 'ain', 'any', | ||
'what', 'most', 're', 'haven', "isn't", 'there', "it's", 'same', | ||
'm', 'only', 'my', 'needn', 'too', 'into', 'in', 'by', 'between', | ||
"that'll", "mightn't", "aren't", 'am', 'up', 'having', "you'll", | ||
"you're", 'these', 'mustn', 'himself', 'down', 'such', 'wasn', | ||
'ourselves', 'did', 'because', 'should', 'won', 'about', 'aren', | ||
'don', 'while', 't', 'isn', 'have', 'whom', 'myself', 'itself', | ||
'this', 'will', 'and', 'further', 'no', 'where', 'ma', 'yours', | ||
'been', "didn't", 'that', 'had', 'when', 'we', 'herself', 'some', | ||
'has', "she's", "needn't", "should've", 'of', "won't", 'both', | ||
'which', "haven't", 'yourself', 'through', 'the', 'from', | ||
"you've", 'for', 'then', 'hadn', 'a', 'them', 'as', 'after', | ||
'themselves', "shouldn't", 'they', 'y', 'doesn', 'didn', 'here', | ||
'ours', 'own', 'it', "hadn't", 'each', 'our', 'shouldn', 'all', | ||
'out', 'before', 'couldn', 'd', "doesn't", 'hers', "hasn't", | ||
'than', 'at', "don't", 'not', 'to', 'is', 'with', 'until', | ||
'does', 'yourselves', 'under', 'below', 'i', 'those', "wouldn't", | ||
'once', "couldn't", 'just', 's', 'shan', "wasn't", 'him', 'so', | ||
'can', 'doing', 'o', 'her', 'were', 'now', 'very', 'weren', | ||
'its', 'against', 'do', 've', 'be', 'mightn', 'but', "shan't", | ||
'nor', 'more'} | ||
|
||
chinese_stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不', | ||
'人', '都', '一', '一个', '上', '也', '很', '到', '说', | ||
'要', '去', '你', '会', '着', '没有', '看', '好', '自己', | ||
'请问', '您', '他', '于', '及', '即', '为', '最', '从', '以', | ||
'了', '将', '与', '吗', '吧', '中', '#', '什么', '怎么', '哪个', | ||
'哪些', '啥', '相关'} | ||
|
||
|
||
class JiebaKeywordExtractor(DocProcessor): | ||
top_k: int = 3 | ||
|
||
def _process_docs(self, origin_docs: List[Document], query: Query = None) \ | ||
-> List[Document]: | ||
for _doc in origin_docs: | ||
words = jieba.lcut(_doc.text) | ||
filtered_words = [word for word in words if word not in | ||
chinese_stopwords and word.lower() not in stop_words] | ||
keywords = jieba.analyse.extract_tags(" ".join(filtered_words), | ||
topK=self.top_k) | ||
_doc.keywords.update(keywords) | ||
|
||
return origin_docs | ||
|
||
def _initialize_by_component_configer(self, | ||
doc_processor_configer: ComponentConfiger) -> 'DocProcessor': | ||
super()._initialize_by_component_configer(doc_processor_configer) | ||
if hasattr(doc_processor_configer, "top_k"): | ||
self.top_k = doc_processor_configer.top_k | ||
return self |
7 changes: 7 additions & 0 deletions
7
agentuniverse/agent/action/knowledge/doc_processor/jieba_keyword_extractor.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
name: 'jieba_keyword_extractor' | ||
description: 'extract keywords from text' | ||
top_k: 3 | ||
metadata: | ||
type: 'DOC_PROCESSOR' | ||
module: 'agentuniverse.agent.action.knowledge.doc_processor.jieba_keyword_extractor' | ||
class: 'JiebaKeywordExtractor' |
58 changes: 58 additions & 0 deletions
58
agentuniverse/agent/action/knowledge/doc_processor/recursive_character_text_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
from typing import List | ||
|
||
# @Time : 2024/7/31 16:19 | ||
# @Author : fanen.lhy | ||
# @Email : fanen.lhy@antgroup.com | ||
# @FileName: recursive_character_text_splitter.py | ||
from typing import List, Optional | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter as Splitter | ||
|
||
from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \ | ||
DocProcessor | ||
from agentuniverse.agent.action.knowledge.store.document import Document | ||
from agentuniverse.agent.action.knowledge.store.query import Query | ||
from agentuniverse.base.config.component_configer.component_configer import \ | ||
ComponentConfiger | ||
|
||
|
||
class RecursiveCharacterTextSplitter(DocProcessor): | ||
chunk_size: int = 200 | ||
chunk_overlap: int = 20 | ||
separators: List[str] = ["\n\n", "\n", " ", ""] | ||
splitter: Optional[Splitter] = None | ||
|
||
def __init__(self, **kwargs): | ||
super().__init__(**kwargs) | ||
self.splitter = Splitter(separators=self.separators, | ||
chunk_size=self.chunk_size, | ||
chunk_overlap=self.chunk_overlap) | ||
|
||
def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \ | ||
List[Document]: | ||
lc_doc_list = self.splitter.split_documents(Document.as_langchain_list( | ||
origin_docs | ||
)) | ||
return Document.from_langchain_list(lc_doc_list) | ||
|
||
def _initialize_by_component_configer(self, | ||
doc_processor_configer: ComponentConfiger) -> 'DocProcessor': | ||
super()._initialize_by_component_configer(doc_processor_configer) | ||
if hasattr(doc_processor_configer, "chunk_size"): | ||
self.chunk_size = doc_processor_configer.chunk_size | ||
if hasattr(doc_processor_configer, "chunk_overlap"): | ||
self.chunk_overlap = doc_processor_configer.chunk_overlap | ||
if hasattr(doc_processor_configer, "separators"): | ||
self.separators = doc_processor_configer.separators | ||
self.splitter = Splitter(separators=self.separators, | ||
chunk_size=self.chunk_size, | ||
chunk_overlap=self.chunk_overlap) | ||
return self | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
11 changes: 11 additions & 0 deletions
11
agentuniverse/agent/action/knowledge/doc_processor/recursive_character_text_splitter.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
name: 'recursive_character_text_splitter' | ||
description: 'langchain recursive character text splitter' | ||
chunk_size: 200 | ||
chunk_overlap: 20 | ||
separators: | ||
- "\n\n" | ||
- "\n" | ||
metadata: | ||
type: 'DOC_PROCESSOR' | ||
module: 'agentuniverse.agent.action.knowledge.doc_processor.recursive_character_text_splitter' | ||
class: 'RecursiveCharacterTextSplitter' |
Oops, something went wrong.