Skip to content

Commit

Permalink
Merge pull request #145 from alipay/dev_fanen
Browse files Browse the repository at this point in the history
ADD: New Knowledge component
  • Loading branch information
LandJerry authored Aug 27, 2024
2 parents 55837b5 + d3b8120 commit cf2223d
Show file tree
Hide file tree
Showing 135 changed files with 3,672 additions and 9,944 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# !/usr/bin/env python3
# -*- coding:utf-8 -*-

# @Time : 2024/7/23 13:59
# @Author : fanen.lhy
# @Email : fanen.lhy@antgroup.com
# @FileName: __init__.py.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# !/usr/bin/env python3
# -*- coding:utf-8 -*-

# @Time : 2024/8/5 14:37
# @Author : fanen.lhy
# @Email : fanen.lhy@antgroup.com
# @FileName: text_splitter.py
from typing import List, Optional
from langchain.text_splitter import CharacterTextSplitter as Splitter

from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
DocProcessor
from agentuniverse.agent.action.knowledge.store.document import Document
from agentuniverse.agent.action.knowledge.store.query import Query
from agentuniverse.base.config.component_configer.component_configer import \
ComponentConfiger


class CharacterTextSplitter(DocProcessor):
chunk_size: int = 200
chunk_overlap: int = 20
separator: str = "/n/n"
splitter: Optional[Splitter] = None

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.splitter = Splitter(separator=self.separator,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap)

def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \
List[Document]:
lc_doc_list = self.splitter.split_documents(Document.as_langchain_list(
origin_docs
))
return Document.from_langchain_list(lc_doc_list)

def _initialize_by_component_configer(self,
doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
super()._initialize_by_component_configer(doc_processor_configer)
if hasattr(doc_processor_configer, "chunk_size"):
self.chunk_size = doc_processor_configer.chunk_size
if hasattr(doc_processor_configer, "chunk_overlap"):
self.chunk_overlap = doc_processor_configer.chunk_overlap
if hasattr(doc_processor_configer, "separator"):
self.separator = doc_processor_configer.separator
self.splitter = Splitter(separator=self.separator,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap)
return self
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: 'character_text_splitter'
description: 'langchain character text splitter'
chunk_size: 200
chunk_overlap: 20
separators: "/n/n"
metadata:
type: 'DOC_PROCESSOR'
module: 'agentuniverse.agent.action.knowledge.doc_processor.character_text_splitter'
class: 'CharacterTextSplitter'
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# !/usr/bin/env python3
# -*- coding:utf-8 -*-

# @Time : 2024/8/5 15:48
# @Author : fanen.lhy
# @Email : fanen.lhy@antgroup.com
# @FileName: dashscope_reranker.py

from typing import List, Optional
import dashscope
from http import HTTPStatus

from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
DocProcessor
from agentuniverse.agent.action.knowledge.store.document import Document
from agentuniverse.agent.action.knowledge.store.query import Query
from agentuniverse.base.config.component_configer.component_configer import \
ComponentConfiger

MODEL_NAME_MAP = {
"gte_rerank": dashscope.TextReRank.Models.gte_rerank
}


class DashscopeReranker(DocProcessor):
model_name: str = "gte_rerank"
top_n: int = 10

def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \
List[Document]:
if not query or not query.query_str:
raise Exception("Dashscope reranker need an origin string query.")
if len(origin_docs)<1:
return origin_docs
documents_texts = []
for _doc in origin_docs:
documents_texts.append(_doc.text)
resp = dashscope.TextReRank.call(
model=MODEL_NAME_MAP.get(self.model_name),
query=query.query_str,
documents=documents_texts,
top_n=self.top_n,
return_documents=False
)
if resp.status_code == HTTPStatus.OK:
results = resp.output.results
else:
raise Exception(f"Dashscope rerank api call error: {resp}")
rerank_docs = []
for _result in results:
index = _result.index
if origin_docs[index].metadata:
origin_docs[index].metadata["relevance_score"] = _result.relevance_score
else:
origin_docs[index].metadata = {"relevance_score": _result.relevance_score}
rerank_docs.append(origin_docs[index])

return rerank_docs

def _initialize_by_component_configer(self,
doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
super()._initialize_by_component_configer(doc_processor_configer)
if hasattr(doc_processor_configer, "model_name"):
self.model_name = doc_processor_configer.model_name
if hasattr(doc_processor_configer, "top_n"):
self.top_n = doc_processor_configer.top_n
return self
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: 'dashscope_reranker'
description: 'reranker use dashscope api'
metadata:
type: 'DOC_PROCESSOR'
module: 'agentuniverse.agent.action.knowledge.doc_processor.dashscope_reranker'
class: 'DashscopeReranker'
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# !/usr/bin/env python3
# -*- coding:utf-8 -*-

# @Time : 2024/7/23 14:00
# @Author : fanen.lhy
# @Email : fanen.lhy@antgroup.com
# @FileName: doc_processor.py

from abc import abstractmethod
from typing import List, Optional

from agentuniverse.agent.action.knowledge.store.query import Query
from agentuniverse.agent.action.knowledge.store.document import Document
from agentuniverse.base.component.component_base import ComponentEnum
from agentuniverse.base.component.component_base import ComponentBase
from agentuniverse.base.config.component_configer.component_configer import \
ComponentConfiger


class DocProcessor(ComponentBase):
"""The basic class for doc processor.
"""

component_type: ComponentEnum = ComponentEnum.DOC_PROCESSOR
name: Optional[str] = None
description: Optional[str] = None

class Config:
arbitrary_types_allowed = True

def process_docs(self, origin_docs: List[Document], query: Query = None) -> \
List[Document]:
"""Process input documents,return should also be a document list."""
return self._process_docs(origin_docs, query)

@abstractmethod
def _process_docs(self, origin_docs: List[Document],
query: Query = None) -> \
List[Document]:
"""Process input documents,return should also be a document list."""
pass

def _initialize_by_component_configer(self,
doc_processor_configer: ComponentConfiger) \
-> 'DocProcessor':
"""Initialize the DocProcessor by the ComponentConfiger object.
Args:
doc_processor_configer(ComponentConfiger): A configer contains DocProcessor
basic info.
Returns:
DocProcessor: A DocProcessor instance.
"""
self.name = doc_processor_configer.name
self.description = doc_processor_configer.description
return self
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# !/usr/bin/env python3
# -*- coding:utf-8 -*-

# @Time : 2024/7/23 14:12
# @Author : fanen.lhy
# @Email : fanen.lhy@antgroup.com
# @FileName: doc_processor_manager.py

from agentuniverse.base.annotation.singleton import singleton
from agentuniverse.base.component.component_enum import ComponentEnum
from agentuniverse.base.component.component_manager_base import ComponentManagerBase
from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import DocProcessor


@singleton
class DocProcessorManager(ComponentManagerBase[DocProcessor]):
"""A singleton manager class of the DocProcessor."""

def __init__(self):
super().__init__(ComponentEnum.DOC_PROCESSOR)
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# !/usr/bin/env python3
# -*- coding:utf-8 -*-

# @Time : 2024/8/6 10:44
# @Author : fanen.lhy
# @Email : fanen.lhy@antgroup.com
# @FileName: jieba_keyword_extractor.py
from typing import List

from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
DocProcessor
from agentuniverse.agent.action.knowledge.store.document import Document
from agentuniverse.agent.action.knowledge.store.query import Query
from agentuniverse.base.config.component_configer.component_configer import \
ComponentConfiger

import jieba
import jieba.analyse

# nltk english stopwords
stop_words = {'or', "mustn't", 'how', 'their', 'again', 'few', 'other', 'who',
'being', 'theirs', 'during', 'if', 'on', 'she', 'wouldn', 'why',
'above', 'll', "weren't", 'your', 'are', 'an', 'over', 'his',
'hasn', 'off', 'you', 'he', 'was', "you'd", 'me', 'ain', 'any',
'what', 'most', 're', 'haven', "isn't", 'there', "it's", 'same',
'm', 'only', 'my', 'needn', 'too', 'into', 'in', 'by', 'between',
"that'll", "mightn't", "aren't", 'am', 'up', 'having', "you'll",
"you're", 'these', 'mustn', 'himself', 'down', 'such', 'wasn',
'ourselves', 'did', 'because', 'should', 'won', 'about', 'aren',
'don', 'while', 't', 'isn', 'have', 'whom', 'myself', 'itself',
'this', 'will', 'and', 'further', 'no', 'where', 'ma', 'yours',
'been', "didn't", 'that', 'had', 'when', 'we', 'herself', 'some',
'has', "she's", "needn't", "should've", 'of', "won't", 'both',
'which', "haven't", 'yourself', 'through', 'the', 'from',
"you've", 'for', 'then', 'hadn', 'a', 'them', 'as', 'after',
'themselves', "shouldn't", 'they', 'y', 'doesn', 'didn', 'here',
'ours', 'own', 'it', "hadn't", 'each', 'our', 'shouldn', 'all',
'out', 'before', 'couldn', 'd', "doesn't", 'hers', "hasn't",
'than', 'at', "don't", 'not', 'to', 'is', 'with', 'until',
'does', 'yourselves', 'under', 'below', 'i', 'those', "wouldn't",
'once', "couldn't", 'just', 's', 'shan', "wasn't", 'him', 'so',
'can', 'doing', 'o', 'her', 'were', 'now', 'very', 'weren',
'its', 'against', 'do', 've', 'be', 'mightn', 'but', "shan't",
'nor', 'more'}

chinese_stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不',
'人', '都', '一', '一个', '上', '也', '很', '到', '说',
'要', '去', '你', '会', '着', '没有', '看', '好', '自己',
'请问', '您', '他', '于', '及', '即', '为', '最', '从', '以',
'了', '将', '与', '吗', '吧', '中', '#', '什么', '怎么', '哪个',
'哪些', '啥', '相关'}


class JiebaKeywordExtractor(DocProcessor):
top_k: int = 3

def _process_docs(self, origin_docs: List[Document], query: Query = None) \
-> List[Document]:
for _doc in origin_docs:
words = jieba.lcut(_doc.text)
filtered_words = [word for word in words if word not in
chinese_stopwords and word.lower() not in stop_words]
keywords = jieba.analyse.extract_tags(" ".join(filtered_words),
topK=self.top_k)
_doc.keywords.update(keywords)

return origin_docs

def _initialize_by_component_configer(self,
doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
super()._initialize_by_component_configer(doc_processor_configer)
if hasattr(doc_processor_configer, "top_k"):
self.top_k = doc_processor_configer.top_k
return self
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: 'jieba_keyword_extractor'
description: 'extract keywords from text'
top_k: 3
metadata:
type: 'DOC_PROCESSOR'
module: 'agentuniverse.agent.action.knowledge.doc_processor.jieba_keyword_extractor'
class: 'JiebaKeywordExtractor'
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# !/usr/bin/env python3
# -*- coding:utf-8 -*-
from typing import List

# @Time : 2024/7/31 16:19
# @Author : fanen.lhy
# @Email : fanen.lhy@antgroup.com
# @FileName: recursive_character_text_splitter.py
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter as Splitter

from agentuniverse.agent.action.knowledge.doc_processor.doc_processor import \
DocProcessor
from agentuniverse.agent.action.knowledge.store.document import Document
from agentuniverse.agent.action.knowledge.store.query import Query
from agentuniverse.base.config.component_configer.component_configer import \
ComponentConfiger


class RecursiveCharacterTextSplitter(DocProcessor):
chunk_size: int = 200
chunk_overlap: int = 20
separators: List[str] = ["\n\n", "\n", " ", ""]
splitter: Optional[Splitter] = None

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.splitter = Splitter(separators=self.separators,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap)

def _process_docs(self, origin_docs: List[Document], query: Query = None) -> \
List[Document]:
lc_doc_list = self.splitter.split_documents(Document.as_langchain_list(
origin_docs
))
return Document.from_langchain_list(lc_doc_list)

def _initialize_by_component_configer(self,
doc_processor_configer: ComponentConfiger) -> 'DocProcessor':
super()._initialize_by_component_configer(doc_processor_configer)
if hasattr(doc_processor_configer, "chunk_size"):
self.chunk_size = doc_processor_configer.chunk_size
if hasattr(doc_processor_configer, "chunk_overlap"):
self.chunk_overlap = doc_processor_configer.chunk_overlap
if hasattr(doc_processor_configer, "separators"):
self.separators = doc_processor_configer.separators
self.splitter = Splitter(separators=self.separators,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap)
return self







Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: 'recursive_character_text_splitter'
description: 'langchain recursive character text splitter'
chunk_size: 200
chunk_overlap: 20
separators:
- "\n\n"
- "\n"
metadata:
type: 'DOC_PROCESSOR'
module: 'agentuniverse.agent.action.knowledge.doc_processor.recursive_character_text_splitter'
class: 'RecursiveCharacterTextSplitter'
Loading

0 comments on commit cf2223d

Please sign in to comment.