From 14fc219668ff34cc6d0093c816c9debeae472be5 Mon Sep 17 00:00:00 2001 From: chenshuizhong Date: Wed, 15 May 2024 15:18:17 +0800 Subject: [PATCH 1/8] Updated conversation_api.md document/upload --- docs/conversation_api.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/conversation_api.md b/docs/conversation_api.md index 2dd258fd6f1..13acec8d9e6 100644 --- a/docs/conversation_api.md +++ b/docs/conversation_api.md @@ -315,10 +315,12 @@ This is usually used when upload a file to. ### Parameter: -| name | type | optional | description | -|---------|--------|----------|----------------------------------------| -| file | file | No | Upload file. | -| kb_name | string | No | Choose the upload knowledge base name. | +| name | type | optional | description | +|-----------|--------|----------|---------------------------------------------------------| +| file | file | No | Upload file. | +| kb_name | string | No | Choose the upload knowledge base name. | +| parser_id | string | Yes | Choose the parsing method. | +| run | string | Yes | Parsing will start automatically when the value is "1". | ### Response ```json From c1d3286fa4dfed79d94b159ee0fad6befcd7d0f8 Mon Sep 17 00:00:00 2001 From: guoyuhao2330 Date: Fri, 17 May 2024 14:10:04 +0800 Subject: [PATCH 2/8] list_chunks --- api/apps/api_app.py | 43 +++++++++++++++++++++++++++++ api/db/services/document_service.py | 13 +++++++++ 2 files changed, 56 insertions(+) diff --git a/api/apps/api_app.py b/api/apps/api_app.py index bc4fadf5e29..c2568a2ab22 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -39,6 +39,9 @@ from api.utils.file_utils import filename_type, thumbnail from rag.utils.minio_conn import MINIO +from rag.utils.es_conn import ELASTICSEARCH +from rag.nlp import search +from elasticsearch_dsl import Q def generate_confirmation_token(tenent_id): serializer = URLSafeTimedSerializer(tenent_id) @@ -347,3 +350,43 @@ def upload(): return server_error_response(e) return get_json_result(data=doc_result.to_json()) + + +@manager.route('/list_chunks', methods=['POST']) +# @login_required +def list_chunks(): + token = request.headers.get('Authorization').split()[1] + objs = APIToken.query(token=token) + if not objs: + return get_json_result( + data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR) + + form_data = request.form + + try: + if "filename" in form_data.keys(): + tenant_id = DocumentService.get_tenant_id_by_name(form_data['filename']) + q = Q("match", docnm_kwd=form_data['filename']) + + elif "doc_id" in form_data.keys(): + tenant_id = DocumentService.get_tenant_id(form_data['doc_id']) + q = Q("match", doc_id=form_data['doc_id']) + else: + return get_json_result( + data=False,retmsg="Can't find filename or doc_id" + ) + + res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s") + + res = [{} for _ in range(len(res_es_search['hits']['hits']))] + + for index , chunk in enumerate(res_es_search['hits']['hits']): + res[index]['doc_name'] = chunk['_source']['docnm_kwd'] + res[index]['content'] = chunk['_source']['content_with_weight'] + if 'img_id' in chunk['_source'].keys(): + res[index]['img_id'] = chunk['_source']['img_id'] + + except Exception as e: + return server_error_response(e) + + return get_json_result(data=res) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index d85c15070fc..23e6b0597bc 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -180,6 +180,19 @@ def get_tenant_id(cls, doc_id): if not docs: return return docs[0]["tenant_id"] + + @classmethod + @DB.connection_context() + def get_tenant_id_by_name(cls, name): + docs = cls.model.select( + Knowledgebase.tenant_id).join( + Knowledgebase, on=( + Knowledgebase.id == cls.model.kb_id)).where( + cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value) + docs = docs.dicts() + if not docs: + return + return docs[0]["tenant_id"] @classmethod @DB.connection_context() From 53a7b1c414cbccb110c9d791462a59030242c49e Mon Sep 17 00:00:00 2001 From: guoyuhao2330 Date: Fri, 17 May 2024 14:42:39 +0800 Subject: [PATCH 3/8] list_chunks --- api/db/services/document_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 23e6b0597bc..0e344dce02e 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -180,7 +180,7 @@ def get_tenant_id(cls, doc_id): if not docs: return return docs[0]["tenant_id"] - + @classmethod @DB.connection_context() def get_tenant_id_by_name(cls, name): From 2d75c91742ac6959e26ade5ce0cf7f382bb6565c Mon Sep 17 00:00:00 2001 From: guoyuhao2330 Date: Fri, 17 May 2024 14:45:12 +0800 Subject: [PATCH 4/8] list_chunks --- api/db/services/document_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 0e344dce02e..02f3197de96 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -181,7 +181,7 @@ def get_tenant_id(cls, doc_id): return return docs[0]["tenant_id"] - @classmethod + @classmethod @DB.connection_context() def get_tenant_id_by_name(cls, name): docs = cls.model.select( From ec592e1544c29e4e03d39d9126fcbb67972ca598 Mon Sep 17 00:00:00 2001 From: guoyuhao2330 Date: Fri, 17 May 2024 15:00:04 +0800 Subject: [PATCH 5/8] list_chunks --- docs/conversation_api.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/conversation_api.md b/docs/conversation_api.md index 13acec8d9e6..6e9e83f9773 100644 --- a/docs/conversation_api.md +++ b/docs/conversation_api.md @@ -364,3 +364,38 @@ This is usually used when upload a file to. } ``` + +## Get document chunks + +Get the chunks of the document based on filename or doc_id. +### Path: /api/list_chunks/ +### Method: POST + +### Parameter: + +| name | type | optional | description | +|----------|--------|----------|---------------------------------------------------------| +| filename | string | Yes | | +| doc_id | string | Yes | | + + +### Response +```json +{ + "data": [ + { + "content": "Figure 14: Per-request neural-net processingof RL-Cache.\n103\n(sn)\nCPU\n 102\nGPU\n8101\n100\n8\n16 64 256 1K\n4K", + "doc_name": "RL-Cache.pdf", + "img_id": "0335167613f011ef91240242ac120006-b46c3524952f82dbe061ce9b123f2211" + }, + { + "content": "4.3 ProcessingOverheadof RL-CacheACKNOWLEDGMENTSThis section evaluates how e￿ectively our RL-Cache implemen-tation leverages modern multi-core CPUs and GPUs to keep the per-request neural-net processing overhead low. Figure 14 depictsThis researchwas supported inpart by the Regional Government of Madrid (grant P2018/TCS-4499, EdgeData-CM)andU.S. National Science Foundation (grants CNS-1763617 andCNS-1717179).REFERENCES", + "doc_name": "RL-Cache.pdf", + "img_id": "0335167613f011ef91240242ac120006-d4c12c43938eb55d2d8278eea0d7e6d7" + } + ], + "retcode": 0, + "retmsg": "success" +} + +``` From 329c598673e878fe908cbabf99ad7329703cdb6f Mon Sep 17 00:00:00 2001 From: guoyuhao2330 Date: Fri, 17 May 2024 15:14:17 +0800 Subject: [PATCH 6/8] list_chunks --- api/apps/api_app.py | 8 ++++---- docs/conversation_api.md | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/api/apps/api_app.py b/api/apps/api_app.py index c2568a2ab22..04944904193 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -364,16 +364,16 @@ def list_chunks(): form_data = request.form try: - if "filename" in form_data.keys(): - tenant_id = DocumentService.get_tenant_id_by_name(form_data['filename']) - q = Q("match", docnm_kwd=form_data['filename']) + if "doc_name" in form_data.keys(): + tenant_id = DocumentService.get_tenant_id_by_name(form_data['doc_name']) + q = Q("match", docnm_kwd=form_data['doc_name']) elif "doc_id" in form_data.keys(): tenant_id = DocumentService.get_tenant_id(form_data['doc_id']) q = Q("match", doc_id=form_data['doc_id']) else: return get_json_result( - data=False,retmsg="Can't find filename or doc_id" + data=False,retmsg="Can't find doc_name or doc_id" ) res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s") diff --git a/docs/conversation_api.md b/docs/conversation_api.md index 6e9e83f9773..6310a2fab9c 100644 --- a/docs/conversation_api.md +++ b/docs/conversation_api.md @@ -367,16 +367,16 @@ This is usually used when upload a file to. ## Get document chunks -Get the chunks of the document based on filename or doc_id. +Get the chunks of the document based on doc_name or doc_id. ### Path: /api/list_chunks/ ### Method: POST ### Parameter: -| name | type | optional | description | -|----------|--------|----------|---------------------------------------------------------| -| filename | string | Yes | | -| doc_id | string | Yes | | +| name | type | optional | description | +|----------|--------|----------|---------------------------------| +| doc_name | string | Yes | The doc name in knowledge base. | +| doc_id | string | Yes | The doc_id of doc. | ### Response From dc8ced17fb62d93728c86f0f2e95334aa7ced72b Mon Sep 17 00:00:00 2001 From: GYH <43509927+guoyuhao2330@users.noreply.github.com> Date: Fri, 17 May 2024 15:46:40 +0800 Subject: [PATCH 7/8] Update conversation_api.md --- docs/conversation_api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conversation_api.md b/docs/conversation_api.md index 6310a2fab9c..c3f708f527f 100644 --- a/docs/conversation_api.md +++ b/docs/conversation_api.md @@ -375,8 +375,8 @@ Get the chunks of the document based on doc_name or doc_id. | name | type | optional | description | |----------|--------|----------|---------------------------------| -| doc_name | string | Yes | The doc name in knowledge base. | -| doc_id | string | Yes | The doc_id of doc. | +| doc_name | string | Yes | The doc name in knowledge. It can not be empty without doc_id.| +| doc_id | string | Yes | The doc_id of doc. It can not be empty without doc_name.| ### Response From 1f23c031160bfab0d0a63553b2506c303594897a Mon Sep 17 00:00:00 2001 From: GYH <43509927+guoyuhao2330@users.noreply.github.com> Date: Fri, 17 May 2024 15:57:40 +0800 Subject: [PATCH 8/8] Update conversation_api.md --- docs/conversation_api.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/conversation_api.md b/docs/conversation_api.md index c3f708f527f..bc40983e378 100644 --- a/docs/conversation_api.md +++ b/docs/conversation_api.md @@ -373,10 +373,10 @@ Get the chunks of the document based on doc_name or doc_id. ### Parameter: -| name | type | optional | description | +| Name | Type | Optional | Description | |----------|--------|----------|---------------------------------| -| doc_name | string | Yes | The doc name in knowledge. It can not be empty without doc_id.| -| doc_id | string | Yes | The doc_id of doc. It can not be empty without doc_name.| +| `doc_name` | string | Yes | The name of the document in the knowledge base. It must not be empty if `doc_id` is not set.| +| `doc_id` | string | Yes | The ID of the document in the knowledge base. It must not be empty if `doc_name` is not set.| ### Response