Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add api/list_chunks function #821

Merged
merged 11 commits into from
May 17, 2024
43 changes: 43 additions & 0 deletions api/apps/api_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
from api.utils.file_utils import filename_type, thumbnail
from rag.utils.minio_conn import MINIO

from rag.utils.es_conn import ELASTICSEARCH
from rag.nlp import search
from elasticsearch_dsl import Q

def generate_confirmation_token(tenent_id):
serializer = URLSafeTimedSerializer(tenent_id)
Expand Down Expand Up @@ -347,3 +350,43 @@ def upload():
return server_error_response(e)

return get_json_result(data=doc_result.to_json())


@manager.route('/list_chunks', methods=['POST'])
# @login_required
def list_chunks():
token = request.headers.get('Authorization').split()[1]
objs = APIToken.query(token=token)
if not objs:
return get_json_result(
data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)

form_data = request.form

try:
if "doc_name" in form_data.keys():
tenant_id = DocumentService.get_tenant_id_by_name(form_data['doc_name'])
q = Q("match", docnm_kwd=form_data['doc_name'])

elif "doc_id" in form_data.keys():
tenant_id = DocumentService.get_tenant_id(form_data['doc_id'])
q = Q("match", doc_id=form_data['doc_id'])
else:
return get_json_result(
data=False,retmsg="Can't find doc_name or doc_id"
)

res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s")

res = [{} for _ in range(len(res_es_search['hits']['hits']))]

for index , chunk in enumerate(res_es_search['hits']['hits']):
res[index]['doc_name'] = chunk['_source']['docnm_kwd']
res[index]['content'] = chunk['_source']['content_with_weight']
if 'img_id' in chunk['_source'].keys():
res[index]['img_id'] = chunk['_source']['img_id']

except Exception as e:
return server_error_response(e)

return get_json_result(data=res)
13 changes: 13 additions & 0 deletions api/db/services/document_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,19 @@ def get_tenant_id(cls, doc_id):
return
return docs[0]["tenant_id"]

@classmethod
@DB.connection_context()
def get_tenant_id_by_name(cls, name):
docs = cls.model.select(
Knowledgebase.tenant_id).join(
Knowledgebase, on=(
Knowledgebase.id == cls.model.kb_id)).where(
cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
return
return docs[0]["tenant_id"]

@classmethod
@DB.connection_context()
def get_thumbnails(cls, docids):
Expand Down
35 changes: 35 additions & 0 deletions docs/conversation_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,3 +364,38 @@ This is usually used when upload a file to.
}

```

## Get document chunks

Get the chunks of the document based on doc_name or doc_id.
### Path: /api/list_chunks/
### Method: POST

### Parameter:

| Name | Type | Optional | Description |
|----------|--------|----------|---------------------------------|
| `doc_name` | string | Yes | The name of the document in the knowledge base. It must not be empty if `doc_id` is not set.|
| `doc_id` | string | Yes | The ID of the document in the knowledge base. It must not be empty if `doc_name` is not set.|


### Response
```json
{
"data": [
{
"content": "Figure 14: Per-request neural-net processingof RL-Cache.\n103\n(sn)\nCPU\n 102\nGPU\n8101\n100\n8\n16 64 256 1K\n4K",
"doc_name": "RL-Cache.pdf",
"img_id": "0335167613f011ef91240242ac120006-b46c3524952f82dbe061ce9b123f2211"
},
{
"content": "4.3 ProcessingOverheadof RL-CacheACKNOWLEDGMENTSThis section evaluates how e￿ectively our RL-Cache implemen-tation leverages modern multi-core CPUs and GPUs to keep the per-request neural-net processing overhead low. Figure 14 depictsThis researchwas supported inpart by the Regional Government of Madrid (grant P2018/TCS-4499, EdgeData-CM)andU.S. National Science Foundation (grants CNS-1763617 andCNS-1717179).REFERENCES",
"doc_name": "RL-Cache.pdf",
"img_id": "0335167613f011ef91240242ac120006-d4c12c43938eb55d2d8278eea0d7e6d7"
}
],
"retcode": 0,
"retmsg": "success"
}

```