Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration with Infinity #2894

Merged
merged 1 commit into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ jobs:
echo "Waiting for service to be available..."
sleep 5
done
cd sdk/python && poetry install && source .venv/bin/activate && cd test && pytest t_dataset.py t_chat.py t_session.py t_document.py t_chunk.py
cd sdk/python && poetry install && source .venv/bin/activate && cd test && pytest --tb=short t_dataset.py t_chat.py t_session.py t_document.py t_chunk.py

- name: Stop ragflow:dev
if: always() # always run this step even if previous steps failed
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
git clone https://github.com/infiniflow/ragflow.git
cd ragflow/
export POETRY_VIRTUALENVS_CREATE=true POETRY_VIRTUALENVS_IN_PROJECT=true
~/.local/bin/poetry install --sync --no-root # install RAGFlow dependent python modules
~/.local/bin/poetry install --sync --no-root --with=full # install RAGFlow dependent python modules
```

3. Launch the dependent services (MinIO, Elasticsearch, Redis, and MySQL) using Docker Compose:
Expand All @@ -295,7 +295,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .

Add the following line to `/etc/hosts` to resolve all hosts specified in **docker/service_conf.yaml** to `127.0.0.1`:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
In **docker/service_conf.yaml**, update mysql port to `5455` and es port to `1200`, as specified in **docker/.env**.

Expand Down
2 changes: 1 addition & 1 deletion README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .

`/etc/hosts` に以下の行を追加して、**docker/service_conf.yaml** に指定されたすべてのホストを `127.0.0.1` に解決します:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
**docker/service_conf.yaml** で mysql のポートを `5455` に、es のポートを `1200` に更新します(**docker/.env** に指定された通り).

Expand Down
2 changes: 1 addition & 1 deletion README_ko.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .

`/etc/hosts` 에 다음 줄을 추가하여 **docker/service_conf.yaml** 에 지정된 모든 호스트를 `127.0.0.1` 로 해결합니다:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
**docker/service_conf.yaml** 에서 mysql 포트를 `5455` 로, es 포트를 `1200` 으로 업데이트합니다( **docker/.env** 에 지정된 대로).

Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .

在 `/etc/hosts` 中添加以下代码,将 **docker/service_conf.yaml** 文件中的所有 host 地址都解析为 `127.0.0.1`:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
在文件 **docker/service_conf.yaml** 中,对照 **docker/.env** 的配置将 mysql 端口更新为 `5455`,es 端口更新为 `1200`。

Expand Down
5 changes: 3 additions & 2 deletions api/apps/api_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,13 +529,14 @@ def list_chunks():
return get_json_result(
data=False, message="Can't find doc_name or doc_id"
)
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)

res = retrievaler.chunk_list(doc_id=doc_id, tenant_id=tenant_id)
res = retrievaler.chunk_list(doc_id, tenant_id, kb_ids)
res = [
{
"content": res_item["content_with_weight"],
"doc_name": res_item["docnm_kwd"],
"img_id": res_item["img_id"]
"image_id": res_item["img_id"]
} for res_item in res
]

Expand Down
78 changes: 37 additions & 41 deletions api/apps/chunk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,22 @@

from flask import request
from flask_login import login_required, current_user
from elasticsearch_dsl import Q

from api.db.services.dialog_service import keyword_extraction
from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, rag_tokenizer
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace
from api.db import LLMType, ParserType
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.db.services.document_service import DocumentService
from api.settings import RetCode, retrievaler, kg_retrievaler
from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
from api.utils.api_utils import get_json_result
import hashlib
import re


@manager.route('/list', methods=['POST'])
@login_required
@validate_request("doc_id")
Expand All @@ -53,12 +50,13 @@ def list_chunk():
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
query = {
"doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
}
if "available_int" in req:
query["available_int"] = int(req["available_int"])
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
sres = retrievaler.search(query, search.index_name(tenant_id), kb_ids, highlight=True)
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
for id in sres.ids:
d = {
Expand All @@ -69,16 +67,12 @@ def list_chunk():
"doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []),
"img_id": sres.field[id].get("img_id", ""),
"image_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t")
"positions": json.loads(sres.field[id].get("position_list", "[]")),
}
if len(d["positions"]) % 5 == 0:
poss = []
for i in range(0, len(d["positions"]), 5):
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
d["positions"] = poss
assert isinstance(d["positions"], list)
assert len(d["positions"])==0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5)
res["chunks"].append(d)
return get_json_result(data=res)
except Exception as e:
Expand All @@ -96,22 +90,20 @@ def get():
tenants = UserTenantService.query(user_id=current_user.id)
if not tenants:
return get_data_error_result(message="Tenant not found!")
res = ELASTICSEARCH.get(
chunk_id, search.index_name(
tenants[0].tenant_id))
if not res.get("found"):
tenant_id = tenants[0].tenant_id

kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
chunk = docStoreConn.get(chunk_id, search.index_name(tenant_id), kb_ids)
if chunk is None:
return server_error_response("Chunk not found")
id = res["_id"]
res = res["_source"]
res["chunk_id"] = id
k = []
for n in res.keys():
for n in chunk.keys():
if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
k.append(n)
for n in k:
del res[n]
del chunk[n]

return get_json_result(data=res)
return get_json_result(data=chunk)
except Exception as e:
if str(e).find("NotFoundError") >= 0:
return get_json_result(data=False, message='Chunk not found!',
Expand Down Expand Up @@ -162,7 +154,7 @@ def set():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Expand All @@ -174,11 +166,11 @@ def set():
def switch():
req = request.json
try:
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
search.index_name(tenant_id)):
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not docStoreConn.update({"id": req["chunk_ids"]}, {"available_int": int(req["available_int"])},
search.index_name(doc.tenant_id), doc.kb_id):
return get_data_error_result(message="Index updating failure")
return get_json_result(data=True)
except Exception as e:
Expand All @@ -191,12 +183,11 @@ def switch():
def rm():
req = request.json
try:
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
return get_data_error_result(message="Index updating failure")
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not docStoreConn.delete({"id": req["chunk_ids"]}, search.index_name(current_user.id), doc.kb_id):
return get_data_error_result(message="Index updating failure")
deleted_chunk_ids = req["chunk_ids"]
chunk_number = len(deleted_chunk_ids)
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
Expand Down Expand Up @@ -239,7 +230,7 @@ def create():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)

DocumentService.increment_chunk_num(
doc.id, doc.kb_id, c, 1, 0)
Expand All @@ -256,26 +247,27 @@ def retrieval_test():
page = int(req.get("page", 1))
size = int(req.get("size", 30))
question = req["question"]
kb_id = req["kb_id"]
if isinstance(kb_id, str): kb_id = [kb_id]
kb_ids = req["kb_id"]
if isinstance(kb_ids, str):
kb_ids = [kb_ids]
doc_ids = req.get("doc_ids", [])
similarity_threshold = float(req.get("similarity_threshold", 0.0))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024))

try:
tenants = UserTenantService.query(user_id=current_user.id)
for kid in kb_id:
for kb_id in kb_ids:
for tenant in tenants:
if KnowledgebaseService.query(
tenant_id=tenant.tenant_id, id=kid):
tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(
data=False, message='Only owner of knowledgebase authorized for this operation.',
code=RetCode.OPERATING_ERROR)

e, kb = KnowledgebaseService.get_by_id(kb_id[0])
e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
if not e:
return get_data_error_result(message="Knowledgebase not found!")

Expand All @@ -290,7 +282,7 @@ def retrieval_test():
question += keyword_extraction(chat_mdl, question)

retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, page, size,
similarity_threshold, vector_similarity_weight, top,
doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
for c in ranks["chunks"]:
Expand All @@ -309,12 +301,16 @@ def retrieval_test():
@login_required
def knowledge_graph():
doc_id = request.args["doc_id"]
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
tenant_id = DocumentService.get_tenant_id(doc_id)
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
req = {
"doc_ids":[doc_id],
"knowledge_graph_kwd": ["graph", "mind_map"]
}
tenant_id = DocumentService.get_tenant_id(doc_id)
sres = retrievaler.search(req, search.index_name(tenant_id))
sres = retrievaler.search(req, search.index_name(tenant_id), kb_ids, doc.kb_id)
obj = {"graph": {}, "mind_map": {}}
for id in sres.ids[:2]:
ty = sres.field[id]["knowledge_graph_kwd"]
Expand Down
29 changes: 10 additions & 19 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import re

import flask
from elasticsearch_dsl import Q
from flask import request
from flask_login import login_required, current_user

Expand All @@ -27,14 +26,13 @@
from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.user_service import UserTenantService
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from api.db.services import duplicate_name
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid
from api.db import FileType, TaskStatus, ParserType, FileSource
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.settings import RetCode
from api.settings import RetCode, docStoreConn
from api.utils.api_utils import get_json_result
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail
Expand Down Expand Up @@ -275,18 +273,8 @@ def change_status():
return get_data_error_result(
message="Database error (Document update)!")

if str(req["status"]) == "0":
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=0;",
idxnm=search.index_name(
kb.tenant_id)
)
else:
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=1;",
idxnm=search.index_name(
kb.tenant_id)
)
status = int(req["status"])
docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Expand Down Expand Up @@ -365,8 +353,11 @@ def run():
tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
e, doc = DocumentService.get_by_id(id)
if not e:
return get_data_error_result(message="Document not found!")
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)

if str(req["run"]) == TaskStatus.RUNNING.value:
TaskService.filter_delete([Task.doc_id == id])
Expand Down Expand Up @@ -490,8 +481,8 @@ def change_parser():
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)

return get_json_result(data=True)
except Exception as e:
Expand Down
5 changes: 5 additions & 0 deletions api/apps/kb_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from api.db.db_models import File
from api.settings import RetCode
from api.utils.api_utils import get_json_result
from api.settings import docStoreConn
from rag.nlp import search


@manager.route('/create', methods=['post'])
Expand Down Expand Up @@ -166,6 +168,9 @@ def rm():
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
return get_data_error_result(
message="Database error (Knowledgebase removal)!")
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
docStoreConn.deleteIdx(search.index_name(tenant.tenant_id), req["kb_id"])
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Loading