Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 8ea631a
Author: balibabu <cike8899@users.noreply.github.com>
Date:   Mon Dec 16 18:51:45 2024 +0800

    Fix: Every time you switch the page number of a chunk, the PDF document will be reloaded. infiniflow#4046 (infiniflow#4047)

    ### What problem does this PR solve?

    Fix: Every time you switch the page number of a chunk, the PDF document
    will be reloaded. infiniflow#4046

    ### Type of change

    - [x] Bug Fix (non-breaking change which fixes an issue)

commit 7fb67c4
Author: Kevin Hu <kevinhu.sh@gmail.com>
Date:   Mon Dec 16 15:23:49 2024 +0800

    Fix chunk number error after re-parsing. (infiniflow#4043)

    ### What problem does this PR solve?

    ### Type of change

    - [x] Bug Fix (non-breaking change which fixes an issue)

commit 44ac87a
Author: Michael Luo <luoshitou9@gmail.com>
Date:   Mon Dec 16 14:35:21 2024 +0800

    Remove Redundant None Check for vector_similarity_weight (infiniflow#4037)

    ### What problem does this PR solve?
    The removed if statement is unnecessary and adds cognitive load for
    readers.
    The original code:
    ```
    vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
    if vector_similarity_weight is None:
        vector_similarity_weight = 0.3
    ```
    has been simplified to:
    ```
    vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
    ```

    ### Type of change
    - [x] Refactoring

commit 7ddccbb
Author: so95 <is.thaison@gmail.com>
Date:   Mon Dec 16 08:46:59 2024 +0700

    extraction sqlquery (infiniflow#4027)

    clone infiniflow#4023
    improve the information extraction, most llm return results in markdown
    format ````sql ___ query `____ ```
  • Loading branch information
isthaison committed Dec 17, 2024
1 parent 76d815d commit c00861d
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 22 deletions.
20 changes: 17 additions & 3 deletions agent/component/exesql.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import psycopg2
from agent.component.base import ComponentBase, ComponentParamBase
import pyodbc

import logging

class ExeSQLParam(ComponentParamBase):
"""
Expand Down Expand Up @@ -65,13 +65,26 @@ def _run(self, history, **kwargs):
self._loop += 1

ans = self.get_input()


ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
ans = re.sub(r'^.*?SELECT ', 'SELECT ', repr(ans), flags=re.IGNORECASE)
if self._param.db_type == 'mssql':
# improve the information extraction, most llm return results in markdown format ```sql query ```
match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
if match:
ans = match.group(1) # Query content
print(ans)
else:
print("no markdown")
ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
else:
ans = re.sub(r'^.*?SELECT ', 'SELECT ', repr(ans), flags=re.IGNORECASE)
ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
ans = re.sub(r';[^;]*$', r';', ans)
if not ans:
raise Exception("SQL statement not found!")

logging.info("db_type: ",self._param.db_type)
if self._param.db_type in ["mysql", "mariadb"]:
db = pymysql.connect(db=self._param.database, user=self._param.username, host=self._param.host,
port=self._param.port, password=self._param.password)
Expand All @@ -96,11 +109,12 @@ def _run(self, history, **kwargs):
if not single_sql:
continue
try:
logging.info("single_sql: ",single_sql)
cursor.execute(single_sql)
if cursor.rowcount == 0:
sql_res.append({"content": "\nTotal: 0\n No record in the database!"})
continue
single_res = pd.DataFrame([i for i in cursor.fetchmany(size=self._param.top_n)])
single_res = pd.DataFrame([i for i in cursor.fetchmany(self._param.top_n)])
single_res.columns = [i[0] for i in cursor.description]
sql_res.append({"content": "\nTotal: " + str(cursor.rowcount) + "\n" + single_res.to_markdown()})
except Exception as e:
Expand Down
2 changes: 0 additions & 2 deletions api/apps/dialog_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ def set_dialog():
req["rerank_id"] = ""
similarity_threshold = req.get("similarity_threshold", 0.1)
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
if vector_similarity_weight is None:
vector_similarity_weight = 0.3
llm_setting = req.get("llm_setting", {})
default_prompt = {
"system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。
Expand Down
3 changes: 1 addition & 2 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,12 +356,11 @@ def run():
try:
for id in req["doc_ids"]:
info = {"run": str(req["run"]), "progress": 0}
if str(req["run"]) == TaskStatus.RUNNING.value:
if str(req["run"]) == TaskStatus.RUNNING.value and req.get("delete", False):
info["progress_msg"] = ""
info["chunk_num"] = 0
info["token_num"] = 0
DocumentService.update_by_id(id, info)
# if str(req["run"]) == TaskStatus.CANCEL.value:
tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
Expand Down
11 changes: 8 additions & 3 deletions api/db/services/task_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,15 +248,17 @@ def new_task():

prev_tasks = TaskService.get_tasks(doc["id"])
if prev_tasks:
ck_num = 0
for task in tsks:
reuse_prev_task_chunks(task, prev_tasks, chunking_config)
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
TaskService.filter_delete([Task.doc_id == doc["id"]])
chunk_ids = []
for task in prev_tasks:
if task["chunk_ids"]:
chunk_ids.extend(task["chunk_ids"].split())
if chunk_ids:
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})

bulk_insert_into_db(Task, tsks, True)
DocumentService.begin2parse(doc["id"])
Expand All @@ -267,14 +269,17 @@ def new_task():
SVR_QUEUE_NAME, message=t
), "Can't access Redis. Please check the Redis' status."


def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
if idx >= len(prev_tasks):
return
return 0
prev_task = prev_tasks[idx]
if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
return
return 0
task["chunk_ids"] = prev_task["chunk_ids"]
task["progress"] = 1.0
task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
prev_task["chunk_ids"] = ""

return len(task["chunk_ids"].split())
5 changes: 2 additions & 3 deletions web/src/hooks/chunk-hooks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ export const useFetchNextChunkList = (): ResponseGetType<{
debouncedSearchString,
available,
],

initialData: { data: [], total: 0, documentInfo: {} },
// placeholderData: keepPreviousData,
placeholderData: (previousData) =>
previousData ?? { data: [], total: 0, documentInfo: {} }, // https://github.com/TanStack/query/issues/8183
gcTime: 0,
queryFn: async () => {
const { data } = await kbService.chunk_list({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ const HighlightPopup = ({
// TODO: merge with DocumentPreviewer
const Preview = ({ highlights: state, setWidthAndHeight }: IProps) => {
const url = useGetDocumentUrl();
useCatchDocumentError(url);

const ref = useRef<(highlight: IHighlight) => void>(() => {});
const error = useCatchDocumentError(url);
Expand Down Expand Up @@ -119,12 +118,4 @@ const Preview = ({ highlights: state, setWidthAndHeight }: IProps) => {
);
};

const compare = (oldProps: IProps, newProps: IProps) => {
const arePropsEqual =
oldProps.highlights === newProps.highlights ||
(oldProps.highlights.length === 0 && newProps.highlights.length === 0);

return arePropsEqual;
};

export default memo(Preview);

0 comments on commit c00861d

Please sign in to comment.