Merge remote-tracking branch 'upstream/main'

isthaison · Dec 10, 2024 · 80fb74c · 80fb74c
2 parents b72de7e + b844ad6
commit 80fb74c
Show file tree

Hide file tree

Showing 13 changed files with 142 additions and 46 deletions.
diff --git a/agent/__init__.py b/agent/__init__.py
@@ -0,0 +1,2 @@
+from beartype.claw import beartype_this_package
+beartype_this_package()
diff --git a/api/__init__.py b/api/__init__.py
@@ -0,0 +1,2 @@
+from beartype.claw import beartype_this_package
+beartype_this_package()
diff --git a/deepdoc/__init__.py b/deepdoc/__init__.py
@@ -0,0 +1,2 @@
+from beartype.claw import beartype_this_package
+beartype_this_package()
diff --git a/docs/guides/configure_knowledge_base.md b/docs/guides/configure_knowledge_base.md
@@ -58,7 +58,7 @@ You can also change the chunk template for a particular file on the **Datasets**
 
 ### Select embedding model
 
-An embedding model converts chunks into embeddings. It cannot be changed once the knowledge base has chunks. To switch to a different embedding model, You must delete all chunks in the knowledge base. The obvious reason is that we *must* ensure that files in a specific knowledge base are converted to embeddings using the *same* embedding model (ensure that they are compared in the same embedding space).
+An embedding model converts chunks into embeddings. It cannot be changed once the knowledge base has chunks. To switch to a different embedding model, you must delete all existing chunks in the knowledge base. The obvious reason is that we *must* ensure that files in a specific knowledge base are converted to embeddings using the *same* embedding model (ensure that they are compared in the same embedding space).
 
 The following embedding models can be deployed locally:
 

diff --git a/docs/release_notes.md b/docs/release_notes.md
@@ -13,7 +13,7 @@ Released on November 29, 2024.
 
 ### Improvements
 
-Adds [Infinity's configuration file](https://github.com/infiniflow/ragflow/blob/main/docker/infinity_conf.toml) to facilitate integration and customization of Infinity as a document engine. From this release onwards, updates to Infinity's configuration can be made directly within RAGFlow and will take effect immediately after restarting RAGFlow using `docker compose`. [#3715](https://github.com/infiniflow/ragflow/pull/3715)
+Adds [Infinity's configuration file](https://github.com/infiniflow/ragflow/blob/main/docker/infinity_conf.toml) to facilitate integration and customization of [Infinity](https://github.com/infiniflow/infinity) as a document engine. From this release onwards, updates to Infinity's configuration can be made directly within RAGFlow and will take effect immediately after restarting RAGFlow using `docker compose`. [#3715](https://github.com/infiniflow/ragflow/pull/3715)
 
 ### Fixed issues
 
@@ -137,7 +137,7 @@ See [Upgrade RAGFlow](https://ragflow.io/docs/dev/upgrade_ragflow) for instructi
 
 ## v0.11.0
 
-Released on September 14, 2024
+Released on September 14, 2024.
 
 ### New features
 
@@ -152,4 +152,100 @@ Released on September 14, 2024
 -  Supports running retrieval benchmarking on the following datasets:
     - [ms_marco_v1.1](https://huggingface.co/datasets/microsoft/ms_marco)
     - [trivia_qa](https://huggingface.co/datasets/mandarjoshi/trivia_qa)
-    - [miracl](https://huggingface.co/datasets/miracl/miracl)
+    - [miracl](https://huggingface.co/datasets/miracl/miracl)
+
+## v0.10.0
+
+Released on August 26, 2024.
+
+### New features
+
+- Introduces a text-to-SQL template in the Agent UI.
+- Implements Agent APIs.
+- Incorporates monitoring for the task executor.
+- Introduces Agent tools **GitHub**, **DeepL**, **BaiduFanyi**, **QWeather**, and **GoogleScholar**.
+- Supports chunking of EML files.
+- Supports more LLMs or model services: **GPT-4o-mini**, **PerfXCloud**, **TogetherAI**, **Upstage**, **Novita.AI**, **01.AI**, **SiliconFlow**, **XunFei Spark**, **Baidu Yiyan**, and **Tencent Hunyuan**.
+
+## v0.9.0
+
+Released on August 6, 2024.
+
+### New features
+
+- Supports GraphRAG as a chunk method.
+- Introduces Agent component **Keyword** and search tools, including **Baidu**, **DduckDuckGo**, **PubMed**, **Wikipedia**, **Bing**, and **Google**.
+- Supports speech-to-text recognition for audio files.
+- Supports model vendors **Gemini** and **Groq**.
+- Supports inference frameworks, engines, and services including **LM studio**, **OpenRouter**, **LocalAI**, and **Nvidia API**.
+- Supports using reranker models in Xinference.
+
+## v0.8.0
+
+Released on July 8, 2024.
+
+### New features
+
+- Supports Agentic RAG, enabling graph-based workflow construction for RAG and agents.
+- Supports model vendors **Mistral**, **MiniMax**, **Bedrock**, and **Azure OpenAI**.
+- Supports DOCX files in the MANUAL chunk method.
+- Supports DOCX, MD, and PDF files in the Q&A chunk method.
+
+## v0.7.0
+
+Released on May 31, 2024.
+
+### New features
+
+- Supports the use of reranker models.
+- Integrates reranker and embedding models: [BCE](https://github.com/netease-youdao/BCEmbedding), [BGE](https://github.com/FlagOpen/FlagEmbedding), and [Jina](https://jina.ai/embeddings/).
+- Supports LLMs Baichuan and VolcanoArk.
+- Implements [RAPTOR](https://arxiv.org/html/2401.18059v1) for improved text retrieval.
+- Supports HTML files in the GENERAL chunk method.
+- Provides HTTP and Python APIs for deleting documents by ID.
+- Supports ARM64 platforms.
+
+:::danger IMPORTANT
+While we also test RAGFlow on ARM64 platforms, we do not plan to maintain RAGFlow Docker images for ARM.
+
+If you are on an ARM platform, following [this guide](https://ragflow.io/docs/dev/build_docker_image) to build a RAGFlow Docker image.
+:::
+
+### Related APIs
+
+#### HTTP API
+
+- [Delete documents](https://ragflow.io/docs/dev/http_api_reference#delete-documents)
+
+#### Python API
+
+- [Delete documents](https://ragflow.io/docs/dev/python_api_reference#delete-documents)
+
+## v0.6.0
+
+Released on May 21, 2024.
+
+### New features
+
+- Supports streaming output.
+- Provides HTTP and Python APIs for retrieving document chunks.
+- Supports monitoring of system components, including Elasticsearch, MySQL, Redis, and MinIO.
+- Supports disabling **Layout Recognition** in the GENERAL chunk method to reduce file chunking time.
+
+### Related APIs
+
+#### HTTP API
+
+- [Retrieve chunks](https://ragflow.io/docs/dev/http_api_reference#retrieve-chunks)
+
+#### Python API
+
+- [Retrieve chunks](https://ragflow.io/docs/dev/python_api_reference#retrieve-chunks)
+
+## v0.5.0
+
+Released on May 8, 2024.
+
+### New features
+
+- Supports LLM DeepSeek.
diff --git a/intergrations/chatgpt-on-wechat/plugins/__init__.py b/intergrations/chatgpt-on-wechat/plugins/__init__.py
@@ -1,3 +1,6 @@
+from beartype.claw import beartype_this_package
+beartype_this_package()
+
 from .ragflow_chat import RAGFlowChat
 
 __all__ = [

diff --git a/rag/__init__.py b/rag/__init__.py
@@ -0,0 +1,2 @@
+from beartype.claw import beartype_this_package
+beartype_this_package()
diff --git a/rag/utils/infinity_conn.py b/rag/utils/infinity_conn.py
@@ -350,8 +350,9 @@ def insert(
             assert "_id" not in d
             assert "id" in d
             for k, v in d.items():
-                if k.endswith("_kwd") and isinstance(v, list):
-                    d[k] = " ".join(v)
+                if k in ["important_kwd", "question_kwd", "entities_kwd"]:
+                    assert isinstance(v, list)
+                    d[k] = "###".join(v)
                 elif k == 'kb_id':
                     if isinstance(d[k], list):
                         d[k] = d[k][0] # since d[k] is a list, but we need a str
@@ -443,9 +444,9 @@ def getFields(self, res, fields: list[str]) -> list[str, dict]:
                 v = res[fieldnm][i]
                 if isinstance(v, Series):
                     v = list(v)
-                elif fieldnm.endswith("_kwd"):
+                elif fieldnm in ["important_kwd", "question_kwd", "entities_kwd"]:
                     assert isinstance(v, str)
-                    v = v.split()
+                    v = [kwd for kwd in v.split("###") if kwd]
                 elif fieldnm == "position_int":
                     assert isinstance(v, str)
                     if v:

diff --git a/sdk/python/ragflow_sdk/__init__.py b/sdk/python/ragflow_sdk/__init__.py
@@ -1,3 +1,6 @@
+from beartype.claw import beartype_this_package
+beartype_this_package()
+
 import importlib.metadata
 
 from .ragflow import RAGFlow

diff --git a/web/reducer.js b/web/reducer.js
diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts
@@ -86,7 +86,7 @@ export default {
       namePlaceholder: 'Please input name!',
       doc: 'Docs',
       datasetDescription:
-        '😉 Questions and answers can only be answered after the parsing is successful.',
+        '😉 Please wait for your file to finish parsing before starting an AI-powered chat.',
       addFile: 'Add file',
       searchFiles: 'Search your files',
       localFiles: 'Local files',
@@ -158,17 +158,17 @@ export default {
       topKTip: `K chunks will be fed into rerank models.`,
       delimiter: `Delimiter`,
       delimiterTip:
-        'Supports multiple characters as separators, and the multiple character separators are wrapped with `. For example, if it is configured like this: \n`##`; then the text will be separated by line breaks, two #s and a semicolon, and then assembled according to the size of the "token number".',
+        'A delimiter or separator can consist of one or multiple special characters. If it is multiple characters, ensure they are enclosed in backticks( ``). For example, if you configure your delimiters like this: \n`##`;, then your texts will be separated at line breaks, double hash symbols (##), or semicolons.',
       html4excel: 'Excel to HTML',
       html4excelTip: `When enabled, the spreadsheet will be parsed into HTML tables, and at most 256 rows for one table. Otherwise, it will be parsed into key-value pairs by row.`,
       autoKeywords: 'Auto-keyword',
-      autoKeywordsTip: `Extract N keywords for each chunk to increase their ranking for queries containing those keywords. You can check or update the added keywords for a chunk from the chunk list. Be aware that extra tokens will be consumed by the LLM specified in 'System model settings'.`,
+      autoKeywordsTip: `Automatically extract N keywords for each chunk to increase their ranking for queries containing those keywords. You can check or update the added keywords for a chunk from the chunk list. Be aware that extra tokens will be consumed by the LLM specified in 'System model settings'.`,
       autoQuestions: 'Auto-question',
-      autoQuestionsTip: `Extract N questions for each chunk to increase their ranking for queries containing those questions. You can check or update the added questions for a chunk from the chunk list. This feature will not disrupt the chunking process if an error occurs, except that it may add an empty result to the original chunk. Be aware that extra tokens will be consumed by the LLM specified in 'System model settings'.`,
+      autoQuestionsTip: `Automatically extract N questions for each chunk to increase their ranking for queries containing those questions. You can check or update the added questions for a chunk from the chunk list. This feature will not disrupt the chunking process if an error occurs, except that it may add an empty result to the original chunk. Be aware that extra tokens will be consumed by the LLM specified in 'System model settings'.`,
     },
     knowledgeConfiguration: {
       titleDescription:
-        'Update your knowledge base configurations here, particularly the chunk method.',
+        'Update your knowledge base configuration here, particularly the chunk method.',
       name: 'Knowledge base name',
       photo: 'Knowledge base photo',
       description: 'Description',
@@ -180,13 +180,13 @@ export default {
       chunkTokenNumber: 'Chunk token number',
       chunkTokenNumberMessage: 'Chunk token number is required',
       embeddingModelTip:
-        'The model that converts chunks into embeddings. It cannot be changed once the knowledge base has chunks. To switch to a different embedding model, You must delete all chunks in the knowledge base.',
+        'The model that converts chunks into embeddings. It cannot be changed once the knowledge base has chunks. To switch to a different embedding model, you must delete all existing chunks in the knowledge base.',
       permissionsTip:
         "If set to 'Team', all team members will be able to manage the knowledge base.",
       chunkTokenNumberTip:
         'It sets the token threshold for a chunk. A paragraph with fewer tokens than this threshold will be combined with the following paragraph until the token count exceeds the threshold, at which point a chunk is created.',
       chunkMethod: 'Chunk method',
-      chunkMethodTip: 'Tips are on the right.',
+      chunkMethodTip: 'View the tips on the right.',
       upload: 'Upload',
       english: 'English',
       chinese: 'Chinese',
@@ -279,12 +279,12 @@ export default {
     </p>`,
       knowledgeGraph: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>
 
-<p>This approach chunks files using the 'naive'/'General' method. It splits a document into segements and then combines adjacent segments until the token count exceeds the threshold specified by 'Chunk token number', at which point a chunk is created.</p>
+<p>This approach chunks files using the 'naive'/'General' method. It splits a document into segments and then combines adjacent segments until the token count exceeds the threshold specified by 'Chunk token number', at which point a chunk is created.</p>
 <p>The chunks are then fed to the LLM to extract entities and relationships for a knowledge graph and a mind map.</p>
 <p>Ensure that you set the <b>Entity types</b>.</p>`,
       useRaptor: 'Use RAPTOR to enhance retrieval',
       useRaptorTip:
-        'Recursive Abstractive Processing for Tree-Organized Retrieval, see https://huggingface.co/papers/2401.18059 for more information',
+        'Recursive Abstractive Processing for Tree-Organized Retrieval, see https://huggingface.co/papers/2401.18059 for more information.',
       prompt: 'Prompt',
       promptTip: 'LLM prompt used for summarization.',
       promptMessage: 'Prompt is required',
@@ -305,7 +305,7 @@ The above is the content you need to summarize.`,
       entityTypes: 'Entity types',
       vietnamese: 'Vietamese',
       pageRank: 'Page rank',
-      pageRankTip: `This is used to boost the relevance score. The relevance score with all the retrieved chunks will plus this number, When you want to search the given knowledge base at first place, set a higher pagerank score than others.`,
+      pageRankTip: `This increases the relevance score of the knowledge base. Its value will be added to the relevance score of all retrieved chunks from this knowledge base. Useful when you are searching within multiple knowledge bases and wanting to assign a higher pagerank score to a specific one.`,
     },
     chunk: {
       chunk: 'Chunk',

diff --git a/web/src/pages/chat/markdown-content/index.tsx b/web/src/pages/chat/markdown-content/index.tsx
@@ -20,9 +20,10 @@ import { useTranslation } from 'react-i18next';
 
 import 'katex/dist/katex.min.css'; // `rehype-katex` does not import the CSS for you
 
+import { replaceTextByOldReg } from '../utils';
 import styles from './index.less';
 
-const reg = /(#{2}\d+\${2})/g;
+const reg = /(#{2}\d+@{2})/g;
 const curReg = /(~{2}\d+\${2})/g;
 
 const getChunkIndex = (match: string) => Number(match.slice(2, -2));
@@ -156,7 +157,9 @@ const MarkdownContent = ({
 
   const renderReference = useCallback(
     (text: string) => {
-      let replacedText = reactStringReplace(text, reg, (match, i) => {
+      const nextText = replaceTextByOldReg(text);
+
+      let replacedText = reactStringReplace(nextText, reg, (match, i) => {
         const chunkIndex = getChunkIndex(match);
         return (
           <Popover content={getPopoverContent(chunkIndex)} key={i}>

diff --git a/web/src/pages/chat/utils.ts b/web/src/pages/chat/utils.ts
@@ -41,3 +41,11 @@ export const buildMessageItemReference = (
 
   return reference ?? { doc_aggs: [], chunks: [], total: 0 };
 };
+
+const oldReg = /(#{2}\d+\${2})/g;
+
+export const replaceTextByOldReg = (text: string) => {
+  return text.replace(oldReg, function (substring) {
+    return `${substring.slice(0, -2)}@@`;
+  });
+};
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from beartype.claw import beartype_this_package
		beartype_this_package()