Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved file handling and validation for document processing #976

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions lightrag/api/routers/document_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,27 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
| ".scss"
| ".less"
):
content = file.decode("utf-8")
try:
# Try to decode as UTF-8
content = file.decode("utf-8")

# Validate content
if not content or len(content.strip()) == 0:
logger.error(f"Empty content in file: {file_path.name}")
return False

# Check if content looks like binary data string representation
if content.startswith("b'") or content.startswith('b"'):
logger.error(
f"File {file_path.name} appears to contain binary data representation instead of text"
)
return False

except UnicodeDecodeError:
logger.error(
f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing."
)
return False
case ".pdf":
if not pm.is_installed("pypdf2"):
pm.install("pypdf2")
Expand All @@ -229,7 +249,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
case ".docx":
if not pm.is_installed("docx"):
pm.install("docx")
from docx import Document
from docx import Document # type: ignore
from io import BytesIO

docx_file = BytesIO(file)
Expand All @@ -238,7 +258,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
case ".pptx":
if not pm.is_installed("pptx"):
pm.install("pptx")
from pptx import Presentation
from pptx import Presentation # type: ignore
from io import BytesIO

pptx_file = BytesIO(file)
Expand All @@ -250,7 +270,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
case ".xlsx":
if not pm.is_installed("openpyxl"):
pm.install("openpyxl")
from openpyxl import load_workbook
from openpyxl import load_workbook # type: ignore
from io import BytesIO

xlsx_file = BytesIO(file)
Expand Down
18 changes: 17 additions & 1 deletion lightrag/lightrag.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,8 +670,24 @@ async def apipeline_enqueue_documents(
all_new_doc_ids = set(new_docs.keys())
# Exclude IDs of documents that are already in progress
unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)

# Log ignored document IDs
ignored_ids = [
doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs
]
if ignored_ids:
logger.warning(
f"Ignoring {len(ignored_ids)} document IDs not found in new_docs"
)
for doc_id in ignored_ids:
logger.warning(f"Ignored document ID: {doc_id}")

# Filter new_docs to only include documents with unique IDs
new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids}
new_docs = {
doc_id: new_docs[doc_id]
for doc_id in unique_new_doc_ids
if doc_id in new_docs
}

if not new_docs:
logger.info("No new unique documents were found.")
Expand Down
203 changes: 0 additions & 203 deletions run_with_gunicorn.py

This file was deleted.