Skip to content

Commit

Permalink
Merge pull request #554 from lion-agi/fix_file_utils
Browse files Browse the repository at this point in the history
Reader tool: added `list_dir`
  • Loading branch information
ohdearquant authored Jan 29, 2025
2 parents 1d6303a + b62047c commit 837d3c7
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 31 deletions.
8 changes: 7 additions & 1 deletion lionagi/libs/file/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def dir_to_files(
max_workers: int | None = None,
ignore_errors: bool = False,
verbose: bool = False,
recursive: bool = False,
) -> list[Path]:
"""
Recursively process a directory and return a list of file paths.
Expand All @@ -33,6 +34,8 @@ def dir_to_files(
If None, uses the default ThreadPoolExecutor behavior.
ignore_errors (bool): If True, log warnings for errors instead of raising exceptions.
verbose (bool): If True, print verbose output.
recursive (bool): If True, process directories recursively (the default).
If False, only process files in the top-level directory.
Returns:
List[Path]: A list of Path objects representing the files found.
Expand All @@ -58,11 +61,14 @@ def process_file(file_path: Path) -> Path | None:
raise ValueError(f"Error processing {file_path}: {e}") from e
return None

file_iterator = (
directory_path.rglob("*") if recursive else directory_path.glob("*")
)
try:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(process_file, f)
for f in directory_path.rglob("*")
for f in file_iterator
if f.is_file()
]
files = [
Expand Down
1 change: 1 addition & 0 deletions lionagi/libs/file/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def save_to_file(
)
with file_path.open("w", encoding="utf-8") as file:
file.write(text)
file.close()
if verbose:
logging.info(f"Text saved to: {file_path}")
return file_path
Expand Down
94 changes: 72 additions & 22 deletions lionagi/tools/file/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pydantic import BaseModel, Field, model_validator

from lionagi.operatives.action.tool import Tool
from lionagi.service.endpoints.token_calculator import TokenCalculator
from lionagi.utils import to_num

from ..base import LionTool
Expand All @@ -18,10 +19,12 @@ class ReaderAction(str, Enum):
This enumeration indicates the *type* of action the LLM wants to perform.
- 'open': Convert a file/URL to text and store it internally for partial reads
- 'read': Return a partial slice of the already-opened doc
- 'list_dir': List all files in a directory and store it internally for partial reads
"""

open = "open"
read = "read"
list_dir = "list_dir"


class ReaderRequest(BaseModel):
Expand All @@ -39,6 +42,7 @@ class ReaderRequest(BaseModel):
"Action to perform. Must be one of: "
"- 'open': Convert a file/URL to text and store it internally for partial reads. "
"- 'read': Return a partial slice of the already-opened doc."
"- 'list_dir': List all files in a directory."
),
)

Expand All @@ -54,7 +58,8 @@ class ReaderRequest(BaseModel):
None,
description=(
"Unique ID referencing a previously opened document. "
"This field is REQUIRED if action='read'. If action='open', leave it None."
"This field is REQUIRED if action='read'. Else leave it None."
"this field starts with 'DOC_' for document and 'DIR_' for directory listing."
),
)

Expand All @@ -74,6 +79,22 @@ class ReaderRequest(BaseModel):
),
)

recursive: bool = Field(
False,
description=(
"Whether to recursively list files in subdirectories. Defaults to False."
"Only used if action='list_dir'."
),
)

file_types: list[str] | None = Field(
None,
description=(
"List files with specific extensions. "
"If omitted or None, list all files. Only used if action='list_dir'."
),
)

@model_validator(mode="before")
def _validate_request(cls, values):
for k, v in values.items():
Expand All @@ -96,6 +117,7 @@ class DocumentInfo(BaseModel):

doc_id: str
length: int | None = None
num_tokens: int | None = None


class PartialChunk(BaseModel):
Expand Down Expand Up @@ -150,17 +172,17 @@ class ReaderTool(LionTool):
is_lion_system_tool = True
system_tool_name = "reader_tool"

from lionagi.libs.package.imports import check_import
def __init__(self):
from lionagi.libs.package.imports import check_import

DocumentConverter = check_import(
"docling",
module_name="document_converter",
import_name="DocumentConverter",
)
DocumentConverter = check_import(
"docling",
module_name="document_converter",
import_name="DocumentConverter",
)

def __init__(self):
super().__init__()
self.converter = ReaderTool.DocumentConverter()
self.converter = DocumentConverter()
self.documents = {} # doc_id -> (temp_file_path, doc_length)
self._tool = None

Expand All @@ -174,23 +196,18 @@ def handle_request(self, request: ReaderRequest) -> ReaderResponse:
request = ReaderRequest(**request)
if request.action == "open":
return self._open_doc(request.path_or_url)
elif request.action == "read":
if request.action == "read":
return self._read_doc(
request.doc_id, request.start_offset, request.end_offset
)
if request.action == "list_dir":
return self._list_dir(
request.path_or_url, request.recursive, request.file_types
)
else:
return ReaderResponse(success=False, error="Unknown action type")

def _open_doc(self, source: str) -> ReaderResponse:
try:
result = self.converter.convert(source)
text = result.document.export_to_markdown()
except Exception as e:
return ReaderResponse(
success=False, error=f"Conversion error: {str(e)}"
)

doc_id = f"DOC_{abs(hash(source))}"
def _save_to_temp(self, text, doc_id):
temp_file = tempfile.NamedTemporaryFile(
delete=False, mode="w", encoding="utf-8"
)
Expand All @@ -202,9 +219,26 @@ def _open_doc(self, source: str) -> ReaderResponse:
self.documents[doc_id] = (temp_file.name, doc_len)

return ReaderResponse(
success=True, doc_info=DocumentInfo(doc_id=doc_id, length=doc_len)
success=True,
doc_info=DocumentInfo(
doc_id=doc_id,
length=doc_len,
num_tokens=TokenCalculator.tokenize(text),
),
)

def _open_doc(self, source: str) -> ReaderResponse:
try:
result = self.converter.convert(source)
text = result.document.export_to_markdown()
except Exception as e:
return ReaderResponse(
success=False, error=f"Conversion error: {str(e)}"
)

doc_id = f"DOC_{abs(hash(source))}"
return self._save_to_temp(text, doc_id)

def _read_doc(self, doc_id: str, start: int, end: int) -> ReaderResponse:
if doc_id not in self.documents:
return ReaderResponse(
Expand All @@ -230,14 +264,30 @@ def _read_doc(self, doc_id: str, start: int, end: int) -> ReaderResponse:
chunk=PartialChunk(start_offset=s, end_offset=e, content=content),
)

def _list_dir(
self,
directory: str,
recursive: bool = False,
file_types: list[str] | None = None,
):
from lionagi.libs.file.process import dir_to_files

files = dir_to_files(
directory, recursive=recursive, file_types=file_types
)
files = "\n".join([str(f) for f in files])
doc_id = f"DIR_{abs(hash(directory))}"
return self._save_to_temp(files, doc_id)

def to_tool(self):
if self._tool is None:

def reader_tool(**kwargs):
"""
A function that takes ReaderRequest to either:
A function that takes ReaderRequest to do one of:
- open a doc (File/URL) -> returns doc_id, doc length
- read partial text from doc -> returns chunk
- list all files in a directory -> returns list of files as doc format
"""
return self.handle_request(
ReaderRequest(**kwargs)
Expand Down
3 changes: 3 additions & 0 deletions lionagi/tools/query/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (c) 2023 - 2025, HaiyangLi <quantocean.li at gmail dot com>
#
# SPDX-License-Identifier: Apache-2.0
11 changes: 6 additions & 5 deletions lionagi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,10 +1022,14 @@ def create_path(
The full Path to the new or existing file.
Raises:
ValueError: If no extension or filename invalid.
ValueError: If filename is invalid.
FileExistsError: If file exists and file_exist_ok=False.
"""
if "/" in filename or "\\" in filename:
if "/" in filename:
sub_dir, filename = filename.split("/")[:-1], filename.split("/")[-1]
directory = Path(directory) / "/".join(sub_dir)

if "\\" in filename:
raise ValueError("Filename cannot contain directory separators.")

directory = Path(directory)
Expand All @@ -1036,9 +1040,6 @@ def create_path(
else:
name, ext = filename, extension

if not ext:
raise ValueError("No extension provided for filename.")

# Ensure extension has a single leading dot
ext = f".{ext.lstrip('.')}" if ext else ""

Expand Down
2 changes: 1 addition & 1 deletion lionagi/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.9.3"
__version__ = "0.9.4"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "lionagi"
version = "0.9.3"
version = "0.9.4"
description = "An Intelligence Operating System."
authors = [
{ name = "HaiyangLi", email = "quantocean.li@gmail.com" },
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 837d3c7

Please sign in to comment.