diff --git a/extract_thinker/document_loader/document_loader_spreadsheet.py b/extract_thinker/document_loader/document_loader_spreadsheet.py index bfe6744..0c28e8a 100644 --- a/extract_thinker/document_loader/document_loader_spreadsheet.py +++ b/extract_thinker/document_loader/document_loader_spreadsheet.py @@ -1,6 +1,6 @@ from operator import attrgetter import openpyxl -from typing import Union +from typing import List, Union from io import BytesIO from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader from cachetools import cachedmethod @@ -14,19 +14,40 @@ def __init__(self, content=None, cache_ttl=300): @cachedmethod(cache=attrgetter('cache'), key=lambda self, file_path: hashkey(file_path)) def load_content_from_file(self, file_path: str) -> Union[str, object]: workbook = openpyxl.load_workbook(file_path) - sheet = workbook.active - data = [] - for row in sheet.iter_rows(values_only=True): - data.append(row) + data = {} + for sheet_name in workbook.sheetnames: + sheet = workbook[sheet_name] + sheet_data = [self._process_row(row) for row in sheet.iter_rows(values_only=True)] + data[sheet_name] = sheet_data self.content = data - return self.content + return {"data": self.content, "is_spreadsheet": True} @cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream))) def load_content_from_stream(self, stream: Union[BytesIO, str]) -> Union[str, object]: workbook = openpyxl.load_workbook(filename=BytesIO(stream.read())) - sheet = workbook.active - data = [] - for row in sheet.iter_rows(values_only=True): - data.append(row) + data = {} + for sheet_name in workbook.sheetnames: + sheet = workbook[sheet_name] + sheet_data = [self._process_row(row) for row in sheet.iter_rows(values_only=True)] + data[sheet_name] = sheet_data self.content = data - return self.content + return {"data": self.content, "is_spreadsheet": True} + + def _process_row(self, row): + if all(cell in (None, '', ' ') for cell in row): + return ["\n"] + return [cell if cell not in (None, '', ' ') else "" for cell in row] + + def load_content_from_file_list(self, file_paths: List[str]) -> List[Union[str, object]]: + data_list = [] + for file_path in file_paths: + data = self.load_content_from_file(file_path) + data_list.append(data) + return data_list + + def load_content_from_stream_list(self, streams: List[BytesIO]) -> List[Union[str, object]]: + data_list = [] + for stream in streams: + data = self.load_content_from_stream(stream) + data_list.append(data) + return data_list diff --git a/extract_thinker/extractor.py b/extract_thinker/extractor.py index a7c40fc..7888d6d 100644 --- a/extract_thinker/extractor.py +++ b/extract_thinker/extractor.py @@ -15,7 +15,7 @@ from extract_thinker.document_loader.loader_interceptor import LoaderInterceptor from extract_thinker.document_loader.llm_interceptor import LlmInterceptor -from extract_thinker.utils import get_file_extension, encode_image +from extract_thinker.utils import get_file_extension, encode_image, json_to_formatted_string import yaml import litellm @@ -276,6 +276,8 @@ def _extract(self, if content is not None: if isinstance(content, dict): + if content["is_spreadsheet"]: + content = json_to_formatted_string(content["data"]) content = yaml.dump(content) messages.append({"role": "user", "content": "##Content\n\n" + content}) diff --git a/extract_thinker/utils.py b/extract_thinker/utils.py index 1df2561..d43011f 100644 --- a/extract_thinker/utils.py +++ b/extract_thinker/utils.py @@ -100,3 +100,12 @@ def get_file_extension(file_path): _, ext = os.path.splitext(file_path) ext = ext[1:] # remove the dot return ext + + +def json_to_formatted_string(data): + result = [] + for sheet, rows in data.items(): + result.append(f"##{sheet}") + for row in rows: + result.append(','.join(map(str, row))) + return '\n'.join(result) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 46542e2..1592d67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "extract_thinker" -version = "0.0.8" +version = "0.0.9" description = "Library to extract data from files and documents agnositicaly using LLMs" authors = ["JĂșlio Almeida "] readme = "README.md"