Skip to content

Commit

Permalink
Merge pull request #24 from enoch3712/23-add-documentloader-for-sprea…
Browse files Browse the repository at this point in the history
…dsheets-xlsx

documentLoader added
  • Loading branch information
enoch3712 authored Jun 19, 2024
2 parents 842a9dc + 825e875 commit 2d2be1f
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 13 deletions.
43 changes: 32 additions & 11 deletions extract_thinker/document_loader/document_loader_spreadsheet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from operator import attrgetter
import openpyxl
from typing import Union
from typing import List, Union
from io import BytesIO
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from cachetools import cachedmethod
Expand All @@ -14,19 +14,40 @@ def __init__(self, content=None, cache_ttl=300):
@cachedmethod(cache=attrgetter('cache'), key=lambda self, file_path: hashkey(file_path))
def load_content_from_file(self, file_path: str) -> Union[str, object]:
workbook = openpyxl.load_workbook(file_path)
sheet = workbook.active
data = []
for row in sheet.iter_rows(values_only=True):
data.append(row)
data = {}
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
sheet_data = [self._process_row(row) for row in sheet.iter_rows(values_only=True)]
data[sheet_name] = sheet_data
self.content = data
return self.content
return {"data": self.content, "is_spreadsheet": True}

@cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream)))
def load_content_from_stream(self, stream: Union[BytesIO, str]) -> Union[str, object]:
workbook = openpyxl.load_workbook(filename=BytesIO(stream.read()))
sheet = workbook.active
data = []
for row in sheet.iter_rows(values_only=True):
data.append(row)
data = {}
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
sheet_data = [self._process_row(row) for row in sheet.iter_rows(values_only=True)]
data[sheet_name] = sheet_data
self.content = data
return self.content
return {"data": self.content, "is_spreadsheet": True}

def _process_row(self, row):
if all(cell in (None, '', ' ') for cell in row):
return ["\n"]
return [cell if cell not in (None, '', ' ') else "" for cell in row]

def load_content_from_file_list(self, file_paths: List[str]) -> List[Union[str, object]]:
data_list = []
for file_path in file_paths:
data = self.load_content_from_file(file_path)
data_list.append(data)
return data_list

def load_content_from_stream_list(self, streams: List[BytesIO]) -> List[Union[str, object]]:
data_list = []
for stream in streams:
data = self.load_content_from_stream(stream)
data_list.append(data)
return data_list
4 changes: 3 additions & 1 deletion extract_thinker/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from extract_thinker.document_loader.loader_interceptor import LoaderInterceptor
from extract_thinker.document_loader.llm_interceptor import LlmInterceptor

from extract_thinker.utils import get_file_extension, encode_image
from extract_thinker.utils import get_file_extension, encode_image, json_to_formatted_string
import yaml
import litellm

Expand Down Expand Up @@ -276,6 +276,8 @@ def _extract(self,

if content is not None:
if isinstance(content, dict):
if content["is_spreadsheet"]:
content = json_to_formatted_string(content["data"])
content = yaml.dump(content)
messages.append({"role": "user", "content": "##Content\n\n" + content})

Expand Down
9 changes: 9 additions & 0 deletions extract_thinker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,12 @@ def get_file_extension(file_path):
_, ext = os.path.splitext(file_path)
ext = ext[1:] # remove the dot
return ext


def json_to_formatted_string(data):
result = []
for sheet, rows in data.items():
result.append(f"##{sheet}")
for row in rows:
result.append(','.join(map(str, row)))
return '\n'.join(result)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "extract_thinker"
version = "0.0.8"
version = "0.0.9"
description = "Library to extract data from files and documents agnositicaly using LLMs"
authors = ["Júlio Almeida <enoch3712@gmail.com>"]
readme = "README.md"
Expand Down

0 comments on commit 2d2be1f

Please sign in to comment.