Skip to content

Commit

Permalink
Merge pull request #1184 from Devparihar5/ExcelParser
Browse files Browse the repository at this point in the history
new: added ExcelParser(tested) to read .xlsx files
  • Loading branch information
dartpain authored Oct 6, 2024
2 parents 4895d38 + 09a15e2 commit c9e95a9
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 3 deletions.
3 changes: 2 additions & 1 deletion application/parser/file/bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
from application.parser.file.html_parser import HTMLParser
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.schema.base import Document

DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
".docx": DocxParser(),
".csv": PandasCSVParser(),
".xlsx":ExcelParser(),
".epub": EpubParser(),
".md": MarkdownParser(),
".rst": RstParser(),
Expand Down
65 changes: 65 additions & 0 deletions application/parser/file/tabular_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,68 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]
return (self._row_joiner).join(text_list)
else:
return text_list


class ExcelParser(BaseParser):
r"""Excel (.xlsx) parser.
Parses Excel files using Pandas `read_excel` function.
If special parameters are required, use the `pandas_config` dict.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
col_joiner (str): Separator to use for joining cols per row.
Set to ", " by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
pandas_config (dict): Options for the `pandas.read_excel` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
for more information.
Set to empty dict by default, this means pandas will try to figure
out the table structure on its own.
"""

def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._col_joiner = col_joiner
self._row_joiner = row_joiner
self._pandas_config = pandas_config

def _init_parser(self) -> Dict:
"""Init parser."""
return {}

def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
"""Parse file."""
try:
import pandas as pd
except ImportError:
raise ValueError("pandas module is required to read Excel files.")

df = pd.read_excel(file, **self._pandas_config)

text_list = df.apply(
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
).tolist()

if self._concat_rows:
return (self._row_joiner).join(text_list)
else:
return text_list
1 change: 1 addition & 0 deletions application/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ openapi3-parser==1.1.18
orjson==3.10.7
packaging==24.1
pandas==2.2.3
openpyxl==3.1.5
pathable==0.4.3
pillow==10.4.0
portalocker==2.10.1
Expand Down
2 changes: 1 addition & 1 deletion application/vectorstore/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self, source_id: str, embeddings_key: str, docs_init=None):
else:
self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True)
except Exception:
raise # Just re-raise the exception without assigning to e
raise

self.assert_embedding_dimensions(embeddings)

Expand Down
2 changes: 1 addition & 1 deletion frontend/src/locale/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"remote": "Remote",
"name": "Name",
"choose": "Choose Files",
"info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb",
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
"uploadedFiles": "Uploaded Files",
"cancel": "Cancel",
"train": "Train",
Expand Down
1 change: 1 addition & 0 deletions frontend/src/upload/Upload.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ function Upload({
'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
['.docx'],
'text/csv': ['.csv'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
},
});

Expand Down

0 comments on commit c9e95a9

Please sign in to comment.