diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index aec6c8c18..79fc2c452 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -10,13 +10,14 @@ from application.parser.file.html_parser import HTMLParser from application.parser.file.markdown_parser import MarkdownParser from application.parser.file.rst_parser import RstParser -from application.parser.file.tabular_parser import PandasCSVParser +from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser from application.parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), ".docx": DocxParser(), ".csv": PandasCSVParser(), + ".xlsx":ExcelParser(), ".epub": EpubParser(), ".md": MarkdownParser(), ".rst": RstParser(), diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index 81355ae07..b2dbd1933 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -113,3 +113,68 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str] return (self._row_joiner).join(text_list) else: return text_list + + +class ExcelParser(BaseParser): + r"""Excel (.xlsx) parser. + + Parses Excel files using Pandas `read_excel` function. + If special parameters are required, use the `pandas_config` dict. + + Args: + concat_rows (bool): whether to concatenate all rows into one document. + If set to False, a Document will be created for each row. + True by default. + + col_joiner (str): Separator to use for joining cols per row. + Set to ", " by default. + + row_joiner (str): Separator to use for joining each row. + Only used when `concat_rows=True`. + Set to "\n" by default. + + pandas_config (dict): Options for the `pandas.read_excel` function call. + Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html + for more information. + Set to empty dict by default, this means pandas will try to figure + out the table structure on its own. + + """ + + def __init__( + self, + *args: Any, + concat_rows: bool = True, + col_joiner: str = ", ", + row_joiner: str = "\n", + pandas_config: dict = {}, + **kwargs: Any + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + self._col_joiner = col_joiner + self._row_joiner = row_joiner + self._pandas_config = pandas_config + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file.""" + try: + import pandas as pd + except ImportError: + raise ValueError("pandas module is required to read Excel files.") + + df = pd.read_excel(file, **self._pandas_config) + + text_list = df.apply( + lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + ).tolist() + + if self._concat_rows: + return (self._row_joiner).join(text_list) + else: + return text_list \ No newline at end of file diff --git a/application/requirements.txt b/application/requirements.txt index d7621cfdd..6a57dd12b 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -49,6 +49,7 @@ openapi3-parser==1.1.18 orjson==3.10.7 packaging==24.1 pandas==2.2.3 +openpyxl==3.1.5 pathable==0.4.3 pillow==10.4.0 portalocker==2.10.1 diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index e6c13bcdd..afa55db95 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -22,7 +22,7 @@ def __init__(self, source_id: str, embeddings_key: str, docs_init=None): else: self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True) except Exception: - raise # Just re-raise the exception without assigning to e + raise self.assert_embedding_dimensions(embeddings) diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index c9b599bf7..f5b48d759 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -79,7 +79,7 @@ "remote": "Remote", "name": "Name", "choose": "Choose Files", - "info": "Please upload .pdf, .txt, .rst, .csv, .docx, .md, .zip limited to 25mb", + "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb", "uploadedFiles": "Uploaded Files", "cancel": "Cancel", "train": "Train", diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 50a6d357a..c09bab533 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -275,6 +275,7 @@ function Upload({ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], 'text/csv': ['.csv'], + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], }, });