RamiAwar · RamiAwar · Jul 13, 2024 · Jul 13, 2024 · Jul 13, 2024 · Jul 13, 2024
diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ It's privacy-focused, storing everything on your device. No ☁️, only ☀️!
 
 It hides your data from the LLMs used by default, but this can be disabled if the data is not deemed sensitive.
 
-It can connect to a variety of data sources (Postgres, Snowflake, MySQL, SQLite, CSV, sas7bdat, and more), execute queries, generate charts, and allow for copying the results to build reports quickly.
+It can connect to a variety of data sources (Postgres, Snowflake, MySQL, [Excel](#excel-support), SQLite, CSV, sas7bdat, and more), execute queries, generate charts, and allow for copying the results to build reports quickly.
 
 ## Where is it going?
 
@@ -58,11 +58,11 @@ But you can still influence the direction we go in. We're building this for you,
 
 ## Feature Support
 
-- [x] Broad DB support: Postgres, MySQL, Snowflake, CSV, SQLite, and more
+- [x] Broad DB support: Postgres, MySQL, Snowflake, [Excel](#excel-support), CSV, SQLite, and more
 - [x] Generating and executing SQL from natural language
 - [x] Ability to modify SQL results, save them, and re-run
 - [x] Better support for explorative questions
-- [x] Querying data files like CSV, SQLite, sas7bdat (more connection types)
+- [x] Querying data files like CSV, [Excel](#excel-support), SQLite, sas7bdat (more connection types)
 - [x] Charting via natural language
 - [x] Modifying chart queries and re-rendering/refreshing charts
 
@@ -168,3 +168,14 @@ For example, running the docker image on a remote server with IP `123.123.12.34`
 ```bash
 docker run -p 7377:7377 -v dataline:/home/.dataline --name dataline -e ALLOWED_ORIGINS="http://123.123.12.34:7377,https://123.123.12.34:7377" ramiawar/dataline:latest
 ```
+
+
+### Excel Support
+
+We support excel files, but they will have to conform to some structure for the time being. We also support multiple sheets - each sheet will be ingested as a separate table.
+
+Right now, we will try to automatically detect the 'header row' and the first column based on some manual data processing (so as to keep things secure). This means that we might detect the wrong things if you have extra rows on top / logos / branding elements.
+
+To ensure the best quality, make sure your first row is the column names, and remove any padding rows/columns from all the sheets. If any sheet fails, the import will fail.
+
+Future improvements to this include optionally allowing LLMs to figure out what the header row is to reduce user effort.
diff --git a/backend/alembic/versions/2024_07_11_2328-4e70c3318aaa_added_connection_type.py b/backend/alembic/versions/2024_07_11_2328-4e70c3318aaa_added_connection_type.py
@@ -0,0 +1,36 @@
+"""added connection type
+Revision ID: 4e70c3318aaa
+Revises: 1fcab2512ee2
+Create Date: 2024-07-11 23:28:01.641837
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "4e70c3318aaa"
+down_revision: Union[str, None] = "1fcab2512ee2"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ # Add 'type' column as nullable
+ op.add_column("connections", sa.Column("type", sa.String(), nullable=True))
+
+ # Update 'type' column with values from 'dialect'
+ op.execute("UPDATE connections SET type = dialect")
+
+ # Use batch_alter_table for SQLite compatibility
+ with op.batch_alter_table("connections") as batch_op:
+ # Recreate the column as non-nullable
+ batch_op.alter_column("type", existing_type=sa.String(), nullable=False)
+
+
+def downgrade() -> None:
+ # Drop the 'type' column
+ with op.batch_alter_table("connections") as batch_op:
+ batch_op.drop_column("type")
diff --git a/backend/dataline/models/connection/model.py b/backend/dataline/models/connection/model.py
@@ -14,6 +14,7 @@ class ConnectionModel(DBModel, UUIDMixin, kw_only=True):
  dsn: Mapped[str] = mapped_column("dsn", String, nullable=False, unique=True)
  database: Mapped[str] = mapped_column("database", String, nullable=False)
  name: Mapped[str | None] = mapped_column("name", String)
+ type: Mapped[str] = mapped_column("type", String, nullable=False)
  dialect: Mapped[str | None] = mapped_column("dialect", String)
  is_sample: Mapped[bool] = mapped_column("is_sample", Boolean, nullable=False, default=False, server_default="false")
 

diff --git a/backend/dataline/models/connection/schema.py b/backend/dataline/models/connection/schema.py
@@ -16,6 +16,7 @@ class Connection(BaseModel):
  dsn: str
  database: str
  dialect: str
+ type: str
  is_sample: bool
 
 
@@ -143,6 +144,7 @@ class FileConnectionType(Enum):
  sqlite = "sqlite"
  csv = "csv"
  sas7bdat = "sas7bdat"
+ excel = "excel"
 
 
 class SampleName(Enum):

diff --git a/backend/dataline/repositories/connection.py b/backend/dataline/repositories/connection.py
@@ -1,3 +1,4 @@
+from enum import Enum
 from typing import Type
 
 from pydantic import BaseModel, ConfigDict
@@ -6,6 +7,14 @@
 from dataline.models.connection.model import ConnectionModel
 from dataline.repositories.base import AsyncSession, BaseRepository
 
+class ConnectionType(Enum):
+ csv = "csv"
+ sqlite = "sqlite"
+ excel = "excel"
+ postgres = "postgres"
+ mysql = "mysql"
+ snowflake = "snowflake"
+ sas = "sas"
 
 class ConnectionCreate(BaseModel):
  model_config = ConfigDict(from_attributes=True, extra="ignore")
@@ -14,6 +23,7 @@ class ConnectionCreate(BaseModel):
  database: str
  name: str
  dialect: str
+ type: str
  is_sample: bool = False
 
 
@@ -24,6 +34,7 @@ class ConnectionUpdate(BaseModel):
  database: str | None = None
  name: str | None = None
  dialect: str | None = None
+ type: str | None = None
  is_sample: bool | None = None
 
 

diff --git a/backend/dataline/services/connection.py b/backend/dataline/services/connection.py
@@ -1,6 +1,6 @@
 import logging
-import sqlite3
 import os
+import sqlite3
 import tempfile
 from pathlib import Path
 from typing import BinaryIO
@@ -20,8 +20,10 @@
 from dataline.repositories.connection import (
  ConnectionCreate,
  ConnectionRepository,
+ ConnectionType,
  ConnectionUpdate,
 )
+from dataline.services.file_parsers.excel_parser import ExcelParserService
 from dataline.utils.utils import (
  forward_connection_errors,
  generate_short_uuid,
@@ -37,24 +39,6 @@ class ConnectionService:
  def __init__(self, connection_repo: ConnectionRepository = Depends(ConnectionRepository)) -> None:
  self.connection_repo = connection_repo
 
- async def create_connection(
- self,
- session: AsyncSession,
- dsn: str,
- name: str,
- is_sample: bool = False,
- ) -> ConnectionOut:
- # Check if connection can be established before saving it
- dialect, database = await self.get_connection_details(dsn)
-
- # Check if connection already exists
- await self.check_dsn_already_exists(session, dsn)
-
- connection = await self.connection_repo.create(
- session, ConnectionCreate(dsn=dsn, database=database, name=name, dialect=dialect, is_sample=is_sample)
- )
- return ConnectionOut.model_validate(connection)
-
  async def get_connection(self, session: AsyncSession, connection_id: UUID) -> ConnectionOut:
  connection = await self.connection_repo.get_by_uuid(session, connection_id)
  return ConnectionOut.model_validate(connection)
@@ -143,6 +127,28 @@ async def update_connection(
  updated_connection = await self.connection_repo.update_by_uuid(session, connection_uuid, update)
  return ConnectionOut.model_validate(updated_connection)
 
+ async def create_connection(
+ self,
+ session: AsyncSession,
+ dsn: str,
+ name: str,
+ type: str | None = None,
+ is_sample: bool = False,
+ ) -> ConnectionOut:
+ # Check if connection can be established before saving it
+ dialect, database = await self.get_connection_details(dsn)
+ if not type:
+ type = dialect
+
+ # Check if connection already exists
+ await self.check_dsn_already_exists(session, dsn)
+
+ connection = await self.connection_repo.create(
+ session,
+ ConnectionCreate(dsn=dsn, database=database, name=name, dialect=dialect, type=type, is_sample=is_sample),
+ )
+ return ConnectionOut.model_validate(connection)
+
  async def create_sqlite_connection(
  self, session: AsyncSession, file: BinaryIO, name: str, is_sample: bool = False
  ) -> ConnectionOut:
@@ -172,7 +178,23 @@ async def create_csv_connection(self, session: AsyncSession, file: UploadFile, n
 
  # Create connection with the locally copied file
  dsn = get_sqlite_dsn(str(file_path.absolute()))
- return await self.create_connection(session, dsn=dsn, name=name, is_sample=False)
+ return await self.create_connection(session, dsn=dsn, name=name, type=ConnectionType.csv.value, is_sample=False)
+
+ async def create_excel_connection(self, session: AsyncSession, file: UploadFile, name: str) -> ConnectionOut:
+ generated_name = generate_short_uuid() + ".sqlite"
+ file_path = Path(config.data_directory) / generated_name
+
+ # Connect to the SQLite database and input data (it will be created if it doesn't exist)
+ conn = sqlite3.connect(file_path)
+ ExcelParserService.to_sqlite_offline_secure(file.file, conn, name)
+ conn.commit()
+ conn.close()
+
+ # Create connection with the locally copied file
+ dsn = get_sqlite_dsn(str(file_path.absolute()))
+ return await self.create_connection(
+ session, dsn=dsn, name=name, type=ConnectionType.excel.value, is_sample=False
+ )
 
  async def create_sas7bdat_connection(self, session: AsyncSession, file: UploadFile, name: str) -> ConnectionOut:
  generated_name = generate_short_uuid() + ".sqlite"
@@ -213,7 +235,9 @@ async def create_sas7bdat_connection(self, session: AsyncSession, file: UploadFi
 
  # Create connection with the locally copied file
  dsn = get_sqlite_dsn(str(file_path.absolute()))
- return await self.create_connection(session, dsn=dsn, name=name, is_sample=False)
+ return await self.create_connection(
+ session, dsn=dsn, name=name, type=ConnectionType.sas.value, is_sample=False
+ ) 
  finally:
  # Clean up the temporary file
  os.unlink(temp_file_path)
diff --git a/backend/dataline/services/file_parsers/__init__.py b/backend/dataline/services/file_parsers/__init__.py
@@ -0,0 +1,3 @@
+from dataline.services.file_parsers.excel_parser import ExcelParserService
+
+__all__ = ["ExcelParserService"]
diff --git a/backend/dataline/services/file_parsers/excel_parser.py b/backend/dataline/services/file_parsers/excel_parser.py
@@ -0,0 +1,71 @@
+import logging
+from sqlite3 import Connection
+from typing import BinaryIO
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+def find_non_nan(value: pd.Series) -> bool:
+ return value.notna().any()
+
+
+def process_excel(file: BinaryIO) -> dict[str, pd.DataFrame]:
+ # Storing the processed df for each sheet
+ processed_sheets: dict[str, pd.DataFrame] = {}
+
+ # Get all sheet names
+ sheet_names = pd.ExcelFile(file).sheet_names
+ sheet_dfs = pd.read_excel(file, sheet_name=None, header=None)
+
+ for sheet_name in sheet_names:
+ # Reading the sheet (without headers)
+ sheet_key = str(sheet_name)
+ logger.debug(f"Processing sheet: {sheet_key}")
+ df = sheet_dfs[sheet_key]
+
+ # Finding the first row with non-NaN values to use as a column header
+ header_row_idx = df.apply(find_non_nan, axis=1).idxmax()
+ headers = df.loc[header_row_idx].dropna().values
+
+ # Finding the first column with non-NaN values to use as the first column
+ header_col_idx = int(df.apply(find_non_nan, axis=0).idxmax())
+
+ # HEADER CORRECTION
+ correct_headers = df.loc[header_row_idx]
+ df.columns = list(correct_headers)
+ # Remove the header row from the DataFrame if it's now part of the data
+ df.drop(df.index[int(header_row_idx)], inplace=True)
+
+ # REMOVE EMPTY COLUMNS
+ # Droping any columns before the identified column header
+ if header_col_idx > 0:
+ df.drop(df.columns[:header_col_idx], axis=1, inplace=True)
+ # Resetting the column names to the identified column headers
+ df.columns = list(headers)
+
+ # CLEANUP
+ # Droping any rows and columns that are completely NaN
+ df.dropna(how="all", axis=0, inplace=True)
+ df.dropna(how="all", axis=1, inplace=True)
+ df.reset_index(drop=True, inplace=True)
+
+ # Storing the processed DataFrame in the dictionary
+ processed_sheets[sheet_key] = df
+
+ return processed_sheets
+
+
+class ExcelParserService:
+
+ @classmethod
+ def to_sqlite_offline_secure(cls, file: BinaryIO, conn: Connection, name: str) -> None:
+ """
+ Attempt to parse an Excel file manually and store the data in a SQLite database.
+ """
+ processed_sheets = process_excel(file)
+ for sheet in processed_sheets:
+ table_name = sheet.replace(" ", "_").lower()
+ df = processed_sheets[sheet]
+ df.to_sql(table_name, conn, if_exists="replace", index=False)