From 4ba5cf3905e225f1a59a755cfcf0341959062a5d Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Sun, 31 Jul 2022 12:47:14 -0400
Subject: [PATCH 01/18] docs: clean up

---
 .pre-commit-config.yaml             |  18 -----
 CONTRIBUTING.md                     | 116 ----------------------------
 README.md                           |   2 +-
 api-docs/conf.py                    |  11 ++-
 api-docs/source/contribute/index.md |   4 +-
 5 files changed, 8 insertions(+), 143 deletions(-)
 delete mode 100644 .pre-commit-config.yaml
 delete mode 100644 CONTRIBUTING.md

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index f93de8729..000000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.1.0
-    hooks:
-    -   id: trailing-whitespace
-    -   id: end-of-file-fixer
-    -   id: check-added-large-files
--   repo: https://github.com/pre-commit/mirrors-autopep8
-    rev: v1.5.3
-    hooks:
-    -   id: autopep8
-        args: ['-i', '--select=E,F', '--max-line-length=88', '--exclude=eva/filters, eva/parser/evaql']
--   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.8.3
-    hooks:
-    -   id: flake8
-        additional_dependencies: [flake8-typing-imports==1.6.0]
-        args: ['--select=E,F', '--max-line-length=88', '--exclude=eva/filters, eva/parser/evaql']
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
deleted file mode 100644
index dba205ee3..000000000
--- a/CONTRIBUTING.md
+++ /dev/null
@@ -1,116 +0,0 @@
-# Contributing to EVA
-
-## Setting up Development Environment
-
-### Installation
-
-Installation of EVA involves setting a virtual environment using [miniconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) and configuring git hooks.
-
-1. Clone the repository
-```shell
-git clone https://github.com/georgia-tech-db/eva.git
-```
-
-2. Install the dependencies.
-```shell
-sh script/install/before_install.sh
-export PATH="$HOME/miniconda/bin:$PATH"
-sh script/install/install.sh
-```
-
-<!-- 4. Install `docker` and `docker-compose`.
-Please refer to [official doc](https://docs.docker.com/engine/install/). -->
-
-### Client Testing
-
-1. Set up the server and client
-
-- Activate the conda environment: `conda activate eva`
-
-- Launch EVA database Server: `python eva/eva_server.py`
-
-- Launch CLI: `python eva/eva_cmd_client.py`
-
-2. Run the `UPLOAD` command in the client terminal:
-```mysql
-UPLOAD INFILE 'data/ua_detrac/ua_detrac.mp4' PATH 'test_video.mp4';
-```
-
-3. Run the `LOAD` command in the client terminal: (may take a while)
-```mysql
-LOAD DATA INFILE 'test_video.mp4' INTO MyVideo;
-```
-
-4. Below is a basic query that should work on the client
-```mysql
-SELECT id, data FROM MyVideo WHERE id < 5;
-```
-
-### Configure GPU (Recommended)
-
-1. If your workstation has a GPU, you need to first set it up and configure it. You can run the following command first to check your hardware capabilities. 
-
-    ```
-    ubuntu-drivers devices
-    ```
-
-    If you do have an NVIDIA GPU, and its not been configured yet, follow all the steps in this link carefully. `https://towardsdatascience.com/deep-learning-gpu-installation-on-ubuntu-18-4-9b12230a1d31`. 
-
-    Some pointers:
-    - When installing NVIDIA drivers, check the correct driver version for your GPU to avoid compatibiility issues. 
-    - When installing cuDNN, you will have to create an account. Make sure you get the correct deb files for your OS and architecture. 
-
-2. You can run the following code in a jupyter instance to verify your GPU is working well along with PyTorch.
-
-    ```
-    import torch
-    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-    print(device)
-    ```
-
-    Output of `cuda:0` indicates the presence of a GPU. (Note: 0 indicates the index of the GPU in system. Incase you have multiple GPUs, the index needs to be accordingly changed)
-
-2. Now configure the `executor` section in `eva/eva.yml` as follows:
-
-    ```
-    gpus: {'127.0.1.1': [0]}
-    ```
-    
-    `127.0.1.1` is the loopback address on which the eva server is started. 0 refers to the GPU index to be used. 
-
-## Commiting and Testing
-
-1. Install git hooks in your .git/ directory. [optional, but recommended]
-```shell
-conda activate eva
-pre-commit install
-```
-
-2. Ensure that all the unit test cases (including the ones you have added) run succesfully and the coding style conventions are followed.
-```shell
-bash script/test/test.sh
-```
-
-## Packaging New Version of EVA
-
-1. Generate EVA grammar files.
-```shell
-bash script/antlr4/generate_parser.sh
-```
-
-2. Bump up version number in `setup.cfg` along with any additional dependencies.
-
-3. Create a new build locally.
-```shell
-python -m build
-```
-
-4. Upload build to pypi using credentials.
-```shell
-python -m twine upload dist/*
-```
-
-
-## Issues and PR's
-
-To file a bug or request a feature, please file a GitHub issue. Pull requests are welcome.
diff --git a/README.md b/README.md
index c9cca1366..dbe625d05 100644
--- a/README.md
+++ b/README.md
@@ -132,7 +132,7 @@ python3 -m venv env38                                # to create a virtual envir
 pip install --upgrade pip
 sudo -E apt install -y openjdk-8-jdk openjdk-8-jre   # to install JAVA
 sh script/antlr4/generate_parser.sh                  # to generate the EVA parser
-python -m pip install install -e .[dev]
+python -m pip install -e .
 ```
 
 </p>
diff --git a/api-docs/conf.py b/api-docs/conf.py
index e1b6798b6..88a764afe 100644
--- a/api-docs/conf.py
+++ b/api-docs/conf.py
@@ -14,7 +14,6 @@
 import warnings
 import os
 import sys
-
 sys.path.append(os.path.abspath('.'))
 sys.path.append(os.path.abspath('../'))
 
@@ -26,13 +25,13 @@
 
 
 # -- Project information -----------------------------------------------------
-
-project = 'EVA'
-copyright = str(datetime.now().year) + ', Georgia Tech Database Group'
-author = 'Georgia Tech Database Group'
+project = "evadb"
+author = "Georgia Tech Database Group"
+copyright = str(datetime.now().year) + f', {author}'
 
 # The full version, including alpha/beta/rc tags
-release = '0.0.1'
+from eva.version import __version__ as version
+release =  version
 
 master_doc = 'index'
 
diff --git a/api-docs/source/contribute/index.md b/api-docs/source/contribute/index.md
index a65fe3109..eb1d65e71 100644
--- a/api-docs/source/contribute/index.md
+++ b/api-docs/source/contribute/index.md
@@ -16,7 +16,7 @@ python3 -m venv env38                                # to create a virtual envir
 pip install --upgrade pip
 sudo -E apt install -y openjdk-8-jdk openjdk-8-jre   # to install JAVA
 sh script/antlr4/generate_parser.sh                  # to generate the EVA parser
-python -m pip install install -e .[dev]
+python -m pip install -e .
 ```
 
 #### Submitting a contribution
@@ -69,4 +69,4 @@ On your local machine, run the following script to auto-format using `black`
 
 ```
 python script/formatting/formatter.py 
-```
\ No newline at end of file
+```

From cd27791ad4ef03ee260a433d4dc5be0f2de424e9 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Sun, 31 Jul 2022 15:32:46 -0400
Subject: [PATCH 02/18] docs: development guide clean up and version add to
 docs

---
 README.md                           |  2 +-
 api-docs/conf.py                    | 25 ++++++++++++++++++-------
 api-docs/requirements.txt           |  4 ++--
 api-docs/source/contribute/index.md |  2 +-
 setup.py                            |  2 +-
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index dbe625d05..8e04cc3c8 100644
--- a/README.md
+++ b/README.md
@@ -132,7 +132,7 @@ python3 -m venv env38                                # to create a virtual envir
 pip install --upgrade pip
 sudo -E apt install -y openjdk-8-jdk openjdk-8-jre   # to install JAVA
 sh script/antlr4/generate_parser.sh                  # to generate the EVA parser
-python -m pip install -e .
+pip install -e ".[dev]"
 ```
 
 </p>
diff --git a/api-docs/conf.py b/api-docs/conf.py
index 88a764afe..b2fc1a8bb 100644
--- a/api-docs/conf.py
+++ b/api-docs/conf.py
@@ -1,19 +1,29 @@
-# Configuration file for the Sphinx documentation builder.
+# coding=utf-8
+# Copyright 2018-2022 EVA
 #
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 # -- Path setup --------------------------------------------------------------
 
+import os
+import sys
+import warnings
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 from datetime import datetime
-import warnings
-import os
-import sys
+
 sys.path.append(os.path.abspath('.'))
 sys.path.append(os.path.abspath('../'))
 
@@ -31,6 +41,7 @@
 
 # The full version, including alpha/beta/rc tags
 from eva.version import __version__ as version
+
 release =  version
 
 master_doc = 'index'
diff --git a/api-docs/requirements.txt b/api-docs/requirements.txt
index e7044d183..f2bd0e30e 100644
--- a/api-docs/requirements.txt
+++ b/api-docs/requirements.txt
@@ -11,9 +11,9 @@ commonmark==0.8.1
 recommonmark==0.5.0
 sphinx==4.3.2
 readthedocs-sphinx-ext<1.1
-sphinx-book-theme==0.1.7
+sphinx-book-theme==0.3.3
 sphinx-sitemap==2.2.0
-sphinx-external-toc==0.2.3
+sphinx-external-toc==0.3.0
 sphinx-thebe==0.1.1
 sphinx-jsonschema==1.17.2
 
diff --git a/api-docs/source/contribute/index.md b/api-docs/source/contribute/index.md
index eb1d65e71..40f104758 100644
--- a/api-docs/source/contribute/index.md
+++ b/api-docs/source/contribute/index.md
@@ -16,7 +16,7 @@ python3 -m venv env38                                # to create a virtual envir
 pip install --upgrade pip
 sudo -E apt install -y openjdk-8-jdk openjdk-8-jre   # to install JAVA
 sh script/antlr4/generate_parser.sh                  # to generate the EVA parser
-python -m pip install -e .
+pip install -e ".[dev]"
 ```
 
 #### Submitting a contribution
diff --git a/setup.py b/setup.py
index 6faac69b5..9425dc2ad 100644
--- a/setup.py
+++ b/setup.py
@@ -50,7 +50,7 @@ def version(path):
     "pandas==1.2.3",
     "torch==1.7.1",
     "torchvision==0.8.2",
-    "Pillow==8.1.2",
+    "Pillow==9.0.1",
     "sqlalchemy==1.3.20",
     "sqlalchemy-utils==0.36.6",
     "pyspark==3.0.2",

From f74b81a963f120aeefbacc4ca150e353d204e0ff Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Sun, 31 Jul 2022 15:35:11 -0400
Subject: [PATCH 03/18] style: fix

---
 api-docs/conf.py | 93 +++++++++++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 37 deletions(-)

diff --git a/api-docs/conf.py b/api-docs/conf.py
index b2fc1a8bb..b146aedab 100644
--- a/api-docs/conf.py
+++ b/api-docs/conf.py
@@ -18,33 +18,35 @@
 import os
 import sys
 import warnings
+
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 from datetime import datetime
 
-sys.path.append(os.path.abspath('.'))
-sys.path.append(os.path.abspath('../'))
+from eva.version import __version__ as version
+
+sys.path.append(os.path.abspath("."))
+sys.path.append(os.path.abspath("../"))
 
 
 # Temp. workaround for
 # https://github.com/agronholm/sphinx-autodoc-typehints/issues/133
 warnings.filterwarnings(
-    'ignore', message='sphinx.util.inspect.Signature\(\) is deprecated')
+    "ignore", message="sphinx.util.inspect.Signature() is deprecated"
+)
 
 
 # -- Project information -----------------------------------------------------
 project = "evadb"
 author = "Georgia Tech Database Group"
-copyright = str(datetime.now().year) + f', {author}'
+copyright = str(datetime.now().year) + f", {author}"
 
 # The full version, including alpha/beta/rc tags
-from eva.version import __version__ as version
+release = version
 
-release =  version
-
-master_doc = 'index'
+master_doc = "index"
 
 
 # -- General configuration ---------------------------------------------------
@@ -54,17 +56,17 @@
 # ones.
 extensions = [
     "sphinx_external_toc",
-    'sphinx.ext.autosummary',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.todo',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.graphviz',
+    "sphinx.ext.autosummary",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.todo",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.graphviz",
     "myst_nb",
-    "sphinx-jsonschema"
+    "sphinx-jsonschema",
 ]
 
 myst_enable_extensions = [
@@ -92,27 +94,37 @@
 
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # highlight_language = 'python'
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
 
-autodoc_mock_imports = ["numpy", "sqlalchemy", "sqlalchemy_utils",
-                        "sqlalchemy.orm", "sqlalchemy.orm.exc",
-                        "sqlalchemy.types",
-                        "petastorm", "yaml", "pyspark", "torch",
-                        "pandas", "cv2", "eva.catalog"]
+autodoc_mock_imports = [
+    "numpy",
+    "sqlalchemy",
+    "sqlalchemy_utils",
+    "sqlalchemy.orm",
+    "sqlalchemy.orm.exc",
+    "sqlalchemy.types",
+    "petastorm",
+    "yaml",
+    "pyspark",
+    "torch",
+    "pandas",
+    "cv2",
+    "eva.catalog",
+]
 
 # -- Options for HTML output -------------------------------------------------
 
@@ -130,7 +142,7 @@
     "use_issues_button": True,
     "use_edit_page_button": True,
     "path_to_docs": "api-docs/",
-    "home_page_in_toc": False
+    "home_page_in_toc": False,
 }
 
 # Add any paths that contain custom themes here, relative to this directory.
@@ -140,22 +152,29 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-autodoc_default_flags = ['members', 'private-members', 'special-members',
-                         # 'undoc-members',
-                         'show-inheritance']
+html_static_path = ["_static"]
+
+autodoc_default_flags = [
+    "members",
+    "private-members",
+    "special-members",
+    # 'undoc-members',
+    "show-inheritance",
+]
 
 
 def autodoc_skip_member(app, what, name, obj, skip, options):
     # Ref: https://stackoverflow.com/a/21449475/
-    exclusions = ('__weakref__',  # special-members
-                  '__doc__', '__module__', '__dict__',  # undoc-members
-                  )
+    exclusions = (
+        "__weakref__",  # special-members
+        "__doc__",
+        "__module__",
+        "__dict__",  # undoc-members
+    )
     exclude = name in exclusions
     return True if exclude else None
 
 
 def setup(app):
-    app.connect('autodoc-skip-member', autodoc_skip_member)
-    app.add_css_file('custom.css')
+    app.connect("autodoc-skip-member", autodoc_skip_member)
+    app.add_css_file("custom.css")

From d88fae5fe9c69cb5d3d15a21ed70613aafc9eca1 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Sun, 31 Jul 2022 16:02:19 -0400
Subject: [PATCH 04/18] style: only style the defualt dir

---
 script/formatting/formatter.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/script/formatting/formatter.py b/script/formatting/formatter.py
index 08f274554..0236afcd6 100755
--- a/script/formatting/formatter.py
+++ b/script/formatting/formatter.py
@@ -19,6 +19,7 @@
 import re
 import subprocess
 import sys
+from pathlib import Path
 
 import pkg_resources
 
@@ -258,7 +259,6 @@ def format_dir(dir_path, add_header, strip_header, format_code):
     is_tool(BLACK_BINARY)
     is_tool(FLAKE_BINARY)
     is_tool(ISORT_BINARY)
-
     if args.file_name:
         LOG.info("Scanning file: " + "".join(args.file_name))
         format_file(
@@ -272,7 +272,7 @@ def format_dir(dir_path, add_header, strip_header, format_code):
         format_dir(args.dir_name, args.add_header, args.strip_header, args.format_code)
     # BY DEFAULT, WE FIX THE MODIFIED FILES
     else:
-        LOG.info("Default fix modified files")
+        # LOG.info("Default fix modified files")
         MERGEBASE = subprocess.check_output(
             "git merge-base origin/master HEAD", shell=True, text=True
         ).rstrip()
@@ -285,14 +285,21 @@ def format_dir(dir_path, add_header, strip_header, format_code):
             .rstrip()
             .split("\n")
         )
-
         for file in files:
-            if file != "script/formatting/formatter.py":
+            valid = False
+            ## only format the defualt directories
+            file_path = str(Path(file).absolute())
+            for source_dir in DEFAULT_DIRS:
+                source_path = str(Path(source_dir).resolve())
+                if file_path.startswith(source_path):
+                    valid = True
+            
+            if valid:
                 LOG.info("Stripping headers : " + file)
                 format_file(file, False, True, False)
 
                 LOG.info("Adding headers : " + file)
                 format_file(file, True, False, False)
 
-            LOG.info("Formatting File : " + file)
-            format_file(file, False, False, True)
+                LOG.info("Formatting File : " + file)
+                format_file(file, False, False, True)

From e555aae12d92acc72e96d1a82b683bd34322c8c8 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Sun, 31 Jul 2022 16:10:10 -0400
Subject: [PATCH 05/18] merge

---
 api-docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api-docs/conf.py b/api-docs/conf.py
index b146aedab..982b45196 100644
--- a/api-docs/conf.py
+++ b/api-docs/conf.py
@@ -25,7 +25,7 @@
 #
 from datetime import datetime
 
-from eva.version import __version__ as version
+from eva.version import VERSION as version
 
 sys.path.append(os.path.abspath("."))
 sys.path.append(os.path.abspath("../"))

From dc95ee1fc0ed21e71ba86049896c40f5360b1ae7 Mon Sep 17 00:00:00 2001
From: Rajveer Bachkaniwala <rajveerbachkaniwala29@gmail.com>
Date: Sat, 22 Oct 2022 16:55:45 -0400
Subject: [PATCH 06/18] remove spark

---
 eva/catalog/schema_utils.py              |  60 +++++++++++-
 eva/storage/storage_engine.py            |   2 +-
 eva/storage/structured_storage_engine.py | 115 +++++++++++++++++++++++
 3 files changed, 173 insertions(+), 4 deletions(-)
 create mode 100644 eva/storage/structured_storage_engine.py

diff --git a/eva/catalog/schema_utils.py b/eva/catalog/schema_utils.py
index 6985e8965..4b2c74184 100644
--- a/eva/catalog/schema_utils.py
+++ b/eva/catalog/schema_utils.py
@@ -12,13 +12,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Dict, List
 import numpy as np
 import pandas as pd
 from petastorm.codecs import NdarrayCodec, ScalarCodec
 from petastorm.unischema import Unischema, UnischemaField
 from pyspark.sql.types import FloatType, IntegerType, StringType
+from sqlalchemy import (
+    FLOAT,
+    INTEGER,
+    TEXT,
+    Column,
+    Float,
+    Integer,
+    LargeBinary,
+)
 
 from eva.catalog.column_type import ColumnType, NdArrayType
+from eva.catalog.models.df_column import DataFrameColumn
 from eva.utils.logging_manager import logger
 
 
@@ -55,7 +66,11 @@ def get_petastorm_column(df_column):
             )
         elif column_type == ColumnType.TEXT:
             petastorm_column = UnischemaField(
-                column_name, np.str_, (), ScalarCodec(StringType()), column_is_nullable
+                column_name,
+                np.str_,
+                (),
+                ScalarCodec(StringType()),
+                column_is_nullable,
             )
         elif column_type == ColumnType.NDARRAY:
             np_type = NdArrayType.to_numpy_type(column_array_type)
@@ -82,7 +97,9 @@ def get_petastorm_schema(name, column_list):
         return petastorm_schema
 
     @staticmethod
-    def petastorm_type_cast(schema: Unischema, df: pd.DataFrame) -> pd.DataFrame:
+    def petastorm_type_cast(
+        schema: Unischema, df: pd.DataFrame
+    ) -> pd.DataFrame:
         """
         Try to cast the type if schema defined in UnischemeField for
         Petastorm is not consistent with panda DataFrame provided.
@@ -96,5 +113,42 @@ def petastorm_type_cast(schema: Unischema, df: pd.DataFrame) -> pd.DataFrame:
             try:
                 df[col] = df[col].apply(lambda x: x.astype(dtype, copy=False))
             except Exception:
-                logger.exception("Failed to cast %s to %s for Petastorm" % (col, dtype))
+                logger.exception(
+                    "Failed to cast %s to %s for Petastorm" % (col, dtype)
+                )
         return df
+
+    @staticmethod
+    def get_sqlalchemy_column(df_column: DataFrameColumn) -> Column:
+        column_type = df_column.type
+
+        sqlalchemy_column = None
+        if column_type == ColumnType.INTEGER:
+            sqlalchemy_column = Column(Integer)
+        elif column_type == ColumnType.FLOAT:
+            sqlalchemy_column = Column(Float)
+        elif column_type == ColumnType.TEXT:
+            sqlalchemy_column = Column(TEXT)
+        elif column_type == ColumnType.NDARRAY:
+            sqlalchemy_column = Column(LargeBinary)
+        else:
+            logger.error("Invalid column type: " + str(column_type))
+
+        return sqlalchemy_column
+
+    @staticmethod
+    def get_sqlalchemy_schema(
+        column_list: List[DataFrameColumn],
+    ) -> Dict[str, Column]:
+        """Converts the list of DataFrameColumns to SQLAlchemyColumns
+
+        Args:
+            column_list (List[DataFrameColumn]): columns to be converted
+
+        Returns:
+            Dict[str, Column]: mapping from column_name to sqlalchemy column object
+        """
+        return {
+            column.name: SchemaUtils.get_sqlalchemy_column(column)
+            for column in column_list
+        }
diff --git a/eva/storage/storage_engine.py b/eva/storage/storage_engine.py
index daa2cf413..93927e6a9 100644
--- a/eva/storage/storage_engine.py
+++ b/eva/storage/storage_engine.py
@@ -15,7 +15,7 @@
 from eva.configuration.configuration_manager import ConfigurationManager
 from eva.utils.generic_utils import str_to_class
 
-StorageEngine = str_to_class(ConfigurationManager().get_value("storage", "engine"))()
+StorageEngine = str_to_class("eva.storage.structured_storage_engine.SQLStorageEngine")()
 VideoStorageEngine = str_to_class(
     ConfigurationManager().get_value("storage", "video_engine")
 )()
diff --git a/eva/storage/structured_storage_engine.py b/eva/storage/structured_storage_engine.py
new file mode 100644
index 000000000..3c83bcbd1
--- /dev/null
+++ b/eva/storage/structured_storage_engine.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2018-2022 EVA
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+from pathlib import Path
+from typing import Iterator, List
+
+from petastorm.etl.dataset_metadata import materialize_dataset
+from petastorm.predicates import in_lambda
+from petastorm.unischema import dict_to_spark_row
+
+from eva.catalog.models.df_metadata import DataFrameMetadata
+from eva.catalog.sql_config import SQLConfig
+from eva.configuration.configuration_manager import ConfigurationManager
+from eva.models.storage.batch import Batch
+from eva.readers.petastorm_reader import PetastormReader
+from eva.storage.abstract_storage_engine import AbstractStorageEngine
+from eva.utils.logging_manager import logger
+from eva.catalog.models.base_model import BaseModel
+from eva.catalog.schema_utils import SchemaUtils
+
+# Leveraging Dynamic schema in SQLAlchemy
+# https://sparrigan.github.io/sql/sqla/2016/01/03/dynamic-tables.html
+
+
+class SQLStorageEngine(AbstractStorageEngine):
+    def __init__(self):
+        """
+        Grab the existing sql session
+        """
+        self._sql_session = SQLConfig().session
+
+    def create(self, table: DataFrameMetadata, **kwargs):
+        """
+        Create an empty table in sql.
+        """
+        attr_dict = {"__tablename__": table.name}
+        sqlalchemy_schema = SchemaUtils.get_sqlalchemy_schema(table.columns)
+        attr_dict.update(sqlalchemy_schema)
+        new_table = type("new_table", (BaseModel,), attr_dict)
+        BaseModel.metadata.create_table(tables=[new_table.__table__])
+
+    def drop(self, table: DataFrameMetadata):
+        dir_path = self._spark_url(table)
+        try:
+            shutil.rmtree(str(dir_path))
+        except Exception as e:
+            logger.exception(f"Failed to drop the video table {e}")
+
+    def write(self, table: DataFrameMetadata, rows: Batch):
+        """
+        Write rows into the sql table.
+
+        Arguments:
+            table: table metadata object to write into
+            rows : batch to be persisted in the storage.
+        """
+        attr_dict = {"__tablename__": table.name}
+        sqlalchemy_schema = SchemaUtils.get_sqlalchemy_schema(table.columns)
+        attr_dict.update(sqlalchemy_schema)
+        new_table = type("new_table", (BaseModel,), attr_dict)
+        
+        columns = rows.frames.keys() 
+        data = []
+        for record in rows.frames.values:
+            row_data = { col : record[col] for col in columns}
+            data.append(row_data)
+        self._sql_session.bulk_insert_mappings(new_table, data)
+        self._sql_session.commit()
+
+    def read(
+        self,
+        table: DataFrameMetadata,
+        batch_mem_size: int,
+        columns: List[str] = None,
+        predicate_func=None,
+    ) -> Iterator[Batch]:
+        """
+        Reads the table and return a batch iterator for the
+        tuples that passes the predicate func.
+
+        Argument:
+            table: table metadata object to write into
+            batch_mem_size (int): memory size of the batch read from storage
+            columns (List[str]): A list of column names to be
+                considered in predicate_func
+            predicate_func: customized predicate function returns bool
+
+        Return:
+            Iterator of Batch read.
+        """
+        predicate = None
+        if predicate_func and columns:
+            predicate = in_lambda(columns, predicate_func)
+
+        # ToDo: Handle the sharding logic. We might have to maintain a
+        # context for deciding which shard to read
+        reader = PetastormReader(
+            self._spark_url(table),
+            batch_mem_size=batch_mem_size,
+            predicate=predicate,
+        )
+        for batch in reader.read():
+            yield batch

From 5b9cdd668ac3f93638e6b3f88fd314fe0166477f Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 03:29:00 -0400
Subject: [PATCH 07/18] feat: replace spark+petastorm with sqlalchemy+sqlite

---
 eva/catalog/models/base_model.py         |   2 +-
 eva/catalog/models/df_column.py          |   2 +-
 eva/catalog/models/udf_io.py             |   2 +-
 eva/readers/petastorm_reader.py          |  80 -------------
 eva/spark/__init__.py                    |  15 ---
 eva/spark/session.py                     |  81 -------------
 eva/storage/petastorm_storage_engine.py  | 133 ----------------------
 eva/storage/sqlite_storage_engine.py     | 139 +++++++++++++++++++++++
 eva/storage/storage_engine.py            |   2 +-
 eva/storage/structured_storage_engine.py | 115 -------------------
 eva/utils/generic_utils.py               |  11 ++
 test/spark/__init__.py                   |  14 ---
 test/spark/test_session.py               |  39 -------
 13 files changed, 154 insertions(+), 481 deletions(-)
 delete mode 100644 eva/readers/petastorm_reader.py
 delete mode 100644 eva/spark/__init__.py
 delete mode 100644 eva/spark/session.py
 delete mode 100644 eva/storage/petastorm_storage_engine.py
 create mode 100644 eva/storage/sqlite_storage_engine.py
 delete mode 100644 eva/storage/structured_storage_engine.py
 delete mode 100644 test/spark/__init__.py
 delete mode 100644 test/spark/test_session.py

diff --git a/eva/catalog/models/base_model.py b/eva/catalog/models/base_model.py
index 3d194432d..c32af1afd 100644
--- a/eva/catalog/models/base_model.py
+++ b/eva/catalog/models/base_model.py
@@ -33,7 +33,7 @@ class CustomModel:
     """
 
     query = db_session.query_property()
-    _id = Column("id", Integer, primary_key=True)
+    _id = Column("_row_id", Integer, primary_key=True)
 
     def __init__(self, **kwargs):
         cls_ = type(self)
diff --git a/eva/catalog/models/df_column.py b/eva/catalog/models/df_column.py
index 95b348a9e..04164820d 100644
--- a/eva/catalog/models/df_column.py
+++ b/eva/catalog/models/df_column.py
@@ -31,7 +31,7 @@ class DataFrameColumn(BaseModel):
     _is_nullable = Column("is_nullable", Boolean, default=False)
     _array_type = Column("array_type", Enum(NdArrayType), nullable=True)
     _array_dimensions = Column("array_dimensions", String(100))
-    _metadata_id = Column("metadata_id", Integer, ForeignKey("df_metadata.id"))
+    _metadata_id = Column("metadata_id", Integer, ForeignKey("df_metadata._row_id"))
 
     _dataset = relationship("DataFrameMetadata", back_populates="_columns")
 
diff --git a/eva/catalog/models/udf_io.py b/eva/catalog/models/udf_io.py
index 2b5b74a80..f0da3bea4 100644
--- a/eva/catalog/models/udf_io.py
+++ b/eva/catalog/models/udf_io.py
@@ -32,7 +32,7 @@ class UdfIO(BaseModel):
     _array_type = Column("array_type", Enum(NdArrayType), nullable=True)
     _array_dimensions = Column("array_dimensions", String(100))
     _is_input = Column("is_input", Boolean, default=True)
-    _udf_id = Column("udf_id", Integer, ForeignKey("udf.id"))
+    _udf_id = Column("udf_id", Integer, ForeignKey("udf._row_id"))
     _udf = relationship("UdfMetadata", back_populates="_cols")
 
     __table_args__ = (UniqueConstraint("name", "udf_id"), {})
diff --git a/eva/readers/petastorm_reader.py b/eva/readers/petastorm_reader.py
deleted file mode 100644
index 41f3f9073..000000000
--- a/eva/readers/petastorm_reader.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Dict, Iterator
-
-from petastorm import make_reader
-
-from eva.configuration.configuration_manager import ConfigurationManager
-from eva.readers.abstract_reader import AbstractReader
-
-
-class PetastormReader(AbstractReader):
-    def __init__(
-        self, *args, cur_shard=None, shard_count=None, predicate=None, **kwargs
-    ):
-        """
-        Reads data from the petastorm parquet stores. Note this won't
-        work for any arbitary parquet store apart from one materialized
-        using petastorm. In order to generalize, we might have to replace
-        `make_reader` with `make_batch_reader`.
-        https://petastorm.readthedocs.io/en/latest/api.html#module-petastorm.reader
-
-        Attributes:
-            cur_shard (int, optional): Shard number to load from if sharded
-            shard_count (int, optional): Specify total number of shards if
-                                      applicable
-            predicate (PredicateBase, optional): instance of predicate object
-                to filter rows to be returned by reader
-            cache_type (str): the cache type, if desired.
-            Options are [None, ‘null’, ‘local-disk’] to either have a
-            null/noop cache or a cache implemented using diskcache.
-            cache_location (int): the location or path of the cache.
-            cache_size_limit (int): the size limit of the cache in bytes
-            cache_row_size_estimate (int): the estimated size of a row
-        """
-        self.cur_shard = cur_shard
-        self.shard_count = shard_count
-        self.predicate = predicate
-        petastorm_config = ConfigurationManager().get_value("storage", "petastorm")
-        # cache not allowed with predicates
-        if self.predicate or petastorm_config is None:
-            petastorm_config = {}
-        self.cache_type = petastorm_config.get("cache_type", None)
-        self.cache_location = petastorm_config.get("cache_location", None)
-        self.cache_size_limit = petastorm_config.get("cache_size_limit", None)
-        self.cache_row_size_estimate = petastorm_config.get(
-            "cache_row_size_estimate", None
-        )
-        super().__init__(*args, **kwargs)
-        if self.cur_shard is not None and self.cur_shard <= 0:
-            self.cur_shard = None
-
-        if self.shard_count is not None and self.shard_count <= 0:
-            self.shard_count = None
-
-    def _read(self) -> Iterator[Dict]:
-        # `Todo`: Generalize this reader
-        with make_reader(
-            self.file_url,
-            shard_count=self.shard_count,
-            cur_shard=self.cur_shard,
-            predicate=self.predicate,
-            cache_type=self.cache_type,
-            cache_location=self.cache_location,
-            cache_size_limit=self.cache_size_limit,
-            cache_row_size_estimate=self.cache_row_size_estimate,
-        ) as reader:
-            for row in reader:
-                yield row._asdict()
diff --git a/eva/spark/__init__.py b/eva/spark/__init__.py
deleted file mode 100644
index 3765a7c41..000000000
--- a/eva/spark/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""creates a spark session"""
diff --git a/eva/spark/session.py b/eva/spark/session.py
deleted file mode 100644
index b62c44528..000000000
--- a/eva/spark/session.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pyspark.conf import SparkConf
-from pyspark.sql import SparkSession
-
-from eva.configuration.configuration_manager import ConfigurationManager
-
-
-class Session(object):
-    """
-    Wrapper around Spark Session
-    """
-
-    _instance = None
-    _session = None
-
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super(Session, cls).__new__(cls)
-        return cls._instance
-
-    def __init__(self):
-        self._config = ConfigurationManager()
-        name = self._config.get_value("core", "application")
-        self.init_spark_session(name)
-
-    def init_spark_session(self, application_name, spark_master=None):
-        """Setup a spark session.
-
-        :param spark_master: A master parameter used by spark session builder.
-          Use default value (None) to use system
-          environment configured spark cluster.
-          Use 'local[*]' to run on a local box.
-
-        :return: spark_session: A spark session
-        """
-
-        eva_spark_conf = SparkConf()
-        pyspark_config = self._config.get_value("pyspark", "property")
-        for key, value in pyspark_config.items():
-            eva_spark_conf.set(key, value)
-
-        session_builder = SparkSession.builder.appName(application_name).config(
-            conf=eva_spark_conf
-        )
-
-        if spark_master:
-            session_builder.master(spark_master)
-
-        # Gets an existing SparkSession or,
-        # if there is no existing one, creates a new one based
-        # on the options set in this builder.
-        self._session = session_builder.getOrCreate()
-
-        # Configure logging
-        spark_context = self._session.sparkContext
-        spark_context.setLogLevel("OFF")
-
-    def get_session(self):
-        return self._session
-
-    def get_context(self):
-        return self._session.sparkContext
-
-    def stop(self):
-        self._session.stop()
-
-    def __del__(self):
-        self._session.stop()
diff --git a/eva/storage/petastorm_storage_engine.py b/eva/storage/petastorm_storage_engine.py
deleted file mode 100644
index ba4937168..000000000
--- a/eva/storage/petastorm_storage_engine.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import shutil
-from pathlib import Path
-from typing import Iterator, List
-
-from petastorm.etl.dataset_metadata import materialize_dataset
-from petastorm.predicates import in_lambda
-from petastorm.unischema import dict_to_spark_row
-
-from eva.catalog.models.df_metadata import DataFrameMetadata
-from eva.configuration.configuration_manager import ConfigurationManager
-from eva.models.storage.batch import Batch
-from eva.readers.petastorm_reader import PetastormReader
-from eva.spark.session import Session
-from eva.storage.abstract_storage_engine import AbstractStorageEngine
-from eva.utils.logging_manager import logger
-
-
-class PetastormStorageEngine(AbstractStorageEngine):
-    def __init__(self):
-        """
-        Maintain a long live spark session and context.
-        """
-        self._spark = Session()
-        self.spark_session = self._spark.get_session()
-        self.spark_context = self._spark.get_context()
-        self.coalesce = ConfigurationManager().get_value("pyspark", "coalesce")
-
-    def _spark_url(self, table: DataFrameMetadata) -> str:
-        """
-        Generate a spark/petastorm url given a table
-        """
-        return Path(table.file_url).resolve().as_uri()
-
-    def create(self, table: DataFrameMetadata, **kwargs):
-        """
-        Create an empty dataframe in petastorm.
-        """
-        empty_rdd = self.spark_context.emptyRDD()
-
-        with materialize_dataset(
-            self.spark_session, self._spark_url(table), table.schema.petastorm_schema
-        ):
-
-            self.spark_session.createDataFrame(
-                empty_rdd, table.schema.pyspark_schema
-            ).coalesce(self.coalesce).write.mode("overwrite").parquet(
-                self._spark_url(table)
-            )
-
-    def drop(self, table: DataFrameMetadata):
-        dir_path = self._spark_url(table)
-        try:
-            shutil.rmtree(str(dir_path))
-        except Exception as e:
-            logger.exception(f"Failed to drop the video table {e}")
-
-    def write(self, table: DataFrameMetadata, rows: Batch):
-        """
-        Write rows into the dataframe.
-
-        Arguments:
-            table: table metadata object to write into
-            rows : batch to be persisted in the storage.
-        """
-
-        if rows.empty():
-            return
-        # ToDo
-        # Throw an error if the row schema doesn't match the table schema
-
-        with materialize_dataset(
-            self.spark_session, self._spark_url(table), table.schema.petastorm_schema
-        ):
-
-            records = rows.frames
-            columns = records.keys()
-            rows_rdd = (
-                self.spark_context.parallelize(records.values)
-                .map(lambda x: dict(zip(columns, x)))
-                .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema, x))
-            )
-            self.spark_session.createDataFrame(
-                rows_rdd, table.schema.pyspark_schema
-            ).coalesce(self.coalesce).write.mode("append").parquet(
-                self._spark_url(table)
-            )
-
-    def read(
-        self,
-        table: DataFrameMetadata,
-        batch_mem_size: int,
-        columns: List[str] = None,
-        predicate_func=None,
-    ) -> Iterator[Batch]:
-        """
-        Reads the table and return a batch iterator for the
-        tuples that passes the predicate func.
-
-        Argument:
-            table: table metadata object to write into
-            batch_mem_size (int): memory size of the batch read from storage
-            columns (List[str]): A list of column names to be
-                considered in predicate_func
-            predicate_func: customized predicate function returns bool
-
-        Return:
-            Iterator of Batch read.
-        """
-        predicate = None
-        if predicate_func and columns:
-            predicate = in_lambda(columns, predicate_func)
-
-        # ToDo: Handle the sharding logic. We might have to maintain a
-        # context for deciding which shard to read
-        reader = PetastormReader(
-            self._spark_url(table), batch_mem_size=batch_mem_size, predicate=predicate
-        )
-        for batch in reader.read():
-            yield batch
diff --git a/eva/storage/sqlite_storage_engine.py b/eva/storage/sqlite_storage_engine.py
new file mode 100644
index 000000000..10b3dcb96
--- /dev/null
+++ b/eva/storage/sqlite_storage_engine.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2018-2022 EVA
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+from typing import Iterator, List
+import pandas as pd
+
+from eva.catalog.column_type import ColumnType
+from eva.catalog.models.df_column import DataFrameColumn
+
+from eva.catalog.models.df_metadata import DataFrameMetadata
+from eva.catalog.sql_config import SQLConfig
+from eva.models.storage.batch import Batch
+from eva.storage.abstract_storage_engine import AbstractStorageEngine
+from eva.utils.generic_utils import PickleSerializer, get_size
+from eva.utils.logging_manager import logger
+from eva.catalog.models.base_model import BaseModel
+from eva.catalog.schema_utils import SchemaUtils
+from sqlalchemy.ext.declarative import declarative_base
+
+# Leveraging Dynamic schema in SQLAlchemy
+# https://sparrigan.github.io/sql/sqla/2016/01/03/dynamic-tables.html
+
+Base = declarative_base()
+
+
+class SQLStorageEngine(AbstractStorageEngine):
+    def __init__(self):
+        """
+        Grab the existing sql session
+        """
+        self._sql_session = SQLConfig().session
+        self._sql_engine = SQLConfig().engine
+        self._serializer = PickleSerializer()
+
+    def _dict_to_sql_row(self, dict_row: dict, columns: List[DataFrameColumn]):
+        # Serialize numpy data
+        for col in columns:
+            if col.type == ColumnType.NDARRAY:
+                dict_row[col.name] = self._serializer.serialize(dict_row[col.name])
+        return dict_row            
+
+    def _sql_row_to_dict(self, sql_row: tuple, columns: List[DataFrameColumn]):
+        # Deserialize numpy data
+        dict_row = {}
+        for idx, col in enumerate(columns):
+            if col.type == ColumnType.NDARRAY:
+                dict_row[col.name] = self._serializer.deserialize(sql_row[idx])
+            else:
+                dict_row[col.name] = sql_row[idx]
+        return dict_row 
+    
+    def create(self, table: DataFrameMetadata, **kwargs):
+        """
+        Create an empty table in sql.
+        It dynamically constructs schema in sqlaclchemy
+        to create the table
+        """
+        attr_dict = {"__tablename__": table.name}
+        sqlalchemy_schema = SchemaUtils.get_sqlalchemy_schema(table.columns)
+        attr_dict.update(sqlalchemy_schema)
+        # dynamic schema generation
+        # https://sparrigan.github.io/sql/sqla/2016/01/03/dynamic-tables.html
+        new_table = type("__placeholder_class_name", (BaseModel,), attr_dict)()
+        BaseModel.metadata.tables[table.name].create(self._sql_engine)
+        self._sql_session.commit()
+        return new_table
+
+    def drop(self, table: DataFrameMetadata):
+        dir_path = self._spark_url(table)
+        try:
+            shutil.rmtree(str(dir_path))
+        except Exception as e:
+            logger.exception(f"Failed to drop the video table {e}")
+
+    def write(self, table: DataFrameMetadata, rows: Batch):
+        """
+        Write rows into the sql table.
+
+        Arguments:
+            table: table metadata object to write into
+            rows : batch to be persisted in the storage.
+        """
+        new_table = BaseModel.metadata.tables[table.name]
+        columns = rows.frames.keys()
+        data = []
+        # ToDo: validate the data type before inserting into the table
+        for record in rows.frames.values:
+            row_data = {col: record[idx] for idx, col in enumerate(columns)}
+            data.append(self._dict_to_sql_row(row_data, table.columns))
+        self._sql_engine.execute(new_table.insert(), data)
+        self._sql_session.commit()
+
+    def read(
+        self,
+        table: DataFrameMetadata,
+        batch_mem_size: int,
+    ) -> Iterator[Batch]:
+        """
+        Reads the table and return a batch iterator for the
+        tuples.
+
+        Argument:
+            table: table metadata object of teh table to read 
+            batch_mem_size (int): memory size of the batch read from storage
+        Return:
+            Iterator of Batch read.
+        """
+        
+        new_table = BaseModel.metadata.tables[table.name]
+        result = self._sql_engine.execute(
+            new_table.select()
+        )
+        data_batch = []
+        row_size = None
+        for row in result:
+            # Todo: Verfiy the order of columns in row matches the table.columns
+            # ignore the first dummy (_row_id) primary column 
+            data_batch.append(self._sql_row_to_dict(row[1:], table.columns))
+            if row_size is None:
+                row_size = 0
+                row_size = get_size(data_batch)
+            if len(data_batch) * row_size >= batch_mem_size:
+                yield Batch(pd.DataFrame(data_batch))
+                data_batch = []
+        if data_batch:
+            yield Batch(pd.DataFrame(data_batch))
+        
diff --git a/eva/storage/storage_engine.py b/eva/storage/storage_engine.py
index 93927e6a9..fa54e7da8 100644
--- a/eva/storage/storage_engine.py
+++ b/eva/storage/storage_engine.py
@@ -15,7 +15,7 @@
 from eva.configuration.configuration_manager import ConfigurationManager
 from eva.utils.generic_utils import str_to_class
 
-StorageEngine = str_to_class("eva.storage.structured_storage_engine.SQLStorageEngine")()
+StorageEngine = str_to_class("eva.storage.sqlite_storage_engine.SQLStorageEngine")()
 VideoStorageEngine = str_to_class(
     ConfigurationManager().get_value("storage", "video_engine")
 )()
diff --git a/eva/storage/structured_storage_engine.py b/eva/storage/structured_storage_engine.py
deleted file mode 100644
index 3c83bcbd1..000000000
--- a/eva/storage/structured_storage_engine.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import shutil
-from pathlib import Path
-from typing import Iterator, List
-
-from petastorm.etl.dataset_metadata import materialize_dataset
-from petastorm.predicates import in_lambda
-from petastorm.unischema import dict_to_spark_row
-
-from eva.catalog.models.df_metadata import DataFrameMetadata
-from eva.catalog.sql_config import SQLConfig
-from eva.configuration.configuration_manager import ConfigurationManager
-from eva.models.storage.batch import Batch
-from eva.readers.petastorm_reader import PetastormReader
-from eva.storage.abstract_storage_engine import AbstractStorageEngine
-from eva.utils.logging_manager import logger
-from eva.catalog.models.base_model import BaseModel
-from eva.catalog.schema_utils import SchemaUtils
-
-# Leveraging Dynamic schema in SQLAlchemy
-# https://sparrigan.github.io/sql/sqla/2016/01/03/dynamic-tables.html
-
-
-class SQLStorageEngine(AbstractStorageEngine):
-    def __init__(self):
-        """
-        Grab the existing sql session
-        """
-        self._sql_session = SQLConfig().session
-
-    def create(self, table: DataFrameMetadata, **kwargs):
-        """
-        Create an empty table in sql.
-        """
-        attr_dict = {"__tablename__": table.name}
-        sqlalchemy_schema = SchemaUtils.get_sqlalchemy_schema(table.columns)
-        attr_dict.update(sqlalchemy_schema)
-        new_table = type("new_table", (BaseModel,), attr_dict)
-        BaseModel.metadata.create_table(tables=[new_table.__table__])
-
-    def drop(self, table: DataFrameMetadata):
-        dir_path = self._spark_url(table)
-        try:
-            shutil.rmtree(str(dir_path))
-        except Exception as e:
-            logger.exception(f"Failed to drop the video table {e}")
-
-    def write(self, table: DataFrameMetadata, rows: Batch):
-        """
-        Write rows into the sql table.
-
-        Arguments:
-            table: table metadata object to write into
-            rows : batch to be persisted in the storage.
-        """
-        attr_dict = {"__tablename__": table.name}
-        sqlalchemy_schema = SchemaUtils.get_sqlalchemy_schema(table.columns)
-        attr_dict.update(sqlalchemy_schema)
-        new_table = type("new_table", (BaseModel,), attr_dict)
-        
-        columns = rows.frames.keys() 
-        data = []
-        for record in rows.frames.values:
-            row_data = { col : record[col] for col in columns}
-            data.append(row_data)
-        self._sql_session.bulk_insert_mappings(new_table, data)
-        self._sql_session.commit()
-
-    def read(
-        self,
-        table: DataFrameMetadata,
-        batch_mem_size: int,
-        columns: List[str] = None,
-        predicate_func=None,
-    ) -> Iterator[Batch]:
-        """
-        Reads the table and return a batch iterator for the
-        tuples that passes the predicate func.
-
-        Argument:
-            table: table metadata object to write into
-            batch_mem_size (int): memory size of the batch read from storage
-            columns (List[str]): A list of column names to be
-                considered in predicate_func
-            predicate_func: customized predicate function returns bool
-
-        Return:
-            Iterator of Batch read.
-        """
-        predicate = None
-        if predicate_func and columns:
-            predicate = in_lambda(columns, predicate_func)
-
-        # ToDo: Handle the sharding logic. We might have to maintain a
-        # context for deciding which shard to read
-        reader = PetastormReader(
-            self._spark_url(table),
-            batch_mem_size=batch_mem_size,
-            predicate=predicate,
-        )
-        for batch in reader.read():
-            yield batch
diff --git a/eva/utils/generic_utils.py b/eva/utils/generic_utils.py
index f852fe7cb..968df0261 100644
--- a/eva/utils/generic_utils.py
+++ b/eva/utils/generic_utils.py
@@ -16,6 +16,8 @@
 import importlib
 import sys
 import uuid
+import pickle
+
 from pathlib import Path
 
 from eva.configuration.configuration_manager import ConfigurationManager
@@ -130,3 +132,12 @@ def get_size(obj, seen=None):
     elif hasattr(obj, "__iter__") and not isinstance(obj, (str, bytes, bytearray)):
         size += sum([get_size(i, seen) for i in obj])
     return size
+
+
+class PickleSerializer(object):
+
+    def serialize(self, data):
+        return pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
+
+    def deserialize(self, data):
+        return pickle.loads(data)
\ No newline at end of file
diff --git a/test/spark/__init__.py b/test/spark/__init__.py
deleted file mode 100644
index ccbb30dee..000000000
--- a/test/spark/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/test/spark/test_session.py b/test/spark/test_session.py
deleted file mode 100644
index 35f034102..000000000
--- a/test/spark/test_session.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-from pyspark.sql import SparkSession
-
-from eva.spark.session import Session
-
-
-class SparkSessionTest(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def setUp(self):
-        self.session = Session()
-
-    # def tearDown(self):
-    #     self.session = Session()
-    #     self.session.stop()
-
-    def test_session(self):
-
-        spark_session = self.session.get_session()
-
-        session2 = Session()
-        self.assertEqual(self.session, session2)
-        self.assertIsInstance(spark_session, SparkSession)

From 7ea159e6b886ddeaedadc8977d937160872fcba1 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 03:39:03 -0400
Subject: [PATCH 08/18] feat: remove petastorm related code

---
 eva/catalog/df_schema.py                     |  13 ---
 eva/catalog/schema_utils.py                  | 102 +------------------
 eva/eva.yml                                  |  15 +--
 eva/storage/sqlite_storage_engine.py         |  34 +++----
 eva/storage/storage_engine.py                |   2 +-
 eva/utils/generic_utils.py                   |   6 +-
 test/catalog/test_schema.py                  |  96 -----------------
 test/integration_tests/test_load_executor.py |   7 ++
 8 files changed, 30 insertions(+), 245 deletions(-)

diff --git a/eva/catalog/df_schema.py b/eva/catalog/df_schema.py
index 2536945db..0c7a2090c 100644
--- a/eva/catalog/df_schema.py
+++ b/eva/catalog/df_schema.py
@@ -15,7 +15,6 @@
 from typing import List
 
 from eva.catalog.models.df_column import DataFrameColumn
-from eva.catalog.schema_utils import SchemaUtils
 
 
 class DataFrameSchema(object):
@@ -23,10 +22,6 @@ def __init__(self, name: str, column_list: List[DataFrameColumn]):
 
         self._name = name
         self._column_list = column_list
-        self._petastorm_schema = SchemaUtils.get_petastorm_schema(
-            self._name, self._column_list
-        )
-        self._pyspark_schema = self._petastorm_schema.as_spark_schema()
 
     def __str__(self):
         schema_str = "SCHEMA:: (" + self._name + ")\n"
@@ -42,14 +37,6 @@ def name(self):
     def column_list(self):
         return self._column_list
 
-    @property
-    def petastorm_schema(self):
-        return self._petastorm_schema
-
-    @property
-    def pyspark_schema(self):
-        return self._pyspark_schema
-
     def __eq__(self, other):
         return self.name == other.name and self._column_list == other.column_list
 
diff --git a/eva/catalog/schema_utils.py b/eva/catalog/schema_utils.py
index 4b2c74184..08c58cdfb 100644
--- a/eva/catalog/schema_utils.py
+++ b/eva/catalog/schema_utils.py
@@ -13,111 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Dict, List
-import numpy as np
-import pandas as pd
-from petastorm.codecs import NdarrayCodec, ScalarCodec
-from petastorm.unischema import Unischema, UnischemaField
-from pyspark.sql.types import FloatType, IntegerType, StringType
-from sqlalchemy import (
-    FLOAT,
-    INTEGER,
-    TEXT,
-    Column,
-    Float,
-    Integer,
-    LargeBinary,
-)
 
-from eva.catalog.column_type import ColumnType, NdArrayType
+from sqlalchemy import TEXT, Column, Float, Integer, LargeBinary
+
+from eva.catalog.column_type import ColumnType
 from eva.catalog.models.df_column import DataFrameColumn
 from eva.utils.logging_manager import logger
 
 
 class SchemaUtils(object):
-    @staticmethod
-    def get_petastorm_column(df_column):
-
-        column_type = df_column.type
-        column_name = df_column.name
-        column_is_nullable = df_column.is_nullable
-        column_array_type = df_column.array_type
-        column_array_dimensions = df_column.array_dimensions
-
-        # Reference:
-        # https://github.com/uber/petastorm/blob/master/petastorm/
-        # tests/test_common.py
-
-        petastorm_column = None
-        if column_type == ColumnType.INTEGER:
-            petastorm_column = UnischemaField(
-                column_name,
-                np.int64,
-                (),
-                ScalarCodec(IntegerType()),
-                column_is_nullable,
-            )
-        elif column_type == ColumnType.FLOAT:
-            petastorm_column = UnischemaField(
-                column_name,
-                np.float32,
-                (),
-                ScalarCodec(FloatType()),
-                column_is_nullable,
-            )
-        elif column_type == ColumnType.TEXT:
-            petastorm_column = UnischemaField(
-                column_name,
-                np.str_,
-                (),
-                ScalarCodec(StringType()),
-                column_is_nullable,
-            )
-        elif column_type == ColumnType.NDARRAY:
-            np_type = NdArrayType.to_numpy_type(column_array_type)
-            petastorm_column = UnischemaField(
-                column_name,
-                np_type,
-                column_array_dimensions,
-                NdarrayCodec(),
-                column_is_nullable,
-            )
-        else:
-            logger.error("Invalid column type: " + str(column_type))
-
-        return petastorm_column
-
-    @staticmethod
-    def get_petastorm_schema(name, column_list):
-        petastorm_column_list = []
-        for _column in column_list:
-            petastorm_column = SchemaUtils.get_petastorm_column(_column)
-            petastorm_column_list.append(petastorm_column)
-
-        petastorm_schema = Unischema(name, petastorm_column_list)
-        return petastorm_schema
-
-    @staticmethod
-    def petastorm_type_cast(
-        schema: Unischema, df: pd.DataFrame
-    ) -> pd.DataFrame:
-        """
-        Try to cast the type if schema defined in UnischemeField for
-        Petastorm is not consistent with panda DataFrame provided.
-        """
-        for unischema in schema.fields.values():
-            if not isinstance(unischema.codec, NdarrayCodec):
-                continue
-            # We only care when the cell data is np.ndarray
-            col = unischema.name
-            dtype = unischema.numpy_dtype
-            try:
-                df[col] = df[col].apply(lambda x: x.astype(dtype, copy=False))
-            except Exception:
-                logger.exception(
-                    "Failed to cast %s to %s for Petastorm" % (col, dtype)
-                )
-        return df
-
     @staticmethod
     def get_sqlalchemy_column(df_column: DataFrameColumn) -> Column:
         column_type = df_column.type
diff --git a/eva/eva.yml b/eva/eva.yml
index f53143754..fafae81f8 100644
--- a/eva/eva.yml
+++ b/eva/eva.yml
@@ -17,23 +17,10 @@ executor:
 
 storage:
   upload_dir: ""
-  engine: "eva.storage.petastorm_storage_engine.PetastormStorageEngine"
+  engine: "eva.storage.sqlite_storage_engine.SQLStorageEngine"
   video_engine: "eva.storage.opencv_storage_engine.OpenCVStorageEngine"
   video_engine_version: 0
 
-  # https://petastorm.readthedocs.io/en/latest/api.html#module-petastorm.reader
-  petastorm: {'cache_type' : 'local-disk',
-              'cache_location' : '.cache',
-              'cache_size_limit' : 4000000000, #4gb
-              'cache_row_size_estimate' : 512}
-
-pyspark:
-  property: {'spark.logConf': 'true',
-             'spark.driver.memory': '10g',
-             'spark.sql.execution.arrow.pyspark.enabled': 'true'}
-  coalesce: 2
-
-
 server:
   host: "0.0.0.0"
   port: 5432
diff --git a/eva/storage/sqlite_storage_engine.py b/eva/storage/sqlite_storage_engine.py
index 10b3dcb96..7b12c46f9 100644
--- a/eva/storage/sqlite_storage_engine.py
+++ b/eva/storage/sqlite_storage_engine.py
@@ -12,22 +12,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import shutil
 from typing import Iterator, List
+
 import pandas as pd
+from sqlalchemy.ext.declarative import declarative_base
 
 from eva.catalog.column_type import ColumnType
+from eva.catalog.models.base_model import BaseModel
 from eva.catalog.models.df_column import DataFrameColumn
-
 from eva.catalog.models.df_metadata import DataFrameMetadata
+from eva.catalog.schema_utils import SchemaUtils
 from eva.catalog.sql_config import SQLConfig
 from eva.models.storage.batch import Batch
 from eva.storage.abstract_storage_engine import AbstractStorageEngine
 from eva.utils.generic_utils import PickleSerializer, get_size
 from eva.utils.logging_manager import logger
-from eva.catalog.models.base_model import BaseModel
-from eva.catalog.schema_utils import SchemaUtils
-from sqlalchemy.ext.declarative import declarative_base
 
 # Leveraging Dynamic schema in SQLAlchemy
 # https://sparrigan.github.io/sql/sqla/2016/01/03/dynamic-tables.html
@@ -49,7 +48,7 @@ def _dict_to_sql_row(self, dict_row: dict, columns: List[DataFrameColumn]):
         for col in columns:
             if col.type == ColumnType.NDARRAY:
                 dict_row[col.name] = self._serializer.serialize(dict_row[col.name])
-        return dict_row            
+        return dict_row
 
     def _sql_row_to_dict(self, sql_row: tuple, columns: List[DataFrameColumn]):
         # Deserialize numpy data
@@ -59,8 +58,8 @@ def _sql_row_to_dict(self, sql_row: tuple, columns: List[DataFrameColumn]):
                 dict_row[col.name] = self._serializer.deserialize(sql_row[idx])
             else:
                 dict_row[col.name] = sql_row[idx]
-        return dict_row 
-    
+        return dict_row
+
     def create(self, table: DataFrameMetadata, **kwargs):
         """
         Create an empty table in sql.
@@ -78,11 +77,13 @@ def create(self, table: DataFrameMetadata, **kwargs):
         return new_table
 
     def drop(self, table: DataFrameMetadata):
-        dir_path = self._spark_url(table)
         try:
-            shutil.rmtree(str(dir_path))
+            BaseModel.metadata.tables[table.name].drop()
+            self._sql_session.commit()
         except Exception as e:
-            logger.exception(f"Failed to drop the video table {e}")
+            logger.exception(
+                f"Failed to drop the table {table.name} with Exception {str(e)}"
+            )
 
     def write(self, table: DataFrameMetadata, rows: Batch):
         """
@@ -112,21 +113,19 @@ def read(
         tuples.
 
         Argument:
-            table: table metadata object of teh table to read 
+            table: table metadata object of teh table to read
             batch_mem_size (int): memory size of the batch read from storage
         Return:
             Iterator of Batch read.
         """
-        
+
         new_table = BaseModel.metadata.tables[table.name]
-        result = self._sql_engine.execute(
-            new_table.select()
-        )
+        result = self._sql_engine.execute(new_table.select())
         data_batch = []
         row_size = None
         for row in result:
             # Todo: Verfiy the order of columns in row matches the table.columns
-            # ignore the first dummy (_row_id) primary column 
+            # ignore the first dummy (_row_id) primary column
             data_batch.append(self._sql_row_to_dict(row[1:], table.columns))
             if row_size is None:
                 row_size = 0
@@ -136,4 +135,3 @@ def read(
                 data_batch = []
         if data_batch:
             yield Batch(pd.DataFrame(data_batch))
-        
diff --git a/eva/storage/storage_engine.py b/eva/storage/storage_engine.py
index fa54e7da8..daa2cf413 100644
--- a/eva/storage/storage_engine.py
+++ b/eva/storage/storage_engine.py
@@ -15,7 +15,7 @@
 from eva.configuration.configuration_manager import ConfigurationManager
 from eva.utils.generic_utils import str_to_class
 
-StorageEngine = str_to_class("eva.storage.sqlite_storage_engine.SQLStorageEngine")()
+StorageEngine = str_to_class(ConfigurationManager().get_value("storage", "engine"))()
 VideoStorageEngine = str_to_class(
     ConfigurationManager().get_value("storage", "video_engine")
 )()
diff --git a/eva/utils/generic_utils.py b/eva/utils/generic_utils.py
index 968df0261..51029811d 100644
--- a/eva/utils/generic_utils.py
+++ b/eva/utils/generic_utils.py
@@ -14,10 +14,9 @@
 # limitations under the License.
 import hashlib
 import importlib
+import pickle
 import sys
 import uuid
-import pickle
-
 from pathlib import Path
 
 from eva.configuration.configuration_manager import ConfigurationManager
@@ -135,9 +134,8 @@ def get_size(obj, seen=None):
 
 
 class PickleSerializer(object):
-
     def serialize(self, data):
         return pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
 
     def deserialize(self, data):
-        return pickle.loads(data)
\ No newline at end of file
+        return pickle.loads(data)
diff --git a/test/catalog/test_schema.py b/test/catalog/test_schema.py
index b536290f9..afad977b6 100644
--- a/test/catalog/test_schema.py
+++ b/test/catalog/test_schema.py
@@ -13,112 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from decimal import Decimal
-from unittest.mock import MagicMock, call, patch
-
-import numpy as np
-from petastorm.codecs import NdarrayCodec, ScalarCodec
-from petastorm.unischema import UnischemaField
-from pyspark.sql.types import FloatType, IntegerType, StringType
 
 from eva.catalog.column_type import ColumnType, NdArrayType
 from eva.catalog.df_schema import DataFrameSchema
 from eva.catalog.models.df_column import DataFrameColumn
-from eva.catalog.schema_utils import SchemaUtils
 
 
 class SchemaTests(unittest.TestCase):
 
     # TEST SCHEMA UTILS START
 
-    def test_get_petastorm_column(self):
-        col_name = "frame_id"
-        col = DataFrameColumn(col_name, ColumnType.INTEGER, False)
-        petastorm_col = UnischemaField(
-            col_name, np.int64, (), ScalarCodec(IntegerType()), False
-        )
-        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)
-
-        col = DataFrameColumn(col_name, ColumnType.FLOAT, True)
-        petastorm_col = UnischemaField(
-            col_name, np.float32, (), ScalarCodec(FloatType()), True
-        )
-        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)
-
-        col = DataFrameColumn(col_name, ColumnType.TEXT, False)
-        petastorm_col = UnischemaField(
-            col_name, np.str_, (), ScalarCodec(StringType()), False
-        )
-        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)
-
-        col = DataFrameColumn(col_name, None, True, [10, 10])
-        self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
-
-    def test_get_petastorm_column_ndarray(self):
-        expected_type = [
-            np.int8,
-            np.uint8,
-            np.int16,
-            np.int32,
-            np.int64,
-            np.unicode_,
-            np.bool_,
-            np.float32,
-            np.float64,
-            Decimal,
-            np.str_,
-            np.datetime64,
-        ]
-        col_name = "frame_id"
-        for array_type, np_type in zip(NdArrayType, expected_type):
-            col = DataFrameColumn(
-                col_name, ColumnType.NDARRAY, True, array_type, [10, 10]
-            )
-            petastorm_col = UnischemaField(
-                col_name, np_type, [10, 10], NdarrayCodec(), True
-            )
-            self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)
-
-    def test_raise_exception_when_unkown_array_type(self):
-        col_name = "frame_id"
-        col = DataFrameColumn(
-            col_name, ColumnType.NDARRAY, True, ColumnType.TEXT, [10, 10]
-        )
-        self.assertRaises(ValueError, SchemaUtils.get_petastorm_column, col)
-
-    @patch("eva.catalog.schema_utils.Unischema")
-    @patch("eva.catalog.schema_utils.SchemaUtils.get_petastorm_column")
-    def test_get_petastorm_schema(self, mock_get_pc, mock_uni):
-        cols = [MagicMock() for i in range(2)]
-        mock_get_pc.side_effect = [1, 2]
-        self.assertEqual(
-            SchemaUtils.get_petastorm_schema("name", cols), mock_uni.return_value
-        )
-        mock_get_pc.assert_has_calls([call(cols[0]), call(cols[1])])
-        mock_uni.assert_called_once_with("name", [1, 2])
-
-    # TEST SCHEMA UTILS END
-
-    # TEST DF_SCHEMA START
-    def test_df_schema(self):
-        schema_name = "foo"
-        column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
-        column_2 = DataFrameColumn(
-            "frame_data", ColumnType.NDARRAY, False, NdArrayType.UINT8, [28, 28]
-        )
-        column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
-        col_list = [column_1, column_2, column_3]
-        schema = DataFrameSchema(schema_name, col_list)
-        expected_schema = SchemaUtils.get_petastorm_schema(schema_name, col_list)
-        self.assertEqual(schema.name, schema_name)
-        self.assertEqual(schema.column_list, col_list)
-        self.assertEqual(schema.petastorm_schema.fields, expected_schema.fields)
-        for field1, field2 in zip(
-            schema.petastorm_schema.fields, expected_schema.fields
-        ):
-            self.assertEqual(field1, field2)
-        self.assertEqual(schema.pyspark_schema, expected_schema.as_spark_schema())
-
     def test_schema_equality(self):
         schema_name = "foo"
         column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
diff --git a/test/integration_tests/test_load_executor.py b/test/integration_tests/test_load_executor.py
index f48fc24b9..b53be07eb 100644
--- a/test/integration_tests/test_load_executor.py
+++ b/test/integration_tests/test_load_executor.py
@@ -127,3 +127,10 @@ def test_should_load_csv_with_columns_in_table(self):
         expected_batch = create_dummy_csv_batches(target_columns=select_columns)
         expected_batch.modify_column_alias("myvideocsv")
         self.assertEqual(actual_batch, expected_batch)
+
+
+if __name__ == "__main__":
+    suite = unittest.TestSuite()
+    suite.addTest(LoadExecutorTest("test_should_load_csv_in_table"))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)

From 0f37cd0a028cf08bb8d28b02ff80c396afed7da2 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 03:39:28 -0400
Subject: [PATCH 09/18] feat: remove spark+petastorm dependency

---
 setup.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/setup.py b/setup.py
index 4f1bcaf45..8ffcb6326 100644
--- a/setup.py
+++ b/setup.py
@@ -44,8 +44,6 @@ def read(path, encoding="utf-8"):
     "Pillow==9.0.1",
     "sqlalchemy==1.3.20",
     "sqlalchemy-utils==0.36.6",
-    "pyspark==3.1.3",
-    "petastorm==0.12.0",
     "antlr4-python3-runtime==4.10",
     "pyyaml==5.1",
     "importlib-metadata<5.0",

From 7eb3d5d34916198fb17a5d567e7f0d79e25f7f59 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 03:56:25 -0400
Subject: [PATCH 10/18] test: add sqlite test case

---
 ...ngine.py => test_sqlite_storage_engine.py} | 42 +++++--------------
 1 file changed, 10 insertions(+), 32 deletions(-)
 rename test/storage/{test_petastorm_storage_engine.py => test_sqlite_storage_engine.py} (60%)

diff --git a/test/storage/test_petastorm_storage_engine.py b/test/storage/test_sqlite_storage_engine.py
similarity index 60%
rename from test/storage/test_petastorm_storage_engine.py
rename to test/storage/test_sqlite_storage_engine.py
index f6ce4defd..c3f8c124b 100644
--- a/test/storage/test_petastorm_storage_engine.py
+++ b/test/storage/test_sqlite_storage_engine.py
@@ -14,15 +14,15 @@
 # limitations under the License.
 import shutil
 import unittest
-from test.util import NUM_FRAMES, create_dummy_batches
+from test.util import create_dummy_batches
 
 from eva.catalog.column_type import ColumnType, NdArrayType
 from eva.catalog.models.df_column import DataFrameColumn
 from eva.catalog.models.df_metadata import DataFrameMetadata
-from eva.storage.petastorm_storage_engine import PetastormStorageEngine
+from eva.storage.sqlite_storage_engine import SQLStorageEngine
 
 
-class PetastormStorageEngineTest(unittest.TestCase):
+class SQLStorageEngineTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.table = None
@@ -47,41 +47,19 @@ def tearDown(self):
             pass
 
     def test_should_create_empty_table(self):
-        petastorm = PetastormStorageEngine()
-        petastorm.create(self.table)
-        records = list(petastorm.read(self.table, batch_mem_size=3000))
+        sqlengine = SQLStorageEngine()
+        sqlengine.create(self.table)
+        records = list(sqlengine.read(self.table, batch_mem_size=3000))
         self.assertEqual(records, [])
 
     def test_should_write_rows_to_table(self):
         dummy_batches = list(create_dummy_batches())
 
-        petastorm = PetastormStorageEngine()
-        petastorm.create(self.table)
+        sqlengine = SQLStorageEngine()
+        sqlengine.create(self.table)
         for batch in dummy_batches:
             batch.drop_column_alias()
-            petastorm.write(self.table, batch)
+            sqlengine.write(self.table, batch)
 
-        read_batch = list(petastorm.read(self.table, batch_mem_size=3000))
+        read_batch = list(sqlengine.read(self.table, batch_mem_size=3000))
         self.assertTrue(read_batch, dummy_batches)
-
-    def test_should_return_even_frames(self):
-        dummy_batches = list(create_dummy_batches())
-
-        petastorm = PetastormStorageEngine()
-        petastorm.create(self.table)
-        for batch in dummy_batches:
-            batch.drop_column_alias()
-            petastorm.write(self.table, batch)
-
-        read_batch = list(
-            petastorm.read(
-                self.table,
-                batch_mem_size=3000,
-                columns=["id"],
-                predicate_func=lambda id: id % 2 == 0,
-            )
-        )
-        expected_batch = list(
-            create_dummy_batches(filters=[i for i in range(NUM_FRAMES) if i % 2 == 0])
-        )
-        self.assertTrue(read_batch, expected_batch)

From d033383bc92791140ffd0e151276b32f47f94e02 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 03:57:43 -0400
Subject: [PATCH 11/18] feat: remove petastorm test case

---
 test/readers/test_petastorm_reader.py | 102 --------------------------
 1 file changed, 102 deletions(-)
 delete mode 100644 test/readers/test_petastorm_reader.py

diff --git a/test/readers/test_petastorm_reader.py b/test/readers/test_petastorm_reader.py
deleted file mode 100644
index 61b75b65d..000000000
--- a/test/readers/test_petastorm_reader.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2022 EVA
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import unittest
-from test.util import upload_dir_from_config
-from unittest.mock import patch
-
-import numpy as np
-
-from eva.configuration.configuration_manager import ConfigurationManager
-from eva.readers.petastorm_reader import PetastormReader
-
-
-class PetastormLoaderTest(unittest.TestCase):
-    class DummyRow:
-        def __init__(self, frame_id, frame_data):
-            self.frame_id = frame_id
-            self.frame_data = frame_data
-
-        def _asdict(self):
-            return {"id": self.frame_id, "data": self.frame_data}
-
-    class DummyReader:
-        def __init__(self, data):
-            self.data = data
-
-        def __enter__(self):
-            return self
-
-        def __iter__(self):
-            return self.data
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            pass
-
-    @patch("eva.readers.petastorm_reader.make_reader")
-    def test_should_call_petastorm_make_reader_with_correct_params(self, mock):
-        petastorm_reader = PetastormReader(
-            file_url=os.path.join(upload_dir_from_config, "dummy.avi"),
-            batch_mem_size=3000,
-            cur_shard=2,
-            shard_count=3,
-            predicate="pred",
-        )
-        list(petastorm_reader._read())
-        mock.assert_called_once_with(
-            os.path.join(upload_dir_from_config, "dummy.avi"),
-            shard_count=3,
-            cur_shard=2,
-            predicate="pred",
-            cache_type=None,
-            cache_location=None,
-            cache_size_limit=None,
-            cache_row_size_estimate=None,
-        )
-
-    @patch("eva.readers.petastorm_reader.make_reader")
-    def test_should_call_petastorm_make_reader_with_negative_shards(self, mock):
-        petastorm_reader = PetastormReader(
-            file_url=os.path.join(upload_dir_from_config, "dummy.avi"),
-            batch_mem_size=3000,
-            cur_shard=-1,
-            shard_count=-2,
-        )
-        list(petastorm_reader._read())
-        petastorm_config = ConfigurationManager().get_value("storage", "petastorm")
-        mock.assert_called_once_with(
-            os.path.join(upload_dir_from_config, "dummy.avi"),
-            shard_count=None,
-            cur_shard=None,
-            predicate=None,
-            cache_location=petastorm_config.get("cache_location", None),
-            cache_row_size_estimate=petastorm_config.get(
-                "cache_row_size_estimate", None
-            ),
-            cache_size_limit=petastorm_config.get("cache_size_limit", None),
-            cache_type=petastorm_config.get("cache_type", None),
-        )
-
-    @patch("eva.readers.petastorm_reader.make_reader")
-    def test_should_read_data_using_petastorm_reader(self, mock):
-        petastorm_reader = PetastormReader(
-            file_url=os.path.join(upload_dir_from_config, "dummy.avi"),
-            batch_mem_size=3000,
-        )
-        dummy_values = map(lambda i: self.DummyRow(i, np.ones((2, 2, 3)) * i), range(3))
-        mock.return_value = self.DummyReader(dummy_values)
-        actual = list(petastorm_reader._read())
-        expected = list(dummy_values)
-        self.assertTrue(all([np.allclose(i, j) for i, j in zip(actual, expected)]))

From e9c92b7c04f45b8aaa0bc67f1c91a1deb64a1b45 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 11:12:33 -0400
Subject: [PATCH 12/18] feat: enable drop table for sqlite

---
 eva/catalog/services/df_service.py             |  1 +
 eva/executor/drop_executor.py                  |  2 ++
 eva/storage/sqlite_storage_engine.py           |  7 ++++++-
 test/integration_tests/test_load_executor.py   | 13 ++++++++-----
 test/integration_tests/test_select_executor.py |  2 +-
 5 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/eva/catalog/services/df_service.py b/eva/catalog/services/df_service.py
index f0368a400..66c171d1a 100644
--- a/eva/catalog/services/df_service.py
+++ b/eva/catalog/services/df_service.py
@@ -104,6 +104,7 @@ def drop_dataset_by_name(self, database_name: str, dataset_name: str):
         try:
             dataset = self.dataset_object_by_name(database_name, dataset_name)
             dataset.delete()
+            return True
         except Exception as e:
             err_msg = "Delete dataset failed for name {} with error {}".format(
                 dataset_name, str(e)
diff --git a/eva/executor/drop_executor.py b/eva/executor/drop_executor.py
index 73fd3e31b..e94593a01 100644
--- a/eva/executor/drop_executor.py
+++ b/eva/executor/drop_executor.py
@@ -16,6 +16,7 @@
 
 from eva.catalog.catalog_manager import CatalogManager
 from eva.executor.abstract_executor import AbstractExecutor
+from eva.executor.executor_utils import ExecutorError
 from eva.models.storage.batch import Batch
 from eva.planner.drop_plan import DropPlan
 from eva.storage.storage_engine import StorageEngine, VideoStorageEngine
@@ -57,6 +58,7 @@ def exec(self):
         if not success:
             err_msg = "Failed to drop {}".format(table_ref)
             logger.exception(err_msg)
+            raise ExecutorError(err_msg)
 
         yield Batch(
             pd.DataFrame(
diff --git a/eva/storage/sqlite_storage_engine.py b/eva/storage/sqlite_storage_engine.py
index 7b12c46f9..f6b665e10 100644
--- a/eva/storage/sqlite_storage_engine.py
+++ b/eva/storage/sqlite_storage_engine.py
@@ -78,8 +78,13 @@ def create(self, table: DataFrameMetadata, **kwargs):
 
     def drop(self, table: DataFrameMetadata):
         try:
-            BaseModel.metadata.tables[table.name].drop()
+            table_to_remove = BaseModel.metadata.tables[table.name]
+            table_to_remove.drop()
             self._sql_session.commit()
+            # In memory metadata does not automatically sync with the database
+            # therefore manually removing the table from the in-memory metadata
+            # https://github.com/sqlalchemy/sqlalchemy/issues/5112 
+            BaseModel.metadata.remove(table_to_remove)
         except Exception as e:
             logger.exception(
                 f"Failed to drop the table {table.name} with Exception {str(e)}"
diff --git a/test/integration_tests/test_load_executor.py b/test/integration_tests/test_load_executor.py
index b53be07eb..9e76271db 100644
--- a/test/integration_tests/test_load_executor.py
+++ b/test/integration_tests/test_load_executor.py
@@ -91,6 +91,10 @@ def test_should_load_csv_in_table(self):
         expected_batch = create_dummy_csv_batches()
         expected_batch.modify_column_alias("myvideocsv")
         self.assertEqual(actual_batch, expected_batch)
+        
+        # clean up
+        drop_query = "DROP TABLE MyVideoCSV;"
+        execute_query_fetch_all(drop_query)
 
     def test_should_load_csv_with_columns_in_table(self):
 
@@ -101,12 +105,8 @@ def test_should_load_csv_with_columns_in_table(self):
                 id INTEGER UNIQUE,
                 frame_id INTEGER NOT NULL,
                 video_id INTEGER NOT NULL,
-                dataset_name TEXT(30) NOT NULL,
-                label TEXT(30),
-                bbox NDARRAY FLOAT32(4),
-                object_id INTEGER
+                dataset_name TEXT(30) NOT NULL
             );
-
             """
         execute_query_fetch_all(create_table_query)
 
@@ -128,6 +128,9 @@ def test_should_load_csv_with_columns_in_table(self):
         expected_batch.modify_column_alias("myvideocsv")
         self.assertEqual(actual_batch, expected_batch)
 
+        # clean up
+        drop_query = "DROP TABLE MyVideoCSV;"
+        execute_query_fetch_all(drop_query)
 
 if __name__ == "__main__":
     suite = unittest.TestSuite()
diff --git a/test/integration_tests/test_select_executor.py b/test/integration_tests/test_select_executor.py
index 66c51cec6..bcc863864 100644
--- a/test/integration_tests/test_select_executor.py
+++ b/test/integration_tests/test_select_executor.py
@@ -248,7 +248,7 @@ def test_select_and_sample(self):
         # Disabling it for time being
         self.assertEqual(actual_batch, expected_batch[0])
 
-    def test_aaselect_and_sample_with_predicate(self):
+    def test_select_and_sample_with_predicate(self):
         select_query = (
             "SELECT name, id,data FROM MyVideo SAMPLE 2 WHERE id > 5 ORDER BY id;"
         )

From 177a637535ac5262c68deb8de324fb2ffafdedd7 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 11:14:28 -0400
Subject: [PATCH 13/18] style: ran black

---
 eva/storage/sqlite_storage_engine.py         | 2 +-
 test/integration_tests/test_load_executor.py | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/eva/storage/sqlite_storage_engine.py b/eva/storage/sqlite_storage_engine.py
index f6b665e10..8b8185ec9 100644
--- a/eva/storage/sqlite_storage_engine.py
+++ b/eva/storage/sqlite_storage_engine.py
@@ -83,7 +83,7 @@ def drop(self, table: DataFrameMetadata):
             self._sql_session.commit()
             # In memory metadata does not automatically sync with the database
             # therefore manually removing the table from the in-memory metadata
-            # https://github.com/sqlalchemy/sqlalchemy/issues/5112 
+            # https://github.com/sqlalchemy/sqlalchemy/issues/5112
             BaseModel.metadata.remove(table_to_remove)
         except Exception as e:
             logger.exception(
diff --git a/test/integration_tests/test_load_executor.py b/test/integration_tests/test_load_executor.py
index 9e76271db..4929502f4 100644
--- a/test/integration_tests/test_load_executor.py
+++ b/test/integration_tests/test_load_executor.py
@@ -91,7 +91,7 @@ def test_should_load_csv_in_table(self):
         expected_batch = create_dummy_csv_batches()
         expected_batch.modify_column_alias("myvideocsv")
         self.assertEqual(actual_batch, expected_batch)
-        
+
         # clean up
         drop_query = "DROP TABLE MyVideoCSV;"
         execute_query_fetch_all(drop_query)
@@ -131,9 +131,3 @@ def test_should_load_csv_with_columns_in_table(self):
         # clean up
         drop_query = "DROP TABLE MyVideoCSV;"
         execute_query_fetch_all(drop_query)
-
-if __name__ == "__main__":
-    suite = unittest.TestSuite()
-    suite.addTest(LoadExecutorTest("test_should_load_csv_in_table"))
-    runner = unittest.TextTestRunner()
-    runner.run(suite)

From 40656da01d113b575a1e414f8d3dd2ee1db037e6 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 12:42:46 -0400
Subject: [PATCH 14/18] docs: remove old file

---
 api-docs/source/contribute/index.md | 72 -----------------------------
 1 file changed, 72 deletions(-)
 delete mode 100644 api-docs/source/contribute/index.md

diff --git a/api-docs/source/contribute/index.md b/api-docs/source/contribute/index.md
deleted file mode 100644
index 40f104758..000000000
--- a/api-docs/source/contribute/index.md
+++ /dev/null
@@ -1,72 +0,0 @@
-###  Contributing
-We welcome all kinds of contributions to EVA.
-- New features
-- Code reviewing of PR
-- Documentation 
-- Tutorials and Applications
-
-#### Setting up the development environment
-To hack on EVA, you need to checkout the repository and build EVA from the source.
-Follow the following instructions to build EVA and test your changes locally.
-We recommend using a virtual environment and the pip package manager. EVA requires JAVA 8 for generating the parser.
-```
-git clone https://github.com/georgia-tech-db/eva.git && cd eva
-python3 -m venv env38                                # to create a virtual environment
-. env38/bin/activate
-pip install --upgrade pip
-sudo -E apt install -y openjdk-8-jdk openjdk-8-jre   # to install JAVA
-sh script/antlr4/generate_parser.sh                  # to generate the EVA parser
-pip install -e ".[dev]"
-```
-
-#### Submitting a contribution
-Follow the following steps to contribute to EVA:
-* Merge the most recent changes from the master branch 
-```
-    git remote add origin git@github.com:georgia-tech-db/eva.git
-    git pull . origin/master
-```
-* Run the [test script](#testing) to ensure all the test cases pass.
-* Run the `setup_git_hooks.sh` to add a git pre-push hook so that it runs the linter before pushing any changes.
-* If you are adding a new SQL command, please add the example usage to the documentation. 
-
-#### Testing
-<a name="testing"></a>
-Before merging the PR, the code must pass all the unit test cases. You can use the following script to run all the test cases locally.
-```
-bash script/test/test.sh
-```
-If you want to run a specific test file, use the following command.
-```
-python -m pytest test/integration_tests/test_select_executor.py
-```
-Use the following command to run a specific test case within a test file.
-```
-python -m pytest test/integration_tests/test_select_executor.py -k 'test_should_load_and_select_in_table'
-```
-
-#### Code Style
-We use the [black](https://github.com/psf/black) code style for formatting our python code. For docstrings and documentation, we use [Google pydoc format](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html).
-
-```
-def function_with_types_in_docstring(param1, param2) -> bool:
-    """Example function with types documented in the docstring.
-
-    Additional explanatory text can be added in paragraphs.
-
-    Args:
-        param1 (int): The first parameter.
-        param2 (str): The second parameter.
-
-    Returns:
-        bool: The return value. True for success, False otherwise.
-
-```
-
-##### Lint and Formatting
-Before merging, the PR must pass the code formatting and linting test case.
-On your local machine, run the following script to auto-format using `black`
-
-```
-python script/formatting/formatter.py 
-```

From 06c21209e4acae50f2ed2a85c4e35dfbfd5afd7b Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 13:17:45 -0400
Subject: [PATCH 15/18] fix: sqlalchemy does not support numpy data types

---
 eva/storage/sqlite_storage_engine.py       | 5 +++++
 test/storage/test_sqlite_storage_engine.py | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/eva/storage/sqlite_storage_engine.py b/eva/storage/sqlite_storage_engine.py
index 8b8185ec9..6d51c8ad7 100644
--- a/eva/storage/sqlite_storage_engine.py
+++ b/eva/storage/sqlite_storage_engine.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from typing import Iterator, List
 
+import numpy as np
 import pandas as pd
 from sqlalchemy.ext.declarative import declarative_base
 
@@ -48,6 +49,10 @@ def _dict_to_sql_row(self, dict_row: dict, columns: List[DataFrameColumn]):
         for col in columns:
             if col.type == ColumnType.NDARRAY:
                 dict_row[col.name] = self._serializer.serialize(dict_row[col.name])
+            elif isinstance(dict_row[col.name], (np.generic,)):
+                # SqlAlchemy does not understand numpy geenric data types
+                # convert numpy datatype to python generic datatype
+                dict_row[col.name] = dict_row[col.name].tolist()
         return dict_row
 
     def _sql_row_to_dict(self, sql_row: tuple, columns: List[DataFrameColumn]):
diff --git a/test/storage/test_sqlite_storage_engine.py b/test/storage/test_sqlite_storage_engine.py
index c3f8c124b..88dbf9087 100644
--- a/test/storage/test_sqlite_storage_engine.py
+++ b/test/storage/test_sqlite_storage_engine.py
@@ -51,6 +51,8 @@ def test_should_create_empty_table(self):
         sqlengine.create(self.table)
         records = list(sqlengine.read(self.table, batch_mem_size=3000))
         self.assertEqual(records, [])
+        # clean up
+        sqlengine.drop(self.table)
 
     def test_should_write_rows_to_table(self):
         dummy_batches = list(create_dummy_batches())
@@ -63,3 +65,5 @@ def test_should_write_rows_to_table(self):
 
         read_batch = list(sqlengine.read(self.table, batch_mem_size=3000))
         self.assertTrue(read_batch, dummy_batches)
+        # clean up
+        sqlengine.drop(self.table)

From bdf8c6ee58c561bc311d8e4bd23e1f08af62956e Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Thu, 27 Oct 2022 13:51:02 -0400
Subject: [PATCH 16/18] docs: improve docs

---
 eva/storage/sqlite_storage_engine.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/eva/storage/sqlite_storage_engine.py b/eva/storage/sqlite_storage_engine.py
index 6d51c8ad7..390166833 100644
--- a/eva/storage/sqlite_storage_engine.py
+++ b/eva/storage/sqlite_storage_engine.py
@@ -50,8 +50,9 @@ def _dict_to_sql_row(self, dict_row: dict, columns: List[DataFrameColumn]):
             if col.type == ColumnType.NDARRAY:
                 dict_row[col.name] = self._serializer.serialize(dict_row[col.name])
             elif isinstance(dict_row[col.name], (np.generic,)):
-                # SqlAlchemy does not understand numpy geenric data types
+                # SqlAlchemy does not consume numpy generic data types
                 # convert numpy datatype to python generic datatype
+                # eg. np.int64 -> int
                 dict_row[col.name] = dict_row[col.name].tolist()
         return dict_row
 
@@ -86,7 +87,7 @@ def drop(self, table: DataFrameMetadata):
             table_to_remove = BaseModel.metadata.tables[table.name]
             table_to_remove.drop()
             self._sql_session.commit()
-            # In memory metadata does not automatically sync with the database
+            # In-memory metadata does not automatically sync with the database
             # therefore manually removing the table from the in-memory metadata
             # https://github.com/sqlalchemy/sqlalchemy/issues/5112
             BaseModel.metadata.remove(table_to_remove)

From b649a474789c9e607f3fed5297f3aa3ef62be558 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Mon, 31 Oct 2022 23:08:17 -0400
Subject: [PATCH 17/18] fix: address PR comments

---
 eva/storage/sqlite_storage_engine.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/eva/storage/sqlite_storage_engine.py b/eva/storage/sqlite_storage_engine.py
index 390166833..a8a1d089a 100644
--- a/eva/storage/sqlite_storage_engine.py
+++ b/eva/storage/sqlite_storage_engine.py
@@ -16,7 +16,6 @@
 
 import numpy as np
 import pandas as pd
-from sqlalchemy.ext.declarative import declarative_base
 
 from eva.catalog.column_type import ColumnType
 from eva.catalog.models.base_model import BaseModel
@@ -32,8 +31,6 @@
 # Leveraging Dynamic schema in SQLAlchemy
 # https://sparrigan.github.io/sql/sqla/2016/01/03/dynamic-tables.html
 
-Base = declarative_base()
-
 
 class SQLStorageEngine(AbstractStorageEngine):
     def __init__(self):
@@ -51,8 +48,9 @@ def _dict_to_sql_row(self, dict_row: dict, columns: List[DataFrameColumn]):
                 dict_row[col.name] = self._serializer.serialize(dict_row[col.name])
             elif isinstance(dict_row[col.name], (np.generic,)):
                 # SqlAlchemy does not consume numpy generic data types
-                # convert numpy datatype to python generic datatype
+                # convert numpy datatype to python generic datatype using tolist()
                 # eg. np.int64 -> int
+                # https://stackoverflow.com/a/53067954
                 dict_row[col.name] = dict_row[col.name].tolist()
         return dict_row
 

From 570081a5cc163203e4274671c449f8fc07b5f165 Mon Sep 17 00:00:00 2001
From: Gaurav Tarlok Kakkar <gaurav21776@gmail.com>
Date: Mon, 31 Oct 2022 23:15:02 -0400
Subject: [PATCH 18/18] fix: merge issues

---
 setup.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/setup.py b/setup.py
index 026a954db..1394542c7 100644
--- a/setup.py
+++ b/setup.py
@@ -44,13 +44,7 @@ def read(path, encoding="utf-8"):
     "Pillow==9.0.1",
     "sqlalchemy==1.3.20",
     "sqlalchemy-utils==0.36.6",
-<<<<<<< HEAD
-    "antlr4-python3-runtime==4.10",
-=======
-    "pyspark==3.0.2",
-    "petastorm==0.11.5",
     "antlr4-python3-runtime==4.8",
->>>>>>> c832a21359d0515d7a5346bdc03740545f7bd65a
     "pyyaml==5.1",
     "importlib-metadata<5.0"
 ]