From 0084a04c9da8a05e7735868b4d38889a0895c8f8 Mon Sep 17 00:00:00 2001
From: thomasthaddeus <thaddeus.r.thomas@gmail.com>
Date: Mon, 6 May 2024 01:03:01 -0700
Subject: [PATCH 1/2] connectors updated

---
 src/data_sources/api_connector.py   |   2 +-
 src/data_sources/excel_connector.py |  14 ++-
 src/data_sources/sql_connector.py   | 183 ++++++++++++++++++++++++----
 3 files changed, 168 insertions(+), 31 deletions(-)

diff --git a/src/data_sources/api_connector.py b/src/data_sources/api_connector.py
index c8c459b..8be9cc8 100644
--- a/src/data_sources/api_connector.py
+++ b/src/data_sources/api_connector.py
@@ -1,4 +1,4 @@
-"""api_connector.py
+"""data_sources/api_connector.py
 _summary_
 
 _extended_summary_
diff --git a/src/data_sources/excel_connector.py b/src/data_sources/excel_connector.py
index a8570ab..7e2f03d 100644
--- a/src/data_sources/excel_connector.py
+++ b/src/data_sources/excel_connector.py
@@ -1,4 +1,4 @@
-"""excel_connector.py
+"""data_sources/excel_connector.py
 _summary_
 
 _extended_summary_
@@ -40,8 +40,10 @@ def load_data(self, sheet_name=0, header=0):
         Load data from a specified sheet in the Excel file.
 
         Args:
-            sheet_name (str or int, optional): The name or index of the sheet to read data from. Defaults to the first sheet.
-            header (int, list of int, optional): Row (0-indexed) to use as the header.
+            sheet_name (str or int, optional): The name or index of the sheet
+              to read data from. Defaults to the first sheet.
+            header (int, list of int, optional): Row (0-indexed) to use as the
+              header.
 
         Returns:
             DataFrame: A pandas DataFrame containing the data from the Excel
@@ -52,7 +54,7 @@ def load_data(self, sheet_name=0, header=0):
         try:
             return pd.read_excel(self.file_path, sheet_name=sheet_name, header=header)
         except Exception as e:
-            raise Exception(f"Error reading Excel file: {e}")
+            raise Exception(f"Error reading Excel file: {e}") from e
 
     def load_all_sheets(self):
         """
@@ -66,7 +68,7 @@ def load_all_sheets(self):
         try:
             return pd.read_excel(self.file_path, sheet_name=None)
         except Exception as e:
-            raise Exception(f"Error reading Excel file: {e}")
+            raise Exception(f"Error reading Excel file: {e}") from e
 
     def preview_sheet(self, sheet_name=0, num_rows=5):
         """
@@ -86,4 +88,4 @@ def preview_sheet(self, sheet_name=0, num_rows=5):
         try:
             return pd.read_excel(self.file_path, sheet_name=sheet_name, nrows=num_rows)
         except Exception as e:
-            raise Exception(f"Error previewing Excel file: {e}")
+            raise Exception(f"Error previewing Excel file: {e}") from e
diff --git a/src/data_sources/sql_connector.py b/src/data_sources/sql_connector.py
index 83f7c75..7fbaa84 100644
--- a/src/data_sources/sql_connector.py
+++ b/src/data_sources/sql_connector.py
@@ -1,42 +1,92 @@
-"""sql_connector.py
-_summary_
+"""data_sources/sql_connector.py
 
-_extended_summary_
+This module provides a SQLConnector class for connecting to and interacting
+with a SQL database. The SQLConnector simplifies executing queries and
+performing database operations like inserting and updating data, using
+SQLAlchemy for database connections and pandas for handling query results.
 
 Raises:
-    Exception: _description_
-    Exception: _description_
-    Exception: _description_
+    Exception: Raised when a SQL query execution fails.
+    Exception: Raised when inserting data into a database fails.
+    Exception: Raised when updating data in the database fails.
 
 Returns:
-    _type_: _description_
+    DataFrame: Returns the result of SQL queries as pandas DataFrames.
 
-# Example usage
+Example usage:
 connector = SQLConnector('postgresql://user:password@localhost:5432/mydatabase')
 data = connector.query_data('SELECT * FROM my_table')
 connector.insert_data(df, 'my_table')
 connector.update_data('UPDATE my_table SET column = value WHERE condition')
 """
 
-import pandas as pd
+import logging
 import sqlalchemy
-from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.exc import SQLAlchemyError, OperationalError
+import pandas as pd
+from pandas.errors import DatabaseError
+
+logger = logging.getLogger(__name__)
+
+
+class SQLConnectorError(Exception):
+    """Custom exception class that formats the error message along with query details."""
+
+    def __init__(self, original_exception, query=None):
+        self.original_exception = original_exception
+        self.query = query
+        super().__init__(self.__str__())
+
+    def __str__(self):
+        """Return the formatted exception message."""
+        return f"Error executing query: {self.query}. Error: {self.original_exception}"
+
+
+class WrappedException(Exception):
+    def __init__(self, err, context_message):
+        self.err = err
+        self.context_message = context_message
+        super().__init__(str(self))
+
+    def __str__(self):
+        return f"{self.context_message}. Original Error: {self.err}"
+
 
 class SQLConnector:
     """
-     _summary_
+    Provides methods to connect to a SQL database and execute queries, insert,
+    and update data.
 
-    _extended_summary_
+    This class uses SQLAlchemy to manage database connections and pandas to
+    convert SQL query results into DataFrame objects for easier manipulation
+    and analysis.
     """
+
     def __init__(self, db_uri):
         """
-        Initialize the SQLConnector with the database URI.
+        Initializes a new instance of the SQLConnector class with a specific database URI.
 
         Args:
-            db_uri (str): The database URI.
+            db_uri (str): The connection string for the database, typically
+            including the type of database, username, password, host, and
+            database name.
         """
         self.db_uri = db_uri
         self.engine = sqlalchemy.create_engine(db_uri)
+        self._logger = None
+
+    @property
+    def logger(self):
+        """
+        Returns a logger instance for this class. The logger is created upon
+        first access.
+        """
+        if self._logger is None:
+            self._logger = logging.getLogger(__name__)
+        return self._logger
+
+    def __repr__(self) -> str:
+        return f"SQLConnector(db_uri={self.db_uri})"
 
     def query_data(self, query):
         """
@@ -47,29 +97,38 @@ def query_data(self, query):
 
         Returns:
             DataFrame: The result of the SQL query.
+
+        Raises:
+            DatabaseError: An error occurred when querying the database.
         """
+        self.logger.info("Executing query: %s", query)
         try:
             with self.engine.connect() as connection:
-                return pd.read_sql_query(query, connection)
-        except SQLAlchemyError as e:
-            raise Exception(f"Error executing query: {e}")
+                return pd.DataFrame(pd.read_sql_query(query, connection))
+        except (SQLAlchemyError, DatabaseError) as e:
+            self.logger.error("Error executing query: %s. Error: %s", query, str(e))
+            raise SQLConnectorError(e, query) from e
 
-    def insert_data(self, df, table_name, if_exists='append'):
+    def insert_data(self, df, table_name, if_exists="append"):
         """
         Insert data from a DataFrame into a SQL table.
 
         Args:
             df (DataFrame): The DataFrame to insert into the table.
             table_name (str): The name of the target table.
-            if_exists (str): What to do if the table already exists. Options are 'fail', 'replace', 'append'.
+            if_exists (str): What to do if the table already exists. Options
+              are 'fail', 'replace', 'append'.
 
         Returns:
             None
+
+        Raises:
+            SQLConnectorError: Custom error with detailed information.
         """
         try:
             df.to_sql(table_name, self.engine, if_exists=if_exists, index=False)
-        except SQLAlchemyError as e:
-            raise Exception(f"Error inserting data: {e}")
+        except (SQLAlchemyError, DatabaseError) as e:
+            raise SQLConnectorError(e, f"INSERT INTO {table_name}") from e
 
     def update_data(self, query):
         """
@@ -80,11 +139,87 @@ def update_data(self, query):
 
         Returns:
             None
+
+        Raises:
+            SQLConnectorError: Custom error with detailed information.
         """
         try:
             with self.engine.connect() as connection:
                 connection.execute(query)
-        except SQLAlchemyError as e:
-            raise Exception(f"Error updating data: {e}")
+        except (SQLAlchemyError, OperationalError) as e:
+            raise SQLConnectorError(e, query) from e
+
+    def start_transaction(self):
+        """Starts a new transaction."""
+        connection = self.engine.connect()
+        transaction = connection.begin()
+        return connection, transaction
+
+    def end_transaction(self, transaction, operation="commit"):
+        """Ends the transaction with either a commit or rollback."""
+        if operation == "commit":
+            transaction.commit()
+        else:
+            transaction.rollback()
+
+    def bulk_insert(self, data, table_name):
+        """Inserts data in bulk to the specified table."""
+        self.engine.execute(table_name.insert(), data)
+
+    def create_table(self, create_statement):
+        """Creates a table in the database."""
+        self.engine.execute(create_statement)
+
+    def drop_table(self, table_name):
+        """Drops a table from the database."""
+        self.engine.execute(f"DROP TABLE IF EXISTS {table_name}")
+
+    def set_pooling_options(self, pool_size=5, max_overflow=10, timeout=30):
+        """Set options for the SQLAlchemy connection pool.
 
-    # Additional methods for other database interactions can be added here.
+        Args:
+            pool_size (int): The size of the pool to be maintained.
+            max_overflow (int): The maximum number of connections to create above `pool_size`.
+            timeout (int): The number of seconds to wait before giving up on returning a connection.
+        """
+        # Dispose the old engine and create a new one with updated settings
+        self.engine.dispose()
+        self.engine = sqlalchemy.create_engine(
+            self.db_uri,
+            pool_size=pool_size,
+            max_overflow=max_overflow,
+            pool_timeout=timeout,
+        )
+
+    def execute_large_query_in_batches(self, query, batch_size=1000):
+        """Execute a large SQL query in batches.
+
+        Args:
+            query (str): The SQL query to execute.
+            batch_size (int): The number of rows to fetch per batch.
+
+        Yields:
+            DataFrame: A batch of the result set as a pandas DataFrame.
+        """
+        with self.engine.connect() as conn:
+            result_proxy = conn.execution_options(stream_results=True).execute(
+                query
+            )
+            while True:
+                batch = result_proxy.fetchmany(batch_size)
+                if not batch:
+                    break
+                yield pd.DataFrame(batch)
+
+    def safe_transaction(self):
+        """Provide a context manager for safe transactions."""
+        connection = self.engine.connect()
+        transaction = connection.begin()
+        try:
+            yield connection
+            transaction.commit()
+        except:
+            transaction.rollback()
+            raise
+        finally:
+            connection.close()

From 28e624323661353425491ffc14302f7d3bb7397b Mon Sep 17 00:00:00 2001
From: thomasthaddeus <thaddeus.r.thomas@gmail.com>
Date: Mon, 6 May 2024 01:04:19 -0700
Subject: [PATCH 2/2] update config

---
 src/.conf/plot_config.json        |  18 ++++
 src/__init__.py                   |  81 ++++++++++++++++++
 src/data_analysis_toolkit.py      |  21 -----
 src/visualizer/data_visualizer.py | 135 ++++++++++++++++++++++++------
 4 files changed, 210 insertions(+), 45 deletions(-)
 create mode 100644 src/.conf/plot_config.json

diff --git a/src/.conf/plot_config.json b/src/.conf/plot_config.json
new file mode 100644
index 0000000..3a614f3
--- /dev/null
+++ b/src/.conf/plot_config.json
@@ -0,0 +1,18 @@
+{
+    "global_style": {
+        "style": "whitegrid",
+        "context": "talk",
+        "figure.figsize": [12, 8],
+        "axes.titlesize": 20,
+        "axes.labelsize": 18
+    },
+    "boxplot": {
+        "palette": "viridis"
+    },
+    "scatterplot": {
+        "style": "darkgrid"
+    },
+    "heatmap": {
+        "cmap": "coolwarm"
+    }
+}
diff --git a/src/__init__.py b/src/__init__.py
index f213622..58415b8 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -1,2 +1,83 @@
 # __init__.py
+
+# MIT License
+#
+# Copyright (c) 2023 Thaddeus Thomas
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Metadata about the package
+__version__ = '1.1.1'
+__author__ = 'Thaddeus Thomas'
+__email__ = 'thaddeus@vcwtech.com'
+
+import logging
+import sys
+
+# Convenience imports for users
 from .data_analysis_toolkit import DataAnalysisToolkit
+from .utils import DataImputer
+from .model import FeatureEngineer, ModelEvaluator
+from .preprocessor import DataPreprocessor
+from .generators import ReportGenerator
+from .visualizer import DataVisualizer
+
+# Dependency checks
+required_packages = {
+    'pandas': '1.1.5',
+    'matplotlib': '3.3.4',
+    'scipy': '1.6.0',
+    'sklearn': '0.24.1'
+}
+
+missing_packages = []
+
+for lib, version in required_packages.items():
+    try:
+        pkg = __import__(lib)
+        if pkg.__version__ < version:
+            missing_packages.append(f"{lib}>= {version}")
+    except ImportError:
+        missing_packages.append(f"{lib}>= {version}")
+
+if missing_packages:
+    sys.exit("Missing required packages: " + ', '.join(missing_packages))
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logger.info("Initializing DataAnalysisToolkit package")
+
+# Initialization code that runs on package import, if any
+def _init_package():
+    # Put any package-wide initialization logic here
+    logger.debug("Package initialized successfully")
+
+_init_package()
+
+# Ensure that this module only exposes the intended public interface
+__all__ = [
+    "DataAnalysisToolkit",
+    "DataImputer",
+    "DataVisualizer",
+    "FeatureEngineer",
+    "ModelEvaluator",
+    "DataPreprocessor",
+    "ReportGenerator"
+]
diff --git a/src/data_analysis_toolkit.py b/src/data_analysis_toolkit.py
index bec05c1..655e54d 100644
--- a/src/data_analysis_toolkit.py
+++ b/src/data_analysis_toolkit.py
@@ -1,24 +1,3 @@
-# MIT License
-#
-# Copyright (c) 2023 Thaddeus Thomas
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
 """data_analysis_toolkit.py
 
 This module contains a class, DataAnalysisToolkit, for performing various data
diff --git a/src/visualizer/data_visualizer.py b/src/visualizer/data_visualizer.py
index d0ef7ce..bb40b24 100644
--- a/src/visualizer/data_visualizer.py
+++ b/src/visualizer/data_visualizer.py
@@ -9,34 +9,105 @@
 visualizations. The class offers a range of methods for different types of
 plots, making it easier to explore and understand data patterns and
 relationships.
+
+Returns:
+    _type_: _description_
 """
 
+import json
 import matplotlib.pyplot as plt
 import seaborn as sns
 
 class DataVisualizer:
     """
-    A class for visualizing data in various forms.
+    A class for visualizing data using configurations loaded from a JSON file
+    and customizable themes.
 
     This class provides methods for creating different types of plots, such as
     box plots, scatter plots, heatmaps, histograms, line plots, and pair plots.
-    It helps in exploring data and extracting insights visually.
+    It helps in exploring data and extracting insights visually using a
+    customizable configuration.
 
     Attributes:
         data (DataFrame): The pandas DataFrame from which visualizations will
-        be generated.
+          be generated.
     """
-    def __init__(self, data):
+    def __init__(self, data, config_path):
         """
-        Initializes the DataVisualizer with the dataset.
+        Initializes the DataVisualizer with the dataset and configuration.
 
         Args:
             data (DataFrame): The pandas DataFrame to be used for generating
-            visualizations.
+              visualizations.
+            config_path (str): Path to the JSON configuration file for plot
+              settings.
         """
         self.data = data
+        self.set_theme()
+        self._load_config(config_path)
+
+    def _load_config(self, config_path):
+        """
+        Loads the plot configuration from a JSON file.
+
+        Args:
+            config_path (str): Path to the configuration file.
+        """
+        with open(config_path, 'r', encoding='utf-8') as file:
+            self.config = json.load(file)
+        self.apply_global_style()
 
-    def boxplot(self, column, by=None):
+    def apply_global_style(self):
+        """
+        Applies the global style settings from the configuration.
+        """
+        global_style = self.config.get('global_style', {})
+        if 'style' in global_style:
+            sns.set_style(global_style['style'])
+        if 'context' in global_style:
+            sns.set_context(global_style['context'])
+        for key, value in global_style.items():
+            if key.startswith('figure.') or key.startswith('axes.'):
+                plt.rcParams[key] = value
+
+    def apply_plot_style(self, plot_type):
+        """
+        Applies specific plot style settings, merging them with the global
+        style.
+
+        Args:
+            plot_type (str): The type of plot for which to apply specific
+              styles.
+        """
+        plot_style = self.config.get(plot_type, {})
+        for key, value in plot_style.items():
+            plt.rcParams[key] = value
+
+    def set_theme(
+        self,
+        style="whitegrid",
+        context="talk",
+        figsize=(12,8),
+        titlesize=20,
+        labelsize=18,
+    ):
+        """
+        Configures the visual theme for all plots using matplotlib and seaborn.
+
+        Args:
+            style (str): The base style of the plots.
+            context (str): The context theme of seaborn.
+            figsize (list): Size of the figures in inches.
+            titlesize (int): Size of the titles.
+            labelsize (int): Size of the labels.
+        """
+        sns.set_style(style)
+        sns.set_context(context)
+        plt.rcParams["figure.figsize"] = figsize
+        plt.rcParams["axes.titlesize"] = titlesize
+        plt.rcParams["axes.labelsize"] = labelsize
+
+    def boxplot(self, col, by=None):
         """
         Generates a box plot for a specified column.
 
@@ -52,10 +123,16 @@ def boxplot(self, column, by=None):
         Returns:
             None: This method shows the plot and does not return a value.
         """
-        sns.boxplot(x=by, y=column, data=self.data)
+        self.apply_plot_style("boxplot")
+        sns.boxplot(
+            x=by,
+            y=col,
+            data=self.data,
+            palette=self.config["boxplot"].get("palette", "deep"),
+        )
         plt.show()
 
-    def scatterplot(self, x_column, y_column):
+    def scatterplot(self, x_col, y_col):
         """
         Generates a scatter plot between two columns.
 
@@ -69,7 +146,8 @@ def scatterplot(self, x_column, y_column):
         Returns:
             None: This method shows the plot and does not return a value.
         """
-        sns.scatterplot(x=x_column, y=y_column, data=self.data)
+        self.apply_plot_style("scatterplot")
+        sns.scatterplot(x=x_col, y=y_col, data=self.data)
         plt.show()
 
     def heatmap(self):
@@ -82,11 +160,12 @@ def heatmap(self):
         Returns:
             None: This method shows the plot and does not return a value.
         """
+        self.apply_plot_style("heatmap")
         correlation = self.data.corr()
         sns.heatmap(correlation, annot=True)
         plt.show()
 
-    def histogram(self, column):
+    def histogram(self, col):
         """
         Generates a histogram for a specified column.
 
@@ -100,10 +179,11 @@ def histogram(self, column):
         Returns:
             None: This method shows the plot and does not return a value.
         """
-        sns.histplot(self.data[column])
+        self.apply_plot_style("histogram")
+        sns.histplot(self.data[col])
         plt.show()
 
-    def lineplot(self, x_column, y_column):
+    def lineplot(self, x_col, y_col):
         """
         Generates a line plot between two columns.
 
@@ -117,7 +197,8 @@ def lineplot(self, x_column, y_column):
         Returns:
             None: This method shows the plot and does not return a value.
         """
-        sns.lineplot(x=x_column, y=y_column, data=self.data)
+        self.apply_plot_style("lineplot")
+        sns.lineplot(x=x_col, y=y_col, data=self.data)
         plt.show()
 
     def pairplot(self):
@@ -130,10 +211,11 @@ def pairplot(self):
         Returns:
             None: This method shows the plot and does not return a value.
         """
+        self.apply_plot_style("pairplot")
         sns.pairplot(self.data)
         plt.show()
 
-    def barplot(self, x_column, y_column):
+    def barplot(self, x_col, y_col):
         """
         Generates a bar plot for two columns.
 
@@ -148,10 +230,11 @@ def barplot(self, x_column, y_column):
         Returns:
             None: This method shows the plot and does not return a value.
         """
-        sns.barplot(x=x_column, y=y_column, data=self.data)
+        self.apply_plot_style("barplot")
+        sns.barplot(x=x_col, y=y_col, data=self.data)
         plt.show()
 
-    def piechart(self, column):
+    def piechart(self, col):
         """
         Generates a pie chart for a specified column.
 
@@ -164,12 +247,13 @@ def piechart(self, column):
         Returns:
             None: This method shows the plot and does not return a value.
         """
-        pie_data = self.data[column].value_counts()
-        plt.pie(pie_data, labels=pie_data.index, autopct='%1.1f%%')
-        plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
+        self.apply_plot_style("piechart")
+        pie_data = self.data[col].value_counts()
+        plt.pie(pie_data, labels=pie_data.index, autopct="%1.1f%%")
+        plt.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.
         plt.show()
 
-    def violinplot(self, column, by=None):
+    def violinplot(self, col, by=None):
         """
         Generates a violin plot for a specified column.
 
@@ -177,11 +261,14 @@ def violinplot(self, column, by=None):
         different categories.
 
         Args:
-            column (str): The name of the column for which to generate the violin plot.
-            by (str, optional): A column name to group data by. Defaults to None.
+            column (str): The name of the column for which to generate the
+              violin plot.
+            by (str, optional): A column name to group data by. Defaults to
+              None.
 
         Returns:
             None: This method shows the plot and does not return a value.
         """
-        sns.violinplot(x=by, y=column, data=self.data)
+        self.apply_plot_style("violinplot")
+        sns.violinplot(x=by, y=col, data=self.data)
         plt.show()