From 0084a04c9da8a05e7735868b4d38889a0895c8f8 Mon Sep 17 00:00:00 2001 From: thomasthaddeus Date: Mon, 6 May 2024 01:03:01 -0700 Subject: [PATCH 1/2] connectors updated --- src/data_sources/api_connector.py | 2 +- src/data_sources/excel_connector.py | 14 ++- src/data_sources/sql_connector.py | 183 ++++++++++++++++++++++++---- 3 files changed, 168 insertions(+), 31 deletions(-) diff --git a/src/data_sources/api_connector.py b/src/data_sources/api_connector.py index c8c459b..8be9cc8 100644 --- a/src/data_sources/api_connector.py +++ b/src/data_sources/api_connector.py @@ -1,4 +1,4 @@ -"""api_connector.py +"""data_sources/api_connector.py _summary_ _extended_summary_ diff --git a/src/data_sources/excel_connector.py b/src/data_sources/excel_connector.py index a8570ab..7e2f03d 100644 --- a/src/data_sources/excel_connector.py +++ b/src/data_sources/excel_connector.py @@ -1,4 +1,4 @@ -"""excel_connector.py +"""data_sources/excel_connector.py _summary_ _extended_summary_ @@ -40,8 +40,10 @@ def load_data(self, sheet_name=0, header=0): Load data from a specified sheet in the Excel file. Args: - sheet_name (str or int, optional): The name or index of the sheet to read data from. Defaults to the first sheet. - header (int, list of int, optional): Row (0-indexed) to use as the header. + sheet_name (str or int, optional): The name or index of the sheet + to read data from. Defaults to the first sheet. + header (int, list of int, optional): Row (0-indexed) to use as the + header. Returns: DataFrame: A pandas DataFrame containing the data from the Excel @@ -52,7 +54,7 @@ def load_data(self, sheet_name=0, header=0): try: return pd.read_excel(self.file_path, sheet_name=sheet_name, header=header) except Exception as e: - raise Exception(f"Error reading Excel file: {e}") + raise Exception(f"Error reading Excel file: {e}") from e def load_all_sheets(self): """ @@ -66,7 +68,7 @@ def load_all_sheets(self): try: return pd.read_excel(self.file_path, sheet_name=None) except Exception as e: - raise Exception(f"Error reading Excel file: {e}") + raise Exception(f"Error reading Excel file: {e}") from e def preview_sheet(self, sheet_name=0, num_rows=5): """ @@ -86,4 +88,4 @@ def preview_sheet(self, sheet_name=0, num_rows=5): try: return pd.read_excel(self.file_path, sheet_name=sheet_name, nrows=num_rows) except Exception as e: - raise Exception(f"Error previewing Excel file: {e}") + raise Exception(f"Error previewing Excel file: {e}") from e diff --git a/src/data_sources/sql_connector.py b/src/data_sources/sql_connector.py index 83f7c75..7fbaa84 100644 --- a/src/data_sources/sql_connector.py +++ b/src/data_sources/sql_connector.py @@ -1,42 +1,92 @@ -"""sql_connector.py -_summary_ +"""data_sources/sql_connector.py -_extended_summary_ +This module provides a SQLConnector class for connecting to and interacting +with a SQL database. The SQLConnector simplifies executing queries and +performing database operations like inserting and updating data, using +SQLAlchemy for database connections and pandas for handling query results. Raises: - Exception: _description_ - Exception: _description_ - Exception: _description_ + Exception: Raised when a SQL query execution fails. + Exception: Raised when inserting data into a database fails. + Exception: Raised when updating data in the database fails. Returns: - _type_: _description_ + DataFrame: Returns the result of SQL queries as pandas DataFrames. -# Example usage +Example usage: connector = SQLConnector('postgresql://user:password@localhost:5432/mydatabase') data = connector.query_data('SELECT * FROM my_table') connector.insert_data(df, 'my_table') connector.update_data('UPDATE my_table SET column = value WHERE condition') """ -import pandas as pd +import logging import sqlalchemy -from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.exc import SQLAlchemyError, OperationalError +import pandas as pd +from pandas.errors import DatabaseError + +logger = logging.getLogger(__name__) + + +class SQLConnectorError(Exception): + """Custom exception class that formats the error message along with query details.""" + + def __init__(self, original_exception, query=None): + self.original_exception = original_exception + self.query = query + super().__init__(self.__str__()) + + def __str__(self): + """Return the formatted exception message.""" + return f"Error executing query: {self.query}. Error: {self.original_exception}" + + +class WrappedException(Exception): + def __init__(self, err, context_message): + self.err = err + self.context_message = context_message + super().__init__(str(self)) + + def __str__(self): + return f"{self.context_message}. Original Error: {self.err}" + class SQLConnector: """ - _summary_ + Provides methods to connect to a SQL database and execute queries, insert, + and update data. - _extended_summary_ + This class uses SQLAlchemy to manage database connections and pandas to + convert SQL query results into DataFrame objects for easier manipulation + and analysis. """ + def __init__(self, db_uri): """ - Initialize the SQLConnector with the database URI. + Initializes a new instance of the SQLConnector class with a specific database URI. Args: - db_uri (str): The database URI. + db_uri (str): The connection string for the database, typically + including the type of database, username, password, host, and + database name. """ self.db_uri = db_uri self.engine = sqlalchemy.create_engine(db_uri) + self._logger = None + + @property + def logger(self): + """ + Returns a logger instance for this class. The logger is created upon + first access. + """ + if self._logger is None: + self._logger = logging.getLogger(__name__) + return self._logger + + def __repr__(self) -> str: + return f"SQLConnector(db_uri={self.db_uri})" def query_data(self, query): """ @@ -47,29 +97,38 @@ def query_data(self, query): Returns: DataFrame: The result of the SQL query. + + Raises: + DatabaseError: An error occurred when querying the database. """ + self.logger.info("Executing query: %s", query) try: with self.engine.connect() as connection: - return pd.read_sql_query(query, connection) - except SQLAlchemyError as e: - raise Exception(f"Error executing query: {e}") + return pd.DataFrame(pd.read_sql_query(query, connection)) + except (SQLAlchemyError, DatabaseError) as e: + self.logger.error("Error executing query: %s. Error: %s", query, str(e)) + raise SQLConnectorError(e, query) from e - def insert_data(self, df, table_name, if_exists='append'): + def insert_data(self, df, table_name, if_exists="append"): """ Insert data from a DataFrame into a SQL table. Args: df (DataFrame): The DataFrame to insert into the table. table_name (str): The name of the target table. - if_exists (str): What to do if the table already exists. Options are 'fail', 'replace', 'append'. + if_exists (str): What to do if the table already exists. Options + are 'fail', 'replace', 'append'. Returns: None + + Raises: + SQLConnectorError: Custom error with detailed information. """ try: df.to_sql(table_name, self.engine, if_exists=if_exists, index=False) - except SQLAlchemyError as e: - raise Exception(f"Error inserting data: {e}") + except (SQLAlchemyError, DatabaseError) as e: + raise SQLConnectorError(e, f"INSERT INTO {table_name}") from e def update_data(self, query): """ @@ -80,11 +139,87 @@ def update_data(self, query): Returns: None + + Raises: + SQLConnectorError: Custom error with detailed information. """ try: with self.engine.connect() as connection: connection.execute(query) - except SQLAlchemyError as e: - raise Exception(f"Error updating data: {e}") + except (SQLAlchemyError, OperationalError) as e: + raise SQLConnectorError(e, query) from e + + def start_transaction(self): + """Starts a new transaction.""" + connection = self.engine.connect() + transaction = connection.begin() + return connection, transaction + + def end_transaction(self, transaction, operation="commit"): + """Ends the transaction with either a commit or rollback.""" + if operation == "commit": + transaction.commit() + else: + transaction.rollback() + + def bulk_insert(self, data, table_name): + """Inserts data in bulk to the specified table.""" + self.engine.execute(table_name.insert(), data) + + def create_table(self, create_statement): + """Creates a table in the database.""" + self.engine.execute(create_statement) + + def drop_table(self, table_name): + """Drops a table from the database.""" + self.engine.execute(f"DROP TABLE IF EXISTS {table_name}") + + def set_pooling_options(self, pool_size=5, max_overflow=10, timeout=30): + """Set options for the SQLAlchemy connection pool. - # Additional methods for other database interactions can be added here. + Args: + pool_size (int): The size of the pool to be maintained. + max_overflow (int): The maximum number of connections to create above `pool_size`. + timeout (int): The number of seconds to wait before giving up on returning a connection. + """ + # Dispose the old engine and create a new one with updated settings + self.engine.dispose() + self.engine = sqlalchemy.create_engine( + self.db_uri, + pool_size=pool_size, + max_overflow=max_overflow, + pool_timeout=timeout, + ) + + def execute_large_query_in_batches(self, query, batch_size=1000): + """Execute a large SQL query in batches. + + Args: + query (str): The SQL query to execute. + batch_size (int): The number of rows to fetch per batch. + + Yields: + DataFrame: A batch of the result set as a pandas DataFrame. + """ + with self.engine.connect() as conn: + result_proxy = conn.execution_options(stream_results=True).execute( + query + ) + while True: + batch = result_proxy.fetchmany(batch_size) + if not batch: + break + yield pd.DataFrame(batch) + + def safe_transaction(self): + """Provide a context manager for safe transactions.""" + connection = self.engine.connect() + transaction = connection.begin() + try: + yield connection + transaction.commit() + except: + transaction.rollback() + raise + finally: + connection.close() From 28e624323661353425491ffc14302f7d3bb7397b Mon Sep 17 00:00:00 2001 From: thomasthaddeus Date: Mon, 6 May 2024 01:04:19 -0700 Subject: [PATCH 2/2] update config --- src/.conf/plot_config.json | 18 ++++ src/__init__.py | 81 ++++++++++++++++++ src/data_analysis_toolkit.py | 21 ----- src/visualizer/data_visualizer.py | 135 ++++++++++++++++++++++++------ 4 files changed, 210 insertions(+), 45 deletions(-) create mode 100644 src/.conf/plot_config.json diff --git a/src/.conf/plot_config.json b/src/.conf/plot_config.json new file mode 100644 index 0000000..3a614f3 --- /dev/null +++ b/src/.conf/plot_config.json @@ -0,0 +1,18 @@ +{ + "global_style": { + "style": "whitegrid", + "context": "talk", + "figure.figsize": [12, 8], + "axes.titlesize": 20, + "axes.labelsize": 18 + }, + "boxplot": { + "palette": "viridis" + }, + "scatterplot": { + "style": "darkgrid" + }, + "heatmap": { + "cmap": "coolwarm" + } +} diff --git a/src/__init__.py b/src/__init__.py index f213622..58415b8 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,2 +1,83 @@ # __init__.py + +# MIT License +# +# Copyright (c) 2023 Thaddeus Thomas +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Metadata about the package +__version__ = '1.1.1' +__author__ = 'Thaddeus Thomas' +__email__ = 'thaddeus@vcwtech.com' + +import logging +import sys + +# Convenience imports for users from .data_analysis_toolkit import DataAnalysisToolkit +from .utils import DataImputer +from .model import FeatureEngineer, ModelEvaluator +from .preprocessor import DataPreprocessor +from .generators import ReportGenerator +from .visualizer import DataVisualizer + +# Dependency checks +required_packages = { + 'pandas': '1.1.5', + 'matplotlib': '3.3.4', + 'scipy': '1.6.0', + 'sklearn': '0.24.1' +} + +missing_packages = [] + +for lib, version in required_packages.items(): + try: + pkg = __import__(lib) + if pkg.__version__ < version: + missing_packages.append(f"{lib}>= {version}") + except ImportError: + missing_packages.append(f"{lib}>= {version}") + +if missing_packages: + sys.exit("Missing required packages: " + ', '.join(missing_packages)) + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +logger.info("Initializing DataAnalysisToolkit package") + +# Initialization code that runs on package import, if any +def _init_package(): + # Put any package-wide initialization logic here + logger.debug("Package initialized successfully") + +_init_package() + +# Ensure that this module only exposes the intended public interface +__all__ = [ + "DataAnalysisToolkit", + "DataImputer", + "DataVisualizer", + "FeatureEngineer", + "ModelEvaluator", + "DataPreprocessor", + "ReportGenerator" +] diff --git a/src/data_analysis_toolkit.py b/src/data_analysis_toolkit.py index bec05c1..655e54d 100644 --- a/src/data_analysis_toolkit.py +++ b/src/data_analysis_toolkit.py @@ -1,24 +1,3 @@ -# MIT License -# -# Copyright (c) 2023 Thaddeus Thomas -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - """data_analysis_toolkit.py This module contains a class, DataAnalysisToolkit, for performing various data diff --git a/src/visualizer/data_visualizer.py b/src/visualizer/data_visualizer.py index d0ef7ce..bb40b24 100644 --- a/src/visualizer/data_visualizer.py +++ b/src/visualizer/data_visualizer.py @@ -9,34 +9,105 @@ visualizations. The class offers a range of methods for different types of plots, making it easier to explore and understand data patterns and relationships. + +Returns: + _type_: _description_ """ +import json import matplotlib.pyplot as plt import seaborn as sns class DataVisualizer: """ - A class for visualizing data in various forms. + A class for visualizing data using configurations loaded from a JSON file + and customizable themes. This class provides methods for creating different types of plots, such as box plots, scatter plots, heatmaps, histograms, line plots, and pair plots. - It helps in exploring data and extracting insights visually. + It helps in exploring data and extracting insights visually using a + customizable configuration. Attributes: data (DataFrame): The pandas DataFrame from which visualizations will - be generated. + be generated. """ - def __init__(self, data): + def __init__(self, data, config_path): """ - Initializes the DataVisualizer with the dataset. + Initializes the DataVisualizer with the dataset and configuration. Args: data (DataFrame): The pandas DataFrame to be used for generating - visualizations. + visualizations. + config_path (str): Path to the JSON configuration file for plot + settings. """ self.data = data + self.set_theme() + self._load_config(config_path) + + def _load_config(self, config_path): + """ + Loads the plot configuration from a JSON file. + + Args: + config_path (str): Path to the configuration file. + """ + with open(config_path, 'r', encoding='utf-8') as file: + self.config = json.load(file) + self.apply_global_style() - def boxplot(self, column, by=None): + def apply_global_style(self): + """ + Applies the global style settings from the configuration. + """ + global_style = self.config.get('global_style', {}) + if 'style' in global_style: + sns.set_style(global_style['style']) + if 'context' in global_style: + sns.set_context(global_style['context']) + for key, value in global_style.items(): + if key.startswith('figure.') or key.startswith('axes.'): + plt.rcParams[key] = value + + def apply_plot_style(self, plot_type): + """ + Applies specific plot style settings, merging them with the global + style. + + Args: + plot_type (str): The type of plot for which to apply specific + styles. + """ + plot_style = self.config.get(plot_type, {}) + for key, value in plot_style.items(): + plt.rcParams[key] = value + + def set_theme( + self, + style="whitegrid", + context="talk", + figsize=(12,8), + titlesize=20, + labelsize=18, + ): + """ + Configures the visual theme for all plots using matplotlib and seaborn. + + Args: + style (str): The base style of the plots. + context (str): The context theme of seaborn. + figsize (list): Size of the figures in inches. + titlesize (int): Size of the titles. + labelsize (int): Size of the labels. + """ + sns.set_style(style) + sns.set_context(context) + plt.rcParams["figure.figsize"] = figsize + plt.rcParams["axes.titlesize"] = titlesize + plt.rcParams["axes.labelsize"] = labelsize + + def boxplot(self, col, by=None): """ Generates a box plot for a specified column. @@ -52,10 +123,16 @@ def boxplot(self, column, by=None): Returns: None: This method shows the plot and does not return a value. """ - sns.boxplot(x=by, y=column, data=self.data) + self.apply_plot_style("boxplot") + sns.boxplot( + x=by, + y=col, + data=self.data, + palette=self.config["boxplot"].get("palette", "deep"), + ) plt.show() - def scatterplot(self, x_column, y_column): + def scatterplot(self, x_col, y_col): """ Generates a scatter plot between two columns. @@ -69,7 +146,8 @@ def scatterplot(self, x_column, y_column): Returns: None: This method shows the plot and does not return a value. """ - sns.scatterplot(x=x_column, y=y_column, data=self.data) + self.apply_plot_style("scatterplot") + sns.scatterplot(x=x_col, y=y_col, data=self.data) plt.show() def heatmap(self): @@ -82,11 +160,12 @@ def heatmap(self): Returns: None: This method shows the plot and does not return a value. """ + self.apply_plot_style("heatmap") correlation = self.data.corr() sns.heatmap(correlation, annot=True) plt.show() - def histogram(self, column): + def histogram(self, col): """ Generates a histogram for a specified column. @@ -100,10 +179,11 @@ def histogram(self, column): Returns: None: This method shows the plot and does not return a value. """ - sns.histplot(self.data[column]) + self.apply_plot_style("histogram") + sns.histplot(self.data[col]) plt.show() - def lineplot(self, x_column, y_column): + def lineplot(self, x_col, y_col): """ Generates a line plot between two columns. @@ -117,7 +197,8 @@ def lineplot(self, x_column, y_column): Returns: None: This method shows the plot and does not return a value. """ - sns.lineplot(x=x_column, y=y_column, data=self.data) + self.apply_plot_style("lineplot") + sns.lineplot(x=x_col, y=y_col, data=self.data) plt.show() def pairplot(self): @@ -130,10 +211,11 @@ def pairplot(self): Returns: None: This method shows the plot and does not return a value. """ + self.apply_plot_style("pairplot") sns.pairplot(self.data) plt.show() - def barplot(self, x_column, y_column): + def barplot(self, x_col, y_col): """ Generates a bar plot for two columns. @@ -148,10 +230,11 @@ def barplot(self, x_column, y_column): Returns: None: This method shows the plot and does not return a value. """ - sns.barplot(x=x_column, y=y_column, data=self.data) + self.apply_plot_style("barplot") + sns.barplot(x=x_col, y=y_col, data=self.data) plt.show() - def piechart(self, column): + def piechart(self, col): """ Generates a pie chart for a specified column. @@ -164,12 +247,13 @@ def piechart(self, column): Returns: None: This method shows the plot and does not return a value. """ - pie_data = self.data[column].value_counts() - plt.pie(pie_data, labels=pie_data.index, autopct='%1.1f%%') - plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. + self.apply_plot_style("piechart") + pie_data = self.data[col].value_counts() + plt.pie(pie_data, labels=pie_data.index, autopct="%1.1f%%") + plt.axis("equal") # Equal aspect ratio ensures that pie is drawn as a circle. plt.show() - def violinplot(self, column, by=None): + def violinplot(self, col, by=None): """ Generates a violin plot for a specified column. @@ -177,11 +261,14 @@ def violinplot(self, column, by=None): different categories. Args: - column (str): The name of the column for which to generate the violin plot. - by (str, optional): A column name to group data by. Defaults to None. + column (str): The name of the column for which to generate the + violin plot. + by (str, optional): A column name to group data by. Defaults to + None. Returns: None: This method shows the plot and does not return a value. """ - sns.violinplot(x=by, y=column, data=self.data) + self.apply_plot_style("violinplot") + sns.violinplot(x=by, y=col, data=self.data) plt.show()