Merge pull request #37 from LyzrCore/fix/data-analyzr

Fix bugs 6, 9 and 11
LyzrCore · Apr 4, 2024 · 4ceb4af · 4ceb4af
2 parents 3b8a711 + d89cc86
commit 4ceb4af
Show file tree

Hide file tree

Showing 29 changed files with 721 additions and 322 deletions.
diff --git a/build/lib/lyzr/base/prompts/analysis_steps_pt.txt b/build/lib/lyzr/base/prompts/analysis_steps_pt.txt
@@ -17,7 +17,7 @@ B. The value of "steps" should be a list of dictionaries. Each dictionary should
     2. Task: The task to be performed. The task can be one of the following: "clean_data", "transform", "math_operation", "analysis"
     3. Type: The type of task to be performed.
         3a. For task "clean_data", following types are available: "convert_to_datetime", "convert_to_numeric", "convert_to_categorical"
-        3b. For task "transform", following types are available: "one_hot_encode", "ordinal_encode", "scale", "extract_time_period"
+        3b. For task "transform", following types are available: "one_hot_encode", "ordinal_encode", "scale", "extract_time_period", "select_indices"
         3c. For task "math_operation", following types are available: "add", "subtract", "multiply", "divide"
         3d. For task "analysis", following types are available: "sortvalues", "filter", "mean", "sum", "cumsum", "groupby", "correlation", "regression", "classification", "clustering", "forecast"
     4. Args: The arguments required to perform the task. The arguments should be in the form of a dictionary.

diff --git a/build/lib/lyzr/base/prompts/plotting_steps_pt.txt b/build/lib/lyzr/base/prompts/plotting_steps_pt.txt
@@ -7,34 +7,7 @@ Take a moment to read and understand their insights. Follow their instructions a
 Your answer should be in the form of a python JSON object, following the given format:
 {schema}
 
-A. The value of "preprocess" should be a dictionary with keys "df_name" and "steps".
-    The value of "analysis_df" should be the name of the dataframe on which this analysis is to be performed.
-    The value of "steps" should be a list of dictionaries. Each dictionary should contain the following keys: "step", "task", "type", "args".
-    The following values are available for these keys. ONLY USE THESE VALUES.
-    1. Step: A number indicating the order of the step. Numbering should start from 1.
-    2. Task: The task to be performed. The task can be one of the following: "clean_data", "transform", "math_operation", "analysis"
-    3. Type: The type of task to be performed.
-        3a. For task "clean_data", following types are available: "convert_to_datetime", "convert_to_numeric", "convert_to_categorical"
-        3b. For task "transform", following types are available: "one_hot_encode", "ordinal_encode", "scale", "extract_time_period"
-        3c. For task "math_operation", following types are available: "add", "subtract", "multiply", "divide"
-        3d. For task "analysis", following types are available: "sortvalues", "filter", "mean", "sum", "cumsum", "groupby", "correlation", "regression", "classification", "clustering", "forecast"
-    4. Args: The arguments required to perform the task. The arguments should be in the form of a dictionary.
-        4a. For task "clean_data" - "columns": list
-        4b. For task "transform", type "one_hot_encode", "ordinal_encode", and "scale" - "columns": list
-        4c. For task "transform", type "extract_time_period" - "columns": list, "period_to_extract": Literal["week", "month", "year", "day", "hour", "minute", "second", "weekday"]
-        4d. For task "transform", type "select_indices" - "columns": list, "indices": list
-        4e. For task "math_operation" - "columns": list, "result": str (the name of the column to store the result in)
-        4f. For task "analysis", type "groupby" - "columns": list, "agg": Union[str, list], "agg_col": Optional[list]
-        4g. For task "analysis", type "sortvalues" - columns: list, "ascending": Optional[bool]
-        4h. For task "analysis", type "filter" - "columns": list, "values": list[Any] (the values to compare the columns to), "relations": list[Literal["lessthan", "greaterthan", "lessthanorequalto", "greaterthanorequalto", "equalto", "notequalto", "startswith", "endswith", "contains"]]
-        4i. For task "analysis", types "mean", "cumsum", and "sum" - "columns": list
-        4j. For task "analysis", type "correlation" - "columns": list, "method": Optional[Literal["pearson", "kendall", "spearman"]]
-        4k. For task "analysis", type "regression" - "x": list, "y": list
-        4l. For task "analysis", type "classification" - "x": list, "y": list
-        4m. For task "analysis", type "clustering" - "x": list, "y": list
-        4n. For task "analysis", type "forecast" - "time_column": str, "y_column": str, "end": Optional[str], "steps": Optional[int] # you must pass either "end" - the date until which to forecast or "steps" - the number of steps to forecast
-
-B. The value of "plot" should be a dictionary. It should contain the following keys: "figsize", "subplots", "title", "plots".
+A. The value of "plot" should be a dictionary. It should contain the following keys: "figsize", "subplots", "title", "plots".
     1. The value of "figsize" should be a tuple of two integers - the width and height of the figure respectively.
     2. The value of "subplots" should be a tuple of two integers - the number of rows and columns of the subplot grid respectively.
     3. The value of "title" should be a string - the title of the plot.
@@ -60,7 +33,4 @@ You may now begin.
 
 Question: {question}
 
-{df_details}
-
-Insights from Business Analyst:
-{guide}
+{df_details}
diff --git a/build/lib/lyzr/base/prompts/plotting_steps_with_analysis_pt.txt b/build/lib/lyzr/base/prompts/plotting_steps_with_analysis_pt.txt
@@ -0,0 +1,66 @@
+You are a Senior Data Scientist. You have been asked a question on a dataframe.
+Your job is to make a plot using {plotting_lib} that depicts the answer to the question.
+
+To assist you, a Business Analyst with domain knowledge has given their insights on the best way to go about your task.
+Take a moment to read and understand their insights. Follow their instructions as closely as possible.
+
+Your answer should be in the form of a python JSON object, following the given format:
+{schema}
+
+A. The value of "preprocess" should be a dictionary with keys "df_name" and "steps".
+    The value of "analysis_df" should be the name of the dataframe on which this analysis is to be performed.
+    The value of "steps" should be a list of dictionaries. Each dictionary should contain the following keys: "step", "task", "type", "args".
+    The following values are available for these keys. ONLY USE THESE VALUES.
+    1. Step: A number indicating the order of the step. Numbering should start from 1.
+    2. Task: The task to be performed. The task can be one of the following: "clean_data", "transform", "math_operation", "analysis"
+    3. Type: The type of task to be performed.
+        3a. For task "clean_data", following types are available: "convert_to_datetime", "convert_to_numeric", "convert_to_categorical"
+        3b. For task "transform", following types are available: "one_hot_encode", "ordinal_encode", "scale", "extract_time_period", "select_indices"
+        3c. For task "math_operation", following types are available: "add", "subtract", "multiply", "divide"
+        3d. For task "analysis", following types are available: "sortvalues", "filter", "mean", "sum", "cumsum", "groupby", "correlation", "regression", "classification", "clustering", "forecast"
+    4. Args: The arguments required to perform the task. The arguments should be in the form of a dictionary.
+        4a. For task "clean_data" - "columns": list
+        4b. For task "transform", type "one_hot_encode", "ordinal_encode", and "scale" - "columns": list
+        4c. For task "transform", type "extract_time_period" - "columns": list, "period_to_extract": Literal["week", "month", "year", "day", "hour", "minute", "second", "weekday"]
+        4d. For task "transform", type "select_indices" - "columns": list, "indices": list
+        4e. For task "math_operation" - "columns": list, "result": str (the name of the column to store the result in)
+        4f. For task "analysis", type "groupby" - "columns": list, "agg": Union[str, list], "agg_col": Optional[list]
+        4g. For task "analysis", type "sortvalues" - columns: list, "ascending": Optional[bool]
+        4h. For task "analysis", type "filter" - "columns": list, "values": list[Any] (the values to compare the columns to), "relations": list[Literal["lessthan", "greaterthan", "lessthanorequalto", "greaterthanorequalto", "equalto", "notequalto", "startswith", "endswith", "contains"]]
+        4i. For task "analysis", types "mean", "cumsum", and "sum" - "columns": list
+        4j. For task "analysis", type "correlation" - "columns": list, "method": Optional[Literal["pearson", "kendall", "spearman"]]
+        4k. For task "analysis", type "regression" - "x": list, "y": list
+        4l. For task "analysis", type "classification" - "x": list, "y": list
+        4m. For task "analysis", type "clustering" - "x": list, "y": list
+        4n. For task "analysis", type "forecast" - "time_column": str, "y_column": str, "end": Optional[str], "steps": Optional[int] # you must pass either "end" - the date until which to forecast or "steps" - the number of steps to forecast
+
+B. The value of "plot" should be a dictionary. It should contain the following keys: "figsize", "subplots", "title", "plots".
+    1. The value of "figsize" should be a tuple of two integers - the width and height of the figure respectively.
+    2. The value of "subplots" should be a tuple of two integers - the number of rows and columns of the subplot grid respectively.
+    3. The value of "title" should be a string - the title of the plot.
+    4. The value of "plots" should be a list of dictionaries. Each dictionary should contain the following keys: "subplot", "plot_type", "x", "y", "args".
+        4a. The value of "subplot" should be a tuple of two integers - the row and column number of the subplot respectively.
+        4b. The value of "plot_type" should be a string - the type of plot to be made, for example "line", "bar", "barh", "scatter", "hist".
+        4c. The value of "x" should be a strings - the name of the column to be plotted on the x-axis.
+        4d. The value of "y" should be a strings - the name of the column to be plotted on the y-axis.
+        4e. For a histogram, omit "x" and "y". Instead use "by", which should be a list of strings - the names of the columns to be plotted.
+        4f. The value of "args" should be a dictionary - the arguments required to make the plot.
+            4e1. For "line" plots, the following arguments are available - xlabel: str, ylabel: str, color: str, linestyle: str, etc.
+            4e2. For "bar" plots, the following arguments are available - xlabel: str, ylabel: str, color: str, stacked: bool, etc.
+            4e3. For "barh" plots, the following arguments are available - xlabel: str, ylabel: str, color: str, stacked: bool, etc.
+            4e4. For "scatter" plots, the following arguments are available - xlabel: str, ylabel: str, color: str, marker: str, markersize: float, etc.
+            4e5. For "hist" plots, the following arguments are available - xlabel: str, color: str, bins: int, stacked: bool, etc.
+
+
+Do not give any explanations. Only give the python JSON as the answer.
+This JSON will be evaluated using the eval() function in python. Ensure that it is in the correct format, and has no syntax errors.
+
+Before beginning, take a deep breath and relax. You are an expert in your field. You have done this many times before.
+You may now begin.
+
+Question: {question}
+
+{df_details}
+
+Insights from Business Analyst:
+{guide}
diff --git a/build/lib/lyzr/data_analyzr/analyzr.py b/build/lib/lyzr/data_analyzr/analyzr.py
@@ -1,12 +1,14 @@
 # standard library imports
 import os
 import time
+import uuid
 import traceback
 from typing import Union, Literal, Optional, Any
 
 # third-party imports
 import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
 
 # local imports
 from lyzr.base.prompt import Prompt
@@ -184,6 +186,11 @@ def _set_logger(self, log_level, print_log):
         self.logger.setLevel(numeric_level)
 
         if self.logger.hasHandlers():
+            for handler in self.logger.handlers:
+                try:
+                    handler.close()
+                except Exception:
+                    pass
             self.logger.handlers.clear()
 
         if print_log:
@@ -318,14 +325,10 @@ def visualisation(
             return self.visualisation_output
 
         if plot_path is None:
-            plot_path = Path("generated_plots/plot.png").as_posix()
+            plot_path = Path(f"generated_plots/{str(uuid.uuid4())}.png").as_posix()
         else:
             plot_path = Path(plot_path).as_posix()
 
-        if self.df_dict is None:
-            self.logger.info("Fetching dataframes from database to make visualization.")
-            self.df_dict = self.database_connector.fetch_dataframes_dict()
-
         plot_context = plot_context or self.context
         self.user_input = user_input or self.user_input
         if self.user_input is None:
@@ -346,6 +349,22 @@ def visualisation(
                 self._plot_model.get("name"),
                 **self._plot_model_kwargs,
             )
+        use_guide = True
+        plot_df = self.df_dict
+        if "analysis_output" in self.__dict__ and isinstance(
+            self.analysis_output, pd.DataFrame
+        ):
+            use_guide = False
+            plot_df = {"dataset": self.analysis_output}
+        elif self.df_dict is None:
+            self.logger.info("Fetching dataframes from database to make visualization.")
+            self.df_dict = self.database_connector.fetch_dataframes_dict()
+            plot_df = self.df_dict
+        if plot_df is not None:
+            df_keys = list(plot_df.keys())
+            for key in df_keys:
+                k_new = key.lower().replace(" ", "_")
+                plot_df[k_new] = plot_df.pop(key)
 
         self.visualisation_output = None
         self.start_time = time.time()
@@ -355,17 +374,20 @@ def visualisation(
                 plotter = PlotFactory(
                     plotting_model=self._plot_model,
                     plotting_model_kwargs=self._plot_model_kwargs,
-                    df_dict=self.df_dict,
+                    df_dict=plot_df,
                     logger=self.logger,
                     plot_context=plot_context,
                     plot_path=plot_path,
+                    use_guide=use_guide,
                 )
                 analysis_steps = plotter.get_analysis_steps(self.user_input)
                 if analysis_steps is not None and "steps" in analysis_steps:
                     if len(analysis_steps["steps"]) == 0:
                         self.plot_df = self.df_dict[analysis_steps["df_name"]]
                     else:
                         self.plot_df = self.analysis(user_input, "", analysis_steps)
+                elif not use_guide:
+                    self.plot_df = plot_df[list(plot_df.keys())[0]]
                 else:
                     self.logger.info(
                         "No analysis steps found. Using first dataframe for plotting.\n"
@@ -375,10 +397,12 @@ def visualisation(
                 self.visualisation_output = plotter.get_visualisation(self.plot_df)
                 return self.visualisation_output
             except RecursionError:
+                plt.close()
                 raise RecursionError(
                     "The request could not be completed. Please wait a while and try again."
                 )
             except Exception as e:
+                plt.close()
                 if time.time() - self.start_time > 30:
                     raise TimeoutError(
                         "The request could not be completed. Please wait a while and try again."

diff --git a/build/lib/lyzr/data_analyzr/db_connector.py b/build/lib/lyzr/data_analyzr/db_connector.py
@@ -475,6 +475,14 @@ def create_database(self, db_path: str, df_dict: dict[pd.DataFrame]):
         try:
             self.conn = sqlite3.connect(self.db_path, check_same_thread=False)
             for name, df in df_dict.items():
+                df = df.rename(
+                    columns=dict(
+                        zip(
+                            df.columns,
+                            [col.replace(" ", "_").lower() for col in df.columns],
+                        )
+                    )
+                )
                 df.to_sql(name, con=self.conn, index=False, if_exists="replace")
             return self.conn
         except sqlite3.Error as e:

diff --git a/build/lib/lyzr/data_analyzr/file_utils.py b/build/lib/lyzr/data_analyzr/file_utils.py
@@ -1,5 +1,6 @@
 # standard library imports
 import os
+import uuid
 from typing import Literal, Union
 
 # third-party imports
@@ -64,11 +65,12 @@ def get_db_details(
         if connector is None:
             connector = SQLiteConnector()
             connector.create_database(
-                db_path=config.get("db_path", "./sqlite/sqlite.db"), df_dict=df_dict
+                db_path=config.get("db_path", f"./sqlite/{str(uuid.uuid4())}.db"),
+                df_dict=df_dict,
             )
         if isinstance(vector_store_config, dict):
             vector_store = ChromaDBVectorStore(
-                path=vector_store_config["path"],
+                path=vector_store_config.get("path", f"./chromadb/{str(uuid.uuid4())}"),
                 remake_store=vector_store_config.get(
                     "remake_store", vector_store_config.get("remake", True)
                 ),