Merge pull request #45 from LyzrCore/imp/data-analyzr

Changes for litellm upgrade
LyzrCore · Jul 10, 2024 · f2065ec · f2065ec
2 parents cc86481 + c93beb0
commit f2065ec
Show file tree

Hide file tree

Showing 23 changed files with 133 additions and 157 deletions.
diff --git a/build/lib/lyzr/base/prompt_texts.py b/build/lib/lyzr/base/prompt_texts.py
@@ -8,22 +8,12 @@
         },
         "user": {"inputs": "{df_details}"},
     },
-    "ml_analysis_guide": {
-        "system": {
-            "context": "You are Business Analyst. You are an expert in your field. You are assisting a data analyst.\nYou are given a dataset and a question. Your job is to analyze these two inputs and determine how to answer the question based on the data.\n\n",
-            "external_context": "{context}",
-            "task": "You must determine what type of analysis should be performed on the dataset in order to answer the question.\nYou should then list out the steps that the data analyst should take to perform the analysis.\nLimit your total response to 100 words.\nYou should address the data analyst directly.",
-            "doc_addition_text": "You may use the following documentation to understand the schema of the data:\n{doc}\n",
-        },
-        "user": {"inputs": "{df_details}\nQuestion: {question}"},
-    },
     "analysis_code": {
         "system": {
             "context": "You are an Expert DATA ANALYST and PYTHON CODER. Your task is to RESPOND with precise Python code based on the questions provided by the user.\n\n",
             "external_context": "{context}",
             "task": "Please follow these steps:\n1. READ the user's question CAREFULLY to understand what Python code is being requested.\n2. WRITE the Python code that directly answers the user's question.\n3. ENSURE that your response contains ONLY the Python code without any additional explanations or comments.\n4. VERIFY that your Python code is SYNTACTICALLY CORRECT and adheres to standard Pythonic practices.\n5. You code must SAVE the result to `result`.\n6. Whenever possible your code should OUTPUT a pandas dataframe.\n7. You may use triple backticks ``` before and after the code block.\n8. Do NOT add comments your code.\n\n",
             "closing": "You MUST provide clean and efficient Python code as a response, and remember, I'm going to tip $300K for a BETTER SOLUTION!\n\nNow Take a Deep Breath.\n\n",
-            "guide": "To assist you, a Business Analyst with domain knowledge has given their insights on the best way to go about your task.\nFollow their instructions as closely as possible.\n{guide}\n\n",
             "doc_addition_text": "You may use the following documentation to understand the schema of the data:\n{doc}\n",
             "history": "Also use responses to past questions to guide you.\n\n",
             "locals": "The following local environment variables are available to you:\n{locals}\n\n",
@@ -48,10 +38,10 @@
         "system": {
             "context": "You are an Expert DATA ANALYST and COMMUNICATOR. Your task is to INTERPRET complex analytics results and TRANSLATE them into SIMPLE, UNDERSTANDABLE insights for business users and data analysts.\n\n",
             "external_context": "{context}",
-            "task": "Proceed with the following steps:\n\n1. ANALYZE the user query, the analysis guide, and the analysis output to fully comprehend the results derived from the initial dataset.\n2. SIMPLIFY the findings by creating clear explanations that resonate with both business users and data analysts, ensuring that you use plain language.\n3. ACCURATELY ROUND all relevant numbers to TWO DECIMAL PLACES to complement the analysis output.\n4. RANK your insights based on their significance and SHARE only the top {n_insights}.\n5. FORMAT these insights as BULLET POINTS for clarity and succinctness.\n\nYou MUST adhere to these guidelines:\n\n- Present ONLY THE LIST of insights without titles or additional information.\n- Ensure that each insight is DIRECTLY TIED to a corresponding data point from the analysis output.\n\nI’m going to tip $300K for a BETTER SOLUTION!\n\nTake a Deep Breath.",
+            "task": "Proceed with the following steps:\n\n1. ANALYZE the user query, the analysis code, and the analysis output to fully comprehend the results derived from the initial dataset.\n2. SIMPLIFY the findings by creating clear explanations that resonate with both business users and data analysts, ensuring that you use plain language.\n3. ACCURATELY ROUND all relevant numbers to TWO DECIMAL PLACES to complement the analysis output.\n4. RANK your insights based on their significance and SHARE only the top {n_insights}.\n5. FORMAT these insights as BULLET POINTS for clarity and succinctness.\n\nYou MUST adhere to these guidelines:\n\n- Present ONLY THE LIST of insights without titles or additional information.\n- Ensure that each insight is DIRECTLY TIED to a corresponding data point from the analysis output.\n\nI’m going to tip $300K for a BETTER SOLUTION!\n\nTake a Deep Breath.",
         },
         "user": {
-            "inputs": "Today is {date}.\nuser query: {user_input}\nanalysis guide:\n{analysis_guide}\n\nanalysis output:\n{analysis_output}"
+            "inputs": "Today is {date}.\nuser query: {user_input}\nanalysis code:\n{analysis_code}\n\nanalysis output:\n{analysis_output}"
         },
     },
     "recommendations": {
@@ -84,6 +74,7 @@
             "sql_plot": "Please follow these steps:\n1. READ the user's question CAREFULLY.\n2. UNDERSTAND what plot can be generated to answer the question.\n3. If needed, USE the 'conn' object to query the database with `pd.read_sql('SQL query here', conn.conn)`.\n4. WRITE the Python code that makes a figure `fig` with this plot.\n5. ENSURE that your response contains ONLY the code without any additional explanations or comments.\n4. VERIFY that your code is SYNTACTICALLY CORRECT and adheres to standard practices.\n5. You code must SAVE THE PLOT to `fig`.\n6. You may use triple backticks ``` before and after the code block.\n7. Do NOT add comments to your code.\n\nYou MUST provide clean and efficient code as a response, and remember, I'm going to tip $300K for a BETTER SOLUTION!\n\nNow Take a Deep Breath.\n\n",
             "python_plot": "Please follow these steps:\n1. READ the user's question CAREFULLY.\n2. UNDERSTAND what plot can be generated to answer the question.\n3. WRITE the Python code that makes a figure `fig` with this plot.\n3. ENSURE that your response contains ONLY the Python code without any additional explanations or comments.\n4. VERIFY that your Python code is SYNTACTICALLY CORRECT and adheres to standard Pythonic practices.\n5. You code must SAVE THE PLOT to `fig`.\n6. You may use triple backticks ``` before and after the code block.\n7. Do NOT add comments to your code.\n\nYou MUST provide clean and efficient Python code as a response, and remember, I'm going to tip $300K for a BETTER SOLUTION!\n\nNow Take a Deep Breath.\n\n",
             "doc_addition_text": "You may use the following documentation to understand the schema of the {db_type}:\n{doc}\n",
+            "python_examples_text": "You may use the following examples to guide you:\n{python_examples}\n",
             "sql_examples_text": "You may use the following examples to guide you:\n{sql_examples}\n",
             "history": "Also use responses to past questions to guide you.",
             "locals": "The following local environment variables are available to you:\n{locals}\n",

diff --git a/build/lib/lyzr/data_analyzr/analysis_handler/plotter.py b/build/lib/lyzr/data_analyzr/analysis_handler/plotter.py
@@ -77,9 +77,15 @@ class PlotFactory(FactoryBaseClass):
         _add_sql_examples(user_input: str, system_message_sections: list, system_message_dict: dict):
             Add SQL examples to the system message sections and dictionary based on user input.
 
+        _add_python_examples(user_input: str, system_message_sections: list, system_message_dict: dict):
+            Add Python examples to the system message sections and dictionary based on user input.
+
         extract_and_execute_code(llm_response: str):
             Executes the plotting code extracted from the provided LLM response.
 
+        code_cleaner(code: str) -> str:
+            Handler for cleaning the code by removing print statements and plt.show() calls.
+
         save_plot_image() -> str:
             Saves the current plot to a file specified by `self.plot_path`.
 
@@ -307,6 +313,11 @@ def _get_message_sections_and_dict(self, user_input: str) -> tuple[list, dict]:
                 system_message_sections.append("doc_addition_text")
                 system_message_dict["doc"] = doc_str
                 system_message_dict["db_type"] = "dataframe(s)"
+            system_message_sections, system_message_dict = self._add_python_examples(
+                user_input=user_input,
+                system_message_sections=system_message_sections,
+                system_message_dict=system_message_dict,
+            )
         system_message_dict["locals"] = make_locals_string(self.locals_)
         return system_message_sections, system_message_dict
 
@@ -408,6 +419,37 @@ def _add_sql_examples(
             system_message_dict["sql_examples"] = sql_examples_str
         return system_message_sections, system_message_dict
 
+    def _add_python_examples(
+        self, user_input: str, system_message_sections: list, system_message_dict: dict
+    ):
+        """
+        Add SQL examples to the system message sections and dictionary based on user input.
+
+        Args:
+            user_input (str): The input provided by the user.
+            system_message_sections (list): A list of sections in the system message.
+            system_message_dict (dict): A dictionary containing the system message content.
+
+        Returns:
+            tuple: Updated `system_message_sections` and `system_message_dict` with SQL examples included if any were found.
+
+        Procedure:
+            - Retrieve SQL examples similar to the user's input from the vector store.
+            - If any examples are found, append them to the system message sections and format them into a string.
+            - Add the formatted string to the system message dictionary under the key "python_examples".
+            - Return the updated system message sections and dictionary.
+        """
+        python_examples = self.vector_store.get_related_python_code(user_input)
+        if len(python_examples) > 0:
+            system_message_sections.append("python_examples_text")
+            python_examples_str = ""
+            for example in python_examples:
+                if example is not None:
+                    if "question" in example and "python_code" in example:
+                        python_examples_str += f"Question: {example['question']}\nAnalysis Code:\n{example['python_code']}\n\n"
+            system_message_dict["python_examples"] = python_examples_str
+        return system_message_sections, system_message_dict
+
     def extract_and_execute_code(self, llm_response: str):
         """
         Executes the plotting code extracted from the provided LLM response.
@@ -459,6 +501,7 @@ def extract_and_execute_code(self, llm_response: str):
         return self.locals_["fig"]
 
     def code_cleaner(self, code: str) -> str:
+        """Handler for cleaning the extracted code before execution."""
         return remove_print_and_plt_show(code)
 
     def save_plot_image(self) -> str:

diff --git a/build/lib/lyzr/data_analyzr/analysis_handler/pythonic.py b/build/lib/lyzr/data_analyzr/analysis_handler/pythonic.py
@@ -43,16 +43,22 @@ class PythonicAnalysisFactory(FactoryBaseClass):
     Methods:
         __init__(llm, logger, context, df_dict, vector_store, max_retries=None, time_limit=None, auto_train=None, **llm_kwargs):
             Initializes a PythonicAnalysisFactory instance.
+
         generate_output(user_input, **kwargs):
             Runs analysis and generates output based on the provided user input.
+
         get_prompt_messages(user_input):
             Generates a list of prompt messages based on the user's input.
-        get_analysis_guide(user_input):
-            Generates an analysis guide based on the user's input.
+
         _get_locals_and_docs(system_message_sections, system_message_dict, user_input):
             Retrieves local variables and related documentation based on user input.
+
         extract_and_execute_code(llm_response):
             Extracts Python code from a given LLM response, processes it, and executes it within a controlled environment.
+
+        code_cleaner(code) -> str:
+            Handler for cleaning the extracted code.
+
         auto_train(user_input, code, **kwargs):
             Adds the user input and generated Python code to the vector store if the auto_train flag is set.
     """
@@ -171,7 +177,7 @@ def get_prompt_messages(self, user_input: str) -> list:
 
         This method constructs a series of messages to be used with the LLM.
         - Incorporates context and examples relevant to the user's input.
-        - List of messages includes system messages with context, guides, local variables,
+        - List of messages includes system messages with context, local variables,
         documentation, and historical examples, followed by the user's input.
 
         Args:
@@ -187,11 +193,6 @@ def get_prompt_messages(self, user_input: str) -> list:
             "closing",
         ]
         system_message_dict = {"context": self.context}
-        # add analysis guide
-        self.guide = self.get_analysis_guide(user_input)
-        if self.guide is not None and self.guide != "":
-            system_message_sections.append("guide")
-            system_message_dict["guide"] = self.guide
         # add locals and docs
         system_message_sections, system_message_dict = self._get_locals_and_docs(
             system_message_sections=system_message_sections,
@@ -219,43 +220,6 @@ def get_prompt_messages(self, user_input: str) -> list:
         messages.append(UserMessage(content=user_input))
         return messages
 
-    def get_analysis_guide(self, user_input: str) -> str:
-        """
-        Generate an analysis guide based on the user's input.
-
-        Args:
-            user_input (str): The input provided by the user for which the analysis guide is to be generated.
-
-        Returns:
-            str: The content of the analysis guide generated by the language model.
-
-        Procedure:
-            - Define system message sections and a dictionary to store system message format strings.
-            - Retrieve related documentation from the vector store based on the user input.
-            - Add the documentation to the system message dictionary.
-            - Generate an analysis guide using the llm based on the system message sections and dictionary.
-        """
-        system_message_sections = [
-            "context",
-            "external_context",
-            "task",
-        ]
-        system_message_dict = {"context": self.context}
-        doc_list = self.vector_store.get_related_documentation(user_input)
-        if len(doc_list) > 0:
-            system_message_sections.append("doc_addition_text")
-            system_message_dict["doc"] = ""
-            for doc_item in doc_list:
-                system_message_dict["doc"] += f"{doc_item}\n"
-        messages = [
-            LyzrPromptFactory("ml_analysis_guide", "system").get_message(
-                use_sections=system_message_sections,
-                **system_message_dict,
-            ),
-        ]
-        llm_response = self.llm.run(messages=messages)
-        return llm_response.message.content.strip()
-
     def _get_locals_and_docs(
         self, system_message_sections: list, system_message_dict: dict, user_input: str
     ) -> tuple[list, dict]:
@@ -345,6 +309,7 @@ def extract_and_execute_code(self, llm_response: str):
         return self.locals_["result"]
 
     def code_cleaner(self, code: str) -> str:
+        """Handler for cleaning the extracted code before execution."""
         return remove_print_and_plt_show(code)
 
     def auto_train(self, user_input: str, code: str, **kwargs):

diff --git a/build/lib/lyzr/data_analyzr/analysis_handler/sql.py b/build/lib/lyzr/data_analyzr/analysis_handler/sql.py
@@ -49,6 +49,9 @@ class TxttoSQLFactory(FactoryBaseClass):
         extract_and_execute_code(llm_response: str):
             Extracts an SQL query from the given LLM response and executes it.
 
+        code_cleaner(code) -> str:
+            Handler for cleaning the extracted code before execution.
+
         _handle_create_table_sql(sql_query: str):
             Handles the execution of a SQL query when table creation is involved.
 
@@ -272,10 +275,10 @@ def extract_and_execute_code(self, llm_response: str):
         else:
             analysis_output = self.connector.run_sql(sql_query)
         self.code = sql_query
-        self.guide = sql_query
         return analysis_output
 
     def code_cleaner(self, code) -> str:
+        """Handler for cleaning the extracted code before execution."""
         return code
 
     def _handle_create_table_sql(self, sql_query: str):

diff --git a/build/lib/lyzr/data_analyzr/analysis_handler/utils.py b/build/lib/lyzr/data_analyzr/analysis_handler/utils.py
@@ -8,6 +8,7 @@
 import time
 import string
 import logging
+import warnings
 import traceback
 from pathlib import Path
 from typing import Any, Sequence, Union
@@ -402,6 +403,7 @@ def process_llm_response(llm_response, **kwargs):
 
     def decorator_wrapper(func):
         def wrapped_func(**kwargs):
+            warnings.filterwarnings("ignore")
             result = None
             start_time = time.time()
             logger.info(

diff --git a/build/lib/lyzr/data_analyzr/analyzr.py b/build/lib/lyzr/data_analyzr/analyzr.py
@@ -35,7 +35,6 @@ class DataAnalyzr:
         analysis_llm (LiteLLM): LLM instance for performing analysis.
         context (ContextDict): Context for analysis and response generation.
         logger (logging.Logger): Logger instance for logging messages.
-        analysis_guide (str): The guide for the analysis process.
         analysis_code (str): The code generated for the analysis.
         analysis_output (Union[str, pd.DataFrame, dict[str, pd.DataFrame], None]): The output of the analysis process.
         plot_code (str): The code generated for the visualization.
@@ -171,15 +170,14 @@ def __init__(
             self.database_connector,
             self.vector_store,
             self.analysis_code,
-            self.analysis_guide,
             self.analysis_output,
             self.plot_code,
             self.plot_output,
             self.insights_output,
             self.recommendations_output,
             self.tasks_output,
             self.ai_queries_output,
-        ) = (None,) * 12
+        ) = (None,) * 11
 
         from lyzr.data_analyzr.utils import logging_decorator
 
@@ -252,7 +250,7 @@ def analysis(
         Perform an analysis based on the provided user input and analysis parameters.
 
         This method determines the type of analysis to be performed (SQL, or Pythonic) and executes it.
-        If the analysis type is set to skip, it sets the analysis guide to "No analysis performed." and returns None.
+        If the analysis type is set to skip, it sets the analysis code and analysis output to None and returns None.
 
         Args:
             user_input (str): The input string provided by the user for analysis.
@@ -276,7 +274,6 @@ def analysis(
         """
         if self.analysis_type is AnalysisTypes.skip:
             self.logger.info("No analysis performed.")
-            self.analysis_guide = "No analysis performed."
             self.analysis_output = None
             self.analysis_code = None
             return self.analysis_output
@@ -299,7 +296,6 @@ def analysis(
             **analyser_args,
         )
         self.analysis_output = analyser.generate_output(user_input)
-        self.analysis_guide = analyser.guide
         self.analysis_code = analyser.code
         return self.analysis_output
 
@@ -394,16 +390,16 @@ def insights(
 
         if insights_context is None:
             insights_context = ""
-        if not hasattr(self, "analysis_guide") or self.analysis_guide is None:
-            self.analysis_guide = ""
+        if not hasattr(self, "analysis_code") or self.analysis_code is None:
+            self.analysis_code = ""
         self.insights_output = self.generator_llm.run(
             messages=[
                 LyzrPromptFactory(name="insights", prompt_type="system").get_message(
                     context=insights_context, n_insights=n_insights
                 ),
                 LyzrPromptFactory(name="insights", prompt_type="user").get_message(
                     user_input=user_input,
-                    analysis_guide=self.analysis_guide,
+                    analysis_code=self.analysis_code,
                     analysis_output=(
                         format_analysis_output(output_df=self.analysis_output)
                         if self.analysis_output is not None