fix: staticmethod LyzrLLMFactory and recommendations prompt

LyzrCore · Apr 18, 2024 · 9fd078b · 9fd078b
1 parent 5847686
commit 9fd078b
Show file tree

Hide file tree

Showing 15 changed files with 36 additions and 34 deletions.
diff --git a/build/lib/lyzr/base/file_utils.py b/build/lib/lyzr/base/file_utils.py
@@ -57,7 +57,7 @@ def describe_dataset(
         raise ValueError("Please provide a valid pandas DataFrame.")
 
     if model is None:
-        model = LyzrLLMFactory(
+        model = LyzrLLMFactory.from_defaults(
             api_key=api_key,
             api_type=model_type,
             model=model_name,

diff --git a/build/lib/lyzr/base/llm.py b/build/lib/lyzr/base/llm.py
@@ -15,8 +15,8 @@
 
 
 class LyzrLLMFactory:
-
-    def from_defaults(self, model: str = DEFAULT_LLM, **kwargs) -> LLM:
+    @staticmethod
+    def from_defaults(model: str = DEFAULT_LLM, **kwargs) -> LLM:
         # model_type -> api_type
         # model_name -> model
         # model_prompts -> Sequence[ChatMessage]

diff --git a/build/lib/lyzr/base/prompt_texts.py b/build/lib/lyzr/base/prompt_texts.py
@@ -29,15 +29,15 @@
             "inputs": "The user asked the following question: {user_input}\nGenerate recommendations that enhance the user's question or are related to it."
         }
     },
-    "analysis_guide": {
+    "ml_analysis_guide": {
         "system": {
             "context": "You are Business Analyst. You are an expert in your field. You are assisting a data analyst.\nYou are given a dataset and a question. Your job is to analyze these two inputs and determine how to answer the question based on the data.\n\n",
             "external_context": "{context}",
             "task": "You must determine what type of analysis should be performed on the dataset in order to answer the question.\nYou should then list out the steps that the data analyst should take to perform the analysis.\nLimit your total response to 100 words.\nYou should address the data analyst directly.",
         },
         "user": {"inputs": "{df_details}\nQuestion: {question}"},
     },
-    "analysis_steps": {
+    "ml_analysis_steps": {
         "system": {
             "task": "You are a Senior Data Scientist. You have been asked a question on a dataframe.\nYour job is to analyze the given dataframe `df` to answer the question.\n\nTo assist you, a Business Analyst with domain knowledge has given their insights on the best way to go about your task.\nThe Business Analyst has also shared the names of the columns required in the resultant dataframe.\nFollow their instructions as closely as possible.\n\nMake sure that you clean the data before you analyze it.\n\nYour answer should be in the form of a python JSON object, following the given format:\n{schema}\n\nA. The value of 'analysis_df' should be the name of the dataframe on which this analysis is to be performed.\nB. The value of 'steps' should be a list of dictionaries. Each dictionary should contain the following keys: 'step', 'task', 'type', 'args'.\n    The following values are available for these keys. ONLY USE THESE VALUES.\n    1. Step: A number indicating the order of the step. Numbering should start from 1.\n    2. Task: The task to be performed. The task can be one of the following: 'clean_data', 'transform', 'math_operation', 'analysis'\n    3. Type: The type of task to be performed.\n        3a. For task 'clean_data', following types are available: 'convert_to_datetime', 'convert_to_numeric', 'convert_to_categorical'\n        3b. For task 'transform', following types are available: 'one_hot_encode', 'ordinal_encode', 'scale', 'extract_time_period', 'select_indices'\n        3c. For task 'math_operation', following types are available: 'add', 'subtract', 'multiply', 'divide'\n        3d. For task 'analysis', following types are available: 'sortvalues', 'filter', 'mean', 'sum', 'cumsum', 'groupby', 'correlation', 'regression', 'classification', 'clustering', 'forecast'\n    4. Args: The arguments required to perform the task. The arguments should be in the form of a dictionary.\n        4a. For task 'clean_data' - 'columns': list\n        4b. For task 'transform', type 'one_hot_encode', 'ordinal_encode', and 'scale' - 'columns': list\n        4c. For task 'transform', type 'extract_time_period' - 'columns': list, 'period_to_extract': Literal['week', 'month', 'year', 'day', 'hour', 'minute', 'second', 'weekday']\n        4d. For task 'transform', type 'select_indices' - 'columns': list, 'indices': list\n        4e. For task 'math_operation' - 'columns': list, 'result': str (the name of the column to store the result in)\n        4f. For task 'analysis', type 'groupby' - 'columns': list, 'agg': Union[str, list], 'agg_col': Optional[list]\n        4g. For task 'analysis', type 'sortvalues' - columns: list, 'ascending': Optional[bool]\n        4h. For task 'analysis', type 'filter' - 'columns': list, 'values': list[Any] (the values to compare the columns to), 'relations': list[Literal['lessthan', 'greaterthan', 'lessthanorequalto', 'greaterthanorequalto', 'equalto', 'notequalto', 'startswith', 'endswith', 'contains']]\n        4i. For task 'analysis', types 'mean', 'cumsum', and 'sum' - 'columns': list\n        4j. For task 'analysis', type 'correlation' - 'columns': list, 'method': Optional[Literal['pearson', 'kendall', 'spearman']]\n        4k. For task 'analysis', type 'regression' - 'x': list, 'y': list\n        4l. For task 'analysis', type 'classification' - 'x': list, 'y': list\n        4m. For task 'analysis', type 'clustering' - 'x': list, 'y': list\n        4n. For task 'analysis', type 'forecast' - 'time_column': str, 'y_column': str, 'end': Optional[str], 'steps': Optional[int] # you must pass either 'end' - the date until which to forecast or 'steps' - the number of steps to forecast\nC. The value of 'output_columns' should be a list of strings. Each string should be the name of a column in the dataframe. These columns should be the ones that are required to answer the question.\n\nDo not give any explanations. Only give the python JSON as the answer.\nThis JSON will be evaluated using the eval() function in python. Ensure that it is in the correct format, and has no syntax errors.\n\nOnly return this JSON with details of steps. Do not return anything else.\n\nBefore beginning, take a deep breath and relax. You are an expert in your field. You have done this many times before.\nYou may now begin."
         },

diff --git a/build/lib/lyzr/data_analyzr/analyzr.py b/build/lib/lyzr/data_analyzr/analyzr.py
@@ -88,7 +88,7 @@ def __init__(
             if analysis_type is None:
                 raise MissingValueError("`analysis_type` is a required parameter.")
             if model is None:
-                self.model = LyzrLLMFactory().from_defaults(
+                self.model = LyzrLLMFactory.from_defaults(
                     model="gpt-4-1106-preview", api_key=api_key, seed=seed
                 )
             elif isinstance(model, LiteLLM):
@@ -125,7 +125,7 @@ def _legacy_usage(
                 warnings.warn(
                     f"The `{param}` parameter is deprecated and will be removed in a future version. Please use the `analysis_model` parameter to set the analysis model, and the `gen_model` parameter to set the generation model."
                 )
-        self.model = model or LyzrLLMFactory().from_defaults(
+        self.model = model or LyzrLLMFactory.from_defaults(
             api_key=api_key,
             api_type=model_type,
             model=model_name or os.environ.get("MODEL_NAME", "gpt-4-1106-preview"),
@@ -246,7 +246,7 @@ def analysis(
                 "No analysis performed. Analysis output is the given dataframe."
             )
             return self.analysis_output
-        analysis_model = LyzrLLMFactory().from_defaults(model="gpt-3.5-turbo")
+        analysis_model = LyzrLLMFactory.from_defaults(model="gpt-3.5-turbo")
         analysis_model.additional_kwargs["logger"] = self.logger
         if self.analysis_type == "sql" and analysis_steps is None:
             return self._txt_to_sql_analysis(
@@ -393,6 +393,7 @@ def recommendations(
         if not use_insights:
             insights = None
             system_message_sections.append("task_no_insights")
+            user_message_dict["insights"] = ""
         else:
             system_message_sections.append("task_with_insights")
             user_message_dict["insights"] = (
@@ -412,7 +413,7 @@ def recommendations(
             ]
         elif output_type.lower().strip() == "text":
             system_message_sections.append("text_type")
-            system_message_dict["n_recommendations"] = n_recommendations
+        system_message_dict["n_recommendations"] = n_recommendations
 
         system_message_sections.append("closing")
         self.recommendations_output = self.model.run(
@@ -461,7 +462,7 @@ def tasks(
                     context=tasks_context.strip() + "\n\n", n_tasks=n_tasks
                 ),
                 LyzrPromptFactory(name="tasks", prompt_type="user").get_message(
-                    user_input=user_input or self.user_input,
+                    user_input=user_input,
                     insights=self.insights_output,
                     recommendations=self.recommendations_output,
                 ),

diff --git a/build/lib/lyzr/data_analyzr/ml_analysis_utils.py b/build/lib/lyzr/data_analyzr/ml_analysis_utils.py
@@ -112,12 +112,12 @@ def _get_analysis_guide(self, user_input: str) -> str:
         output = self.model.run(
             messages=[
                 LyzrPromptFactory(
-                    name="analysis_guide", prompt_type="system"
+                    name="ml_analysis_guide", prompt_type="system"
                 ).get_message(
                     context=self.context,
                 ),
                 LyzrPromptFactory(
-                    name="analysis_guide", prompt_type="user"
+                    name="ml_analysis_guide", prompt_type="user"
                 ).get_message(
                     df_details=print_df_details(self.df_dict, self.df_info_dict),
                     question=user_input,
@@ -162,10 +162,10 @@ def _get_analysis_steps_messages_kwargs(self, user_input: str) -> tuple:
             "output columns": ["col1", "col2", "col3"],
         }
         messages = [
-            LyzrPromptFactory(name="analysis_steps", prompt_type="system").get_message(
-                schema=schema
-            ),
-            LyzrPromptFactory(name="analysis_steps", prompt_type="user").get_message(
+            LyzrPromptFactory(
+                name="ml_analysis_steps", prompt_type="system"
+            ).get_message(schema=schema),
+            LyzrPromptFactory(name="ml_analysis_steps", prompt_type="user").get_message(
                 df_details=print_df_details(self.df_dict, self.df_info_dict),
                 question=user_input,
                 context=self.analysis_guide,

diff --git a/dist/lyzr-0.1.33.tar.gz b/dist/lyzr-0.1.33.tar.gz
diff --git a/dist/lyzr-0.1.33-py3-none-any.whl → dist/lyzr-0.1.34-py3-none-any.whl b/dist/lyzr-0.1.33-py3-none-any.whl → dist/lyzr-0.1.34-py3-none-any.whl
diff --git a/dist/lyzr-0.1.34.tar.gz b/dist/lyzr-0.1.34.tar.gz
diff --git a/lyzr.egg-info/PKG-INFO b/lyzr.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lyzr
-Version: 0.1.33
+Version: 0.1.34
 Summary: UNKNOWN
 Home-page: UNKNOWN
 Author: lyzr

diff --git a/lyzr/base/file_utils.py b/lyzr/base/file_utils.py
@@ -57,7 +57,7 @@ def describe_dataset(
         raise ValueError("Please provide a valid pandas DataFrame.")
 
     if model is None:
-        model = LyzrLLMFactory(
+        model = LyzrLLMFactory.from_defaults(
             api_key=api_key,
             api_type=model_type,
             model=model_name,

diff --git a/lyzr/base/llm.py b/lyzr/base/llm.py
@@ -15,8 +15,8 @@
 
 
 class LyzrLLMFactory:
-
-    def from_defaults(self, model: str = DEFAULT_LLM, **kwargs) -> LLM:
+    @staticmethod
+    def from_defaults(model: str = DEFAULT_LLM, **kwargs) -> LLM:
         # model_type -> api_type
         # model_name -> model
         # model_prompts -> Sequence[ChatMessage]

diff --git a/lyzr/base/prompt_texts.py b/lyzr/base/prompt_texts.py
@@ -29,15 +29,15 @@
             "inputs": "The user asked the following question: {user_input}\nGenerate recommendations that enhance the user's question or are related to it."
         }
     },
-    "analysis_guide": {
+    "ml_analysis_guide": {
         "system": {
             "context": "You are Business Analyst. You are an expert in your field. You are assisting a data analyst.\nYou are given a dataset and a question. Your job is to analyze these two inputs and determine how to answer the question based on the data.\n\n",
             "external_context": "{context}",
             "task": "You must determine what type of analysis should be performed on the dataset in order to answer the question.\nYou should then list out the steps that the data analyst should take to perform the analysis.\nLimit your total response to 100 words.\nYou should address the data analyst directly.",
         },
         "user": {"inputs": "{df_details}\nQuestion: {question}"},
     },
-    "analysis_steps": {
+    "ml_analysis_steps": {
         "system": {
             "task": "You are a Senior Data Scientist. You have been asked a question on a dataframe.\nYour job is to analyze the given dataframe `df` to answer the question.\n\nTo assist you, a Business Analyst with domain knowledge has given their insights on the best way to go about your task.\nThe Business Analyst has also shared the names of the columns required in the resultant dataframe.\nFollow their instructions as closely as possible.\n\nMake sure that you clean the data before you analyze it.\n\nYour answer should be in the form of a python JSON object, following the given format:\n{schema}\n\nA. The value of 'analysis_df' should be the name of the dataframe on which this analysis is to be performed.\nB. The value of 'steps' should be a list of dictionaries. Each dictionary should contain the following keys: 'step', 'task', 'type', 'args'.\n    The following values are available for these keys. ONLY USE THESE VALUES.\n    1. Step: A number indicating the order of the step. Numbering should start from 1.\n    2. Task: The task to be performed. The task can be one of the following: 'clean_data', 'transform', 'math_operation', 'analysis'\n    3. Type: The type of task to be performed.\n        3a. For task 'clean_data', following types are available: 'convert_to_datetime', 'convert_to_numeric', 'convert_to_categorical'\n        3b. For task 'transform', following types are available: 'one_hot_encode', 'ordinal_encode', 'scale', 'extract_time_period', 'select_indices'\n        3c. For task 'math_operation', following types are available: 'add', 'subtract', 'multiply', 'divide'\n        3d. For task 'analysis', following types are available: 'sortvalues', 'filter', 'mean', 'sum', 'cumsum', 'groupby', 'correlation', 'regression', 'classification', 'clustering', 'forecast'\n    4. Args: The arguments required to perform the task. The arguments should be in the form of a dictionary.\n        4a. For task 'clean_data' - 'columns': list\n        4b. For task 'transform', type 'one_hot_encode', 'ordinal_encode', and 'scale' - 'columns': list\n        4c. For task 'transform', type 'extract_time_period' - 'columns': list, 'period_to_extract': Literal['week', 'month', 'year', 'day', 'hour', 'minute', 'second', 'weekday']\n        4d. For task 'transform', type 'select_indices' - 'columns': list, 'indices': list\n        4e. For task 'math_operation' - 'columns': list, 'result': str (the name of the column to store the result in)\n        4f. For task 'analysis', type 'groupby' - 'columns': list, 'agg': Union[str, list], 'agg_col': Optional[list]\n        4g. For task 'analysis', type 'sortvalues' - columns: list, 'ascending': Optional[bool]\n        4h. For task 'analysis', type 'filter' - 'columns': list, 'values': list[Any] (the values to compare the columns to), 'relations': list[Literal['lessthan', 'greaterthan', 'lessthanorequalto', 'greaterthanorequalto', 'equalto', 'notequalto', 'startswith', 'endswith', 'contains']]\n        4i. For task 'analysis', types 'mean', 'cumsum', and 'sum' - 'columns': list\n        4j. For task 'analysis', type 'correlation' - 'columns': list, 'method': Optional[Literal['pearson', 'kendall', 'spearman']]\n        4k. For task 'analysis', type 'regression' - 'x': list, 'y': list\n        4l. For task 'analysis', type 'classification' - 'x': list, 'y': list\n        4m. For task 'analysis', type 'clustering' - 'x': list, 'y': list\n        4n. For task 'analysis', type 'forecast' - 'time_column': str, 'y_column': str, 'end': Optional[str], 'steps': Optional[int] # you must pass either 'end' - the date until which to forecast or 'steps' - the number of steps to forecast\nC. The value of 'output_columns' should be a list of strings. Each string should be the name of a column in the dataframe. These columns should be the ones that are required to answer the question.\n\nDo not give any explanations. Only give the python JSON as the answer.\nThis JSON will be evaluated using the eval() function in python. Ensure that it is in the correct format, and has no syntax errors.\n\nOnly return this JSON with details of steps. Do not return anything else.\n\nBefore beginning, take a deep breath and relax. You are an expert in your field. You have done this many times before.\nYou may now begin."
         },

diff --git a/lyzr/data_analyzr/analyzr.py b/lyzr/data_analyzr/analyzr.py
@@ -88,7 +88,7 @@ def __init__(
             if analysis_type is None:
                 raise MissingValueError("`analysis_type` is a required parameter.")
             if model is None:
-                self.model = LyzrLLMFactory().from_defaults(
+                self.model = LyzrLLMFactory.from_defaults(
                     model="gpt-4-1106-preview", api_key=api_key, seed=seed
                 )
             elif isinstance(model, LiteLLM):
@@ -125,7 +125,7 @@ def _legacy_usage(
                 warnings.warn(
                     f"The `{param}` parameter is deprecated and will be removed in a future version. Please use the `analysis_model` parameter to set the analysis model, and the `gen_model` parameter to set the generation model."
                 )
-        self.model = model or LyzrLLMFactory().from_defaults(
+        self.model = model or LyzrLLMFactory.from_defaults(
             api_key=api_key,
             api_type=model_type,
             model=model_name or os.environ.get("MODEL_NAME", "gpt-4-1106-preview"),
@@ -246,7 +246,7 @@ def analysis(
                 "No analysis performed. Analysis output is the given dataframe."
             )
             return self.analysis_output
-        analysis_model = LyzrLLMFactory().from_defaults(model="gpt-3.5-turbo")
+        analysis_model = LyzrLLMFactory.from_defaults(model="gpt-3.5-turbo")
         analysis_model.additional_kwargs["logger"] = self.logger
         if self.analysis_type == "sql" and analysis_steps is None:
             return self._txt_to_sql_analysis(
@@ -393,6 +393,7 @@ def recommendations(
         if not use_insights:
             insights = None
             system_message_sections.append("task_no_insights")
+            user_message_dict["insights"] = ""
         else:
             system_message_sections.append("task_with_insights")
             user_message_dict["insights"] = (
@@ -412,7 +413,7 @@ def recommendations(
             ]
         elif output_type.lower().strip() == "text":
             system_message_sections.append("text_type")
-            system_message_dict["n_recommendations"] = n_recommendations
+        system_message_dict["n_recommendations"] = n_recommendations
 
         system_message_sections.append("closing")
         self.recommendations_output = self.model.run(
@@ -461,7 +462,7 @@ def tasks(
                     context=tasks_context.strip() + "\n\n", n_tasks=n_tasks
                 ),
                 LyzrPromptFactory(name="tasks", prompt_type="user").get_message(
-                    user_input=user_input or self.user_input,
+                    user_input=user_input,
                     insights=self.insights_output,
                     recommendations=self.recommendations_output,
                 ),