From dd2225b1aa86097624c9ca2eb6436c4ac6790085 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 6 Jun 2024 11:56:48 +0530 Subject: [PATCH 1/4] Use Text output mode to disambiguate from Default data source lookup Previously if default output was selected by Khoj, we'd end up doing an documents search as well, even when Khoj selected internet or general data source to lookup. This update disambiguates the default information mode from the text output mode. To avoid doing documents search when not deemed necessary by Khoj --- src/khoj/routers/helpers.py | 8 ++++---- src/khoj/utils/helpers.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 6af8b9d06..f144044ec 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -287,16 +287,16 @@ async def aget_relevant_output_modes(query: str, conversation_history: dict, is_ response = response.strip() if is_none_or_empty(response): - return ConversationCommand.Default + return ConversationCommand.Text if response in mode_options.keys(): # Check whether the tool exists as a valid ConversationCommand return ConversationCommand(response) - return ConversationCommand.Default - except Exception as e: + return ConversationCommand.Text + except Exception: logger.error(f"Invalid response for determining relevant mode: {response}") - return ConversationCommand.Default + return ConversationCommand.Text async def infer_webpage_urls(q: str, conversation_history: dict, location_data: LocationData) -> List[str]: diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 48be715b9..fd44ba859 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -304,6 +304,7 @@ class ConversationCommand(str, Enum): Online = "online" Webpage = "webpage" Image = "image" + Text = "text" Automation = "automation" AutomatedTask = "automated_task" @@ -330,7 +331,7 @@ class ConversationCommand(str, Enum): mode_descriptions_for_llm = { ConversationCommand.Image: "Use this if the user is requesting an image or visual response to their query.", ConversationCommand.Automation: "Use this if the user is requesting a response at a scheduled date or time.", - ConversationCommand.Default: "Use this if the other response modes don't seem to fit the query.", + ConversationCommand.Text: "Use this if the other response modes don't seem to fit the query.", } From 18f7e6e7ed243d9a98759c50c90466e8b6c0f02e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 6 Jun 2024 16:51:26 +0530 Subject: [PATCH 2/4] Remove "Path" prefix from org ancestor heading in compiled entry --- .../processor/content/org_mode/org_to_entries.py | 2 +- tests/test_org_to_entries.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 2dcaa7449..c91c20703 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -182,7 +182,7 @@ def convert_org_nodes_to_entries( # Children nodes do not need ancestors trail as root parent node will have it if not entry_heading: ancestors_trail = " / ".join(parsed_entry.ancestors) or Path(entry_to_file_map[parsed_entry]) - heading = f"* Path: {ancestors_trail}\n{heading}" if heading else f"* Path: {ancestors_trail}." + heading = f"* {ancestors_trail}\n{heading}" if heading else f"* {ancestors_trail}." compiled = heading diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index f01f50f3c..56be5fa5c 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -50,14 +50,14 @@ def test_entry_split_when_exceeds_max_tokens(): data = { f"{tmp_path}": entry, } - expected_heading = f"* Path: {tmp_path}\n** Heading" + expected_heading = f"* {tmp_path}\n** Heading" # Act # Extract Entries from specified Org files entries = OrgToEntries.extract_org_entries(org_files=data) # Split each entry from specified Org files by max tokens - entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=6) + entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=5) # Assert assert len(entries) == 2 @@ -139,7 +139,7 @@ def test_parse_org_entry_with_children_as_single_entry_if_small(tmp_path): f"{tmp_path}": entry, } first_expected_entry = f""" -* Path: {tmp_path} +* {tmp_path} ** Heading 1. body line 1 @@ -148,13 +148,13 @@ def test_parse_org_entry_with_children_as_single_entry_if_small(tmp_path): """.lstrip() second_expected_entry = f""" -* Path: {tmp_path} +* {tmp_path} ** Heading 2. body line 2 """.lstrip() third_expected_entry = f""" -* Path: {tmp_path} / Heading 2 +* {tmp_path} / Heading 2 ** Subheading 2.1. longer body line 2.1 @@ -192,7 +192,7 @@ def test_separate_sibling_org_entries_if_all_cannot_fit_in_token_limit(tmp_path) f"{tmp_path}": entry, } first_expected_entry = f""" -* Path: {tmp_path} +* {tmp_path} ** Heading 1. body line 1 @@ -201,7 +201,7 @@ def test_separate_sibling_org_entries_if_all_cannot_fit_in_token_limit(tmp_path) """.lstrip() second_expected_entry = f""" -* Path: {tmp_path} +* {tmp_path} ** Heading 2. body line 2 @@ -210,7 +210,7 @@ def test_separate_sibling_org_entries_if_all_cannot_fit_in_token_limit(tmp_path) """.lstrip() third_expected_entry = f""" -* Path: {tmp_path} +* {tmp_path} ** Heading 3. body line 3 From f91cdf8e18d63abc2b2851ffd3acef1b42ecc999 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 6 Jun 2024 16:52:23 +0530 Subject: [PATCH 3/4] Fix showing headings in intermediate step in generating chat response --- src/khoj/routers/api_chat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 702893ff7..b4d370e1a 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -590,9 +590,7 @@ async def send_rate_limit_message(message: str): ) if compiled_references: - headings = "\n- " + "\n- ".join( - set([" ".join(c.get("compiled", c).split("Path: ")[1:]).split("\n ")[0] for c in compiled_references]) - ) + headings = "\n- " + "\n- ".join(set([c.get("compiled", c).split("\n")[0] for c in compiled_references])) await send_status_update(f"**📜 Found Relevant Notes**: {headings}") online_results: Dict = dict() From f440ddbe1df33475c48621e5f35956c9a19ece8d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 9 Jun 2024 07:16:55 +0530 Subject: [PATCH 4/4] Fix openai chat actor, director tests - Update test ChatModelOptions setup since update to it's schema - Fix stale function calls using their updated signatures --- tests/conftest.py | 2 +- tests/helpers.py | 15 +++--- tests/test_openai_chat_actors.py | 81 ++++++++++++++++++------------ tests/test_openai_chat_director.py | 10 ++-- 4 files changed, 62 insertions(+), 46 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 28a4f2c5c..405b652fe 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -301,7 +301,7 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa # Initialize Processor from Config if os.getenv("OPENAI_API_KEY"): chat_model = ChatModelOptionsFactory(chat_model="gpt-3.5-turbo", model_type="openai") - OpenAIProcessorConversationConfigFactory() + chat_model.openai_config = OpenAIProcessorConversationConfigFactory() UserConversationProcessorConfigFactory(user=user, setting=chat_model) state.anonymous_mode = not require_auth diff --git a/tests/helpers.py b/tests/helpers.py index 686735967..009c8b552 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -36,6 +36,13 @@ class Meta: token = factory.Faker("password") +class OpenAIProcessorConversationConfigFactory(factory.django.DjangoModelFactory): + class Meta: + model = OpenAIProcessorConversationConfig + + api_key = os.getenv("OPENAI_API_KEY") + + class ChatModelOptionsFactory(factory.django.DjangoModelFactory): class Meta: model = ChatModelOptions @@ -44,6 +51,7 @@ class Meta: tokenizer = None chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" model_type = "offline" + openai_config = factory.SubFactory(OpenAIProcessorConversationConfigFactory) class UserConversationProcessorConfigFactory(factory.django.DjangoModelFactory): @@ -54,13 +62,6 @@ class Meta: setting = factory.SubFactory(ChatModelOptionsFactory) -class OpenAIProcessorConversationConfigFactory(factory.django.DjangoModelFactory): - class Meta: - model = OpenAIProcessorConversationConfig - - api_key = os.getenv("OPENAI_API_KEY") - - class ConversationFactory(factory.django.DjangoModelFactory): class Meta: model = Conversation diff --git a/tests/test_openai_chat_actors.py b/tests/test_openai_chat_actors.py index 7c4f5ee3b..ae2a8c551 100644 --- a/tests/test_openai_chat_actors.py +++ b/tests/test_openai_chat_actors.py @@ -250,7 +250,7 @@ def test_answer_from_chat_history_and_currently_retrieved_content(): # Act response_gen = converse( references=[ - "Testatron was born on 1st April 1984 in Testville." + {"compiled": "Testatron was born on 1st April 1984 in Testville.", "file": "background.md"} ], # Assume context retrieved from notes for the user_query user_query="Where was I born?", conversation_log=populate_chat_history(message_list), @@ -304,14 +304,26 @@ def test_answer_requires_current_date_awareness(): "Chat actor should be able to answer questions relative to current date using provided notes" # Arrange context = [ - f"""{datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner" + { + "compiled": f"""{datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner" Expenses:Food:Dining 10.00 USD""", - f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch" + "file": "Ledger.org", + }, + { + "compiled": f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch" Expenses:Food:Dining 10.00 USD""", - f"""2020-04-01 "SuperMercado" "Bananas" + "file": "Ledger.org", + }, + { + "compiled": f"""2020-04-01 "SuperMercado" "Bananas" Expenses:Food:Groceries 10.00 USD""", - f"""2020-01-01 "Naco Taco" "Burittos for Dinner" + "file": "Ledger.org", + }, + { + "compiled": f"""2020-01-01 "Naco Taco" "Burittos for Dinner" Expenses:Food:Dining 10.00 USD""", + "file": "Ledger.org", + }, ] # Act @@ -336,14 +348,26 @@ def test_answer_requires_date_aware_aggregation_across_provided_notes(): "Chat actor should be able to answer questions that require date aware aggregation across multiple notes" # Arrange context = [ - f"""# {datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner" + { + "compiled": f"""# {datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner" Expenses:Food:Dining 10.00 USD""", - f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch" + "file": "Ledger.md", + }, + { + "compiled": f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch" Expenses:Food:Dining 10.00 USD""", - f"""2020-04-01 "SuperMercado" "Bananas" + "file": "Ledger.md", + }, + { + "compiled": f"""2020-04-01 "SuperMercado" "Bananas" Expenses:Food:Groceries 10.00 USD""", - f"""2020-01-01 "Naco Taco" "Burittos for Dinner" + "file": "Ledger.md", + }, + { + "compiled": f"""2020-01-01 "Naco Taco" "Burittos for Dinner" Expenses:Food:Dining 10.00 USD""", + "file": "Ledger.md", + }, ] # Act @@ -423,9 +447,9 @@ def test_agent_prompt_should_be_used(openai_agent): "Chat actor should ask be tuned to think like an accountant based on the agent definition" # Arrange context = [ - f"""I went to the store and bought some bananas for 2.20""", - f"""I went to the store and bought some apples for 1.30""", - f"""I went to the store and bought some oranges for 6.00""", + {"compiled": f"""I went to the store and bought some bananas for 2.20""", "file": "Ledger.md"}, + {"compiled": f"""I went to the store and bought some apples for 1.30""", "file": "Ledger.md"}, + {"compiled": f"""I went to the store and bought some oranges for 6.00""", "file": "Ledger.md"}, ] expected_responses = ["9.50", "9.5"] @@ -496,10 +520,10 @@ async def test_websearch_khoj_website_for_info_about_khoj(chat_client): @pytest.mark.parametrize( "user_query, expected_mode", [ - ("What's the latest in the Israel/Palestine conflict?", "default"), - ("Summarize the latest tech news every Monday evening", "reminder"), + ("What's the latest in the Israel/Palestine conflict?", "text"), + ("Summarize the latest tech news every Monday evening", "automation"), ("Paint a scenery in Timbuktu in the winter", "image"), - ("Remind me, when did I last visit the Serengeti?", "default"), + ("Remind me, when did I last visit the Serengeti?", "text"), ], ) async def test_use_default_response_mode(chat_client, user_query, expected_mode): @@ -525,10 +549,10 @@ async def test_select_data_sources_actor_chooses_to_search_notes( chat_client, user_query, expected_conversation_commands ): # Act - conversation_commands = await aget_relevant_information_sources(user_query, {}) + conversation_commands = await aget_relevant_information_sources(user_query, {}, False) # Assert - assert expected_conversation_commands in conversation_commands + assert set(expected_conversation_commands) == set(conversation_commands) # ---------------------------------------------------------------------------------------------------- @@ -549,46 +573,37 @@ async def test_infer_webpage_urls_actor_extracts_correct_links(chat_client): @pytest.mark.anyio @pytest.mark.django_db(transaction=True) @pytest.mark.parametrize( - "user_query, location, expected_crontime, expected_qs, unexpected_qs", + "user_query, expected_crontime, expected_qs, unexpected_qs", [ ( "Share the weather forecast for the next day daily at 7:30pm", - ("Ubud", "Bali", "Indonesia"), - "30 11 * * *", # ensure correctly converts to utc - ["weather forecast", "ubud"], + "30 19 * * *", + ["weather forecast"], ["7:30"], ), ( "Notify me when the new President of Brazil is announced", - ("Sao Paulo", "Sao Paulo", "Brazil"), "* *", # crontime is variable ["brazil", "president"], ["notify"], # ensure reminder isn't re-triggered on scheduled query run ), ( "Let me know whenever Elon leaves Twitter. Check this every afternoon at 12", - ("Karachi", "Sindh", "Pakistan"), - "0 7 * * *", # ensure correctly converts to utc + "0 12 * * *", # ensure correctly converts to utc ["elon", "twitter"], ["12"], ), ( "Draw a wallpaper every morning using the current weather", - ("Bogota", "Cundinamarca", "Colombia"), "* * *", # daily crontime - ["weather", "wallpaper", "bogota"], + ["weather", "wallpaper"], ["every"], ), ], ) -async def test_infer_task_scheduling_request( - chat_client, user_query, location, expected_crontime, expected_qs, unexpected_qs -): - # Arrange - location_data = LocationData(city=location[0], region=location[1], country=location[2]) - +async def test_infer_task_scheduling_request(chat_client, user_query, expected_crontime, expected_qs, unexpected_qs): # Act - crontime, inferred_query = await schedule_query(user_query, location_data, {}) + crontime, inferred_query, _ = await schedule_query(user_query, {}) inferred_query = inferred_query.lower() # Assert diff --git a/tests/test_openai_chat_director.py b/tests/test_openai_chat_director.py index fffaa0d91..58184ef96 100644 --- a/tests/test_openai_chat_director.py +++ b/tests/test_openai_chat_director.py @@ -516,7 +516,7 @@ async def test_get_correct_tools_online(chat_client): user_query = "What's the weather in Patagonia this week?" # Act - tools = await aget_relevant_information_sources(user_query, {}) + tools = await aget_relevant_information_sources(user_query, {}, False) # Assert tools = [tool.value for tool in tools] @@ -531,7 +531,7 @@ async def test_get_correct_tools_notes(chat_client): user_query = "Where did I go for my first battleship training?" # Act - tools = await aget_relevant_information_sources(user_query, {}) + tools = await aget_relevant_information_sources(user_query, {}, False) # Assert tools = [tool.value for tool in tools] @@ -546,7 +546,7 @@ async def test_get_correct_tools_online_or_general_and_notes(chat_client): user_query = "What's the highest point in Patagonia and have I been there?" # Act - tools = await aget_relevant_information_sources(user_query, {}) + tools = await aget_relevant_information_sources(user_query, {}, False) # Assert tools = [tool.value for tool in tools] @@ -563,7 +563,7 @@ async def test_get_correct_tools_general(chat_client): user_query = "How many noble gases are there?" # Act - tools = await aget_relevant_information_sources(user_query, {}) + tools = await aget_relevant_information_sources(user_query, {}, False) # Assert tools = [tool.value for tool in tools] @@ -587,7 +587,7 @@ async def test_get_correct_tools_with_chat_history(chat_client): chat_history = generate_history(chat_log) # Act - tools = await aget_relevant_information_sources(user_query, chat_history) + tools = await aget_relevant_information_sources(user_query, chat_history, False) # Assert tools = [tool.value for tool in tools]