From fdb4c284dedf375f1817be26922540aefc3fc84b Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Wed, 27 Nov 2024 18:44:05 +0700 Subject: [PATCH 01/17] add support for multimodal --- .../python/agent/tools/query_engine.py | 22 ++++++++++++++----- .../components/settings/python/settings.py | 17 +++++++++++++- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index e78ae0442..ad0a47224 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -1,10 +1,13 @@ import os from typing import Optional +from llama_index.core.query_engine import BaseQueryEngine, SimpleMultiModalQueryEngine from llama_index.core.tools.query_engine import QueryEngineTool +from app.settings import get_multi_modal_llm -def create_query_engine(index, **kwargs): + +def create_query_engine(index, **kwargs) -> BaseQueryEngine: """ Create a query engine for the given index. @@ -17,11 +20,18 @@ def create_query_engine(index, **kwargs): kwargs["similarity_top_k"] = top_k # If index is index is LlamaCloudIndex # use auto_routed mode for better query results - if ( - index.__class__.__name__ == "LlamaCloudIndex" - and kwargs.get("auto_routed") is None - ): - kwargs["auto_routed"] = True + if index.__class__.__name__ == "LlamaCloudIndex": + if kwargs.get("auto_routed") is None: + kwargs["auto_routed"] = True + kwargs["retrieve_image_nodes"] = True + # TODO: Add support for MultiModalVectorStoreIndex + mm_llm = get_multi_modal_llm() + if mm_llm: + return SimpleMultiModalQueryEngine( + retriever=index.as_retriever(**kwargs), + multi_modal_llm=mm_llm, + ) + return index.as_query_engine(**kwargs) diff --git a/templates/components/settings/python/settings.py b/templates/components/settings/python/settings.py index bc7270bd8..0068116f0 100644 --- a/templates/components/settings/python/settings.py +++ b/templates/components/settings/python/settings.py @@ -1,9 +1,24 @@ import os -from typing import Dict +from typing import Dict, Optional +from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.core.settings import Settings +def get_multi_modal_llm() -> Optional[MultiModalLLM]: + model_provider = os.getenv("MODEL_PROVIDER") + llm_model = os.getenv("MULTIMODAL_LLM_MODEL") + if llm_model is None: + return None + if model_provider == "openai": + from llama_index.multi_modal_llms.openai import OpenAIMultiModal + + return OpenAIMultiModal(model=llm_model) + else: + # TODO: Add support for other providers + return None + + def init_settings(): model_provider = os.getenv("MODEL_PROVIDER") match model_provider: From 1e647c22186803faff1101903dfd675f6c7c254c Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 08:52:43 +0700 Subject: [PATCH 02/17] update code --- .../python/agent/tools/query_engine.py | 129 ++++++++++++++++-- .../components/settings/python/settings.py | 24 ++-- 2 files changed, 128 insertions(+), 25 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index ad0a47224..5fe0c5a06 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -1,10 +1,117 @@ import os -from typing import Optional +from typing import List, Optional, Sequence -from llama_index.core.query_engine import BaseQueryEngine, SimpleMultiModalQueryEngine +from llama_index.core.base.base_query_engine import BaseQueryEngine +from llama_index.core.base.response.schema import RESPONSE_TYPE, Response +from llama_index.core.query_engine import SimpleMultiModalQueryEngine +from llama_index.core.query_engine.multi_modal import _get_image_and_text_nodes +from llama_index.core.response_synthesizers import ( + BaseSynthesizer, + get_response_synthesizer, +) +from llama_index.core.response_synthesizers.type import ResponseMode +from llama_index.core.schema import ImageNode, NodeWithScore, QueryBundle from llama_index.core.tools.query_engine import QueryEngineTool -from app.settings import get_multi_modal_llm +from app.settings import multi_modal_llm + + +class MultiModalQueryEngine(SimpleMultiModalQueryEngine): + """ + A multi-modal query engine that splits the retrieval results into chunks then summarizes each chunk to reduce the number of tokens in the response. + """ + + def __init__( + self, + text_synthesizer: Optional[BaseSynthesizer] = None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + # Use a response synthesizer for text nodes summarization + self._text_synthesizer = text_synthesizer or get_response_synthesizer( + streaming=False, + response_mode=ResponseMode.TREE_SUMMARIZE, + ) + + def _summarize_text_nodes( + self, query_bundle: QueryBundle, nodes: List[NodeWithScore] + ) -> str: + """ + Synthesize a response for the query using the retrieved nodes. + """ + return str( + self._text_synthesizer.synthesize( + query=query_bundle, + nodes=nodes, + streaming=False, + ) + ) + + def synthesize( + self, + query_bundle: QueryBundle, + nodes: List[NodeWithScore], + ) -> RESPONSE_TYPE: + image_nodes, text_nodes = _get_image_and_text_nodes(nodes) + # Summarize the text nodes + text_response = self._summarize_text_nodes( + query_bundle=query_bundle, + nodes=text_nodes, + ) + + fmt_prompt = self._text_qa_template.format( + context_str=text_response, + query_str=query_bundle.query_str, + ) + + llm_response = self._multi_modal_llm.complete( + prompt=fmt_prompt, + image_documents=[ + image_node.node + for image_node in image_nodes + if isinstance(image_node.node, ImageNode) + ], + ) + + return Response( + response=str(llm_response), + source_nodes=nodes, + metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, + ) + + async def asynthesize( + self, + query_bundle: QueryBundle, + nodes: List[NodeWithScore], + additional_source_nodes: Optional[Sequence[NodeWithScore]] = None, + ) -> RESPONSE_TYPE: + image_nodes, text_nodes = _get_image_and_text_nodes(nodes) + # Summarize the text nodes to avoid exceeding the token limit + text_response = self._summarize_text_nodes( + query_bundle=query_bundle, + nodes=text_nodes, + ) + + fmt_prompt = self._text_qa_template.format( + context_str=text_response, + query_str=query_bundle.query_str, + ) + + llm_response = await self._multi_modal_llm.acomplete( + prompt=fmt_prompt, + image_documents=[ + image_node.node + for image_node in image_nodes + if isinstance(image_node.node, ImageNode) + ], + ) + + return Response( + response=str(llm_response), + source_nodes=nodes, + metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, + ) def create_query_engine(index, **kwargs) -> BaseQueryEngine: @@ -21,15 +128,15 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: # If index is index is LlamaCloudIndex # use auto_routed mode for better query results if index.__class__.__name__ == "LlamaCloudIndex": - if kwargs.get("auto_routed") is None: - kwargs["auto_routed"] = True - kwargs["retrieve_image_nodes"] = True - # TODO: Add support for MultiModalVectorStoreIndex - mm_llm = get_multi_modal_llm() - if mm_llm: - return SimpleMultiModalQueryEngine( + retrieval_mode = kwargs.get("retrieval_mode") + if retrieval_mode is None: + kwargs["retrieval_mode"] = "auto_routed" + if multi_modal_llm: + # Note: image nodes are not supported for auto_routed or chunk retrieval mode + kwargs["retrieve_image_nodes"] = True + return MultiModalQueryEngine( retriever=index.as_retriever(**kwargs), - multi_modal_llm=mm_llm, + multi_modal_llm=multi_modal_llm, ) return index.as_query_engine(**kwargs) diff --git a/templates/components/settings/python/settings.py b/templates/components/settings/python/settings.py index 0068116f0..ae24ee4ad 100644 --- a/templates/components/settings/python/settings.py +++ b/templates/components/settings/python/settings.py @@ -4,19 +4,8 @@ from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.core.settings import Settings - -def get_multi_modal_llm() -> Optional[MultiModalLLM]: - model_provider = os.getenv("MODEL_PROVIDER") - llm_model = os.getenv("MULTIMODAL_LLM_MODEL") - if llm_model is None: - return None - if model_provider == "openai": - from llama_index.multi_modal_llms.openai import OpenAIMultiModal - - return OpenAIMultiModal(model=llm_model) - else: - # TODO: Add support for other providers - return None +# Singleton for multi-modal LLM +multi_modal_llm: Optional[MultiModalLLM] = None def init_settings(): @@ -75,14 +64,21 @@ def init_openai(): from llama_index.core.constants import DEFAULT_TEMPERATURE from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.llms.openai import OpenAI + from llama_index.multi_modal_llms.openai import OpenAIMultiModal + from llama_index.multi_modal_llms.openai.utils import GPT4V_MODELS max_tokens = os.getenv("LLM_MAX_TOKENS") + model_name = os.getenv("MODEL", "gpt-4o-mini") Settings.llm = OpenAI( - model=os.getenv("MODEL", "gpt-4o-mini"), + model=model_name, temperature=float(os.getenv("LLM_TEMPERATURE", DEFAULT_TEMPERATURE)), max_tokens=int(max_tokens) if max_tokens is not None else None, ) + if model_name in GPT4V_MODELS: + global multi_modal_llm + multi_modal_llm = OpenAIMultiModal(model=model_name) + dimensions = os.getenv("EMBEDDING_DIM") Settings.embed_model = OpenAIEmbedding( model=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small"), From bc8df660a11c5d153e50d04141615232d720dfb9 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 09:15:11 +0700 Subject: [PATCH 03/17] fix wrong signature override --- templates/components/engines/python/agent/tools/query_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 5fe0c5a06..123eb679f 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -52,6 +52,7 @@ def synthesize( self, query_bundle: QueryBundle, nodes: List[NodeWithScore], + additional_source_nodes: Optional[Sequence[NodeWithScore]] = None, ) -> RESPONSE_TYPE: image_nodes, text_nodes = _get_image_and_text_nodes(nodes) # Summarize the text nodes From 9d6f94b97226585a04412444453e5cedc42fc529 Mon Sep 17 00:00:00 2001 From: thucpn Date: Fri, 29 Nov 2024 11:05:46 +0700 Subject: [PATCH 04/17] bump: chat-ui --- .../types/streaming/nextjs/app/components/chat-section.tsx | 3 +-- templates/types/streaming/nextjs/package.json | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/templates/types/streaming/nextjs/app/components/chat-section.tsx b/templates/types/streaming/nextjs/app/components/chat-section.tsx index 4a0fe9deb..edf7f3311 100644 --- a/templates/types/streaming/nextjs/app/components/chat-section.tsx +++ b/templates/types/streaming/nextjs/app/components/chat-section.tsx @@ -1,8 +1,7 @@ "use client"; import { ChatSection as ChatSectionUI } from "@llamaindex/chat-ui"; -import "@llamaindex/chat-ui/styles/code.css"; -import "@llamaindex/chat-ui/styles/katex.css"; +import "@llamaindex/chat-ui/styles/markdown.css"; import "@llamaindex/chat-ui/styles/pdf.css"; import { useChat } from "ai/react"; import CustomChatInput from "./ui/chat/chat-input"; diff --git a/templates/types/streaming/nextjs/package.json b/templates/types/streaming/nextjs/package.json index ff185c709..94e576f07 100644 --- a/templates/types/streaming/nextjs/package.json +++ b/templates/types/streaming/nextjs/package.json @@ -16,7 +16,7 @@ "@radix-ui/react-select": "^2.1.1", "@radix-ui/react-slot": "^1.0.2", "@radix-ui/react-tabs": "^1.1.0", - "@llamaindex/chat-ui": "0.0.11", + "@llamaindex/chat-ui": "0.0.12", "ai": "4.0.3", "ajv": "^8.12.0", "class-variance-authority": "^0.7.0", From a40da22788337cadb14487a0c531892c608d9347 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 29 Nov 2024 11:06:16 +0700 Subject: [PATCH 05/17] Create lazy-insects-share.md --- .changeset/lazy-insects-share.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/lazy-insects-share.md diff --git a/.changeset/lazy-insects-share.md b/.changeset/lazy-insects-share.md new file mode 100644 index 000000000..4f914968e --- /dev/null +++ b/.changeset/lazy-insects-share.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +bump: chat-ui From 88d291801fe84284df80dbab481990ffc5e01d70 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 11:52:28 +0700 Subject: [PATCH 06/17] add MultiModalSynthesizer --- .../python/agent/tools/query_engine.py | 104 ++++++------------ .../components/settings/python/settings.py | 9 +- 2 files changed, 37 insertions(+), 76 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 123eb679f..905d20df9 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -1,102 +1,59 @@ import os -from typing import List, Optional, Sequence +from typing import Any, List, Optional, Sequence from llama_index.core.base.base_query_engine import BaseQueryEngine from llama_index.core.base.response.schema import RESPONSE_TYPE, Response -from llama_index.core.query_engine import SimpleMultiModalQueryEngine +from llama_index.core.multi_modal_llms import MultiModalLLM +from llama_index.core.prompts.base import BasePromptTemplate +from llama_index.core.prompts.default_prompt_selectors import ( + DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, +) +from llama_index.core.query_engine import ( + RetrieverQueryEngine, +) from llama_index.core.query_engine.multi_modal import _get_image_and_text_nodes -from llama_index.core.response_synthesizers import ( - BaseSynthesizer, - get_response_synthesizer, +from llama_index.core.response_synthesizers import TreeSummarize +from llama_index.core.response_synthesizers.base import QueryTextType +from llama_index.core.schema import ( + ImageNode, + NodeWithScore, ) -from llama_index.core.response_synthesizers.type import ResponseMode -from llama_index.core.schema import ImageNode, NodeWithScore, QueryBundle from llama_index.core.tools.query_engine import QueryEngineTool from app.settings import multi_modal_llm -class MultiModalQueryEngine(SimpleMultiModalQueryEngine): +class MultiModalSynthesizer(TreeSummarize): """ - A multi-modal query engine that splits the retrieval results into chunks then summarizes each chunk to reduce the number of tokens in the response. + A synthesizer that summarizes text nodes and uses a multi-modal LLM to generate a response. """ def __init__( self, - text_synthesizer: Optional[BaseSynthesizer] = None, + multimodal_model: Optional[MultiModalLLM] = None, + text_qa_template: Optional[BasePromptTemplate] = None, *args, **kwargs, ): super().__init__(*args, **kwargs) - # Use a response synthesizer for text nodes summarization - self._text_synthesizer = text_synthesizer or get_response_synthesizer( - streaming=False, - response_mode=ResponseMode.TREE_SUMMARIZE, - ) - - def _summarize_text_nodes( - self, query_bundle: QueryBundle, nodes: List[NodeWithScore] - ) -> str: - """ - Synthesize a response for the query using the retrieved nodes. - """ - return str( - self._text_synthesizer.synthesize( - query=query_bundle, - nodes=nodes, - streaming=False, - ) - ) - - def synthesize( - self, - query_bundle: QueryBundle, - nodes: List[NodeWithScore], - additional_source_nodes: Optional[Sequence[NodeWithScore]] = None, - ) -> RESPONSE_TYPE: - image_nodes, text_nodes = _get_image_and_text_nodes(nodes) - # Summarize the text nodes - text_response = self._summarize_text_nodes( - query_bundle=query_bundle, - nodes=text_nodes, - ) - - fmt_prompt = self._text_qa_template.format( - context_str=text_response, - query_str=query_bundle.query_str, - ) - - llm_response = self._multi_modal_llm.complete( - prompt=fmt_prompt, - image_documents=[ - image_node.node - for image_node in image_nodes - if isinstance(image_node.node, ImageNode) - ], - ) - - return Response( - response=str(llm_response), - source_nodes=nodes, - metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, - ) + self._multi_modal_llm = multimodal_model + self._text_qa_template = text_qa_template or DEFAULT_TREE_SUMMARIZE_PROMPT_SEL async def asynthesize( self, - query_bundle: QueryBundle, + query: QueryTextType, nodes: List[NodeWithScore], additional_source_nodes: Optional[Sequence[NodeWithScore]] = None, + **response_kwargs: Any, ) -> RESPONSE_TYPE: image_nodes, text_nodes = _get_image_and_text_nodes(nodes) + # Summarize the text nodes to avoid exceeding the token limit - text_response = self._summarize_text_nodes( - query_bundle=query_bundle, - nodes=text_nodes, - ) + text_response = str(await super().asynthesize(query, nodes)) fmt_prompt = self._text_qa_template.format( context_str=text_response, - query_str=query_bundle.query_str, + query_str=query.query_str, # type: ignore ) llm_response = await self._multi_modal_llm.acomplete( @@ -123,6 +80,7 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: index: The index to create a query engine for. params (optional): Additional parameters for the query engine, e.g: similarity_top_k """ + top_k = int(os.getenv("TOP_K", 0)) if top_k != 0 and kwargs.get("filters") is None: kwargs["similarity_top_k"] = top_k @@ -132,12 +90,14 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: retrieval_mode = kwargs.get("retrieval_mode") if retrieval_mode is None: kwargs["retrieval_mode"] = "auto_routed" - if multi_modal_llm: - # Note: image nodes are not supported for auto_routed or chunk retrieval mode + mm_model = multi_modal_llm.get() + if mm_model: kwargs["retrieve_image_nodes"] = True - return MultiModalQueryEngine( + return RetrieverQueryEngine( retriever=index.as_retriever(**kwargs), - multi_modal_llm=multi_modal_llm, + response_synthesizer=MultiModalSynthesizer( + multimodal_model=mm_model + ), ) return index.as_query_engine(**kwargs) diff --git a/templates/components/settings/python/settings.py b/templates/components/settings/python/settings.py index ae24ee4ad..abbc696ed 100644 --- a/templates/components/settings/python/settings.py +++ b/templates/components/settings/python/settings.py @@ -1,11 +1,13 @@ import os +from contextvars import ContextVar from typing import Dict, Optional from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.core.settings import Settings -# Singleton for multi-modal LLM -multi_modal_llm: Optional[MultiModalLLM] = None +multi_modal_llm: ContextVar[Optional[MultiModalLLM]] = ContextVar( + "multi_modal_llm", default=None +) def init_settings(): @@ -76,8 +78,7 @@ def init_openai(): ) if model_name in GPT4V_MODELS: - global multi_modal_llm - multi_modal_llm = OpenAIMultiModal(model=model_name) + multi_modal_llm.set(OpenAIMultiModal(model=model_name)) dimensions = os.getenv("EMBEDDING_DIM") Settings.embed_model = OpenAIEmbedding( From fa17d39e22832bf0176f7a4b27f4212d2c66f220 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 12:15:47 +0700 Subject: [PATCH 07/17] add synthesize --- .../python/agent/tools/query_engine.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 905d20df9..542541bf0 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -39,6 +39,38 @@ def __init__( self._multi_modal_llm = multimodal_model self._text_qa_template = text_qa_template or DEFAULT_TREE_SUMMARIZE_PROMPT_SEL + def synthesize( + self, + query: QueryTextType, + nodes: List[NodeWithScore], + additional_source_nodes: Optional[Sequence[NodeWithScore]] = None, + **response_kwargs: Any, + ) -> RESPONSE_TYPE: + image_nodes, text_nodes = _get_image_and_text_nodes(nodes) + + # Summarize the text nodes to avoid exceeding the token limit + text_response = str(super().synthesize(query, nodes)) + + fmt_prompt = self._text_qa_template.format( + context_str=text_response, + query_str=query.query_str, # type: ignore + ) + + llm_response = self._multi_modal_llm.complete( + prompt=fmt_prompt, + image_documents=[ + image_node.node + for image_node in image_nodes + if isinstance(image_node.node, ImageNode) + ], + ) + + return Response( + response=str(llm_response), + source_nodes=nodes, + metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, + ) + async def asynthesize( self, query: QueryTextType, @@ -93,6 +125,7 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: mm_model = multi_modal_llm.get() if mm_model: kwargs["retrieve_image_nodes"] = True + print("Using multi-modal model") return RetrieverQueryEngine( retriever=index.as_retriever(**kwargs), response_synthesizer=MultiModalSynthesizer( From 0ffb7ff4e1041d4e8b90202be140d50f3519ba36 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 13:18:24 +0700 Subject: [PATCH 08/17] use getter --- .../python/agent/tools/query_engine.py | 43 +++---------------- .../components/settings/python/settings.py | 14 +++--- 2 files changed, 14 insertions(+), 43 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 542541bf0..6f664fe80 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -20,7 +20,7 @@ ) from llama_index.core.tools.query_engine import QueryEngineTool -from app.settings import multi_modal_llm +from app.settings import get_multi_modal_llm class MultiModalSynthesizer(TreeSummarize): @@ -39,38 +39,6 @@ def __init__( self._multi_modal_llm = multimodal_model self._text_qa_template = text_qa_template or DEFAULT_TREE_SUMMARIZE_PROMPT_SEL - def synthesize( - self, - query: QueryTextType, - nodes: List[NodeWithScore], - additional_source_nodes: Optional[Sequence[NodeWithScore]] = None, - **response_kwargs: Any, - ) -> RESPONSE_TYPE: - image_nodes, text_nodes = _get_image_and_text_nodes(nodes) - - # Summarize the text nodes to avoid exceeding the token limit - text_response = str(super().synthesize(query, nodes)) - - fmt_prompt = self._text_qa_template.format( - context_str=text_response, - query_str=query.query_str, # type: ignore - ) - - llm_response = self._multi_modal_llm.complete( - prompt=fmt_prompt, - image_documents=[ - image_node.node - for image_node in image_nodes - if isinstance(image_node.node, ImageNode) - ], - ) - - return Response( - response=str(llm_response), - source_nodes=nodes, - metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, - ) - async def asynthesize( self, query: QueryTextType, @@ -122,18 +90,17 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: retrieval_mode = kwargs.get("retrieval_mode") if retrieval_mode is None: kwargs["retrieval_mode"] = "auto_routed" - mm_model = multi_modal_llm.get() - if mm_model: + multi_modal_llm = get_multi_modal_llm() + if multi_modal_llm: kwargs["retrieve_image_nodes"] = True - print("Using multi-modal model") return RetrieverQueryEngine( retriever=index.as_retriever(**kwargs), response_synthesizer=MultiModalSynthesizer( - multimodal_model=mm_model + multimodal_model=multi_modal_llm ), ) - return index.as_query_engine(**kwargs) + raise ValueError("Multi-modal LLM is not set") def get_query_engine_tool( diff --git a/templates/components/settings/python/settings.py b/templates/components/settings/python/settings.py index abbc696ed..ff647560c 100644 --- a/templates/components/settings/python/settings.py +++ b/templates/components/settings/python/settings.py @@ -1,13 +1,16 @@ import os -from contextvars import ContextVar from typing import Dict, Optional from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.core.settings import Settings -multi_modal_llm: ContextVar[Optional[MultiModalLLM]] = ContextVar( - "multi_modal_llm", default=None -) +# `Settings` does not support setting `MultiModalLLM` +# so we use a global variable to store it +_multi_modal_llm: Optional[MultiModalLLM] = None + + +def get_multi_modal_llm(): + return _multi_modal_llm def init_settings(): @@ -78,7 +81,8 @@ def init_openai(): ) if model_name in GPT4V_MODELS: - multi_modal_llm.set(OpenAIMultiModal(model=model_name)) + global _multi_modal_llm + _multi_modal_llm = OpenAIMultiModal(model=model_name) dimensions = os.getenv("EMBEDDING_DIM") Settings.embed_model = OpenAIEmbedding( From b9d336c61057f0945b3af7f5bc8d3781510083a3 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:02:33 +0700 Subject: [PATCH 09/17] remove wrong code --- .../components/engines/python/agent/tools/query_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 6f664fe80..c035975a0 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -53,7 +53,7 @@ async def asynthesize( fmt_prompt = self._text_qa_template.format( context_str=text_response, - query_str=query.query_str, # type: ignore + query_str=query.query_str, ) llm_response = await self._multi_modal_llm.acomplete( @@ -100,7 +100,7 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: ), ) - raise ValueError("Multi-modal LLM is not set") + return index.as_query_engine(**kwargs) def get_query_engine_tool( From 235159e057974167b4148be9b9a05f0d6050afed Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:20:30 +0700 Subject: [PATCH 10/17] improve code --- .../python/agent/tools/query_engine.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index c035975a0..fa8e71b31 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -1,19 +1,19 @@ import os from typing import Any, List, Optional, Sequence +from llama_index.core import get_response_synthesizer from llama_index.core.base.base_query_engine import BaseQueryEngine from llama_index.core.base.response.schema import RESPONSE_TYPE, Response from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.core.prompts.base import BasePromptTemplate from llama_index.core.prompts.default_prompt_selectors import ( - DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, + DEFAULT_TEXT_QA_PROMPT_SEL, ) from llama_index.core.query_engine import ( RetrieverQueryEngine, ) from llama_index.core.query_engine.multi_modal import _get_image_and_text_nodes -from llama_index.core.response_synthesizers import TreeSummarize -from llama_index.core.response_synthesizers.base import QueryTextType +from llama_index.core.response_synthesizers.base import BaseSynthesizer, QueryTextType from llama_index.core.schema import ( ImageNode, NodeWithScore, @@ -23,21 +23,23 @@ from app.settings import get_multi_modal_llm -class MultiModalSynthesizer(TreeSummarize): +class MultiModalSynthesizer(BaseSynthesizer): """ A synthesizer that summarizes text nodes and uses a multi-modal LLM to generate a response. """ def __init__( self, - multimodal_model: Optional[MultiModalLLM] = None, + multimodal_model: MultiModalLLM, + response_synthesizer: Optional[BaseSynthesizer], text_qa_template: Optional[BasePromptTemplate] = None, *args, **kwargs, ): super().__init__(*args, **kwargs) self._multi_modal_llm = multimodal_model - self._text_qa_template = text_qa_template or DEFAULT_TREE_SUMMARIZE_PROMPT_SEL + self._response_synthesizer = response_synthesizer + self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL async def asynthesize( self, @@ -49,11 +51,13 @@ async def asynthesize( image_nodes, text_nodes = _get_image_and_text_nodes(nodes) # Summarize the text nodes to avoid exceeding the token limit - text_response = str(await super().asynthesize(query, nodes)) + text_response = str( + await self._response_synthesizer.asynthesize(query, text_nodes) + ) fmt_prompt = self._text_qa_template.format( context_str=text_response, - query_str=query.query_str, + query_str=query.query_str, # type: ignore ) llm_response = await self._multi_modal_llm.acomplete( @@ -90,13 +94,13 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: retrieval_mode = kwargs.get("retrieval_mode") if retrieval_mode is None: kwargs["retrieval_mode"] = "auto_routed" - multi_modal_llm = get_multi_modal_llm() - if multi_modal_llm: + if get_multi_modal_llm(): kwargs["retrieve_image_nodes"] = True return RetrieverQueryEngine( retriever=index.as_retriever(**kwargs), response_synthesizer=MultiModalSynthesizer( - multimodal_model=multi_modal_llm + multimodal_model=get_multi_modal_llm(), + response_synthesizer=get_response_synthesizer(), ), ) From 3b14f59cc1ee166837d4ff9732a5bfbef182b83b Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:26:27 +0700 Subject: [PATCH 11/17] fix missing abstract methods --- .../python/agent/tools/query_engine.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index fa8e71b31..c30f4234c 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -1,5 +1,5 @@ import os -from typing import Any, List, Optional, Sequence +from typing import Any, Dict, List, Optional, Sequence from llama_index.core import get_response_synthesizer from llama_index.core.base.base_query_engine import BaseQueryEngine @@ -19,6 +19,7 @@ NodeWithScore, ) from llama_index.core.tools.query_engine import QueryEngineTool +from llama_index.core.types import RESPONSE_TEXT_TYPE from app.settings import get_multi_modal_llm @@ -41,6 +42,25 @@ def __init__( self._response_synthesizer = response_synthesizer self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL + def _get_prompts(self, **kwargs) -> Dict[str, Any]: + return { + "text_qa_template": self._text_qa_template, + } + + def _update_prompts(self, prompts: Dict[str, Any]) -> None: + if "text_qa_template" in prompts: + self._text_qa_template = prompts["text_qa_template"] + + async def aget_response( + self, + *args, + **response_kwargs: Any, + ) -> RESPONSE_TEXT_TYPE: + return await self._response_synthesizer.aget_response(*args, **response_kwargs) + + def get_response(self, *args, **kwargs) -> RESPONSE_TEXT_TYPE: + return self._response_synthesizer.get_response(*args, **kwargs) + async def asynthesize( self, query: QueryTextType, From 81a8bfe4946f39350e36821c8f479ac3761439de Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:28:10 +0700 Subject: [PATCH 12/17] fix missing abstract methods --- .../python/agent/tools/query_engine.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index c30f4234c..7ff49fb04 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -95,6 +95,38 @@ async def asynthesize( metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, ) + def synthesize( + self, + query: QueryTextType, + nodes: List[NodeWithScore], + additional_source_nodes: Optional[Sequence[NodeWithScore]] = None, + **response_kwargs: Any, + ) -> RESPONSE_TYPE: + image_nodes, text_nodes = _get_image_and_text_nodes(nodes) + + # Summarize the text nodes to avoid exceeding the token limit + text_response = str(self._response_synthesizer.synthesize(query, text_nodes)) + + fmt_prompt = self._text_qa_template.format( + context_str=text_response, + query_str=query.query_str, # type: ignore + ) + + llm_response = self._multi_modal_llm.complete( + prompt=fmt_prompt, + image_documents=[ + image_node.node + for image_node in image_nodes + if isinstance(image_node.node, ImageNode) + ], + ) + + return Response( + response=str(llm_response), + source_nodes=nodes, + metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, + ) + def create_query_engine(index, **kwargs) -> BaseQueryEngine: """ From 2e8e89a9b419ba731935cd262cbaadc0151fbf5d Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:36:34 +0700 Subject: [PATCH 13/17] use MultiModalSynthesizer as default --- .../python/agent/tools/query_engine.py | 126 +++++++++--------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 7ff49fb04..64276b160 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -9,9 +9,6 @@ from llama_index.core.prompts.default_prompt_selectors import ( DEFAULT_TEXT_QA_PROMPT_SEL, ) -from llama_index.core.query_engine import ( - RetrieverQueryEngine, -) from llama_index.core.query_engine.multi_modal import _get_image_and_text_nodes from llama_index.core.response_synthesizers.base import BaseSynthesizer, QueryTextType from llama_index.core.schema import ( @@ -24,6 +21,64 @@ from app.settings import get_multi_modal_llm +def create_query_engine(index, **kwargs) -> BaseQueryEngine: + """ + Create a query engine for the given index. + + Args: + index: The index to create a query engine for. + params (optional): Additional parameters for the query engine, e.g: similarity_top_k + """ + + top_k = int(os.getenv("TOP_K", 0)) + if top_k != 0 and kwargs.get("filters") is None: + kwargs["similarity_top_k"] = top_k + multimodal_llm = get_multi_modal_llm() + if multimodal_llm: + kwargs["response_synthesizer"] = MultiModalSynthesizer( + multimodal_model=multimodal_llm, + response_synthesizer=get_response_synthesizer(), + ) + + # If index is index is LlamaCloudIndex + # use auto_routed mode for better query results + if index.__class__.__name__ == "LlamaCloudIndex": + retrieval_mode = kwargs.get("retrieval_mode") + if retrieval_mode is None: + kwargs["retrieval_mode"] = "auto_routed" + if multimodal_llm: + kwargs["retrieve_image_nodes"] = True + return index.as_query_engine(**kwargs) + + +def get_query_engine_tool( + index, + name: Optional[str] = None, + description: Optional[str] = None, + **kwargs, +) -> QueryEngineTool: + """ + Get a query engine tool for the given index. + + Args: + index: The index to create a query engine for. + name (optional): The name of the tool. + description (optional): The description of the tool. + """ + if name is None: + name = "query_index" + if description is None: + description = ( + "Use this tool to retrieve information about the text corpus from an index." + ) + query_engine = create_query_engine(index, **kwargs) + return QueryEngineTool.from_defaults( + query_engine=query_engine, + name=name, + description=description, + ) + + class MultiModalSynthesizer(BaseSynthesizer): """ A synthesizer that summarizes text nodes and uses a multi-modal LLM to generate a response. @@ -70,6 +125,9 @@ async def asynthesize( ) -> RESPONSE_TYPE: image_nodes, text_nodes = _get_image_and_text_nodes(nodes) + if len(image_nodes) == 0: + return await self._response_synthesizer.asynthesize(query, text_nodes) + # Summarize the text nodes to avoid exceeding the token limit text_response = str( await self._response_synthesizer.asynthesize(query, text_nodes) @@ -104,6 +162,9 @@ def synthesize( ) -> RESPONSE_TYPE: image_nodes, text_nodes = _get_image_and_text_nodes(nodes) + if len(image_nodes) == 0: + return self._response_synthesizer.synthesize(query, text_nodes) + # Summarize the text nodes to avoid exceeding the token limit text_response = str(self._response_synthesizer.synthesize(query, text_nodes)) @@ -126,62 +187,3 @@ def synthesize( source_nodes=nodes, metadata={"text_nodes": text_nodes, "image_nodes": image_nodes}, ) - - -def create_query_engine(index, **kwargs) -> BaseQueryEngine: - """ - Create a query engine for the given index. - - Args: - index: The index to create a query engine for. - params (optional): Additional parameters for the query engine, e.g: similarity_top_k - """ - - top_k = int(os.getenv("TOP_K", 0)) - if top_k != 0 and kwargs.get("filters") is None: - kwargs["similarity_top_k"] = top_k - # If index is index is LlamaCloudIndex - # use auto_routed mode for better query results - if index.__class__.__name__ == "LlamaCloudIndex": - retrieval_mode = kwargs.get("retrieval_mode") - if retrieval_mode is None: - kwargs["retrieval_mode"] = "auto_routed" - if get_multi_modal_llm(): - kwargs["retrieve_image_nodes"] = True - return RetrieverQueryEngine( - retriever=index.as_retriever(**kwargs), - response_synthesizer=MultiModalSynthesizer( - multimodal_model=get_multi_modal_llm(), - response_synthesizer=get_response_synthesizer(), - ), - ) - - return index.as_query_engine(**kwargs) - - -def get_query_engine_tool( - index, - name: Optional[str] = None, - description: Optional[str] = None, - **kwargs, -) -> QueryEngineTool: - """ - Get a query engine tool for the given index. - - Args: - index: The index to create a query engine for. - name (optional): The name of the tool. - description (optional): The description of the tool. - """ - if name is None: - name = "query_index" - if description is None: - description = ( - "Use this tool to retrieve information about the text corpus from an index." - ) - query_engine = create_query_engine(index, **kwargs) - return QueryEngineTool.from_defaults( - query_engine=query_engine, - name=name, - description=description, - ) From 46096ba4cf78a85d65ed2e039b6c80f9535319b0 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:39:53 +0700 Subject: [PATCH 14/17] better code --- .../components/engines/python/agent/tools/query_engine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 64276b160..8bfe3ff6a 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -37,7 +37,6 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: if multimodal_llm: kwargs["response_synthesizer"] = MultiModalSynthesizer( multimodal_model=multimodal_llm, - response_synthesizer=get_response_synthesizer(), ) # If index is index is LlamaCloudIndex @@ -87,14 +86,14 @@ class MultiModalSynthesizer(BaseSynthesizer): def __init__( self, multimodal_model: MultiModalLLM, - response_synthesizer: Optional[BaseSynthesizer], + response_synthesizer: Optional[BaseSynthesizer] = None, text_qa_template: Optional[BasePromptTemplate] = None, *args, **kwargs, ): super().__init__(*args, **kwargs) self._multi_modal_llm = multimodal_model - self._response_synthesizer = response_synthesizer + self._response_synthesizer = response_synthesizer or get_response_synthesizer() self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL def _get_prompts(self, **kwargs) -> Dict[str, Any]: From a7f56a766e259fd2f9c90b3aafcc16ef7a4c3371 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:49:02 +0700 Subject: [PATCH 15/17] simplify code --- .../engines/python/agent/tools/query_engine.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 8bfe3ff6a..8c3c1b1d7 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -1,7 +1,6 @@ import os from typing import Any, Dict, List, Optional, Sequence -from llama_index.core import get_response_synthesizer from llama_index.core.base.base_query_engine import BaseQueryEngine from llama_index.core.base.response.schema import RESPONSE_TYPE, Response from llama_index.core.multi_modal_llms import MultiModalLLM @@ -42,11 +41,10 @@ def create_query_engine(index, **kwargs) -> BaseQueryEngine: # If index is index is LlamaCloudIndex # use auto_routed mode for better query results if index.__class__.__name__ == "LlamaCloudIndex": - retrieval_mode = kwargs.get("retrieval_mode") - if retrieval_mode is None: + if kwargs.get("retrieval_mode") is None: kwargs["retrieval_mode"] = "auto_routed" - if multimodal_llm: - kwargs["retrieve_image_nodes"] = True + if multimodal_llm: + kwargs["retrieve_image_nodes"] = True return index.as_query_engine(**kwargs) @@ -86,14 +84,14 @@ class MultiModalSynthesizer(BaseSynthesizer): def __init__( self, multimodal_model: MultiModalLLM, - response_synthesizer: Optional[BaseSynthesizer] = None, + response_synthesizer: Optional[BaseSynthesizer], text_qa_template: Optional[BasePromptTemplate] = None, *args, **kwargs, ): super().__init__(*args, **kwargs) self._multi_modal_llm = multimodal_model - self._response_synthesizer = response_synthesizer or get_response_synthesizer() + self._response_synthesizer = response_synthesizer self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL def _get_prompts(self, **kwargs) -> Dict[str, Any]: From af026747f6717ef0c7484140374b7a3e62a91b09 Mon Sep 17 00:00:00 2001 From: leehuwuj Date: Fri, 29 Nov 2024 15:59:22 +0700 Subject: [PATCH 16/17] fix wrong params --- .../components/engines/python/agent/tools/query_engine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/templates/components/engines/python/agent/tools/query_engine.py b/templates/components/engines/python/agent/tools/query_engine.py index 8c3c1b1d7..396fb1d6e 100644 --- a/templates/components/engines/python/agent/tools/query_engine.py +++ b/templates/components/engines/python/agent/tools/query_engine.py @@ -1,6 +1,7 @@ import os from typing import Any, Dict, List, Optional, Sequence +from llama_index.core import get_response_synthesizer from llama_index.core.base.base_query_engine import BaseQueryEngine from llama_index.core.base.response.schema import RESPONSE_TYPE, Response from llama_index.core.multi_modal_llms import MultiModalLLM @@ -84,14 +85,14 @@ class MultiModalSynthesizer(BaseSynthesizer): def __init__( self, multimodal_model: MultiModalLLM, - response_synthesizer: Optional[BaseSynthesizer], + response_synthesizer: Optional[BaseSynthesizer] = None, text_qa_template: Optional[BasePromptTemplate] = None, *args, **kwargs, ): super().__init__(*args, **kwargs) self._multi_modal_llm = multimodal_model - self._response_synthesizer = response_synthesizer + self._response_synthesizer = response_synthesizer or get_response_synthesizer() self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL def _get_prompts(self, **kwargs) -> Dict[str, Any]: From 7516b06951b6f9ab7fe28488ff040d01a20b105f Mon Sep 17 00:00:00 2001 From: Marcus Schiesser Date: Fri, 29 Nov 2024 18:00:49 +0700 Subject: [PATCH 17/17] docs: changeset --- .changeset/blue-hornets-boil.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/blue-hornets-boil.md diff --git a/.changeset/blue-hornets-boil.md b/.changeset/blue-hornets-boil.md new file mode 100644 index 000000000..e8c2928d4 --- /dev/null +++ b/.changeset/blue-hornets-boil.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Add support multimodal indexes (e.g. from LlamaCloud)