From 6361fd37011460bf61aaaf17b98120bd02c0acd1 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Sat, 16 Mar 2024 00:52:57 -0700 Subject: [PATCH] Add relevance ranking to document prompt template --- backend/modules/brain/rags/quivr_rag.py | 6 +- backend/packages/files/parsers/common.py | 4 +- backend/vectorstore/supabase.py | 18 +++- supabase/migrations/20240316075202_hybrid.sql | 86 +++++++++++++++++++ 4 files changed, 109 insertions(+), 5 deletions(-) create mode 100644 supabase/migrations/20240316075202_hybrid.sql diff --git a/backend/modules/brain/rags/quivr_rag.py b/backend/modules/brain/rags/quivr_rag.py index 3fcc4a1ab316..c98b7bd299da 100644 --- a/backend/modules/brain/rags/quivr_rag.py +++ b/backend/modules/brain/rags/quivr_rag.py @@ -50,6 +50,7 @@ When answering use markdown to make it concise and neat. Use the following pieces of context from files provided by the user that are store in a brain to answer the users question in the same language as the user question. Your name is Quivr. You're a helpful assistant. If you don't know the answer with the context provided from the files, just say that you don't know, don't try to make up an answer. +The relevance of the context is ranked from 0 to 2. 2 being the most relevant and 0 being the least relevant. Value more relevant information when answering. User instruction to follow if provided to answer: {custom_instructions} """ @@ -65,7 +66,7 @@ # How we format documents DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template( - template="File: {file_name} Content: {page_content}" + template="File: {file_name} Content: {page_content} Relevance: {similarity}" ) @@ -226,6 +227,7 @@ def get_chain(self): | CONDENSE_QUESTION_PROMPT | ChatLiteLLM(temperature=0, model=self.model, api_base=api_base) | StrOutputParser(), + "question": lambda x: x["question"], } prompt_custom_user = self.prompt_to_use() @@ -236,7 +238,7 @@ def get_chain(self): # Now we retrieve the documents retrieved_documents = { "docs": itemgetter("standalone_question") | retriever_doc, - "question": lambda x: x["standalone_question"], + "question": itemgetter("question"), "custom_instructions": lambda x: prompt_to_use, } diff --git a/backend/packages/files/parsers/common.py b/backend/packages/files/parsers/common.py index fa149f73e7fb..d00b118ff612 100644 --- a/backend/packages/files/parsers/common.py +++ b/backend/packages/files/parsers/common.py @@ -42,9 +42,11 @@ async def process_file( for doc in file.documents: # pyright: ignore reportPrivateUsage=none new_metadata = metadata.copy() len_chunk = len(enc.encode(doc.page_content)) - page_content_encoded = doc.page_content.encode("unicode_escape").decode( + page_content_encoded = doc.page_content.replace("\n", " ") + page_content_encoded = page_content_encoded.encode("unicode_escape").decode( "ascii", "replace" ) + # Replace \n with space new_metadata["chunk_size"] = len_chunk doc_with_metadata = DocumentSerializable( diff --git a/backend/vectorstore/supabase.py b/backend/vectorstore/supabase.py index ea6a313e76c0..be6db33669ba 100644 --- a/backend/vectorstore/supabase.py +++ b/backend/vectorstore/supabase.py @@ -68,19 +68,28 @@ def find_brain_closest_query( def similarity_search( self, query: str, + full_text_weight: float = 2.0, # Add this parameter + semantic_weight: float = 1.0, # Add this parameter + rrf_k: int = 1, # Add this parameter k: int = 40, - table: str = "match_vectors", + table: str = "hybrid_match_vectors", threshold: float = 0.5, **kwargs: Any, ) -> List[Document]: vectors = self._embedding.embed_documents([query]) query_embedding = vectors[0] + query_lower = query.lower() res = self._client.rpc( table, { + "query_text": query_lower, + "match_count": 500, "query_embedding": query_embedding, - "max_chunk_sum": self.max_input, + "full_text_weight": full_text_weight, # Add this line + "semantic_weight": semantic_weight, # Add this line + "rrf_k": rrf_k, # Add this line "p_brain_id": str(self.brain_id), + "max_chunk_sum": self.max_input, }, ).execute() @@ -96,5 +105,10 @@ def similarity_search( for search in res.data if search.get("content") ] + for search in res.data: + if search.get("content"): + logger.info("ft_rank: %s", search.get("ft_rank", 0.0)) + logger.info("similarity: %s", search.get("similarity", 0.0)) + logger.info("rank_ix: %s", search.get("rank_ix", 0)) return match_result diff --git a/supabase/migrations/20240316075202_hybrid.sql b/supabase/migrations/20240316075202_hybrid.sql new file mode 100644 index 000000000000..c5dc3d9cdfa1 --- /dev/null +++ b/supabase/migrations/20240316075202_hybrid.sql @@ -0,0 +1,86 @@ +alter table "public"."vectors" add column "fts" tsvector generated always as (to_tsvector('english'::regconfig, content)) stored; + +CREATE INDEX vectors_fts_idx ON public.vectors USING gin (fts); + +set check_function_bodies = off; + +CREATE OR REPLACE FUNCTION public.hybrid_match_vectors(query_text text, query_embedding vector, p_brain_id uuid, match_count integer, max_chunk_sum integer, full_text_weight double precision DEFAULT 1.0, semantic_weight double precision DEFAULT 1.0, rrf_k integer DEFAULT 50) + RETURNS TABLE(id uuid, brain_id uuid, content text, metadata jsonb, embedding vector, similarity double precision, ft_rank double precision, rank_ix integer) + LANGUAGE plpgsql +AS $function$ +BEGIN +RETURN QUERY +WITH full_text AS ( + SELECT + v.id, + ts_rank_cd(v.fts, websearch_to_tsquery(query_text))::double precision AS ft_rank, + row_number() OVER (ORDER BY ts_rank_cd(v.fts, websearch_to_tsquery(query_text)) DESC)::integer AS rank_ix, + (v.metadata->>'chunk_size')::integer AS chunk_size + FROM + vectors v + INNER JOIN + brains_vectors bv ON v.id = bv.vector_id + WHERE + bv.brain_id = p_brain_id AND + v.fts @@ websearch_to_tsquery(query_text) + LIMIT LEAST(match_count, 30) * 2 +), semantic AS ( + SELECT + v.id, + (1 - (v.embedding <#> query_embedding))::double precision AS semantic_similarity, + row_number() OVER (ORDER BY (v.embedding <#> query_embedding))::integer AS rank_ix + FROM + vectors v + INNER JOIN + brains_vectors bv ON v.id = bv.vector_id + WHERE + bv.brain_id = p_brain_id + LIMIT LEAST(match_count, 30) * 2 +), combined AS ( + SELECT + coalesce(ft.id, st.id) AS id, + (coalesce(1.0 / (rrf_k + ft.rank_ix), 0)::double precision * full_text_weight + coalesce(1.0 / (rrf_k + st.rank_ix), 0)::double precision * semantic_weight)::double precision AS combined_score, + ft.ft_rank, + ft.rank_ix, + ft.chunk_size + FROM + full_text ft + FULL OUTER JOIN + semantic st ON ft.id = st.id +), ranked_vectors AS ( + SELECT + c.id, + c.combined_score, + sum(c.chunk_size) OVER (ORDER BY c.combined_score DESC, c.rank_ix)::integer AS running_total, + c.ft_rank, + c.rank_ix, + c.chunk_size + FROM + combined c +) +SELECT + v.id, + bv.brain_id, + v.content, + v.metadata, + v.embedding, + c.combined_score::double precision AS similarity, + c.ft_rank::double precision, + c.rank_ix::integer +FROM + ranked_vectors c +JOIN + vectors v ON v.id = c.id +JOIN + brains_vectors bv ON v.id = bv.vector_id +WHERE + c.running_total <= max_chunk_sum +ORDER BY + c.combined_score DESC, c.rank_ix +LIMIT + LEAST(match_count, 30); +END; +$function$ +; + +