From 6361fd37011460bf61aaaf17b98120bd02c0acd1 Mon Sep 17 00:00:00 2001
From: Stan Girard <girard.stanislas@gmail.com>
Date: Sat, 16 Mar 2024 00:52:57 -0700
Subject: [PATCH] Add relevance ranking to document prompt template

---
 backend/modules/brain/rags/quivr_rag.py       |  6 +-
 backend/packages/files/parsers/common.py      |  4 +-
 backend/vectorstore/supabase.py               | 18 +++-
 supabase/migrations/20240316075202_hybrid.sql | 86 +++++++++++++++++++
 4 files changed, 109 insertions(+), 5 deletions(-)
 create mode 100644 supabase/migrations/20240316075202_hybrid.sql

diff --git a/backend/modules/brain/rags/quivr_rag.py b/backend/modules/brain/rags/quivr_rag.py
index 3fcc4a1ab316..c98b7bd299da 100644
--- a/backend/modules/brain/rags/quivr_rag.py
+++ b/backend/modules/brain/rags/quivr_rag.py
@@ -50,6 +50,7 @@
 When answering use markdown to make it concise and neat.
 Use the following pieces of context from files provided by the user that are store in a brain to answer  the users question in the same language as the user question. Your name is Quivr. You're a helpful assistant.  
 If you don't know the answer with the context provided from the files, just say that you don't know, don't try to make up an answer.
+The relevance of the context is ranked from 0 to 2. 2 being the most relevant and 0 being the least relevant. Value more relevant information when answering.
 User instruction to follow if provided to answer: {custom_instructions}
 """
 
@@ -65,7 +66,7 @@
 # How we format documents
 
 DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
-    template="File: {file_name} Content:  {page_content}"
+    template="File: {file_name} Content:  {page_content} Relevance: {similarity}"
 )
 
 
@@ -226,6 +227,7 @@ def get_chain(self):
             | CONDENSE_QUESTION_PROMPT
             | ChatLiteLLM(temperature=0, model=self.model, api_base=api_base)
             | StrOutputParser(),
+            "question": lambda x: x["question"],
         }
 
         prompt_custom_user = self.prompt_to_use()
@@ -236,7 +238,7 @@ def get_chain(self):
         # Now we retrieve the documents
         retrieved_documents = {
             "docs": itemgetter("standalone_question") | retriever_doc,
-            "question": lambda x: x["standalone_question"],
+            "question": itemgetter("question"),
             "custom_instructions": lambda x: prompt_to_use,
         }
 
diff --git a/backend/packages/files/parsers/common.py b/backend/packages/files/parsers/common.py
index fa149f73e7fb..d00b118ff612 100644
--- a/backend/packages/files/parsers/common.py
+++ b/backend/packages/files/parsers/common.py
@@ -42,9 +42,11 @@ async def process_file(
         for doc in file.documents:  # pyright: ignore reportPrivateUsage=none
             new_metadata = metadata.copy()
             len_chunk = len(enc.encode(doc.page_content))
-            page_content_encoded = doc.page_content.encode("unicode_escape").decode(
+            page_content_encoded = doc.page_content.replace("\n", " ")
+            page_content_encoded = page_content_encoded.encode("unicode_escape").decode(
                 "ascii", "replace"
             )
+            # Replace \n with space
 
             new_metadata["chunk_size"] = len_chunk
             doc_with_metadata = DocumentSerializable(
diff --git a/backend/vectorstore/supabase.py b/backend/vectorstore/supabase.py
index ea6a313e76c0..be6db33669ba 100644
--- a/backend/vectorstore/supabase.py
+++ b/backend/vectorstore/supabase.py
@@ -68,19 +68,28 @@ def find_brain_closest_query(
     def similarity_search(
         self,
         query: str,
+        full_text_weight: float = 2.0,  # Add this parameter
+        semantic_weight: float = 1.0,  # Add this parameter
+        rrf_k: int = 1,  # Add this parameter
         k: int = 40,
-        table: str = "match_vectors",
+        table: str = "hybrid_match_vectors",
         threshold: float = 0.5,
         **kwargs: Any,
     ) -> List[Document]:
         vectors = self._embedding.embed_documents([query])
         query_embedding = vectors[0]
+        query_lower = query.lower()
         res = self._client.rpc(
             table,
             {
+                "query_text": query_lower,
+                "match_count": 500,
                 "query_embedding": query_embedding,
-                "max_chunk_sum": self.max_input,
+                "full_text_weight": full_text_weight,  # Add this line
+                "semantic_weight": semantic_weight,  # Add this line
+                "rrf_k": rrf_k,  # Add this line
                 "p_brain_id": str(self.brain_id),
+                "max_chunk_sum": self.max_input,
             },
         ).execute()
 
@@ -96,5 +105,10 @@ def similarity_search(
             for search in res.data
             if search.get("content")
         ]
+        for search in res.data:
+            if search.get("content"):
+                logger.info("ft_rank: %s", search.get("ft_rank", 0.0))
+                logger.info("similarity: %s", search.get("similarity", 0.0))
+                logger.info("rank_ix: %s", search.get("rank_ix", 0))
 
         return match_result
diff --git a/supabase/migrations/20240316075202_hybrid.sql b/supabase/migrations/20240316075202_hybrid.sql
new file mode 100644
index 000000000000..c5dc3d9cdfa1
--- /dev/null
+++ b/supabase/migrations/20240316075202_hybrid.sql
@@ -0,0 +1,86 @@
+alter table "public"."vectors" add column "fts" tsvector generated always as (to_tsvector('english'::regconfig, content)) stored;
+
+CREATE INDEX vectors_fts_idx ON public.vectors USING gin (fts);
+
+set check_function_bodies = off;
+
+CREATE OR REPLACE FUNCTION public.hybrid_match_vectors(query_text text, query_embedding vector, p_brain_id uuid, match_count integer, max_chunk_sum integer, full_text_weight double precision DEFAULT 1.0, semantic_weight double precision DEFAULT 1.0, rrf_k integer DEFAULT 50)
+ RETURNS TABLE(id uuid, brain_id uuid, content text, metadata jsonb, embedding vector, similarity double precision, ft_rank double precision, rank_ix integer)
+ LANGUAGE plpgsql
+AS $function$
+BEGIN
+RETURN QUERY
+WITH full_text AS (
+    SELECT
+        v.id,
+        ts_rank_cd(v.fts, websearch_to_tsquery(query_text))::double precision AS ft_rank,
+        row_number() OVER (ORDER BY ts_rank_cd(v.fts, websearch_to_tsquery(query_text)) DESC)::integer AS rank_ix,
+        (v.metadata->>'chunk_size')::integer AS chunk_size
+    FROM
+        vectors v
+    INNER JOIN
+        brains_vectors bv ON v.id = bv.vector_id
+    WHERE
+        bv.brain_id = p_brain_id AND
+        v.fts @@ websearch_to_tsquery(query_text)
+    LIMIT LEAST(match_count, 30) * 2
+), semantic AS (
+    SELECT
+        v.id,
+        (1 - (v.embedding <#> query_embedding))::double precision AS semantic_similarity,
+        row_number() OVER (ORDER BY (v.embedding <#> query_embedding))::integer AS rank_ix
+    FROM
+        vectors v
+    INNER JOIN
+        brains_vectors bv ON v.id = bv.vector_id
+    WHERE
+        bv.brain_id = p_brain_id
+    LIMIT LEAST(match_count, 30) * 2
+), combined AS (
+    SELECT
+        coalesce(ft.id, st.id) AS id,
+        (coalesce(1.0 / (rrf_k + ft.rank_ix), 0)::double precision * full_text_weight + coalesce(1.0 / (rrf_k + st.rank_ix), 0)::double precision * semantic_weight)::double precision AS combined_score,
+        ft.ft_rank,
+        ft.rank_ix,
+        ft.chunk_size
+    FROM
+        full_text ft
+    FULL OUTER JOIN
+        semantic st ON ft.id = st.id
+), ranked_vectors AS (
+    SELECT
+        c.id,
+        c.combined_score,
+        sum(c.chunk_size) OVER (ORDER BY c.combined_score DESC, c.rank_ix)::integer AS running_total,
+        c.ft_rank,
+        c.rank_ix,
+        c.chunk_size
+    FROM
+        combined c
+)
+SELECT
+    v.id,
+    bv.brain_id,
+    v.content,
+    v.metadata,
+    v.embedding,
+    c.combined_score::double precision AS similarity,
+    c.ft_rank::double precision,
+    c.rank_ix::integer
+FROM
+    ranked_vectors c
+JOIN
+    vectors v ON v.id = c.id
+JOIN
+    brains_vectors bv ON v.id = bv.vector_id
+WHERE
+    c.running_total <= max_chunk_sum
+ORDER BY
+    c.combined_score DESC, c.rank_ix
+LIMIT
+    LEAST(match_count, 30);
+END;
+$function$
+;
+
+