Skip to content

Commit

Permalink
Implement max_marginal_relevance_search in VectorStore of Pinecone (
Browse files Browse the repository at this point in the history
langchain-ai#6056)

This adds implementation of MMR search in pinecone; and I have two
semi-related observations about this vector store class:
- Maybe we should also have a
`similarity_search_by_vector_returning_embeddings` like in supabase, but
it's not in the base `VectorStore` class so I didn't implement
- Talking about the base class, there's
`similarity_search_with_relevance_scores`, but in pinecone it is called
`similarity_search_with_score`; maybe we should consider renaming it to
align with other `VectorStore` base and sub classes (or add that as an
alias for backward compatibility)

#### Who can review?

Tag maintainers/contributors who might be interested:
 - VectorStores / Retrievers / Memory - @dev2049
  • Loading branch information
neo authored and Undertone0809 committed Jun 19, 2023
1 parent af9fa13 commit b7c9931
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 4 deletions.
46 changes: 42 additions & 4 deletions docs/modules/indexes/vectorstores/examples/pinecone.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
},
"outputs": [],
"source": [
"!pip install pinecone-client"
"!pip install pinecone-client openai tiktoken"
]
},
{
Expand Down Expand Up @@ -70,7 +70,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "aac9563e",
"metadata": {
"tags": []
Expand All @@ -85,7 +85,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "a3c3999a",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -135,13 +135,51 @@
"print(docs[0].page_content)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "d46d1452",
"metadata": {},
"source": [
"### Maximal Marginal Relevance Searches\n",
"\n",
"In addition to using similarity search in the retriever object, you can also use `mmr` as retriever.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a359ed74",
"metadata": {},
"outputs": [],
"source": []
"source": [
"retriever = docsearch.as_retriever(search_type=\"mmr\")\n",
"matched_docs = retriever.get_relevant_documents(query)\n",
"for i, d in enumerate(matched_docs):\n",
" print(f\"\\n## Document {i}\\n\")\n",
" print(d.page_content)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "7c477287",
"metadata": {},
"source": [
"Or use `max_marginal_relevance_search` directly:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ca82740",
"metadata": {},
"outputs": [],
"source": [
"found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)\n",
"for i, doc in enumerate(found_docs):\n",
" print(f\"{i + 1}.\", doc.page_content, \"\\n\")"
]
}
],
"metadata": {
Expand Down
82 changes: 82 additions & 0 deletions langchain/vectorstores/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
import uuid
from typing import Any, Callable, Iterable, List, Optional, Tuple

import numpy as np

from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -157,6 +160,85 @@ def similarity_search(
)
return [doc for doc, _ in docs_and_scores]

def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
if namespace is None:
namespace = self._namespace
results = self._index.query(
[embedding],
top_k=fetch_k,
include_values=True,
include_metadata=True,
namespace=namespace,
filter=filter,
)
mmr_selected = maximal_marginal_relevance(
np.array([embedding], dtype=np.float32),
[item["values"] for item in results["matches"]],
k=k,
lambda_mult=lambda_mult,
)
selected = [results["matches"][i]["metadata"] for i in mmr_selected]
return [
Document(page_content=metadata.pop((self._text_key)), metadata=metadata)
for metadata in selected
]

def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self._embedding_function(query)
return self.max_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult, filter, namespace
)

@classmethod
def from_texts(
cls,
Expand Down

0 comments on commit b7c9931

Please sign in to comment.