Skip to content

Commit

Permalink
rebase on refactored vs
Browse files Browse the repository at this point in the history
  • Loading branch information
duwenxin99 committed Sep 11, 2024
1 parent 2576315 commit fc7e02d
Show file tree
Hide file tree
Showing 7 changed files with 568 additions and 41 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ test = [
"mypy==1.11.2",
"pytest-asyncio==0.24.0",
"pytest==8.3.2",
"pytest-cov==5.0.0"
"pytest-cov==5.0.0",
"Pillow==10.4.0"
]

[build-system]
Expand Down
2 changes: 1 addition & 1 deletion samples/index_tuning_sample/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ We will calculate the average latency of multiple queries to have a better under
query_2 = "Aromas include tropical fruit, broom, brimstone and dried herb."
query_3 = "Wine from spain."
query_4 = "dry-farmed vineyard"
query_5 = "balanced elegance of some kind"
query_5 = "balanced elegance of some kind"
queries = [query_1, query_2, query_3, query_4, query_5]
```
Expand Down
161 changes: 144 additions & 17 deletions src/langchain_google_alloydb_pg/async_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
# TODO: Remove below import when minimum supported Python version is 3.10
from __future__ import annotations

import base64
import json
import uuid
import warnings
from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Type, Union

import numpy as np
Expand Down Expand Up @@ -294,6 +296,45 @@ async def aadd_documents(
ids = await self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs)
return ids

def _encode_image(self, uri: str) -> str:
"""Get base64 string from image URI."""
with open(uri, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")

async def aadd_images(
self,
uris: List[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Embed images and add to the table."""
encoded_images = []
if metadatas is None:
metadatas = [{"image_uri": uri} for uri in uris]

for uri in uris:
encoded_image = self._encode_image(uri)
encoded_images.append(encoded_image)

# check if `embed_image()` API is supported by the embedding service used
if hasattr(self.embedding_service, "embed_image"):
try:
embeddings = self.embedding_service.embed_image(uris)
except Exception as e:
raise Exception(
f"Make sure your selected embedding model supports list of image URIs as input. {str(e)}"
)
else:
raise ValueError(
"Please use an embedding model that supports image embedding."
)

ids = await self._aadd_embeddings(
encoded_images, embeddings, metadatas=metadatas, ids=ids, **kwargs
)
return ids

async def adelete(
self,
ids: Optional[List[str]] = None,
Expand Down Expand Up @@ -477,12 +518,13 @@ async def __query_collection(
async def asimilarity_search(
self,
query: str,
image_uri: Optional[str] = None,
k: Optional[int] = None,
filter: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected by similarity search on query."""
embedding = self.embedding_service.embed_query(text=query)
embedding = self._embedding_helper(query, image_uri)

return await self.asimilarity_search_by_vector(
embedding=embedding, k=k, filter=filter, **kwargs
Expand All @@ -499,15 +541,16 @@ def _select_relevance_score_fn(self) -> Callable[[float], float]:
elif self.distance_strategy == DistanceStrategy.EUCLIDEAN:
return self._euclidean_relevance_score_fn

async def asimilarity_search_with_score(
async def asimilarity_search_with_score( # type: ignore[override]
self,
query: str,
image_uri: Optional[str] = None,
k: Optional[int] = None,
filter: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and distance scores selected by similarity search on query."""
embedding = self.embedding_service.embed_query(query)
embedding = self._embedding_helper(query, image_uri)
docs = await self.asimilarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter, **kwargs
)
Expand Down Expand Up @@ -560,17 +603,88 @@ async def asimilarity_search_with_score_by_vector(

return documents_with_scores

async def amax_marginal_relevance_search(
async def _asimilarity_search_with_relevance_scores( # type: ignore[override]
self,
query: Optional[str] = None,
image_uri: Optional[str] = None,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Return docs and relevance scores in the range [0, 1].
0 is dissimilar, 1 is most similar.
Args:
query: Input text.
k: Number of Documents to return. Defaults to 4.
**kwargs: kwargs to be passed to similarity search. Should include:
score_threshold: Optional, a floating point value between 0 to 1 to
filter the resulting set of retrieved docs.
Returns:
List of Tuples of (doc, similarity_score)
"""
relevance_score_fn = self._select_relevance_score_fn()
docs_and_scores = await self.asimilarity_search_with_score(
query=query, image_uri=image_uri, k=k, **kwargs
)
return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores]

async def asimilarity_search_with_relevance_scores( # type: ignore[override]
self,
query: Optional[str] = None,
image_uri: Optional[str] = None,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Async return docs and relevance scores in the range [0, 1].
0 is dissimilar, 1 is most similar.
Args:
query: Input text.
k: Number of Documents to return. Defaults to 4.
**kwargs: kwargs to be passed to similarity search. Should include:
score_threshold: Optional, a floating point value between 0 to 1 to
filter the resulting set of retrieved docs
Returns:
List of Tuples of (doc, similarity_score)
"""
score_threshold = kwargs.pop("score_threshold", None)

docs_and_similarities = await self._asimilarity_search_with_relevance_scores(
query=query, image_uri=image_uri, k=k, **kwargs
)
if any(
similarity < 0.0 or similarity > 1.0
for _, similarity in docs_and_similarities
):
warnings.warn(
"Relevance scores must be between"
f" 0 and 1, got {docs_and_similarities}"
)

if score_threshold is not None:
docs_and_similarities = [
(doc, similarity)
for doc, similarity in docs_and_similarities
if similarity >= score_threshold
]
if len(docs_and_similarities) == 0:
warnings.warn(
"No relevant docs were retrieved using the relevance score"
f" threshold {score_threshold}"
)
return docs_and_similarities

async def amax_marginal_relevance_search( # type: ignore[override]
self,
query: str,
image_uri: Optional[str] = None,
k: Optional[int] = None,
fetch_k: Optional[int] = None,
lambda_mult: Optional[float] = None,
filter: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance."""
embedding = self.embedding_service.embed_query(text=query)
embedding = self._embedding_helper(query, image_uri)

return await self.amax_marginal_relevance_search_by_vector(
embedding=embedding,
Expand All @@ -581,7 +695,7 @@ async def amax_marginal_relevance_search(
**kwargs,
)

async def amax_marginal_relevance_search_by_vector(
async def amax_marginal_relevance_search_by_vector( # type: ignore[override]
self,
embedding: List[float],
k: Optional[int] = None,
Expand Down Expand Up @@ -734,31 +848,31 @@ async def is_valid_index(
results = result_map.fetchall()
return bool(len(results) == 1)

def similarity_search(
def add_texts(
self,
query: str,
k: Optional[int] = None,
filter: Optional[str] = None,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[Document]:
) -> List[str]:
raise NotImplementedError(
"Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
)

def add_texts(
def add_documents(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
documents: List[Document],
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
raise NotImplementedError(
"Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
)

def add_documents(
def add_images(
self,
documents: List[Document],
uris: List[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
Expand Down Expand Up @@ -816,9 +930,22 @@ def from_documents( # type: ignore[override]
"Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
)

def similarity_search_with_score(
def similarity_search(
self,
query: str,
image_uri: Optional[str] = None,
k: Optional[int] = None,
filter: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
raise NotImplementedError(
"Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
)

def similarity_search_with_score( # type: ignore[override]
self,
query: str,
image_uri: Optional[str] = None,
k: Optional[int] = None,
filter: Optional[str] = None,
**kwargs: Any,
Expand Down
Loading

0 comments on commit fc7e02d

Please sign in to comment.