rebase on refactored vs

googleapis · Sep 11, 2024 · fc7e02d · fc7e02d
1 parent 2576315
commit fc7e02d
Show file tree

Hide file tree

Showing 7 changed files with 568 additions and 41 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,7 +44,8 @@ test = [
     "mypy==1.11.2",
     "pytest-asyncio==0.24.0",
     "pytest==8.3.2",
-    "pytest-cov==5.0.0"
+    "pytest-cov==5.0.0",
+    "Pillow==10.4.0"
 ]
 
 [build-system]

diff --git a/samples/index_tuning_sample/README.md b/samples/index_tuning_sample/README.md
@@ -120,7 +120,7 @@ We will calculate the average latency of multiple queries to have a better under
     query_2 = "Aromas include tropical fruit, broom, brimstone and dried herb."
     query_3 = "Wine from spain."
     query_4 = "dry-farmed vineyard"
-    query_5 = "balanced elegance of some kind"
+    query_5 = "balanced elegance of some kind" 
     queries = [query_1, query_2, query_3, query_4, query_5]
     ```
 

diff --git a/src/langchain_google_alloydb_pg/async_vectorstore.py b/src/langchain_google_alloydb_pg/async_vectorstore.py
@@ -15,8 +15,10 @@
 # TODO: Remove below import when minimum supported Python version is 3.10
 from __future__ import annotations
 
+import base64
 import json
 import uuid
+import warnings
 from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Type, Union
 
 import numpy as np
@@ -294,6 +296,45 @@ async def aadd_documents(
         ids = await self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs)
         return ids
 
+    def _encode_image(self, uri: str) -> str:
+        """Get base64 string from image URI."""
+        with open(uri, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    async def aadd_images(
+        self,
+        uris: List[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Embed images and add to the table."""
+        encoded_images = []
+        if metadatas is None:
+            metadatas = [{"image_uri": uri} for uri in uris]
+
+        for uri in uris:
+            encoded_image = self._encode_image(uri)
+            encoded_images.append(encoded_image)
+
+        # check if `embed_image()` API is supported by the embedding service used
+        if hasattr(self.embedding_service, "embed_image"):
+            try:
+                embeddings = self.embedding_service.embed_image(uris)
+            except Exception as e:
+                raise Exception(
+                    f"Make sure your selected embedding model supports list of image URIs as input. {str(e)}"
+                )
+        else:
+            raise ValueError(
+                "Please use an embedding model that supports image embedding."
+            )
+
+        ids = await self._aadd_embeddings(
+            encoded_images, embeddings, metadatas=metadatas, ids=ids, **kwargs
+        )
+        return ids
+
     async def adelete(
         self,
         ids: Optional[List[str]] = None,
@@ -477,12 +518,13 @@ async def __query_collection(
     async def asimilarity_search(
         self,
         query: str,
+        image_uri: Optional[str] = None,
         k: Optional[int] = None,
         filter: Optional[str] = None,
         **kwargs: Any,
     ) -> List[Document]:
         """Return docs selected by similarity search on query."""
-        embedding = self.embedding_service.embed_query(text=query)
+        embedding = self._embedding_helper(query, image_uri)
 
         return await self.asimilarity_search_by_vector(
             embedding=embedding, k=k, filter=filter, **kwargs
@@ -499,15 +541,16 @@ def _select_relevance_score_fn(self) -> Callable[[float], float]:
         elif self.distance_strategy == DistanceStrategy.EUCLIDEAN:
             return self._euclidean_relevance_score_fn
 
-    async def asimilarity_search_with_score(
+    async def asimilarity_search_with_score(  # type: ignore[override]
         self,
         query: str,
+        image_uri: Optional[str] = None,
         k: Optional[int] = None,
         filter: Optional[str] = None,
         **kwargs: Any,
     ) -> List[Tuple[Document, float]]:
         """Return docs and distance scores selected by similarity search on query."""
-        embedding = self.embedding_service.embed_query(query)
+        embedding = self._embedding_helper(query, image_uri)
         docs = await self.asimilarity_search_with_score_by_vector(
             embedding=embedding, k=k, filter=filter, **kwargs
         )
@@ -560,17 +603,88 @@ async def asimilarity_search_with_score_by_vector(
 
         return documents_with_scores
 
-    async def amax_marginal_relevance_search(
+    async def _asimilarity_search_with_relevance_scores(  # type: ignore[override]
+        self,
+        query: Optional[str] = None,
+        image_uri: Optional[str] = None,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """
+        Return docs and relevance scores in the range [0, 1].
+        0 is dissimilar, 1 is most similar.
+        Args:
+            query: Input text.
+            k: Number of Documents to return. Defaults to 4.
+            **kwargs: kwargs to be passed to similarity search. Should include:
+                score_threshold: Optional, a floating point value between 0 to 1 to
+                filter the resulting set of retrieved docs.
+        Returns:
+            List of Tuples of (doc, similarity_score)
+        """
+        relevance_score_fn = self._select_relevance_score_fn()
+        docs_and_scores = await self.asimilarity_search_with_score(
+            query=query, image_uri=image_uri, k=k, **kwargs
+        )
+        return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores]
+
+    async def asimilarity_search_with_relevance_scores(  # type: ignore[override]
+        self,
+        query: Optional[str] = None,
+        image_uri: Optional[str] = None,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """Async return docs and relevance scores in the range [0, 1].
+        0 is dissimilar, 1 is most similar.
+        Args:
+            query: Input text.
+            k: Number of Documents to return. Defaults to 4.
+            **kwargs: kwargs to be passed to similarity search. Should include:
+                score_threshold: Optional, a floating point value between 0 to 1 to
+                filter the resulting set of retrieved docs
+        Returns:
+            List of Tuples of (doc, similarity_score)
+        """
+        score_threshold = kwargs.pop("score_threshold", None)
+
+        docs_and_similarities = await self._asimilarity_search_with_relevance_scores(
+            query=query, image_uri=image_uri, k=k, **kwargs
+        )
+        if any(
+            similarity < 0.0 or similarity > 1.0
+            for _, similarity in docs_and_similarities
+        ):
+            warnings.warn(
+                "Relevance scores must be between"
+                f" 0 and 1, got {docs_and_similarities}"
+            )
+
+        if score_threshold is not None:
+            docs_and_similarities = [
+                (doc, similarity)
+                for doc, similarity in docs_and_similarities
+                if similarity >= score_threshold
+            ]
+            if len(docs_and_similarities) == 0:
+                warnings.warn(
+                    "No relevant docs were retrieved using the relevance score"
+                    f" threshold {score_threshold}"
+                )
+        return docs_and_similarities
+
+    async def amax_marginal_relevance_search(  # type: ignore[override]
         self,
         query: str,
+        image_uri: Optional[str] = None,
         k: Optional[int] = None,
         fetch_k: Optional[int] = None,
         lambda_mult: Optional[float] = None,
         filter: Optional[str] = None,
         **kwargs: Any,
     ) -> List[Document]:
         """Return docs selected using the maximal marginal relevance."""
-        embedding = self.embedding_service.embed_query(text=query)
+        embedding = self._embedding_helper(query, image_uri)
 
         return await self.amax_marginal_relevance_search_by_vector(
             embedding=embedding,
@@ -581,7 +695,7 @@ async def amax_marginal_relevance_search(
             **kwargs,
         )
 
-    async def amax_marginal_relevance_search_by_vector(
+    async def amax_marginal_relevance_search_by_vector(  # type: ignore[override]
         self,
         embedding: List[float],
         k: Optional[int] = None,
@@ -734,31 +848,31 @@ async def is_valid_index(
             results = result_map.fetchall()
         return bool(len(results) == 1)
 
-    def similarity_search(
+    def add_texts(
         self,
-        query: str,
-        k: Optional[int] = None,
-        filter: Optional[str] = None,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
         **kwargs: Any,
-    ) -> List[Document]:
+    ) -> List[str]:
         raise NotImplementedError(
             "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
         )
 
-    def add_texts(
+    def add_documents(
         self,
-        texts: Iterable[str],
-        metadatas: Optional[List[dict]] = None,
+        documents: List[Document],
         ids: Optional[List[str]] = None,
         **kwargs: Any,
     ) -> List[str]:
         raise NotImplementedError(
             "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
         )
 
-    def add_documents(
+    def add_images(
         self,
-        documents: List[Document],
+        uris: List[str],
+        metadatas: Optional[List[dict]] = None,
         ids: Optional[List[str]] = None,
         **kwargs: Any,
     ) -> List[str]:
@@ -816,9 +930,22 @@ def from_documents(  # type: ignore[override]
             "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
         )
 
-    def similarity_search_with_score(
+    def similarity_search(
+        self,
+        query: str,
+        image_uri: Optional[str] = None,
+        k: Optional[int] = None,
+        filter: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        raise NotImplementedError(
+            "Sync methods are not implemented for AsyncAlloyDBVectorStore. Use AlloyDBVectorStore interface instead."
+        )
+
+    def similarity_search_with_score(  # type: ignore[override]
         self,
         query: str,
+        image_uri: Optional[str] = None,
         k: Optional[int] = None,
         filter: Optional[str] = None,
         **kwargs: Any,