Skip to content

Commit

Permalink
Add effective_search_ratio to vectorstore (langchain-ai#18)
Browse files Browse the repository at this point in the history
* Add effective_search_ratio to vectorstore

* Format

* Switch to query attribute

* add changelog

* fix test

---------

Co-authored-by: Alex Thomas <alexthomas93@users.noreply.github.com>
  • Loading branch information
tomasonjo and alexthomas93 authored Dec 9, 2024
1 parent 2ff67ea commit 2392837
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

- Enhanced Neo4j driver connection management with more robust error handling.
- Simplified connection state checking in Neo4jGraph.
- Introduced `effective_search_ratio` parameter in Neo4jVector to enhance query accuracy by adjusting the candidate pool size during similarity searches.

### Fixed

Expand Down
36 changes: 31 additions & 5 deletions libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,17 @@ def _get_search_index_query(
if index_type == IndexType.NODE:
if search_type == SearchType.VECTOR:
return (
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
)
elif search_type == SearchType.HYBRID:
call_prefix = "CALL () { " if neo4j_version_is_5_23_or_above else "CALL { "

query_body = (
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
"WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
"UNWIND nodes AS n "
"RETURN n.node AS node, (n.score / max) AS score UNION "
Expand All @@ -117,8 +119,9 @@ def _get_search_index_query(
raise ValueError(f"Unsupported SearchType: {search_type}")
else:
return (
"CALL db.index.vector.queryRelationships($index, $k, $embedding) "
"CALL db.index.vector.queryRelationships($index, $k * $ef, $embedding) "
"YIELD relationship, score "
"WITH relationship, score LIMIT $k "
)


Expand Down Expand Up @@ -461,6 +464,8 @@ class Neo4jVector(VectorStore):
'NODE' or 'RELATIONSHIP'
pre_delete_collection: If True, will delete existing data if it exists.
(default: False). Useful for testing.
effective_search_ratio: Controls the candidate pool size by multiplying $k
to balance query accuracy and performance.
Example:
.. code-block:: python
Expand Down Expand Up @@ -587,6 +592,7 @@ def __init__(
self.retrieval_query = retrieval_query
self.search_type = search_type
self._index_type = index_type

# Calculate embedding dimension
self.embedding_dimension = len(embedding.embed_query("foo"))

Expand Down Expand Up @@ -984,6 +990,7 @@ def similarity_search(
k: int = 4,
params: Dict[str, Any] = {},
filter: Optional[Dict[str, Any]] = None,
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search with Neo4jVector.
Expand All @@ -996,7 +1003,9 @@ def similarity_search(
filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to
filter on metadata.
Defaults to None.
effective_search_ratio (int): Controls the candidate pool size
by multiplying $k to balance query accuracy and performance.
Defaults to 1.
Returns:
List of Documents most similar to the query.
"""
Expand All @@ -1007,6 +1016,7 @@ def similarity_search(
query=query,
params=params,
filter=filter,
effective_search_ratio=effective_search_ratio,
**kwargs,
)

Expand All @@ -1016,6 +1026,7 @@ def similarity_search_with_score(
k: int = 4,
params: Dict[str, Any] = {},
filter: Optional[Dict[str, Any]] = None,
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Expand All @@ -1028,6 +1039,9 @@ def similarity_search_with_score(
filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to
filter on metadata.
Defaults to None.
effective_search_ratio (int): Controls the candidate pool size
by multiplying $k to balance query accuracy and performance.
Defaults to 1.
Returns:
List of Documents most similar to the query and score for each
Expand All @@ -1039,6 +1053,7 @@ def similarity_search_with_score(
query=query,
params=params,
filter=filter,
effective_search_ratio=effective_search_ratio,
**kwargs,
)
return docs
Expand All @@ -1049,6 +1064,7 @@ def similarity_search_with_score_by_vector(
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
params: Dict[str, Any] = {},
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Expand All @@ -1069,6 +1085,9 @@ def similarity_search_with_score_by_vector(
Defaults to None.
params (Dict[str, Any]): The search params for the index type.
Defaults to empty dict.
effective_search_ratio (int): Controls the candidate pool size
by multiplying $k to balance query accuracy and performance.
Defaults to 1.
Returns:
List[Tuple[Document, float]]: A list of tuples, each containing
Expand Down Expand Up @@ -1154,6 +1173,7 @@ def similarity_search_with_score_by_vector(
"embedding": embedding,
"keyword_index": self.keyword_index_name,
"query": remove_lucene_chars(kwargs["query"]),
"ef": effective_search_ratio,
**params,
**filter_params,
}
Expand Down Expand Up @@ -1209,6 +1229,7 @@ def similarity_search_by_vector(
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
params: Dict[str, Any] = {},
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
Expand All @@ -1226,7 +1247,12 @@ def similarity_search_by_vector(
List of Documents most similar to the query vector.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter, params=params, **kwargs
embedding=embedding,
k=k,
filter=filter,
params=params,
effective_search_ratio=effective_search_ratio,
**kwargs,
)
return [doc for doc, _ in docs_and_scores]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def test_hybrid_score_normalization() -> None:
"index": "vector",
"k": 1,
"embedding": FakeEmbeddingsWithOsDimension().embed_query("foo"),
"ef": 1,
"query": "foo",
"keyword_index": "keyword",
},
Expand Down Expand Up @@ -993,6 +994,28 @@ def test_neo4j_max_marginal_relevance_search() -> None:
drop_vector_indexes(docsearch)


def test_neo4jvector_effective_search_ratio() -> None:
"""Test effective search parameter."""
docsearch = Neo4jVector.from_texts(
texts=texts,
embedding=FakeEmbeddingsWithOsDimension(),
url=url,
username=username,
password=password,
pre_delete_collection=True,
)
output = docsearch.similarity_search("foo", k=2, effective_search_ratio=2)
assert len(output) == 2

output1 = docsearch.similarity_search_with_score(
"foo", k=2, effective_search_ratio=2
)
assert len(output1) == 2
# Assert ordered by score
assert output1[0][1] > output1[1][1]
drop_vector_indexes(docsearch)


def test_neo4jvector_passing_graph_object() -> None:
"""Test end to end construction and search with passing graph object."""
graph = Neo4jGraph(url=url, username=username, password=password)
Expand Down
6 changes: 4 additions & 2 deletions libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,9 @@ def test_converting_to_yaml() -> None:
def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None:
expected_query = (
"CALL () { "
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
"WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
"UNWIND nodes AS n "
"RETURN n.node AS node, (n.score / max) AS score UNION "
Expand All @@ -225,8 +226,9 @@ def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None:
def test_get_search_index_query_hybrid_node_neo4j_5_23_below() -> None:
expected_query = (
"CALL { "
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
"WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
"UNWIND nodes AS n "
"RETURN n.node AS node, (n.score / max) AS score UNION "
Expand Down

0 comments on commit 2392837

Please sign in to comment.