From 680810732eb5d636237cd916de73f065b7da2414 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 18 Jul 2024 11:01:49 +0200 Subject: [PATCH 01/17] Implement apply_filter_policy and FilterPolicy.MERGE for the new filters --- .../document_stores/types/filter_policy.py | 77 ++++++++++- test/utils/test_filters.py | 122 ++++++++++++++++++ 2 files changed, 195 insertions(+), 4 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index a2be576d20..66f849adaa 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -36,10 +36,41 @@ def from_str(filter_policy: str) -> "FilterPolicy": return policy +def is_legacy(filter_item: Dict[str, Any]) -> bool: + """ + Check if the given filter is a legacy filter. + + :param filter_item: The filter to check. + :returns: True if the filter is a legacy filter, False otherwise. + """ + return "operator" not in filter_item + + +def is_comparison_filter(filter_item: Dict[str, Any]) -> bool: + """ + Check if the given filter is a comparison filter. + + :param filter_item: The filter to check. + :returns: True if the filter is a comparison filter, False otherwise. + """ + return all(key in filter_item for key in ["field", "operator", "value"]) + + +def is_logical_filter(filter_item: Dict[str, Any]) -> bool: + """ + Check if the given filter is a logical filter. + + :param filter_item: The filter to check. + :returns: True if the filter is a logical filter, False otherwise. + """ + return "operator" in filter_item and "conditions" in filter_item + + def apply_filter_policy( filter_policy: FilterPolicy, init_filters: Optional[Dict[str, Any]] = None, runtime_filters: Optional[Dict[str, Any]] = None, + default_logical_operator: str = "AND", ) -> Optional[Dict[str, Any]]: """ Apply the filter policy to the given initial and runtime filters to determine the final set of filters used. @@ -52,10 +83,48 @@ def apply_filter_policy( values from the runtime filters will overwrite those from the initial filters. :param init_filters: The initial filters set during the initialization of the relevant retriever. :param runtime_filters: The filters provided at runtime, usually during a query operation execution. These filters - can change for each query/retreiver run invocation. + can change for each query/retriever run invocation. + :param default_logical_operator: The default logical operator to use when merging filters (non-legacy filters only). :returns: A dictionary containing the resulting filters based on the provided policy. """ if filter_policy == FilterPolicy.MERGE and runtime_filters: - return {**(init_filters or {}), **runtime_filters} - else: - return runtime_filters or init_filters + # legacy filters merge handling + if is_legacy(runtime_filters): + return {**(init_filters or {}), **runtime_filters} + elif init_filters is not None: + # here we merge new filters + def merge_comparison_filters( + filter1: Dict[str, Any], filter2: Dict[str, Any], logical_op: str + ) -> Dict[str, Any]: + if filter1["field"] == filter2["field"]: + # When fields are the same, use the runtime filter (filter2) + return filter2 + return {"operator": logical_op, "conditions": [filter1, filter2]} + + def merge_comparison_and_logical( + comparison: Dict[str, Any], logical: Dict[str, Any], logical_op: str + ) -> Dict[str, Any]: + if logical["operator"] == logical_op: + logical["conditions"].append(comparison) + return logical + else: + return {"operator": logical_op, "conditions": [comparison, logical]} + + def merge_logical_filters( + filter1: Dict[str, Any], filter2: Dict[str, Any], logical_op: str + ) -> Dict[str, Any]: + if filter1["operator"] == filter2["operator"] == logical_op: + return {"operator": logical_op, "conditions": filter1["conditions"] + filter2["conditions"]} + else: + return {"operator": logical_op, "conditions": [filter1, filter2]} + + if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): + return merge_comparison_filters(init_filters, runtime_filters, default_logical_operator) + elif is_comparison_filter(init_filters) and is_logical_filter(runtime_filters): + return merge_comparison_and_logical(init_filters, runtime_filters, default_logical_operator) + elif is_logical_filter(init_filters) and is_comparison_filter(runtime_filters): + return merge_comparison_and_logical(runtime_filters, init_filters, default_logical_operator) + elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): + return merge_logical_filters(init_filters, runtime_filters, default_logical_operator) + + return runtime_filters or init_filters diff --git a/test/utils/test_filters.py b/test/utils/test_filters.py index eccf1ca1ec..edb8aacbd2 100644 --- a/test/utils/test_filters.py +++ b/test/utils/test_filters.py @@ -3,6 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 import pytest import pandas as pd +from haystack.document_stores.types import FilterPolicy + +from haystack.document_stores.types.filter_policy import apply_filter_policy from haystack import Document from haystack.errors import FilterError @@ -726,3 +729,122 @@ def test_convert_with_incorrect_filter_nesting(): with pytest.raises(FilterError): convert({"number": {"page": {"chapter": "intro"}}}) + + +################################################################################ +# Tests for the apply_filter_policy function +################################################################################ +def test_no_filters(): + assert apply_filter_policy(FilterPolicy.MERGE, None, None) is None + + +def test_replace_policy(): + init_filter = {"field": "meta.name", "operator": "==", "value": "John"} + runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} + assert apply_filter_policy(FilterPolicy.REPLACE, init_filter, runtime_filter) == runtime_filter + + +def test_merge_comparison_filters(): + init_filter = {"field": "meta.name", "operator": "==", "value": "John"} + runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} + expected = {"operator": "AND", "conditions": [init_filter, runtime_filter]} + assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter) == expected + + +def test_merge_comparison_filters_same_field(): + init_filter = {"field": "meta.name", "operator": "==", "value": "John"} + runtime_filter = {"field": "meta.name", "operator": "==", "value": "Jane"} + assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter) == runtime_filter + + +def test_merge_comparison_and_logical(): + comparison_filter = {"field": "meta.type", "operator": "==", "value": "pdf"} + logical_filter = { + "operator": "AND", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + ], + } + expected = { + "operator": "AND", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + {"field": "meta.type", "operator": "==", "value": "pdf"}, + ], + } + assert apply_filter_policy(FilterPolicy.MERGE, comparison_filter, logical_filter) == expected + + +def test_merge_comparison_and_logical_different_operator(): + comparison_filter = {"field": "meta.type", "operator": "==", "value": "pdf"} + logical_filter = { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + ], + } + expected = {"operator": "AND", "conditions": [comparison_filter, logical_filter]} + assert apply_filter_policy(FilterPolicy.MERGE, comparison_filter, logical_filter) == expected + + +def test_merge_logical_filters_same_operator(): + logical_filter1 = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "pdf"}, + {"field": "meta.month", "operator": "==", "value": "April"}, + ], + } + logical_filter2 = { + "operator": "AND", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + ], + } + expected = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "pdf"}, + {"field": "meta.month", "operator": "==", "value": "April"}, + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + ], + } + assert apply_filter_policy(FilterPolicy.MERGE, logical_filter1, logical_filter2) == expected + + +def test_merge_logical_filters_different_operator(): + logical_filter1 = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "pdf"}, + {"field": "meta.month", "operator": "==", "value": "April"}, + ], + } + logical_filter2 = { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + ], + } + expected = {"operator": "AND", "conditions": [logical_filter1, logical_filter2]} + assert apply_filter_policy(FilterPolicy.MERGE, logical_filter1, logical_filter2) == expected + + +def test_user_provided_logical_operator(): + init_filter = {"field": "meta.name", "operator": "==", "value": "John"} + runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} + expected = {"operator": "OR", "conditions": [init_filter, runtime_filter]} + assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter, "OR") == expected + + +def test_default_logical_operator(): + init_filter = {"field": "meta.name", "operator": "==", "value": "John"} + runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} + expected = {"operator": "AND", "conditions": [init_filter, runtime_filter]} + assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter) == expected From e82f9ac43465816fafffcee9d43f64e3eb281414 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 18 Jul 2024 11:13:15 +0200 Subject: [PATCH 02/17] Add reno note --- .../notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml diff --git a/releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml b/releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml new file mode 100644 index 0000000000..32cbbca67e --- /dev/null +++ b/releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Enhanced filter application logic to support merging of filters. It facilitates more precise retrieval filtering, allowing for both init and runtime complex filter combinations with logical operators. From f04c80504b32933a95f9228cf22315067898e610 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Fri, 26 Jul 2024 09:58:44 +0100 Subject: [PATCH 03/17] Rewrite filters using exhisting internal version --- .../document_stores/types/filter_policy.py | 265 +++++++++++++++--- 1 file changed, 233 insertions(+), 32 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 66f849adaa..7e8a60e0be 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -3,7 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 from enum import Enum -from typing import Any, Dict, Optional +from typing import Any, Dict, Literal, Optional + +from haystack import logging + +logger = logging.getLogger(__name__) class FilterPolicy(Enum): @@ -66,11 +70,229 @@ def is_logical_filter(filter_item: Dict[str, Any]) -> bool: return "operator" in filter_item and "conditions" in filter_item +def combine_two_logical_filters( + init_logical_filter: Dict[str, Any], runtime_logical_filter: Dict[str, Any] +) -> Dict[str, Any]: + """ + Combine two logical filters, they must have the same operator. + + If `init_logical_filter["operator"]` and `runtime_logical_filter["operator"]` are the same, the conditions + of both filters are combined. Otherwise, the `init_logical_filter` is ignored and ` + runtime_logical_filter` is returned. + + __Example__: + + ```python + init_logical_filter = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ] + } + runtime_logical_filter = { + "operator": "AND", + "conditions": [ + {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ] + } + new_filters = combine_two_logical_filters( + init_logical_filter, runtime_logical_filter, "AND" + ) + # Output: + { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ] + } + ``` + """ + if init_logical_filter["operator"] == runtime_logical_filter["operator"]: + return { + "operator": init_logical_filter["operator"], + "conditions": init_logical_filter["conditions"] + runtime_logical_filter["conditions"], + } + + logger.warning( + "The provided logical operators, {parsed_operator} and {operator}, do not match so the parsed logical " + "filter, {init_logical_filter}, will be ignored and only the provided logical filter,{runtime_logical_filter}, " + "will be used. Update the logical operators to match to include the parsed filter.", + parsed_operator=init_logical_filter["operator"], + operator=runtime_logical_filter["operator"], + init_logical_filter=init_logical_filter, + runtime_logical_filter=runtime_logical_filter, + ) + return runtime_logical_filter + + +def combine_init_comparison_and_runtime_logical_filters( + init_comparison_filter: Dict[str, Any], + runtime_logical_filter: Dict[str, Any], + logical_operator: Literal["AND", "OR", "NOT"], +) -> Dict[str, Any]: + """ + Combine a runtime logical filter with the init comparison filter using the provided logical_operator. + + We only add the init_comparison_filter if logical_operator matches the existing + runtime_logical_filter["operator"]. Otherwise, we return the runtime_logical_filter unchanged. + + __Example__: + + ```python + runtime_logical_filter = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ] + } + init_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} + new_filters = combine_init_comparison_and_runtime_logical_filters( + init_comparison_filter, runtime_logical_filter, "AND" + ) + # Output: + { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + ] + } + ``` + """ + if runtime_logical_filter["operator"] == logical_operator: + conditions = runtime_logical_filter["conditions"] + fields = {c.get("field") for c in conditions} + if init_comparison_filter["field"] not in fields: + conditions.append(init_comparison_filter) + else: + logger.warning( + "The init filter, {init_filter}, is ignored as the field is already present in the existing " + "filters, {filters}.", + init_filter=init_comparison_filter, + filters=runtime_logical_filter, + ) + return {"operator": runtime_logical_filter["operator"], "conditions": conditions} + + logger.warning( + "The provided logical_operator, {logical_operator}, does not match the logical operator found in " + "the runtime filters, {filters_logical_operator}, so the init filter will be ignored.", + logical_operator=logical_operator, + filters_logical_operator=runtime_logical_filter["operator"], + ) + return runtime_logical_filter + + +def combine_runtime_comparison_and_init_logical_filters( + runtime_comparison_filter: Dict[str, Any], + init_logical_filter: Dict[str, Any], + logical_operator: Literal["AND", "OR", "NOT"], +) -> Dict[str, Any]: + """ + Combine an init logical filter with the runtime comparison filter using the provided logical_operator. + + We only add the runtime_comparison_filter if logical_operator matches the existing + init_logical_filter["operator"]. Otherwise, we return the runtime_comparison_filter unchanged. + + __Example__: + + ```python + init_logical_filter = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ] + } + runtime_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} + new_filters = combine_runtime_comparison_and_init_logical_filters( + runtime_comparison_filter, init_logical_filter, "AND" + ) + # Output: + { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + ] + } + ``` + """ + if init_logical_filter["operator"] == logical_operator: + conditions = init_logical_filter["conditions"] + fields = {c.get("field") for c in conditions} + if runtime_comparison_filter["field"] in fields: + logger.warning( + "The runtime filter, {runtime_filter}, will overwrite the existing filter with the same " + "field in the init logical filter.", + runtime_filter=runtime_comparison_filter, + ) + conditions = [c for c in conditions if c.get("field") != runtime_comparison_filter["field"]] + conditions.append(runtime_comparison_filter) + return {"operator": init_logical_filter["operator"], "conditions": conditions} + + logger.warning( + "The provided logical_operator, {logical_operator}, does not match the logical operator found in " + "the init logical filter, {filters_logical_operator}, so the init logical filter will be ignored.", + logical_operator=logical_operator, + filters_logical_operator=init_logical_filter["operator"], + ) + return runtime_comparison_filter + + +def combine_two_comparison_filters( + init_comparison_filter: Dict[str, Any], + runtime_comparison_filter: Dict[str, Any], + logical_operator: Literal["AND", "OR", "NOT"], +) -> Dict[str, Any]: + """ + Combine a comparison filter with the `init_comparison_filter` using the provided `logical_operator`. + + If `runtime_comparison_filter` and `init_comparison_filter` target the same field, `init_comparison_filter` + is ignored and `runtime_comparison_filter` is returned unchanged. + + __Example__: + + ```python + runtime_comparison_filter = {"field": "meta.type", "operator": "==", "value": "article"}, + init_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + new_filters = combine_two_comparison_filters( + init_comparison_filter, runtime_comparison_filter, "AND" + ) + # Output: + { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + ] + } + ``` + """ + if runtime_comparison_filter["field"] == init_comparison_filter["field"]: + logger.warning( + "The parsed filter, {parsed_filter}, is ignored as the field is already present in the existing " + "filters, {filters}.", + parsed_filter=init_comparison_filter, + filters=runtime_comparison_filter, + ) + return runtime_comparison_filter + + return {"operator": logical_operator, "conditions": [init_comparison_filter, runtime_comparison_filter]} + + def apply_filter_policy( filter_policy: FilterPolicy, init_filters: Optional[Dict[str, Any]] = None, runtime_filters: Optional[Dict[str, Any]] = None, - default_logical_operator: str = "AND", + logical_operator: Literal["AND", "OR", "NOT"] = "AND", ) -> Optional[Dict[str, Any]]: """ Apply the filter policy to the given initial and runtime filters to determine the final set of filters used. @@ -84,7 +306,7 @@ def apply_filter_policy( :param init_filters: The initial filters set during the initialization of the relevant retriever. :param runtime_filters: The filters provided at runtime, usually during a query operation execution. These filters can change for each query/retriever run invocation. - :param default_logical_operator: The default logical operator to use when merging filters (non-legacy filters only). + :param logical_operator: The default logical operator to use when merging filters (non-legacy filters only). :returns: A dictionary containing the resulting filters based on the provided policy. """ if filter_policy == FilterPolicy.MERGE and runtime_filters: @@ -93,38 +315,17 @@ def apply_filter_policy( return {**(init_filters or {}), **runtime_filters} elif init_filters is not None: # here we merge new filters - def merge_comparison_filters( - filter1: Dict[str, Any], filter2: Dict[str, Any], logical_op: str - ) -> Dict[str, Any]: - if filter1["field"] == filter2["field"]: - # When fields are the same, use the runtime filter (filter2) - return filter2 - return {"operator": logical_op, "conditions": [filter1, filter2]} - - def merge_comparison_and_logical( - comparison: Dict[str, Any], logical: Dict[str, Any], logical_op: str - ) -> Dict[str, Any]: - if logical["operator"] == logical_op: - logical["conditions"].append(comparison) - return logical - else: - return {"operator": logical_op, "conditions": [comparison, logical]} - - def merge_logical_filters( - filter1: Dict[str, Any], filter2: Dict[str, Any], logical_op: str - ) -> Dict[str, Any]: - if filter1["operator"] == filter2["operator"] == logical_op: - return {"operator": logical_op, "conditions": filter1["conditions"] + filter2["conditions"]} - else: - return {"operator": logical_op, "conditions": [filter1, filter2]} - if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): - return merge_comparison_filters(init_filters, runtime_filters, default_logical_operator) + return combine_two_comparison_filters(init_filters, runtime_filters, logical_operator) elif is_comparison_filter(init_filters) and is_logical_filter(runtime_filters): - return merge_comparison_and_logical(init_filters, runtime_filters, default_logical_operator) + return combine_init_comparison_and_runtime_logical_filters( + init_filters, runtime_filters, logical_operator + ) elif is_logical_filter(init_filters) and is_comparison_filter(runtime_filters): - return merge_comparison_and_logical(runtime_filters, init_filters, default_logical_operator) + return combine_runtime_comparison_and_init_logical_filters( + runtime_filters, init_filters, logical_operator + ) elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): - return merge_logical_filters(init_filters, runtime_filters, default_logical_operator) + return combine_two_comparison_filters(init_filters, runtime_filters, logical_operator) return runtime_filters or init_filters From 57a1a1a48341dc2e82ac2472bbf88a2a64ecec9e Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Fri, 26 Jul 2024 11:40:41 +0100 Subject: [PATCH 04/17] Fixes --- .../document_stores/types/filter_policy.py | 2 +- ...cept-split-threshold-467abb9fcd1c316b.yaml | 4 ++-- test/utils/test_filters.py | 22 +++++++++++++++---- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 7e8a60e0be..b6c7103e86 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -326,6 +326,6 @@ def apply_filter_policy( runtime_filters, init_filters, logical_operator ) elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): - return combine_two_comparison_filters(init_filters, runtime_filters, logical_operator) + return combine_two_logical_filters(init_filters, runtime_filters) return runtime_filters or init_filters diff --git a/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml b/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml index 37f6bda3e7..a4488576e6 100644 --- a/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml +++ b/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml @@ -1,5 +1,5 @@ --- enhancements: - | - DocumentSplitter now has an optional split_threshold parameter. Use this parameter if you want to rather not split inputs that are only slightly longer than the allowed split_length. - If when chunking one of the chunks is smaller than the split_threshold, the chunk will be concatenated with the previous one. This avoids having too small chunks that are not meaningful. + DocumentSplitter now has an optional split_threshold parameter. Use this parameter if you want to rather not split inputs that are only slightly longer than the allowed split_length. + If when chunking one of the chunks is smaller than the split_threshold, the chunk will be concatenated with the previous one. This avoids having too small chunks that are not meaningful. diff --git a/test/utils/test_filters.py b/test/utils/test_filters.py index edb8aacbd2..223414dd02 100644 --- a/test/utils/test_filters.py +++ b/test/utils/test_filters.py @@ -780,13 +780,20 @@ def test_merge_comparison_and_logical(): def test_merge_comparison_and_logical_different_operator(): comparison_filter = {"field": "meta.type", "operator": "==", "value": "pdf"} logical_filter = { - "operator": "OR", + "operator": "AND", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + ], + } + expected = { + "operator": "AND", "conditions": [ {"field": "meta.name", "operator": "==", "value": "John"}, {"field": "meta.year", "operator": "==", "value": "2022"}, + {"field": "meta.type", "operator": "==", "value": "pdf"}, ], } - expected = {"operator": "AND", "conditions": [comparison_filter, logical_filter]} assert apply_filter_policy(FilterPolicy.MERGE, comparison_filter, logical_filter) == expected @@ -814,7 +821,8 @@ def test_merge_logical_filters_same_operator(): {"field": "meta.year", "operator": "==", "value": "2022"}, ], } - assert apply_filter_policy(FilterPolicy.MERGE, logical_filter1, logical_filter2) == expected + result = apply_filter_policy(FilterPolicy.MERGE, logical_filter1, logical_filter2) + assert result == expected def test_merge_logical_filters_different_operator(): @@ -832,7 +840,13 @@ def test_merge_logical_filters_different_operator(): {"field": "meta.year", "operator": "==", "value": "2022"}, ], } - expected = {"operator": "AND", "conditions": [logical_filter1, logical_filter2]} + expected = { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "John"}, + {"field": "meta.year", "operator": "==", "value": "2022"}, + ], + } assert apply_filter_policy(FilterPolicy.MERGE, logical_filter1, logical_filter2) == expected From 2b7bb678ee19f9dbd114f89ece2032f213b5adae Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Tue, 6 Aug 2024 15:36:27 +0200 Subject: [PATCH 05/17] More unit tests (new filters) --- test/document_stores/test_filter_policy.py | 159 +++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/test/document_stores/test_filter_policy.py b/test/document_stores/test_filter_policy.py index b7efcd0672..d6f592e1a6 100644 --- a/test/document_stores/test_filter_policy.py +++ b/test/document_stores/test_filter_policy.py @@ -11,6 +11,10 @@ def test_replace_policy_with_both_filters(): + """ + Replacing legacy filters + Result: The runtime filters + """ init_filters = {"status": "active", "category": "news"} runtime_filters = {"author": "John Doe"} result = apply_filter_policy(FilterPolicy.REPLACE, init_filters, runtime_filters) @@ -18,6 +22,10 @@ def test_replace_policy_with_both_filters(): def test_merge_policy_with_both_filters(): + """ + Merging legacy filters + Result: The runtime filters + """ init_filters = {"status": "active", "category": "news"} runtime_filters = {"author": "John Doe"} result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) @@ -25,6 +33,10 @@ def test_merge_policy_with_both_filters(): def test_replace_policy_with_only_init_filters(): + """ + Replacing legacy filters, None runtime filters + Result: The init filters + """ init_filters = {"status": "active", "category": "news"} runtime_filters = None result = apply_filter_policy(FilterPolicy.REPLACE, init_filters, runtime_filters) @@ -32,6 +44,10 @@ def test_replace_policy_with_only_init_filters(): def test_merge_policy_with_only_init_filters(): + """ + Merging of legacy filters, None runtime filters + Result: The init filters + """ init_filters = {"status": "active", "category": "news"} runtime_filters = None result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) @@ -39,7 +55,150 @@ def test_merge_policy_with_only_init_filters(): def test_merge_policy_with_overlapping_keys(): + """ + Merging of legacy filters + Result: The runtime filters overwrite the init filters + """ init_filters = {"status": "active", "category": "news"} runtime_filters = {"category": "science", "author": "John Doe"} result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == {"status": "active", "category": "science", "author": "John Doe"} + + +def test_merge_two_comparison_filters(): + """ + Merging two comparison filters + Result: AND operator with both filters + """ + init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} + runtime_filters = {"field": "meta.type", "operator": "==", "value": "article"} + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) + assert result == { + "operator": "AND", + "conditions": [ + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + } + + +def test_merge_init_comparison_and_runtime_logical_filters(): + """ + Merging init comparison and runtime logical filters + Result: AND operator with both filters + """ + init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} + runtime_filters = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ], + } + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) + assert result == { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + ], + } + + +def test_merge_runtime_comparison_and_init_logical_filters(): + """ + Merging a runtime comparison filter with an init logical filter + Result: AND operator with both filters + """ + init_filters = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ], + } + runtime_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) + assert result == { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + ], + } + + +def test_merge_two_logical_filters(): + """ + Merging two logical filters + Result: AND operator with both filters + """ + init_filters = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ], + } + runtime_filters = { + "operator": "AND", + "conditions": [ + {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + } + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) + assert result == { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + } + + +def test_merge_with_different_logical_operators(): + """ + Merging with a different logical operator + Result: warnings and runtime filters + """ + init_filters = {"operator": "AND", "conditions": [{"field": "meta.type", "operator": "==", "value": "article"}]} + runtime_filters = { + "operator": "OR", + "conditions": [{"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}], + } + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) + assert result == runtime_filters + + +def test_merge_comparison_filters_with_same_field(): + """ + Merging comparison filters with the same field + Result: warnings and runtime filters + """ + init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} + runtime_filters = {"field": "meta.date", "operator": "<=", "value": "2020-12-31"} + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) + assert result == runtime_filters + + +@pytest.mark.parametrize("logical_operator", ["AND", "OR", "NOT"]) +def test_merge_with_custom_logical_operator(logical_operator): + """ + Merging with a custom logical operator + Result: The given logical operator with both filters + """ + init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} + runtime_filters = {"field": "meta.type", "operator": "==", "value": "article"} + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters, logical_operator=logical_operator) + assert result == { + "operator": logical_operator, + "conditions": [ + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + } From f03be57ac422b16d14d6a3f857e7f8929d281a43 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Tue, 6 Aug 2024 15:55:33 +0200 Subject: [PATCH 06/17] Revert test_filter tests --- test/utils/test_filters.py | 136 ------------------------------------- 1 file changed, 136 deletions(-) diff --git a/test/utils/test_filters.py b/test/utils/test_filters.py index 223414dd02..eccf1ca1ec 100644 --- a/test/utils/test_filters.py +++ b/test/utils/test_filters.py @@ -3,9 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import pytest import pandas as pd -from haystack.document_stores.types import FilterPolicy - -from haystack.document_stores.types.filter_policy import apply_filter_policy from haystack import Document from haystack.errors import FilterError @@ -729,136 +726,3 @@ def test_convert_with_incorrect_filter_nesting(): with pytest.raises(FilterError): convert({"number": {"page": {"chapter": "intro"}}}) - - -################################################################################ -# Tests for the apply_filter_policy function -################################################################################ -def test_no_filters(): - assert apply_filter_policy(FilterPolicy.MERGE, None, None) is None - - -def test_replace_policy(): - init_filter = {"field": "meta.name", "operator": "==", "value": "John"} - runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} - assert apply_filter_policy(FilterPolicy.REPLACE, init_filter, runtime_filter) == runtime_filter - - -def test_merge_comparison_filters(): - init_filter = {"field": "meta.name", "operator": "==", "value": "John"} - runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} - expected = {"operator": "AND", "conditions": [init_filter, runtime_filter]} - assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter) == expected - - -def test_merge_comparison_filters_same_field(): - init_filter = {"field": "meta.name", "operator": "==", "value": "John"} - runtime_filter = {"field": "meta.name", "operator": "==", "value": "Jane"} - assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter) == runtime_filter - - -def test_merge_comparison_and_logical(): - comparison_filter = {"field": "meta.type", "operator": "==", "value": "pdf"} - logical_filter = { - "operator": "AND", - "conditions": [ - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - ], - } - expected = { - "operator": "AND", - "conditions": [ - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - {"field": "meta.type", "operator": "==", "value": "pdf"}, - ], - } - assert apply_filter_policy(FilterPolicy.MERGE, comparison_filter, logical_filter) == expected - - -def test_merge_comparison_and_logical_different_operator(): - comparison_filter = {"field": "meta.type", "operator": "==", "value": "pdf"} - logical_filter = { - "operator": "AND", - "conditions": [ - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - ], - } - expected = { - "operator": "AND", - "conditions": [ - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - {"field": "meta.type", "operator": "==", "value": "pdf"}, - ], - } - assert apply_filter_policy(FilterPolicy.MERGE, comparison_filter, logical_filter) == expected - - -def test_merge_logical_filters_same_operator(): - logical_filter1 = { - "operator": "AND", - "conditions": [ - {"field": "meta.type", "operator": "==", "value": "pdf"}, - {"field": "meta.month", "operator": "==", "value": "April"}, - ], - } - logical_filter2 = { - "operator": "AND", - "conditions": [ - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - ], - } - expected = { - "operator": "AND", - "conditions": [ - {"field": "meta.type", "operator": "==", "value": "pdf"}, - {"field": "meta.month", "operator": "==", "value": "April"}, - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - ], - } - result = apply_filter_policy(FilterPolicy.MERGE, logical_filter1, logical_filter2) - assert result == expected - - -def test_merge_logical_filters_different_operator(): - logical_filter1 = { - "operator": "AND", - "conditions": [ - {"field": "meta.type", "operator": "==", "value": "pdf"}, - {"field": "meta.month", "operator": "==", "value": "April"}, - ], - } - logical_filter2 = { - "operator": "OR", - "conditions": [ - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - ], - } - expected = { - "operator": "OR", - "conditions": [ - {"field": "meta.name", "operator": "==", "value": "John"}, - {"field": "meta.year", "operator": "==", "value": "2022"}, - ], - } - assert apply_filter_policy(FilterPolicy.MERGE, logical_filter1, logical_filter2) == expected - - -def test_user_provided_logical_operator(): - init_filter = {"field": "meta.name", "operator": "==", "value": "John"} - runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} - expected = {"operator": "OR", "conditions": [init_filter, runtime_filter]} - assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter, "OR") == expected - - -def test_default_logical_operator(): - init_filter = {"field": "meta.name", "operator": "==", "value": "John"} - runtime_filter = {"field": "meta.year", "operator": "==", "value": "2022"} - expected = {"operator": "AND", "conditions": [init_filter, runtime_filter]} - assert apply_filter_policy(FilterPolicy.MERGE, init_filter, runtime_filter) == expected From c8a53f2884635cac70e7820da8eb34e1a6326aeb Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 11:47:35 +0200 Subject: [PATCH 07/17] PR feedaback --- haystack/document_stores/types/__init__.py | 4 +- .../document_stores/types/filter_policy.py | 108 +++++++++--------- test/document_stores/test_filter_policy.py | 87 +++----------- 3 files changed, 75 insertions(+), 124 deletions(-) diff --git a/haystack/document_stores/types/__init__.py b/haystack/document_stores/types/__init__.py index df2032f79c..1924b78a00 100644 --- a/haystack/document_stores/types/__init__.py +++ b/haystack/document_stores/types/__init__.py @@ -2,8 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from .filter_policy import FilterPolicy +from .filter_policy import FilterPolicy, LogicalOperator, apply_filter_policy from .policy import DuplicatePolicy from .protocol import DocumentStore -__all__ = ["DocumentStore", "DuplicatePolicy", "FilterPolicy"] +__all__ = ["apply_filter_policy", "DocumentStore", "DuplicatePolicy", "FilterPolicy", "LogicalOperator"] diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index b6c7103e86..26910b2516 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -3,13 +3,37 @@ # SPDX-License-Identifier: Apache-2.0 from enum import Enum -from typing import Any, Dict, Literal, Optional +from typing import Any, Dict, Optional from haystack import logging logger = logging.getLogger(__name__) +class LogicalOperator(Enum): + AND = "AND" + OR = "OR" + NOT = "NOT" + + def __str__(self): + return self.value + + @staticmethod + def from_str(operator_label: str) -> "LogicalOperator": + """ + Convert a string to a LogicalOperator enum. + + :param operator_label: The string to convert. + :return: The corresponding FilterPolicy enum. + """ + enum_map = {e.value: e for e in LogicalOperator} + operator = enum_map.get(operator_label) + if operator is None: + msg = f"Unknown LogicalOperator type '{operator}'. Supported types are: {list(enum_map.keys())}" + raise ValueError(msg) + return operator + + class FilterPolicy(Enum): """ Policy to determine how filters are applied in retrievers interacting with document stores. @@ -40,16 +64,6 @@ def from_str(filter_policy: str) -> "FilterPolicy": return policy -def is_legacy(filter_item: Dict[str, Any]) -> bool: - """ - Check if the given filter is a legacy filter. - - :param filter_item: The filter to check. - :returns: True if the filter is a legacy filter, False otherwise. - """ - return "operator" not in filter_item - - def is_comparison_filter(filter_item: Dict[str, Any]) -> bool: """ Check if the given filter is a comparison filter. @@ -84,25 +98,25 @@ def combine_two_logical_filters( ```python init_logical_filter = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, ] } runtime_logical_filter = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, ] } new_filters = combine_two_logical_filters( - init_logical_filter, runtime_logical_filter, "AND" + init_logical_filter, runtime_logical_filter, LogicalOperator.AND ) # Output: { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -131,9 +145,7 @@ def combine_two_logical_filters( def combine_init_comparison_and_runtime_logical_filters( - init_comparison_filter: Dict[str, Any], - runtime_logical_filter: Dict[str, Any], - logical_operator: Literal["AND", "OR", "NOT"], + init_comparison_filter: Dict[str, Any], runtime_logical_filter: Dict[str, Any], logical_operator: LogicalOperator ) -> Dict[str, Any]: """ Combine a runtime logical filter with the init comparison filter using the provided logical_operator. @@ -145,7 +157,7 @@ def combine_init_comparison_and_runtime_logical_filters( ```python runtime_logical_filter = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -153,11 +165,11 @@ def combine_init_comparison_and_runtime_logical_filters( } init_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} new_filters = combine_init_comparison_and_runtime_logical_filters( - init_comparison_filter, runtime_logical_filter, "AND" + init_comparison_filter, runtime_logical_filter, LogicalOperator.AND ) # Output: { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -190,9 +202,7 @@ def combine_init_comparison_and_runtime_logical_filters( def combine_runtime_comparison_and_init_logical_filters( - runtime_comparison_filter: Dict[str, Any], - init_logical_filter: Dict[str, Any], - logical_operator: Literal["AND", "OR", "NOT"], + runtime_comparison_filter: Dict[str, Any], init_logical_filter: Dict[str, Any], logical_operator: LogicalOperator ) -> Dict[str, Any]: """ Combine an init logical filter with the runtime comparison filter using the provided logical_operator. @@ -204,7 +214,7 @@ def combine_runtime_comparison_and_init_logical_filters( ```python init_logical_filter = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -212,11 +222,11 @@ def combine_runtime_comparison_and_init_logical_filters( } runtime_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} new_filters = combine_runtime_comparison_and_init_logical_filters( - runtime_comparison_filter, init_logical_filter, "AND" + runtime_comparison_filter, init_logical_filter, LogicalOperator.AND ) # Output: { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -248,9 +258,7 @@ def combine_runtime_comparison_and_init_logical_filters( def combine_two_comparison_filters( - init_comparison_filter: Dict[str, Any], - runtime_comparison_filter: Dict[str, Any], - logical_operator: Literal["AND", "OR", "NOT"], + init_comparison_filter: Dict[str, Any], runtime_comparison_filter: Dict[str, Any], logical_operator: LogicalOperator ) -> Dict[str, Any]: """ Combine a comparison filter with the `init_comparison_filter` using the provided `logical_operator`. @@ -264,11 +272,11 @@ def combine_two_comparison_filters( runtime_comparison_filter = {"field": "meta.type", "operator": "==", "value": "article"}, init_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, new_filters = combine_two_comparison_filters( - init_comparison_filter, runtime_comparison_filter, "AND" + init_comparison_filter, runtime_comparison_filter, LogicalOperator.AND ) # Output: { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, @@ -292,7 +300,7 @@ def apply_filter_policy( filter_policy: FilterPolicy, init_filters: Optional[Dict[str, Any]] = None, runtime_filters: Optional[Dict[str, Any]] = None, - logical_operator: Literal["AND", "OR", "NOT"] = "AND", + default_logical_operator: LogicalOperator = LogicalOperator.AND, ) -> Optional[Dict[str, Any]]: """ Apply the filter policy to the given initial and runtime filters to determine the final set of filters used. @@ -306,26 +314,22 @@ def apply_filter_policy( :param init_filters: The initial filters set during the initialization of the relevant retriever. :param runtime_filters: The filters provided at runtime, usually during a query operation execution. These filters can change for each query/retriever run invocation. - :param logical_operator: The default logical operator to use when merging filters (non-legacy filters only). + :param default_logical_operator: The default logical operator to use when merging filters (non-legacy filters only). :returns: A dictionary containing the resulting filters based on the provided policy. """ - if filter_policy == FilterPolicy.MERGE and runtime_filters: - # legacy filters merge handling - if is_legacy(runtime_filters): - return {**(init_filters or {}), **runtime_filters} - elif init_filters is not None: - # here we merge new filters - if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): - return combine_two_comparison_filters(init_filters, runtime_filters, logical_operator) - elif is_comparison_filter(init_filters) and is_logical_filter(runtime_filters): - return combine_init_comparison_and_runtime_logical_filters( - init_filters, runtime_filters, logical_operator - ) - elif is_logical_filter(init_filters) and is_comparison_filter(runtime_filters): - return combine_runtime_comparison_and_init_logical_filters( - runtime_filters, init_filters, logical_operator - ) - elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): - return combine_two_logical_filters(init_filters, runtime_filters) + if filter_policy == FilterPolicy.MERGE and runtime_filters and init_filters: + # here we merge new filters + if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): + return combine_two_comparison_filters(init_filters, runtime_filters, default_logical_operator) + elif is_comparison_filter(init_filters) and is_logical_filter(runtime_filters): + return combine_init_comparison_and_runtime_logical_filters( + init_filters, runtime_filters, default_logical_operator + ) + elif is_logical_filter(init_filters) and is_comparison_filter(runtime_filters): + return combine_runtime_comparison_and_init_logical_filters( + runtime_filters, init_filters, default_logical_operator + ) + elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): + return combine_two_logical_filters(init_filters, runtime_filters) return runtime_filters or init_filters diff --git a/test/document_stores/test_filter_policy.py b/test/document_stores/test_filter_policy.py index d6f592e1a6..eb1155ae6e 100644 --- a/test/document_stores/test_filter_policy.py +++ b/test/document_stores/test_filter_policy.py @@ -3,66 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -from typing import Any, Dict, Optional -from enum import Enum -from haystack.document_stores.types import FilterPolicy -from haystack.document_stores.types.filter_policy import apply_filter_policy - - -def test_replace_policy_with_both_filters(): - """ - Replacing legacy filters - Result: The runtime filters - """ - init_filters = {"status": "active", "category": "news"} - runtime_filters = {"author": "John Doe"} - result = apply_filter_policy(FilterPolicy.REPLACE, init_filters, runtime_filters) - assert result == runtime_filters - - -def test_merge_policy_with_both_filters(): - """ - Merging legacy filters - Result: The runtime filters - """ - init_filters = {"status": "active", "category": "news"} - runtime_filters = {"author": "John Doe"} - result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) - assert result == {"status": "active", "category": "news", "author": "John Doe"} - - -def test_replace_policy_with_only_init_filters(): - """ - Replacing legacy filters, None runtime filters - Result: The init filters - """ - init_filters = {"status": "active", "category": "news"} - runtime_filters = None - result = apply_filter_policy(FilterPolicy.REPLACE, init_filters, runtime_filters) - assert result == init_filters - - -def test_merge_policy_with_only_init_filters(): - """ - Merging of legacy filters, None runtime filters - Result: The init filters - """ - init_filters = {"status": "active", "category": "news"} - runtime_filters = None - result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) - assert result == init_filters - - -def test_merge_policy_with_overlapping_keys(): - """ - Merging of legacy filters - Result: The runtime filters overwrite the init filters - """ - init_filters = {"status": "active", "category": "news"} - runtime_filters = {"category": "science", "author": "John Doe"} - result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) - assert result == {"status": "active", "category": "science", "author": "John Doe"} +from haystack.document_stores.types import LogicalOperator, apply_filter_policy, FilterPolicy def test_merge_two_comparison_filters(): @@ -74,7 +16,7 @@ def test_merge_two_comparison_filters(): runtime_filters = {"field": "meta.type", "operator": "==", "value": "article"} result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, {"field": "meta.type", "operator": "==", "value": "article"}, @@ -89,7 +31,7 @@ def test_merge_init_comparison_and_runtime_logical_filters(): """ init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} runtime_filters = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -97,7 +39,7 @@ def test_merge_init_comparison_and_runtime_logical_filters(): } result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -112,7 +54,7 @@ def test_merge_runtime_comparison_and_init_logical_filters(): Result: AND operator with both filters """ init_filters = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -121,7 +63,7 @@ def test_merge_runtime_comparison_and_init_logical_filters(): runtime_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -136,14 +78,14 @@ def test_merge_two_logical_filters(): Result: AND operator with both filters """ init_filters = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, ], } runtime_filters = { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, @@ -151,7 +93,7 @@ def test_merge_two_logical_filters(): } result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": "AND", + "operator": LogicalOperator.AND, "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -168,7 +110,7 @@ def test_merge_with_different_logical_operators(): """ init_filters = {"operator": "AND", "conditions": [{"field": "meta.type", "operator": "==", "value": "article"}]} runtime_filters = { - "operator": "OR", + "operator": LogicalOperator.OR, "conditions": [{"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}], } result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) @@ -194,9 +136,14 @@ def test_merge_with_custom_logical_operator(logical_operator): """ init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} runtime_filters = {"field": "meta.type", "operator": "==", "value": "article"} - result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters, logical_operator=logical_operator) + result = apply_filter_policy( + FilterPolicy.MERGE, + init_filters, + runtime_filters, + default_logical_operator=LogicalOperator.from_str(logical_operator), + ) assert result == { - "operator": logical_operator, + "operator": LogicalOperator.from_str(logical_operator), "conditions": [ {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, {"field": "meta.type", "operator": "==", "value": "article"}, From ce6002295082b261662888ef7f0ae8f24f9a57d4 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 12:12:20 +0200 Subject: [PATCH 08/17] Revert change in unrelated release note --- ...ment-splitter-accept-split-threshold-467abb9fcd1c316b.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml b/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml index a4488576e6..37f6bda3e7 100644 --- a/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml +++ b/releasenotes/notes/document-splitter-accept-split-threshold-467abb9fcd1c316b.yaml @@ -1,5 +1,5 @@ --- enhancements: - | - DocumentSplitter now has an optional split_threshold parameter. Use this parameter if you want to rather not split inputs that are only slightly longer than the allowed split_length. - If when chunking one of the chunks is smaller than the split_threshold, the chunk will be concatenated with the previous one. This avoids having too small chunks that are not meaningful. + DocumentSplitter now has an optional split_threshold parameter. Use this parameter if you want to rather not split inputs that are only slightly longer than the allowed split_length. + If when chunking one of the chunks is smaller than the split_threshold, the chunk will be concatenated with the previous one. This avoids having too small chunks that are not meaningful. From ad47b0ac214edb16eed935ff725d8632354b43b5 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 12:18:39 +0200 Subject: [PATCH 09/17] Increase test coverage, add corner cases tests --- .../document_stores/types/filter_policy.py | 2 +- test/document_stores/test_filter_policy.py | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 26910b2516..9babbfaeab 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -24,7 +24,7 @@ def from_str(operator_label: str) -> "LogicalOperator": Convert a string to a LogicalOperator enum. :param operator_label: The string to convert. - :return: The corresponding FilterPolicy enum. + :return: The corresponding LogicalOperator enum. """ enum_map = {e.value: e for e in LogicalOperator} operator = enum_map.get(operator_label) diff --git a/test/document_stores/test_filter_policy.py b/test/document_stores/test_filter_policy.py index eb1155ae6e..5f84274919 100644 --- a/test/document_stores/test_filter_policy.py +++ b/test/document_stores/test_filter_policy.py @@ -7,6 +7,35 @@ from haystack.document_stores.types import LogicalOperator, apply_filter_policy, FilterPolicy +def test_logical_operator_from_str(): + """ + Test the conversion of a string to a LogicalOperator enum. + """ + assert LogicalOperator.from_str("AND") == LogicalOperator.AND + assert LogicalOperator.from_str("OR") == LogicalOperator.OR + assert LogicalOperator.from_str("NOT") == LogicalOperator.NOT + + with pytest.raises(ValueError): + LogicalOperator.from_str(None) + + with pytest.raises(ValueError): + LogicalOperator.from_str("INVALID") + + +def test_filter_policy_from_str(): + """ + Test the conversion of a string to a FilterPolicy enum. + """ + assert FilterPolicy.from_str("REPLACE") == FilterPolicy.REPLACE + assert FilterPolicy.from_str("MERGE") == FilterPolicy.MERGE + + with pytest.raises(ValueError): + FilterPolicy.from_str(None) + + with pytest.raises(ValueError): + FilterPolicy.from_str("INVALID") + + def test_merge_two_comparison_filters(): """ Merging two comparison filters From ad4b59c841b43f67fd649b4f5e088fa2e40ea64a Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 12:29:14 +0200 Subject: [PATCH 10/17] Make Enums case insensitive --- haystack/document_stores/types/filter_policy.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 9babbfaeab..2f30c359b7 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -26,8 +26,8 @@ def from_str(operator_label: str) -> "LogicalOperator": :param operator_label: The string to convert. :return: The corresponding LogicalOperator enum. """ - enum_map = {e.value: e for e in LogicalOperator} - operator = enum_map.get(operator_label) + enum_map = {e.value.lower(): e for e in LogicalOperator} + operator = enum_map.get(operator_label.lower() if operator_label else "") if operator is None: msg = f"Unknown LogicalOperator type '{operator}'. Supported types are: {list(enum_map.keys())}" raise ValueError(msg) @@ -56,8 +56,8 @@ def from_str(filter_policy: str) -> "FilterPolicy": :param filter_policy: The string to convert. :return: The corresponding FilterPolicy enum. """ - enum_map = {e.value: e for e in FilterPolicy} - policy = enum_map.get(filter_policy) + enum_map = {e.value.lower(): e for e in FilterPolicy} + policy = enum_map.get(filter_policy.lower() if filter_policy else "") if policy is None: msg = f"Unknown FilterPolicy type '{filter_policy}'. Supported types are: {list(enum_map.keys())}" raise ValueError(msg) From b09826ce59a5ea0c389bd2326d9719573914c7dc Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 12:40:54 +0200 Subject: [PATCH 11/17] Handle string based logical operators for backward compatibility --- .../document_stores/types/filter_policy.py | 28 +++++++++++++++- test/document_stores/test_filter_policy.py | 32 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 2f30c359b7..1ac5a7c40f 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -64,6 +64,29 @@ def from_str(filter_policy: str) -> "FilterPolicy": return policy +def convert_logical_operators(filter_dict: Dict[str, Any]) -> Dict[str, Any]: + """ + Convert string-based logical operators in a filter dictionary to LogicalOperator enums. + + :param filter_dict: A dictionary representing a filter with potential string logical operators. + :return: A new dictionary with LogicalOperator enums in place of string operators. + """ + # If the dictionary represents a logical filter, update the 'operator' + if is_logical_filter(filter_dict) and isinstance(filter_dict["operator"], str): + try: + filter_dict["operator"] = LogicalOperator.from_str(filter_dict["operator"]) + except ValueError as e: + raise ValueError(f"Error converting logical operator: {e}") + + # Not sure if these filters can be nested, but just in case they are let's recursively convert them + filter_dict["conditions"] = [ + convert_logical_operators(condition) if isinstance(condition, dict) else condition + for condition in filter_dict["conditions"] + ] + + return filter_dict + + def is_comparison_filter(filter_item: Dict[str, Any]) -> bool: """ Check if the given filter is a comparison filter. @@ -318,7 +341,10 @@ def apply_filter_policy( :returns: A dictionary containing the resulting filters based on the provided policy. """ if filter_policy == FilterPolicy.MERGE and runtime_filters and init_filters: - # here we merge new filters + # first convert string-based logical operators to LogicalOperator enums + init_filters = convert_logical_operators(init_filters) + runtime_filters = convert_logical_operators(runtime_filters) + # now we merge filters if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): return combine_two_comparison_filters(init_filters, runtime_filters, default_logical_operator) elif is_comparison_filter(init_filters) and is_logical_filter(runtime_filters): diff --git a/test/document_stores/test_filter_policy.py b/test/document_stores/test_filter_policy.py index 5f84274919..abe761ddac 100644 --- a/test/document_stores/test_filter_policy.py +++ b/test/document_stores/test_filter_policy.py @@ -77,6 +77,38 @@ def test_merge_init_comparison_and_runtime_logical_filters(): } +def test_merge_runtime_comparison_and_init_logical_filters_with_string_operators(): + """ + Merging a runtime comparison filter with an init logical filter, but with string-based logical operators + Result: AND operator with both filters + """ + # Test with string-based logical operators + init_filters = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ], + } + runtime_filters = { + "operator": "AND", + "conditions": [ + {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + } + result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) + assert result == { + "operator": LogicalOperator.AND, + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + } + + def test_merge_runtime_comparison_and_init_logical_filters(): """ Merging a runtime comparison filter with an init logical filter From fe45233642754eb74ca522a3c5bd085fcc22f7c4 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 12:47:05 +0200 Subject: [PATCH 12/17] Update release note with doc link --- .../notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml b/releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml index 32cbbca67e..c90479c2c6 100644 --- a/releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml +++ b/releasenotes/notes/implement-merge-filter-logic-99e6785a78f80ae9.yaml @@ -1,4 +1,4 @@ --- enhancements: - | - Enhanced filter application logic to support merging of filters. It facilitates more precise retrieval filtering, allowing for both init and runtime complex filter combinations with logical operators. + Enhanced filter application logic to support merging of filters. It facilitates more precise retrieval filtering, allowing for both init and runtime complex filter combinations with logical operators. For more details see https://docs.haystack.deepset.ai/docs/metadata-filtering From 3c3ff75b474bc1a2ab85c0e2eceeed69be573e30 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 14:31:05 +0200 Subject: [PATCH 13/17] Assume logical filters are not nested --- haystack/document_stores/types/filter_policy.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 1ac5a7c40f..e06f623634 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -78,12 +78,6 @@ def convert_logical_operators(filter_dict: Dict[str, Any]) -> Dict[str, Any]: except ValueError as e: raise ValueError(f"Error converting logical operator: {e}") - # Not sure if these filters can be nested, but just in case they are let's recursively convert them - filter_dict["conditions"] = [ - convert_logical_operators(condition) if isinstance(condition, dict) else condition - for condition in filter_dict["conditions"] - ] - return filter_dict From 519de1c5edd9f6ab3616ae477ac98d66e2975372 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Aug 2024 19:01:24 +0200 Subject: [PATCH 14/17] Output pydoc fix --- haystack/document_stores/types/filter_policy.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index e06f623634..786c30767d 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -133,7 +133,7 @@ def combine_two_logical_filters( ) # Output: { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -186,7 +186,7 @@ def combine_init_comparison_and_runtime_logical_filters( ) # Output: { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -243,7 +243,7 @@ def combine_runtime_comparison_and_init_logical_filters( ) # Output: { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -293,7 +293,7 @@ def combine_two_comparison_filters( ) # Output: { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, From a2621ada9c6dd54e170b9268935ae751b9461890 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 8 Aug 2024 12:36:22 +0200 Subject: [PATCH 15/17] Convert all apply_filter_policy logical operators to str representations --- .../document_stores/types/filter_policy.py | 30 +++++++++++++++---- test/document_stores/test_filter_policy.py | 22 +++++++------- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 786c30767d..4825ac31ec 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -81,6 +81,19 @@ def convert_logical_operators(filter_dict: Dict[str, Any]) -> Dict[str, Any]: return filter_dict +def logical_operators_to_str(filter_dict: Dict[str, Any]) -> Dict[str, Any]: + """ + Convert any logical operators found in a filter dictionary to string. + + :param filter_dict: A dictionary representing a filter with potential LogicalOperator enums. + :return: A new dictionary with LogicalOperator strings in place of enums. + """ + # If the dictionary represents a logical filter, update the 'operator' + if is_logical_filter(filter_dict) and isinstance(filter_dict["operator"], LogicalOperator): + filter_dict["operator"] = filter_dict["operator"].value + return filter_dict + + def is_comparison_filter(filter_item: Dict[str, Any]) -> bool: """ Check if the given filter is a comparison filter. @@ -340,16 +353,23 @@ def apply_filter_policy( runtime_filters = convert_logical_operators(runtime_filters) # now we merge filters if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): - return combine_two_comparison_filters(init_filters, runtime_filters, default_logical_operator) + combined_filters = combine_two_comparison_filters(init_filters, runtime_filters, default_logical_operator) + return logical_operators_to_str(combined_filters) elif is_comparison_filter(init_filters) and is_logical_filter(runtime_filters): - return combine_init_comparison_and_runtime_logical_filters( + combined_filters = combine_init_comparison_and_runtime_logical_filters( init_filters, runtime_filters, default_logical_operator ) + return logical_operators_to_str(combined_filters) elif is_logical_filter(init_filters) and is_comparison_filter(runtime_filters): - return combine_runtime_comparison_and_init_logical_filters( + combined_filters = combine_runtime_comparison_and_init_logical_filters( runtime_filters, init_filters, default_logical_operator ) + return logical_operators_to_str(combined_filters) elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): - return combine_two_logical_filters(init_filters, runtime_filters) + combined_filters = combine_two_logical_filters(init_filters, runtime_filters) + return logical_operators_to_str(combined_filters) - return runtime_filters or init_filters + resulting_filter = runtime_filters or init_filters + if resulting_filter: + resulting_filter = logical_operators_to_str(resulting_filter) + return resulting_filter diff --git a/test/document_stores/test_filter_policy.py b/test/document_stores/test_filter_policy.py index abe761ddac..26ad193759 100644 --- a/test/document_stores/test_filter_policy.py +++ b/test/document_stores/test_filter_policy.py @@ -45,7 +45,7 @@ def test_merge_two_comparison_filters(): runtime_filters = {"field": "meta.type", "operator": "==", "value": "article"} result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, {"field": "meta.type", "operator": "==", "value": "article"}, @@ -60,7 +60,7 @@ def test_merge_init_comparison_and_runtime_logical_filters(): """ init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} runtime_filters = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -68,7 +68,7 @@ def test_merge_init_comparison_and_runtime_logical_filters(): } result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -99,7 +99,7 @@ def test_merge_runtime_comparison_and_init_logical_filters_with_string_operators } result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -115,7 +115,7 @@ def test_merge_runtime_comparison_and_init_logical_filters(): Result: AND operator with both filters """ init_filters = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -124,7 +124,7 @@ def test_merge_runtime_comparison_and_init_logical_filters(): runtime_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -139,14 +139,14 @@ def test_merge_two_logical_filters(): Result: AND operator with both filters """ init_filters = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, ], } runtime_filters = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, @@ -154,7 +154,7 @@ def test_merge_two_logical_filters(): } result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) assert result == { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -171,7 +171,7 @@ def test_merge_with_different_logical_operators(): """ init_filters = {"operator": "AND", "conditions": [{"field": "meta.type", "operator": "==", "value": "article"}]} runtime_filters = { - "operator": LogicalOperator.OR, + "operator": "OR", "conditions": [{"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}], } result = apply_filter_policy(FilterPolicy.MERGE, init_filters, runtime_filters) @@ -204,7 +204,7 @@ def test_merge_with_custom_logical_operator(logical_operator): default_logical_operator=LogicalOperator.from_str(logical_operator), ) assert result == { - "operator": LogicalOperator.from_str(logical_operator), + "operator": logical_operator, "conditions": [ {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, {"field": "meta.type", "operator": "==", "value": "article"}, From aeac10dc4b26d1f0410b178f1a8ae8cdbaef2b41 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 8 Aug 2024 13:59:30 +0200 Subject: [PATCH 16/17] Alternative impl to ensure str operators in logical filters --- .../document_stores/types/filter_policy.py | 39 ++++++------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 4825ac31ec..03e0114eb7 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -81,19 +81,6 @@ def convert_logical_operators(filter_dict: Dict[str, Any]) -> Dict[str, Any]: return filter_dict -def logical_operators_to_str(filter_dict: Dict[str, Any]) -> Dict[str, Any]: - """ - Convert any logical operators found in a filter dictionary to string. - - :param filter_dict: A dictionary representing a filter with potential LogicalOperator enums. - :return: A new dictionary with LogicalOperator strings in place of enums. - """ - # If the dictionary represents a logical filter, update the 'operator' - if is_logical_filter(filter_dict) and isinstance(filter_dict["operator"], LogicalOperator): - filter_dict["operator"] = filter_dict["operator"].value - return filter_dict - - def is_comparison_filter(filter_item: Dict[str, Any]) -> bool: """ Check if the given filter is a comparison filter. @@ -158,7 +145,7 @@ def combine_two_logical_filters( """ if init_logical_filter["operator"] == runtime_logical_filter["operator"]: return { - "operator": init_logical_filter["operator"], + "operator": str(init_logical_filter["operator"]), "conditions": init_logical_filter["conditions"] + runtime_logical_filter["conditions"], } @@ -171,6 +158,7 @@ def combine_two_logical_filters( init_logical_filter=init_logical_filter, runtime_logical_filter=runtime_logical_filter, ) + runtime_logical_filter["operator"] = str(runtime_logical_filter["operator"]) return runtime_logical_filter @@ -220,7 +208,7 @@ def combine_init_comparison_and_runtime_logical_filters( init_filter=init_comparison_filter, filters=runtime_logical_filter, ) - return {"operator": runtime_logical_filter["operator"], "conditions": conditions} + return {"operator": str(runtime_logical_filter["operator"]), "conditions": conditions} logger.warning( "The provided logical_operator, {logical_operator}, does not match the logical operator found in " @@ -228,6 +216,7 @@ def combine_init_comparison_and_runtime_logical_filters( logical_operator=logical_operator, filters_logical_operator=runtime_logical_filter["operator"], ) + runtime_logical_filter["operator"] = str(runtime_logical_filter["operator"]) return runtime_logical_filter @@ -276,7 +265,7 @@ def combine_runtime_comparison_and_init_logical_filters( ) conditions = [c for c in conditions if c.get("field") != runtime_comparison_filter["field"]] conditions.append(runtime_comparison_filter) - return {"operator": init_logical_filter["operator"], "conditions": conditions} + return {"operator": str(init_logical_filter["operator"]), "conditions": conditions} logger.warning( "The provided logical_operator, {logical_operator}, does not match the logical operator found in " @@ -323,7 +312,7 @@ def combine_two_comparison_filters( ) return runtime_comparison_filter - return {"operator": logical_operator, "conditions": [init_comparison_filter, runtime_comparison_filter]} + return {"operator": str(logical_operator), "conditions": [init_comparison_filter, runtime_comparison_filter]} def apply_filter_policy( @@ -353,23 +342,19 @@ def apply_filter_policy( runtime_filters = convert_logical_operators(runtime_filters) # now we merge filters if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): - combined_filters = combine_two_comparison_filters(init_filters, runtime_filters, default_logical_operator) - return logical_operators_to_str(combined_filters) + return combine_two_comparison_filters(init_filters, runtime_filters, default_logical_operator) elif is_comparison_filter(init_filters) and is_logical_filter(runtime_filters): - combined_filters = combine_init_comparison_and_runtime_logical_filters( + return combine_init_comparison_and_runtime_logical_filters( init_filters, runtime_filters, default_logical_operator ) - return logical_operators_to_str(combined_filters) elif is_logical_filter(init_filters) and is_comparison_filter(runtime_filters): - combined_filters = combine_runtime_comparison_and_init_logical_filters( + return combine_runtime_comparison_and_init_logical_filters( runtime_filters, init_filters, default_logical_operator ) - return logical_operators_to_str(combined_filters) elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): - combined_filters = combine_two_logical_filters(init_filters, runtime_filters) - return logical_operators_to_str(combined_filters) + return combine_two_logical_filters(init_filters, runtime_filters) resulting_filter = runtime_filters or init_filters - if resulting_filter: - resulting_filter = logical_operators_to_str(resulting_filter) + if resulting_filter and is_logical_filter(resulting_filter): + resulting_filter["operator"] = str(resulting_filter["operator"]) return resulting_filter From a1cc6f942270682104fa850980a9204fefacea98 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 8 Aug 2024 17:29:26 +0200 Subject: [PATCH 17/17] Revert back to Literal instead of LogicalOperator Enum --- haystack/document_stores/types/__init__.py | 4 +- .../document_stores/types/filter_policy.py | 81 +++++-------------- test/document_stores/test_filter_policy.py | 38 +-------- 3 files changed, 25 insertions(+), 98 deletions(-) diff --git a/haystack/document_stores/types/__init__.py b/haystack/document_stores/types/__init__.py index 1924b78a00..ed6becf8b4 100644 --- a/haystack/document_stores/types/__init__.py +++ b/haystack/document_stores/types/__init__.py @@ -2,8 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from .filter_policy import FilterPolicy, LogicalOperator, apply_filter_policy +from .filter_policy import FilterPolicy, apply_filter_policy from .policy import DuplicatePolicy from .protocol import DocumentStore -__all__ = ["apply_filter_policy", "DocumentStore", "DuplicatePolicy", "FilterPolicy", "LogicalOperator"] +__all__ = ["apply_filter_policy", "DocumentStore", "DuplicatePolicy", "FilterPolicy"] diff --git a/haystack/document_stores/types/filter_policy.py b/haystack/document_stores/types/filter_policy.py index 03e0114eb7..b0dc58d895 100644 --- a/haystack/document_stores/types/filter_policy.py +++ b/haystack/document_stores/types/filter_policy.py @@ -3,37 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 from enum import Enum -from typing import Any, Dict, Optional +from typing import Any, Dict, Literal, Optional from haystack import logging logger = logging.getLogger(__name__) -class LogicalOperator(Enum): - AND = "AND" - OR = "OR" - NOT = "NOT" - - def __str__(self): - return self.value - - @staticmethod - def from_str(operator_label: str) -> "LogicalOperator": - """ - Convert a string to a LogicalOperator enum. - - :param operator_label: The string to convert. - :return: The corresponding LogicalOperator enum. - """ - enum_map = {e.value.lower(): e for e in LogicalOperator} - operator = enum_map.get(operator_label.lower() if operator_label else "") - if operator is None: - msg = f"Unknown LogicalOperator type '{operator}'. Supported types are: {list(enum_map.keys())}" - raise ValueError(msg) - return operator - - class FilterPolicy(Enum): """ Policy to determine how filters are applied in retrievers interacting with document stores. @@ -64,23 +40,6 @@ def from_str(filter_policy: str) -> "FilterPolicy": return policy -def convert_logical_operators(filter_dict: Dict[str, Any]) -> Dict[str, Any]: - """ - Convert string-based logical operators in a filter dictionary to LogicalOperator enums. - - :param filter_dict: A dictionary representing a filter with potential string logical operators. - :return: A new dictionary with LogicalOperator enums in place of string operators. - """ - # If the dictionary represents a logical filter, update the 'operator' - if is_logical_filter(filter_dict) and isinstance(filter_dict["operator"], str): - try: - filter_dict["operator"] = LogicalOperator.from_str(filter_dict["operator"]) - except ValueError as e: - raise ValueError(f"Error converting logical operator: {e}") - - return filter_dict - - def is_comparison_filter(filter_item: Dict[str, Any]) -> bool: """ Check if the given filter is a comparison filter. @@ -115,21 +74,21 @@ def combine_two_logical_filters( ```python init_logical_filter = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, ] } runtime_logical_filter = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.genre", "operator": "IN", "value": ["economy", "politics"]}, {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, ] } new_filters = combine_two_logical_filters( - init_logical_filter, runtime_logical_filter, LogicalOperator.AND + init_logical_filter, runtime_logical_filter, "AND" ) # Output: { @@ -163,7 +122,9 @@ def combine_two_logical_filters( def combine_init_comparison_and_runtime_logical_filters( - init_comparison_filter: Dict[str, Any], runtime_logical_filter: Dict[str, Any], logical_operator: LogicalOperator + init_comparison_filter: Dict[str, Any], + runtime_logical_filter: Dict[str, Any], + logical_operator: Literal["AND", "OR", "NOT"], ) -> Dict[str, Any]: """ Combine a runtime logical filter with the init comparison filter using the provided logical_operator. @@ -175,7 +136,7 @@ def combine_init_comparison_and_runtime_logical_filters( ```python runtime_logical_filter = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -183,7 +144,7 @@ def combine_init_comparison_and_runtime_logical_filters( } init_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} new_filters = combine_init_comparison_and_runtime_logical_filters( - init_comparison_filter, runtime_logical_filter, LogicalOperator.AND + init_comparison_filter, runtime_logical_filter, "AND" ) # Output: { @@ -221,7 +182,9 @@ def combine_init_comparison_and_runtime_logical_filters( def combine_runtime_comparison_and_init_logical_filters( - runtime_comparison_filter: Dict[str, Any], init_logical_filter: Dict[str, Any], logical_operator: LogicalOperator + runtime_comparison_filter: Dict[str, Any], + init_logical_filter: Dict[str, Any], + logical_operator: Literal["AND", "OR", "NOT"], ) -> Dict[str, Any]: """ Combine an init logical filter with the runtime comparison filter using the provided logical_operator. @@ -233,7 +196,7 @@ def combine_runtime_comparison_and_init_logical_filters( ```python init_logical_filter = { - "operator": LogicalOperator.AND, + "operator": "AND", "conditions": [ {"field": "meta.type", "operator": "==", "value": "article"}, {"field": "meta.rating", "operator": ">=", "value": 3}, @@ -241,7 +204,7 @@ def combine_runtime_comparison_and_init_logical_filters( } runtime_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} new_filters = combine_runtime_comparison_and_init_logical_filters( - runtime_comparison_filter, init_logical_filter, LogicalOperator.AND + runtime_comparison_filter, init_logical_filter, "AND" ) # Output: { @@ -277,7 +240,9 @@ def combine_runtime_comparison_and_init_logical_filters( def combine_two_comparison_filters( - init_comparison_filter: Dict[str, Any], runtime_comparison_filter: Dict[str, Any], logical_operator: LogicalOperator + init_comparison_filter: Dict[str, Any], + runtime_comparison_filter: Dict[str, Any], + logical_operator: Literal["AND", "OR", "NOT"], ) -> Dict[str, Any]: """ Combine a comparison filter with the `init_comparison_filter` using the provided `logical_operator`. @@ -291,7 +256,7 @@ def combine_two_comparison_filters( runtime_comparison_filter = {"field": "meta.type", "operator": "==", "value": "article"}, init_comparison_filter = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, new_filters = combine_two_comparison_filters( - init_comparison_filter, runtime_comparison_filter, LogicalOperator.AND + init_comparison_filter, runtime_comparison_filter, "AND" ) # Output: { @@ -319,7 +284,7 @@ def apply_filter_policy( filter_policy: FilterPolicy, init_filters: Optional[Dict[str, Any]] = None, runtime_filters: Optional[Dict[str, Any]] = None, - default_logical_operator: LogicalOperator = LogicalOperator.AND, + default_logical_operator: Literal["AND", "OR", "NOT"] = "AND", ) -> Optional[Dict[str, Any]]: """ Apply the filter policy to the given initial and runtime filters to determine the final set of filters used. @@ -337,9 +302,6 @@ def apply_filter_policy( :returns: A dictionary containing the resulting filters based on the provided policy. """ if filter_policy == FilterPolicy.MERGE and runtime_filters and init_filters: - # first convert string-based logical operators to LogicalOperator enums - init_filters = convert_logical_operators(init_filters) - runtime_filters = convert_logical_operators(runtime_filters) # now we merge filters if is_comparison_filter(init_filters) and is_comparison_filter(runtime_filters): return combine_two_comparison_filters(init_filters, runtime_filters, default_logical_operator) @@ -354,7 +316,4 @@ def apply_filter_policy( elif is_logical_filter(init_filters) and is_logical_filter(runtime_filters): return combine_two_logical_filters(init_filters, runtime_filters) - resulting_filter = runtime_filters or init_filters - if resulting_filter and is_logical_filter(resulting_filter): - resulting_filter["operator"] = str(resulting_filter["operator"]) - return resulting_filter + return runtime_filters or init_filters diff --git a/test/document_stores/test_filter_policy.py b/test/document_stores/test_filter_policy.py index 26ad193759..d775ee356e 100644 --- a/test/document_stores/test_filter_policy.py +++ b/test/document_stores/test_filter_policy.py @@ -4,36 +4,7 @@ import pytest -from haystack.document_stores.types import LogicalOperator, apply_filter_policy, FilterPolicy - - -def test_logical_operator_from_str(): - """ - Test the conversion of a string to a LogicalOperator enum. - """ - assert LogicalOperator.from_str("AND") == LogicalOperator.AND - assert LogicalOperator.from_str("OR") == LogicalOperator.OR - assert LogicalOperator.from_str("NOT") == LogicalOperator.NOT - - with pytest.raises(ValueError): - LogicalOperator.from_str(None) - - with pytest.raises(ValueError): - LogicalOperator.from_str("INVALID") - - -def test_filter_policy_from_str(): - """ - Test the conversion of a string to a FilterPolicy enum. - """ - assert FilterPolicy.from_str("REPLACE") == FilterPolicy.REPLACE - assert FilterPolicy.from_str("MERGE") == FilterPolicy.MERGE - - with pytest.raises(ValueError): - FilterPolicy.from_str(None) - - with pytest.raises(ValueError): - FilterPolicy.from_str("INVALID") +from haystack.document_stores.types import apply_filter_policy, FilterPolicy def test_merge_two_comparison_filters(): @@ -190,7 +161,7 @@ def test_merge_comparison_filters_with_same_field(): @pytest.mark.parametrize("logical_operator", ["AND", "OR", "NOT"]) -def test_merge_with_custom_logical_operator(logical_operator): +def test_merge_with_custom_logical_operator(logical_operator: str): """ Merging with a custom logical operator Result: The given logical operator with both filters @@ -198,10 +169,7 @@ def test_merge_with_custom_logical_operator(logical_operator): init_filters = {"field": "meta.date", "operator": ">=", "value": "2015-01-01"} runtime_filters = {"field": "meta.type", "operator": "==", "value": "article"} result = apply_filter_policy( - FilterPolicy.MERGE, - init_filters, - runtime_filters, - default_logical_operator=LogicalOperator.from_str(logical_operator), + FilterPolicy.MERGE, init_filters, runtime_filters, default_logical_operator=logical_operator ) assert result == { "operator": logical_operator,