Skip to content

Commit

Permalink
Fix flaky test_filter_numeric_isnotin_unsigned (#496)
Browse files Browse the repository at this point in the history
The cause was type_arithmetic_promoted_type would return int64 as the common type for uint64 and any signed int. When we then do the static_cast<WideType>(*ptr++) before calling the Is[Not]InOperator, the uint64 is converted to int64 and the special overloads in the Operators are never used.

+ Ability to only_test_encoding_version_v1 in a test/class/module
  • Loading branch information
qc00 committed Jul 14, 2023
1 parent 93906d9 commit 447f90c
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 31 deletions.
10 changes: 8 additions & 2 deletions cpp/arcticdb/processing/operation_dispatch_binary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,14 @@ VariantData binary_membership(const ColumnWithStrings& column_with_strings, Valu
auto ptr = reinterpret_cast<const ColumnType*>(block.value().data());
const auto row_count = block.value().row_count();
for (auto i = 0u; i < row_count; ++i, ++pos) {
if(func(static_cast<WideType>(*ptr++), *typed_value_set))
inserter = pos;
if constexpr (MembershipOperator::needs_uint64_special_handling<ColumnType, ValueSetBaseType>) {
// Avoid narrowing conversion on *ptr:
if (func(*ptr++, *typed_value_set, UInt64SpecialHandlingTag{}))
inserter = pos;
} else {
if (func(static_cast<WideType>(*ptr++), *typed_value_set))
inserter = pos;
}
}
}
inserter.flush();
Expand Down
57 changes: 47 additions & 10 deletions cpp/arcticdb/processing/operation_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ struct PlusOperator;
struct MinusOperator;
struct TimesOperator;
struct DivideOperator;
struct MembershipOperator;

namespace arithmetic_promoted_type::details {
template <class VAL>
Expand Down Expand Up @@ -179,8 +180,15 @@ struct type_arithmetic_promoted_type {
std::conditional_t<(std::is_signed_v<LHS> && sizeof(LHS) > sizeof(RHS)) || (std::is_signed_v<RHS> && sizeof(RHS) > sizeof(LHS)),
// If the signed type is strictly larger than the unsigned type, then promote to the signed type
typename arithmetic_promoted_type::details::signed_width_t<max_width>,
// Otherwise, promote to a signed type wider than the unsigned type, so that it can be exactly represented
typename arithmetic_promoted_type::details::signed_width_t<2 * max_width>
// Otherwise, check if the unsigned one is the widest type we support
std::conditional_t<std::is_same_v<LHS, uint64_t> || std::is_same_v<LHS, uint64_t>,
// If so, there's no common type that can completely hold both arguments. We trigger operation-specific handling
std::conditional_t<std::is_base_of_v<MembershipOperator, Func>,
RHS, // Retains ValueSetBaseType in binary_membership()
int64_t>, // Retain the broken behaviour for Divide for now (https://github.com/man-group/ArcticDB/issues/594)
// There should be a signed type wider than the unsigned type, so both can be exactly represented
typename arithmetic_promoted_type::details::signed_width_t<2 * max_width>
>
>
>
>
Expand Down Expand Up @@ -373,18 +381,44 @@ bool operator()(int64_t t, uint64_t u) const {
}
};

struct IsInOperator {
struct MembershipOperator {
protected:
/** Returns the high bits beyond what can be held in the signed integer type I, including I's sign bit. */
template<typename I>
static constexpr uint64_t incomparable_bits() {
return std::numeric_limits<uint64_t>::max() - static_cast<uint64_t>(std::numeric_limits<I>::max());
}

template<typename U>
static constexpr bool is_signed_int = std::is_integral_v<U> && std::is_signed_v<U>;

public:
/** This is tighter than the signatures of the special handling operator()s below to reject argument types smaller
* than uint64 going down the special handling via type promotion. */
template<typename ColumnType, typename ValueSetBaseType>
static constexpr bool needs_uint64_special_handling =
(std::is_same_v<ColumnType, uint64_t> && is_signed_int<ValueSetBaseType>) ||
(std::is_same_v<ValueSetBaseType, uint64_t> && is_signed_int<ColumnType>);
};

/** Used as a dummy parameter to ensure we don't pick the non-special handling overloads by mistake. */
struct UInt64SpecialHandlingTag {};

struct IsInOperator: MembershipOperator {
template<typename T, typename U>
bool operator()(T t, const std::unordered_set<U>& u) const {
return u.count(t) > 0;
}
bool operator()(uint64_t t, const std::unordered_set<int64_t>& u) const {
if (comparison::msb_set(t))

template<typename U, typename=std::enable_if_t<is_signed_int<U>>>
bool operator()(uint64_t t, const std::unordered_set<U>& u, UInt64SpecialHandlingTag = {}) const {
auto incomparable = t & incomparable_bits<U>();
if (incomparable)
return false;
else
return u.count(t) > 0;
}
bool operator()(int64_t t, const std::unordered_set<uint64_t>& u) const {
bool operator()(int64_t t, const std::unordered_set<uint64_t>& u, UInt64SpecialHandlingTag = {}) const {
if (t < 0)
return false;
else
Expand All @@ -397,18 +431,21 @@ bool operator()(T t, const emilib::HashSet<U>& u) const {
}
};

struct IsNotInOperator {
struct IsNotInOperator: MembershipOperator {
template<typename T, typename U>
bool operator()(T t, const std::unordered_set<U>& u) const {
return u.count(t) == 0;
}
bool operator()(uint64_t t, const std::unordered_set<int64_t>& u) const {
if (comparison::msb_set(t))

template<typename U, typename = std::enable_if_t<is_signed_int<U>>>
bool operator()(uint64_t t, const std::unordered_set<U>& u, UInt64SpecialHandlingTag = {}) const {
auto incomparable = t & incomparable_bits<U>();
if (incomparable)
return true;
else
return u.count(t) == 0;
}
bool operator()(int64_t t, const std::unordered_set<uint64_t>& u) const {
bool operator()(int64_t t, const std::unordered_set<uint64_t>& u, UInt64SpecialHandlingTag = {}) const {
if (t < 0)
return true;
else
Expand Down
12 changes: 9 additions & 3 deletions python/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,15 @@ class EncodingVersion(enum.IntEnum):
V2 = 1


@pytest.fixture(params=list(EncodingVersion))
def encoding_version(request) -> EncodingVersion:
return request.param
@pytest.fixture(scope="session")
def only_test_encoding_version_v1():
"""Dummy fixture to reference at module/class level to reduce test cases"""


def pytest_generate_tests(metafunc):
if "encoding_version" in metafunc.fixturenames:
only_v1 = "only_test_encoding_version_v1" in metafunc.fixturenames
metafunc.parametrize("encoding_version", [EncodingVersion.V1] if only_v1 else list(EncodingVersion))


@pytest.fixture
Expand Down
51 changes: 35 additions & 16 deletions python/tests/unit/arcticdb/version_store/test_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
import pandas as pd
import pytest
from pytz import timezone
from packaging.version import Version
import random
import string

from arcticdb.exceptions import ArcticNativeException
from arcticdb.version_store.processing import QueryBuilder
from arcticdb_ext.exceptions import InternalException, UserInputException
from arcticdb.util.test import assert_frame_equal, IS_PANDAS_ZERO
from arcticdb.util.test import assert_frame_equal, PANDAS_VERSION
from arcticdb.util.hypothesis import (
use_of_function_scoped_fixtures_in_hypothesis_checked,
integral_type_strategies,
Expand All @@ -39,15 +40,20 @@
from arcticdb_ext import set_config_int


@pytest.fixture(scope="module", autouse=True)
def _restrict_to(only_test_encoding_version_v1):
pass


def generic_filter_test(version_store, symbol, df, arctic_query, pandas_query, dynamic_strings=True):
version_store.write(symbol, df, dynamic_strings=dynamic_strings)
expected = df.query(pandas_query)
received = version_store.read(symbol, query_builder=arctic_query).data
if not np.array_equal(expected, received):
print("Original dataframe\n{}".format(df))
print("Pandas query\n{}".format(pandas_query))
print("Expected\n{}".format(expected))
print("Received\n{}".format(received))
print(f"\nOriginal dataframe:\n{df}\ndtypes:\n{df.dtypes}")
print(f"\nPandas query: {pandas_query}")
print(f"\nPandas returns:\n{expected}")
print(f"\nQueryBuilder returns:\n{received}")
assert False
assert True

Expand Down Expand Up @@ -895,20 +901,14 @@ def test_filter_isin_clashing_sets(lmdb_version_store):
generic_filter_test(lmdb_version_store, "test_filter_isin_clashing_sets", df, q, pandas_query)


def numeric_isin_asumptions(df, vals):
assume(not df.empty)
# If df values need a uint64 to hold them then we only support unsigned vals
assume(df["a"].between(-(2**63), 2**63 - 1).all() or all(v >= 0 for v in vals))


@use_of_function_scoped_fixtures_in_hypothesis_checked
@settings(deadline=None)
@given(
df=dataframes_with_names_and_dtypes(["a"], integral_type_strategies()),
vals=st.frozensets(signed_integral_type_strategies(), min_size=1),
)
def test_filter_numeric_isin_signed(lmdb_version_store, df, vals):
numeric_isin_asumptions(df, vals)
assume(not df.empty)
q = QueryBuilder()
q = q[q["a"].isin(vals)]
pandas_query = "a in {}".format(list(vals))
Expand All @@ -922,7 +922,7 @@ def test_filter_numeric_isin_signed(lmdb_version_store, df, vals):
vals=st.frozensets(unsigned_integral_type_strategies(), min_size=1),
)
def test_filter_numeric_isin_unsigned(lmdb_version_store, df, vals):
numeric_isin_asumptions(df, vals)
assume(not df.empty)
q = QueryBuilder()
q = q[q["a"].isin(vals)]
pandas_query = "a in {}".format(list(vals))
Expand Down Expand Up @@ -968,9 +968,9 @@ def test_filter_numeric_isin_unsigned(lmdb_version_store):
df=dataframes_with_names_and_dtypes(["a"], integral_type_strategies()),
vals=st.frozensets(unsigned_integral_type_strategies(), min_size=1),
)
@pytest.mark.skipif(IS_PANDAS_ZERO, reason="Early Pandas filtering does not handle unsigned well")
@pytest.mark.skipif(PANDAS_VERSION < Version("1.2"), reason="Early Pandas filtering does not handle unsigned well")
def test_filter_numeric_isnotin_unsigned(lmdb_version_store, df, vals):
numeric_isin_asumptions(df, vals)
assume(not df.empty)
q = QueryBuilder()
q = q[q["a"].isnotin(vals)]
pandas_query = "a not in {}".format(list(vals))
Expand All @@ -984,7 +984,7 @@ def test_filter_numeric_isnotin_unsigned(lmdb_version_store, df, vals):
vals=st.frozensets(signed_integral_type_strategies(), min_size=1),
)
def test_filter_numeric_isnotin_signed(lmdb_version_store, df, vals):
numeric_isin_asumptions(df, vals)
assume(not df.empty)
q = QueryBuilder()
q = q[q["a"].isnotin(vals)]
pandas_query = "a not in {}".format(list(vals))
Expand All @@ -1010,6 +1010,25 @@ def test_filter_numeric_isnotin_hashing_overflow(lmdb_version_store):
assert_frame_equal(df, result)


_uint64_max = np.iinfo(np.uint64).max


@pytest.mark.parametrize("op", ("in", "not in"))
@pytest.mark.parametrize("signed_type", (np.int8, np.int16, np.int32, np.int64))
@pytest.mark.parametrize("uint64_in", ("df", "vals") if PANDAS_VERSION >= Version("1.2") else ("vals",))
def test_filter_numeric_membership_mixing_int64_and_uint64(lmdb_version_store, op, signed_type, uint64_in):
signed = signed_type(-1)
if uint64_in == "df":
df, vals = pd.DataFrame({"a": [_uint64_max]}), [signed]
else:
df, vals = pd.DataFrame({"a": [signed]}), [_uint64_max]

q = QueryBuilder()
q = q[q["a"].isin(vals) if op == "in" else q["a"].isnotin(vals)]
pandas_query = f"a {op} {vals}"
generic_filter_test(lmdb_version_store, "test_filter_numeric_mixing", df, q, pandas_query)


@use_of_function_scoped_fixtures_in_hypothesis_checked
@settings(deadline=None)
@given(df=dataframes_with_names_and_dtypes(["a"], integral_type_strategies()))
Expand Down

0 comments on commit 447f90c

Please sign in to comment.