From d2c85fc543cd841ec19887ab0e7788d7f9235876 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 7 Oct 2024 17:45:32 +0200 Subject: [PATCH 1/3] chore: make sure we catch gil release errors from pybind11 --- packages/vaex-core/setup.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/vaex-core/setup.py b/packages/vaex-core/setup.py index 6f50c49f0..05baecdb3 100644 --- a/packages/vaex-core/setup.py +++ b/packages/vaex-core/setup.py @@ -121,10 +121,16 @@ def __str__(self): ] extra_compile_args.append("-g") extra_compile_args += extra_dev_options + if os.environ.get("CI"): + # this makes sure we catch error in using pybind11, see https://github.com/vaexio/vaex/issues/2439 + extra_compile_args += ["-UNDEBUG"] + else: + extra_compile_args += ["-DNDEBUG"] if sys.platform == "darwin": extra_compile_args.append("-mmacosx-version-min=10.9") + # on windows (Conda-forge builds), the dirname is an absolute path extension_vaexfast = Extension( "vaex.vaexfast", From 1023030fd42f47a4c1073aa2fa817080916e6e7f Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 7 Oct 2024 19:57:58 +0200 Subject: [PATCH 2/3] fix: hash object should properly handle null values as 'other' --- packages/vaex-core/src/hash_primitives.hpp | 147 ++++++++++++--------- packages/vaex-core/src/hash_string.hpp | 109 ++++++++------- 2 files changed, 146 insertions(+), 110 deletions(-) diff --git a/packages/vaex-core/src/hash_primitives.hpp b/packages/vaex-core/src/hash_primitives.hpp index f88cbbe14..85a614a62 100644 --- a/packages/vaex-core/src/hash_primitives.hpp +++ b/packages/vaex-core/src/hash_primitives.hpp @@ -303,24 +303,26 @@ class hash_base : public hash_common> { py::object key_array() { py::array_t output_array(this->length()); auto output = output_array.template mutable_unchecked<1>(); - py::gil_scoped_release gil; - auto offsets = this->offsets(); - size_t map_index = 0; - int64_t natural_order = 0; - // TODO: can be parallel due to non-overlapping maps - for (auto &map : this->maps) { - for (auto &el : map) { - key_type key = el.first; - int64_t index = static_cast(*this).key_offset(natural_order++, map_index, el, offsets[map_index]); - output(index) = key; + { + py::gil_scoped_release gil; + auto offsets = this->offsets(); + size_t map_index = 0; + int64_t natural_order = 0; + // TODO: can be parallel due to non-overlapping maps + for (auto &map : this->maps) { + for (auto &el : map) { + key_type key = el.first; + int64_t index = static_cast(*this).key_offset(natural_order++, map_index, el, offsets[map_index]); + output(index) = key; + } + map_index += 1; + } + if (this->nan_count) { + output(this->nan_index()) = NaNish::value; + } + if (this->null_count) { + output(this->null_index()) = -1; } - map_index += 1; - } - if (this->nan_count) { - output(this->nan_index()) = NaNish::value; - } - if (this->null_count) { - output(this->null_index()) = -1; } return output_array; } @@ -565,8 +567,11 @@ class ordered_set : public hash_base, T2, Hashmap2> { const key_type &value = values[i]; // the caller is responsible for finding masked values if (custom_isnan(value)) { - output[i - offset] = this->nan_value; - assert(this->nan_count > 0); + if(this->nan_count > 0) { + output[i - offset] = this->nan_value; + } else { + output[i - offset] = -1; + } } else { std::size_t hash = hasher_map_choice()(value); size_t map_index = (hash % nmaps); @@ -630,46 +635,50 @@ class ordered_set : public hash_base, T2, Hashmap2> { if (result.strides()[0] != result.itemsize()) { throw std::runtime_error("stride not equal to bytesize for output"); } - py::gil_scoped_release gil; + { + py::gil_scoped_release gil; - size_t nmaps = this->maps.size(); - auto offsets = this->offsets(); - if (nmaps == 1) { - auto &map0 = this->maps[0]; - for (int64_t i = 0; i < size; i++) { - const key_type &value = input[i]; - // the caller is responsible for finding masked values - if (custom_isnan(value)) { - output[i] = this->nan_value; - // TODO: the test fail here because we pass in NaN for None? - // but of course only in debug mode - assert(this->nan_count > 0); - } else { - auto search = map0.find(value); - if (search == map0.end()) { - output[i] = -1; + size_t nmaps = this->maps.size(); + auto offsets = this->offsets(); + if (nmaps == 1) { + auto &map0 = this->maps[0]; + for (int64_t i = 0; i < size; i++) { + const key_type &value = input[i]; + // the caller is responsible for finding masked values + if (custom_isnan(value)) { + if(this->null_count > 0) { + output[i] = this->nan_value; + } else { + output[i] = -1; + } } else { - output[i] = search->second; + auto search = map0.find(value); + if (search == map0.end()) { + output[i] = -1; + } else { + output[i] = search->second; + } } } - } - } else { - for (int64_t i = 0; i < size; i++) { - const key_type &value = input[i]; - // the caller is responsible for finding masked values - if (custom_isnan(value)) { - output[i] = this->nan_value; - // TODO: the test fail here because we pass in NaN for None? - // but of course only in debug mode - assert(this->nan_count > 0); - } else { - std::size_t hash = hasher_map_choice()(value); - size_t map_index = (hash % nmaps); - auto search = this->maps[map_index].find(value); - if (search == this->maps[map_index].end()) { - output[i] = -1; + } else { + for (int64_t i = 0; i < size; i++) { + const key_type &value = input[i]; + // the caller is responsible for finding masked values + if (custom_isnan(value)) { + if(this->nan_count > 0) { + output[i] = this->nan_value; + } else { + output[i] = -1; + } } else { - output[i] = search->second + offsets[map_index]; + std::size_t hash = hasher_map_choice()(value); + size_t map_index = (hash % nmaps); + auto search = this->maps[map_index].find(value); + if (search == this->maps[map_index].end()) { + output[i] = -1; + } else { + output[i] = search->second + offsets[map_index]; + } } } } @@ -790,9 +799,13 @@ class index_hash : public hash_base, T2, Hashmap2> { for (int64_t i = 0; i < size; i++) { const key_type &key = input(i); if (custom_isnan(key)) { - output(i) = nan_value; - assert(this->nan_count > 0); - if(nan_value == -1) { + if(this->nan_count > 0) { + output(i) = nan_value; + if(nan_value == -1) { + encountered_unknown = true; + } + } else { + output(i) = -1; encountered_unknown = true; } } else { @@ -830,15 +843,23 @@ class index_hash : public hash_base, T2, Hashmap2> { for (int64_t i = 0; i < size; i++) { const key_type &key = input(i); if (custom_isnan(key)) { - output(i) = nan_value; - assert(this->nan_count > 0); - if(nan_value == -1) { + if(this->nan_count) { + output(i) = nan_value; + if(nan_value == -1) { + encountered_unknown = true; + } + } else { + output(i) = -1; encountered_unknown = true; } } else if (input_mask(i) == 1) { - output(i) = null_value; - assert(this->null_count > 0); - if(null_value == -1) { + if(this->null_count) { + output(i) = null_value; + if(null_value == -1) { + encountered_unknown = true; + } + } else { + output(i) = -1; encountered_unknown = true; } } else { diff --git a/packages/vaex-core/src/hash_string.hpp b/packages/vaex-core/src/hash_string.hpp index 40b6b56ca..432bfe641 100644 --- a/packages/vaex-core/src/hash_string.hpp +++ b/packages/vaex-core/src/hash_string.hpp @@ -490,8 +490,11 @@ class ordered_set : public hash_base, T, T, V> { if (strings->has_null()) { for (int64_t i = offset; i < offset + length; i++) { if (strings->is_null(i)) { - output[i - offset] = this->null_value; - assert(this->null_count > 0); + if(this->null_count > 0) { + output[i - offset] = this->null_value; + } else { + output[i - offset] = -1; + } } else { const string_view &key = strings->view(i); size_t hash = hasher_map_choice()(key); @@ -555,19 +558,35 @@ class ordered_set : public hash_base, T, T, V> { return result; } auto output = result.template mutable_unchecked<1>(); - py::gil_scoped_release gil; size_t nmaps = this->maps.size(); auto offsets = this->offsets(); - if (nmaps == 1) { - auto &map0 = this->maps[0]; - // split slow and fast path - if (strings->has_null()) { - for (int64_t i = 0; i < size; i++) { - if (strings->is_null(i)) { - output(i) = this->null_value; - assert(this->null_count > 0); - } else { + { + py::gil_scoped_release gil; + if (nmaps == 1) { + auto &map0 = this->maps[0]; + // split slow and fast path + if (strings->has_null()) { + for (int64_t i = 0; i < size; i++) { + if (strings->is_null(i)) { + if(this->null_count > 0) { + output(i) = this->null_value; + } else { + output(i) = -1; + } + } else { + const string_view &key = strings->view(i); + auto search = map0.find(key); + auto end = map0.end(); + if (search == end) { + output(i) = -1; + } else { + output(i) = search->second; + } + } + } + } else { + for (int64_t i = 0; i < size; i++) { const string_view &key = strings->view(i); auto search = map0.find(key); auto end = map0.end(); @@ -579,27 +598,32 @@ class ordered_set : public hash_base, T, T, V> { } } } else { - for (int64_t i = 0; i < size; i++) { - const string_view &key = strings->view(i); - auto search = map0.find(key); - auto end = map0.end(); - if (search == end) { - output(i) = -1; - } else { - output(i) = search->second; + // split slow and fast path + if (strings->has_null()) { + for (int64_t i = 0; i < size; i++) { + if (strings->is_null(i)) { + if(this->null_count > 0) { + output(i) = this->null_value; + } else { + output(i) = -1; + } + } else { + const string_view &key = strings->view(i); + size_t hash = hasher_map_choice()(key); + size_t map_index = (hash % nmaps); + auto search = this->maps[map_index].find(key, hash); + auto end = this->maps[map_index].end(); + if (search == end) { + output(i) = -1; + } else { + output(i) = search->second + offsets[map_index]; + } + } } - } - } - } else { - // split slow and fast path - if (strings->has_null()) { - for (int64_t i = 0; i < size; i++) { - if (strings->is_null(i)) { - output(i) = this->null_value; - assert(this->null_count > 0); - } else { + } else { + for (int64_t i = 0; i < size; i++) { const string_view &key = strings->view(i); - size_t hash = hasher_map_choice()(key); + std::size_t hash = hasher_map_choice()(key); size_t map_index = (hash % nmaps); auto search = this->maps[map_index].find(key, hash); auto end = this->maps[map_index].end(); @@ -610,19 +634,6 @@ class ordered_set : public hash_base, T, T, V> { } } } - } else { - for (int64_t i = 0; i < size; i++) { - const string_view &key = strings->view(i); - std::size_t hash = hasher_map_choice()(key); - size_t map_index = (hash % nmaps); - auto search = this->maps[map_index].find(key, hash); - auto end = this->maps[map_index].end(); - if (search == end) { - output(i) = -1; - } else { - output(i) = search->second + offsets[map_index]; - } - } } } return result; @@ -751,9 +762,13 @@ class index_hash : public hash_base, T, T, V> { if (strings->has_null()) { for (int64_t i = 0; i < size; i++) { if (strings->is_null(i)) { - output(i) = null_value; - assert(this->null_count > 0); - if(null_value == -1) { + if(this->null_count > 0) { + output(i) = null_value; + if(null_value == -1) { + encountered_unknown = true; + } + } else { + output(i) = -1; encountered_unknown = true; } } else { From 5244572369e140305c5bcf5f63bed6d878bd5bb7 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 7 Oct 2024 22:06:05 +0200 Subject: [PATCH 3/3] fix: do not release the gil before returning an object --- packages/vaex-core/src/hash_primitives.hpp | 84 ++++++++++--------- packages/vaex-core/src/hash_string.hpp | 96 ++++++++++++---------- 2 files changed, 96 insertions(+), 84 deletions(-) diff --git a/packages/vaex-core/src/hash_primitives.hpp b/packages/vaex-core/src/hash_primitives.hpp index 85a614a62..b9037ac84 100644 --- a/packages/vaex-core/src/hash_primitives.hpp +++ b/packages/vaex-core/src/hash_primitives.hpp @@ -385,25 +385,27 @@ class counter : public hash_base, U, Hashmap2>, public coun py::object counts() { py::array_t output_array(this->length()); auto output = output_array.template mutable_unchecked<1>(); - py::gil_scoped_release gil; - auto offsets = this->offsets(); size_t map_index = 0; int64_t natural_order = 0; // TODO: can be parallel due to non-overlapping maps - for (auto &map : this->maps) { - for (auto &el : map) { - // key_type key = el.first; - value_type value = el.second; - int64_t index = key_offset(natural_order++, map_index, el, offsets[map_index]); - output(index) = value; + { + py::gil_scoped_release gil; + auto offsets = this->offsets(); + for (auto &map : this->maps) { + for (auto &el : map) { + // key_type key = el.first; + value_type value = el.second; + int64_t index = key_offset(natural_order++, map_index, el, offsets[map_index]); + output(index) = value; + } + map_index += 1; + } + if (this->nan_count) { + output(this->nan_index()) = this->nan_count; + } + if (this->null_count) { + output(this->null_index()) = this->null_count; } - map_index += 1; - } - if (this->nan_count) { - output(this->nan_index()) = this->nan_count; - } - if (this->null_count) { - output(this->null_index()) = this->null_count; } return output_array; } @@ -540,20 +542,22 @@ class ordered_set : public hash_base, T2, Hashmap2> { auto input = values.template unchecked<1>(); auto output = result.template mutable_unchecked<1>(); size_t nmaps = this->maps.size(); - py::gil_scoped_release gil; - for (int64_t i = 0; i < size; i++) { - const key_type &value = input(i); - if (custom_isnan(value)) { - output(i) = this->nan_count > 0; - } else { - std::size_t hash = hasher_map_choice()(value); - size_t map_index = (hash % nmaps); - auto search = this->maps[map_index].find(value); - auto end = this->maps[map_index].end(); - if (search == end) { - output(i) = false; + { + py::gil_scoped_release gil; + for (int64_t i = 0; i < size; i++) { + const key_type &value = input(i); + if (custom_isnan(value)) { + output(i) = this->nan_count > 0; } else { - output(i) = true; + std::size_t hash = hasher_map_choice()(value); + size_t map_index = (hash % nmaps); + auto search = this->maps[map_index].find(value); + auto end = this->maps[map_index].end(); + if (search == end) { + output(i) = false; + } else { + output(i) = true; + } } } } @@ -646,7 +650,7 @@ class ordered_set : public hash_base, T2, Hashmap2> { const key_type &value = input[i]; // the caller is responsible for finding masked values if (custom_isnan(value)) { - if(this->null_count > 0) { + if(this->nan_count > 0) { output[i] = this->nan_value; } else { output[i] = -1; @@ -843,7 +847,7 @@ class index_hash : public hash_base, T2, Hashmap2> { for (int64_t i = 0; i < size; i++) { const key_type &key = input(i); if (custom_isnan(key)) { - if(this->nan_count) { + if(this->nan_count > 0) { output(i) = nan_value; if(nan_value == -1) { encountered_unknown = true; @@ -853,7 +857,7 @@ class index_hash : public hash_base, T2, Hashmap2> { encountered_unknown = true; } } else if (input_mask(i) == 1) { - if(this->null_count) { + if(this->null_count > 0) { output(i) = null_value; if(null_value == -1) { encountered_unknown = true; @@ -957,16 +961,18 @@ class index_hash : public hash_base, T2, Hashmap2> { py::array_t indices_array(size_output); auto output = result.template mutable_unchecked<1>(); auto output_indices = indices_array.template mutable_unchecked<1>(); - py::gil_scoped_release gil; - // int64_t offset = 0; - size_t index = 0; + { + py::gil_scoped_release gil; + // int64_t offset = 0; + size_t index = 0; - std::copy(indices.begin(), indices.end(), &output_indices(0)); + std::copy(indices.begin(), indices.end(), &output_indices(0)); - for (auto &el : found) { - std::vector &indices = el.second; - for (int64_t i : indices) { - output(index++) = i; + for (auto &el : found) { + std::vector &indices = el.second; + for (int64_t i : indices) { + output(index++) = i; + } } } return std::make_tuple(indices_array, result); diff --git a/packages/vaex-core/src/hash_string.hpp b/packages/vaex-core/src/hash_string.hpp index 432bfe641..d3b601997 100644 --- a/packages/vaex-core/src/hash_string.hpp +++ b/packages/vaex-core/src/hash_string.hpp @@ -280,25 +280,27 @@ class counter : public hash_base, T, A, V>, public counter_mixin output_array(this->length()); auto output = output_array.template mutable_unchecked<1>(); - py::gil_scoped_release gil; - auto offsets = this->offsets(); - size_t map_index = 0; - int64_t natural_order = 0; - // TODO: can be parallel due to non-overlapping maps - for (auto &map : this->maps) { - for (auto &el : map) { - key_type key = el.first; - value_type value = el.second; - output(key.index) = value; + { + py::gil_scoped_release gil; + auto offsets = this->offsets(); + size_t map_index = 0; + int64_t natural_order = 0; + // TODO: can be parallel due to non-overlapping maps + for (auto &map : this->maps) { + for (auto &el : map) { + key_type key = el.first; + value_type value = el.second; + output(key.index) = value; + } + // map_index += 1; + } + // no nans possible + // if (this->nan_count) { + // output(this->nan_index()) = this->nan_count; + // } + if (this->null_count) { + output(this->null_index()) = this->null_count; } - // map_index += 1; - } - // no nans possible - // if (this->nan_count) { - // output(this->nan_index()) = this->nan_count; - // } - if (this->null_count) { - output(this->null_index()) = this->null_count; } return output_array; } @@ -445,13 +447,28 @@ class ordered_set : public hash_base, T, T, V> { int64_t size = strings->length; py::array_t result(size); auto output = result.template mutable_unchecked<1>(); - py::gil_scoped_release gil; - size_t nmaps = this->maps.size(); - if (strings->has_null()) { - for (int64_t i = 0; i < size; i++) { - if (strings->is_null(i)) { - output(i) = this->null_count > 0; - } else { + { + py::gil_scoped_release gil; + size_t nmaps = this->maps.size(); + if (strings->has_null()) { + for (int64_t i = 0; i < size; i++) { + if (strings->is_null(i)) { + output(i) = this->null_count > 0; + } else { + const string_view &value = strings->view(i); + std::size_t hash = hasher_map_choice()(value); + size_t map_index = (hash % nmaps); + auto search = this->maps[map_index].find(value); + auto end = this->maps[map_index].end(); + if (search == end) { + output(i) = false; + } else { + output(i) = true; + } + } + } + } else { + for (int64_t i = 0; i < size; i++) { const string_view &value = strings->view(i); std::size_t hash = hasher_map_choice()(value); size_t map_index = (hash % nmaps); @@ -464,19 +481,6 @@ class ordered_set : public hash_base, T, T, V> { } } } - } else { - for (int64_t i = 0; i < size; i++) { - const string_view &value = strings->view(i); - std::size_t hash = hasher_map_choice()(value); - size_t map_index = (hash % nmaps); - auto search = this->maps[map_index].find(value); - auto end = this->maps[map_index].end(); - if (search == end) { - output(i) = false; - } else { - output(i) = true; - } - } } return result; } @@ -847,15 +851,17 @@ class index_hash : public hash_base, T, T, V> { py::array_t indices_array(size); auto output = result.template mutable_unchecked<1>(); auto output_indices = indices_array.template mutable_unchecked<1>(); - py::gil_scoped_release gil; - size_t index = 0; + { + py::gil_scoped_release gil; + size_t index = 0; - std::copy(indices.begin(), indices.end(), &output_indices(0)); + std::copy(indices.begin(), indices.end(), &output_indices(0)); - for (auto el : found) { - std::vector &indices = el.second; - for (int64_t i : indices) { - output(index++) = i; + for (auto el : found) { + std::vector &indices = el.second; + for (int64_t i : indices) { + output(index++) = i; + } } } return std::make_tuple(indices_array, result);