Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect internal handling of dimension size type #140

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 154 additions & 48 deletions mdio/acceptance_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <sys/wait.h>
#include <unistd.h>

#include <cstdlib>
#include <filesystem>
Expand All @@ -24,6 +26,58 @@

namespace {

constexpr char PYTHON_EXECUTABLE[] = "python3"; // Don't specify absolute path
constexpr char PROJECT_BASE_PATH_ENV[] = "PROJECT_BASE_PATH";
constexpr char DEFAULT_BASE_PATH[] = "../..";
constexpr char ZARR_SCRIPT_RELATIVE_PATH[] =
"/mdio/regression_tests/zarr_compatibility.py";
constexpr char XARRAY_SCRIPT_RELATIVE_PATH[] =
"/mdio/regression_tests/xarray_compatibility_test.py";
constexpr char FILE_PATH_BASE[] = "./zarrs/acceptance/";
constexpr int ERROR_CODE = EXIT_FAILURE;
constexpr int SUCCESS_CODE = EXIT_SUCCESS;

/**
* @brief Executes the Python regression tests
*
* @param scriptPath Which script to run, should be either
* ZARR_SCRIPT_RELATIVE_PATH or XARRAY_SCRIPT_RELATIVE_PATH
* @param arg The argument to pass to the script
* @return int The status code of the script execution
*/
int executePythonScript(const std::string& scriptPath,
const std::vector<std::string>& args) {
// Ensure scriptPath and args are sanitized
if (scriptPath.empty() || args.empty()) {
std::cerr << "Error: scriptPath or args are empty." << std::endl;
return ERROR_CODE;
}

// Ensure the scriptPath is an absolute path or default base path
if (scriptPath[0] != '/' && scriptPath.find(DEFAULT_BASE_PATH) != 0) {
std::cerr
<< "Error: PROJECT_BASE_PATH must be an absolute path or not be set."
<< std::endl;
return ERROR_CODE;
}

// Prepare arguments for execvp
std::vector<const char*> execArgs = {PYTHON_EXECUTABLE, scriptPath.c_str()};
for (const auto& arg : args) {
execArgs.push_back(arg.c_str());
}
execArgs.push_back(nullptr);

// Execute the Python script
if (execvp(execArgs[0], const_cast<char* const*>(execArgs.data())) == -1) {
perror("execvp failed");
return ERROR_CODE;
}

return SUCCESS_CODE; // This line will never be reached if execvp is
// successful
}

namespace VariableTesting {

using float16_t = mdio::dtypes::float_16_t;
Expand Down Expand Up @@ -567,41 +621,63 @@ TEST(Variable, sliceByDimIdx) {
}

TEST(Variable, zarrCompatibility) {
const char* basePath = std::getenv("PROJECT_BASE_PATH");
const char* basePath = std::getenv(PROJECT_BASE_PATH_ENV);
if (!basePath) {
std::cout << "PROJECT_BASE_PATH environment variable not set. Expecting to "
"be in the 'build/mdio' directory."
<< std::endl;
basePath = "../..";
basePath = DEFAULT_BASE_PATH;
}

std::string srcPath =
std::string(basePath) + "/mdio/regression_tests/zarr_compatibility.py";
std::string filePathBase = "./zarrs/acceptance/";
std::string command = "python3 " + srcPath + " " + filePathBase;

std::string i2 = command + "i2";
std::string i4 = command + "i4";
std::string i8 = command + "i8";
std::string f2 = command + "f2";
std::string f4 = command + "f4";
std::string f8 = command + "f8";
std::string voided = command + "voided";

EXPECT_TRUE(std::system(i2.c_str()) == 0)
<< "Failed to read i2\n\tMore detailed error expected above...";
EXPECT_TRUE(std::system(i4.c_str()) == 0)
<< "Failed to read i4\n\tMore detailed error expected above...";
EXPECT_TRUE(std::system(i8.c_str()) == 0)
<< "Failed to read i8\n\tMore detailed error expected above...";
EXPECT_TRUE(std::system(f2.c_str()) == 0)
<< "Failed to read f2\n\tMore detailed error expected above...";
EXPECT_TRUE(std::system(f4.c_str()) == 0)
<< "Failed to read f4\n\tMore detailed error expected above...";
EXPECT_TRUE(std::system(f8.c_str()) == 0)
<< "Failed to read f8\n\tMore detailed error expected above...";
EXPECT_TRUE(std::system(voided.c_str()) == 0)
<< "Failed to read voided\n\tMore detailed error expected above...";
// Resolve the absolute path for the script
std::string srcPath = std::string(basePath) + ZARR_SCRIPT_RELATIVE_PATH;

// Ensure that srcPath is valid and points to an existing file
if (access(srcPath.c_str(), F_OK) == -1) {
std::cerr << "Error: Python script not found at " << srcPath << std::endl;
FAIL() << "Script not found: " << srcPath;
}

std::vector<std::string> args = {"i2", "i4", "i8", "f2",
"f4", "f8", "voided"};
std::vector<pid_t> pids;

for (const auto& arg : args) {
pid_t pid = fork();
if (pid == 0) {
// Child process
int result = executePythonScript(srcPath, {FILE_PATH_BASE + arg});
if (result == 0xfd00) { // 0xfd from Python is 0xfd00 in C++
GTEST_SKIP() << "Zarr compatibility skipped due to import error for "
"required library";
exit(SUCCESS_CODE);
}
exit(result);
} else if (pid > 0) {
// Parent process
pids.push_back(pid);
} else {
// Fork failed
perror("fork failed");
FAIL() << "fork failed";
}
}

// Wait for all child processes
for (pid_t pid : pids) {
int status;
if (waitpid(pid, &status, 0) == -1) {
perror("waitpid failed");
FAIL() << "waitpid failed";
}
if (WIFEXITED(status) && WEXITSTATUS(status) == 0xfd00) {
GTEST_SKIP() << "Zarr compatibility skipped due to import error for "
"required library";
}
EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
<< "Failed to read one of the arguments\n\tMore detailed error "
"expected above...";
}
}

TEST(Variable, dimensionUnits) {
Expand Down Expand Up @@ -1650,32 +1726,62 @@ TEST(Dataset, isel) {
}

TEST(Dataset, xarrayCompatible) {
const char* basePath = std::getenv("PROJECT_BASE_PATH");
const char* basePath = std::getenv(PROJECT_BASE_PATH_ENV);
if (!basePath) {
std::cout << "PROJECT_BASE_PATH environment variable not set. Expecting to "
"be in the 'build/mdio' directory."
<< std::endl;
basePath = "../..";
basePath = DEFAULT_BASE_PATH;
}

// Resolve the absolute path for the script
std::string srcPath = std::string(basePath) + XARRAY_SCRIPT_RELATIVE_PATH;

// Ensure that srcPath is valid and points to an existing file
if (access(srcPath.c_str(), F_OK) == -1) {
std::cerr << "Error: Python script not found at " << srcPath << std::endl;
FAIL() << "Script not found: " << srcPath;
}

std::string srcPath = std::string(basePath) +
"/mdio/regression_tests/xarray_compatibility_test.py";
std::string datasetPath = "./zarrs/acceptance";
std::string command = "python3 " + srcPath + " " + datasetPath + " False";
int status = system(command.c_str());
if (status == 0xfd00) { // 0xfd from Python is 0xfd00 in C++
GTEST_SKIP()
<< "Xarray compatibility skipped due to import error for xarray";
std::vector<std::string> metadataOptions = {"False", "True"};
std::vector<pid_t> pids;

for (const auto& option : metadataOptions) {
pid_t pid = fork();
if (pid == 0) {
// Child process
int result = executePythonScript(srcPath, {FILE_PATH_BASE, option});
if (result == 0xfd00) { // 0xfd from Python is 0xfd00 in C++
GTEST_SKIP()
<< "Xarray compatibility skipped due to import error for xarray";
exit(SUCCESS_CODE);
}
exit(result);
} else if (pid > 0) {
// Parent process
pids.push_back(pid);
} else {
// Fork failed
perror("fork failed");
FAIL() << "fork failed";
}
}

// Wait for all child processes
for (pid_t pid : pids) {
int status;
if (waitpid(pid, &status, 0) == -1) {
perror("waitpid failed");
FAIL() << "waitpid failed";
}
if (WIFEXITED(status) && WEXITSTATUS(status) == 0xfd00) {
GTEST_SKIP()
<< "Xarray compatibility skipped due to import error for xarray";
}
ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
<< "xarray compatibility test failed with one of the metadata "
"options\n\tThere was some expected output above...";
}
ASSERT_TRUE(status == 0)
<< "xarray compatibility test failed without consolidated "
"metadata\n\tThere was some expected output above...";

command = "python3 " + srcPath + " " + datasetPath + " True";
status = system(command.c_str());
ASSERT_TRUE(status == 0)
<< "xarray compatibility test failed with consolidated metadata\n\tThere "
"was some expected output above...";
}

TEST(Dataset, listVars) {
Expand Down
17 changes: 12 additions & 5 deletions mdio/dataset_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <vector>

#include "mdio/dataset_validator.h"
#include "mdio/impl.h"
// #include "tensorstore/tensorstore.h"

#include "absl/strings/escaping.h"
Expand Down Expand Up @@ -183,7 +184,7 @@ absl::Status transform_compressor(nlohmann::json& input /*NOLINT*/,
*/
void transform_shape(
nlohmann::json& input /*NOLINT*/, nlohmann::json& variable /*NOLINT*/,
std::unordered_map<std::string, int>& dimensionMap /*NOLINT*/) {
std::unordered_map<std::string, uint64_t>& dimensionMap /*NOLINT*/) {
if (input["dimensions"][0].is_object()) {
nlohmann::json shape = nlohmann::json::array();
for (auto& dimension : input["dimensions"]) {
Expand Down Expand Up @@ -262,7 +263,7 @@ absl::Status transform_metadata(const std::string& path,
*/
tensorstore::Result<nlohmann::json> from_json_to_spec(
nlohmann::json& json /*NOLINT*/,
std::unordered_map<std::string, int>& dimensionMap /*NOLINT*/,
std::unordered_map<std::string, uint64_t>& dimensionMap /*NOLINT*/,
const std::string& path) {
nlohmann::json variableStub = R"(
{
Expand Down Expand Up @@ -403,12 +404,18 @@ tensorstore::Result<nlohmann::json> from_json_to_spec(
* @return A map of dimension names to sizes or error if the dimensions are not
* consistently sized
*/
tensorstore::Result<std::unordered_map<std::string, int>> get_dimensions(
tensorstore::Result<std::unordered_map<std::string, uint64_t>> get_dimensions(
nlohmann::json& spec /*NOLINT*/) {
std::unordered_map<std::string, int> dimensions;
std::unordered_map<std::string, uint64_t> dimensions;
for (auto& variable : spec["variables"]) {
if (variable["dimensions"][0].is_object()) {
for (auto& dimension : variable["dimensions"]) {
if (dimension["size"].get<uint64_t>() > mdio::constants::kMaxSize) {
return absl::InvalidArgumentError(
"Dimension " + dimension["name"].dump() +
" exceeds maximum size of " +
std::to_string(mdio::constants::kMaxSize));
}
if (dimensions.count(dimension["name"]) == 0) {
dimensions[dimension["name"]] = dimension["size"];
} else {
Expand Down Expand Up @@ -447,7 +454,7 @@ Construct(nlohmann::json& spec /*NOLINT*/, const std::string& path) {
return dimensions.status();
}

std::unordered_map<std::string, int> dimensionMap = dimensions.value();
std::unordered_map<std::string, uint64_t> dimensionMap = dimensions.value();

std::vector<nlohmann::json> datasetSpec;
for (auto& variable : spec["variables"]) {
Expand Down
25 changes: 25 additions & 0 deletions mdio/dataset_factory_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,31 @@ TEST(Variable, simple) {
}
}

TEST(Variable, maxSizeExceeded) {
nlohmann::json j = nlohmann::json::parse(manifest);
// Set all Variables to exceed the maximum size
j["variables"][0]["dimensions"][0]["size"] = 0x7fffffffffffffff;
j["variables"][0]["dimensions"][1]["size"] = 0x7fffffffffffffff;
j["variables"][1]["dimensions"][0]["size"] = 0x7fffffffffffffff;
j["variables"][2]["dimensions"][0]["size"] = 0x7fffffffffffffff;

auto res = Construct(j, "zarrs/simple_dataset");
ASSERT_FALSE(res.status().ok())
<< "Construction succeeded despite exceeding maximum size";
}

TEST(Variable, maxSizeReached) {
nlohmann::json j = nlohmann::json::parse(manifest);
// Set all Variables to reach the maximum size
j["variables"][0]["dimensions"][0]["size"] = mdio::constants::kMaxSize;
j["variables"][0]["dimensions"][1]["size"] = mdio::constants::kMaxSize;
j["variables"][1]["dimensions"][0]["size"] = mdio::constants::kMaxSize;
j["variables"][2]["dimensions"][0]["size"] = mdio::constants::kMaxSize;

auto res = Construct(j, "zarrs/simple_dataset");
ASSERT_TRUE(res.status().ok()) << res.status();
}

TEST(Xarray, open) {
nlohmann::json j = nlohmann::json::parse(manifest);
auto res = Construct(j, "zarrs/simple_dataset");
Expand Down
3 changes: 3 additions & 0 deletions mdio/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ constexpr auto kCreateClean =
/// Create a new file or error if it already exists.
constexpr auto kCreate = tensorstore::OpenMode::create;

// Tensorstore appears to be imposing a max size of 0x3fffffffffffffff
constexpr uint64_t kMaxSize = 4611686018427387903;

// Supported dtypes
constexpr auto kBool = tensorstore::dtype_v<mdio::dtypes::bool_t>;
constexpr auto kInt8 = tensorstore::dtype_v<mdio::dtypes::int8_t>;
Expand Down
Loading