diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index ff31930c06cf9..26ee62ebff143 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -19,6 +19,15 @@ ARG repo ARG arch FROM ${repo}:${arch}-conda +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + # install the required conda packages into the test environment COPY ci/conda_env_cpp.txt \ ci/conda_env_gandiva.txt \ diff --git a/ci/docker/conda-python-pandas.dockerfile b/ci/docker/conda-python-pandas.dockerfile index 303cc80e48a07..7295205593832 100644 --- a/ci/docker/conda-python-pandas.dockerfile +++ b/ci/docker/conda-python-pandas.dockerfile @@ -20,6 +20,15 @@ ARG arch=amd64 ARG python=3.6 FROM ${repo}:${arch}-conda-python-${python} +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + ARG pandas=latest ARG numpy=latest COPY ci/scripts/install_pandas.sh /arrow/ci/scripts/ diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index ab3f77be1b678..b743188746c7f 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -19,6 +19,15 @@ ARG repo ARG arch FROM ${repo}:${arch}-conda-cpp +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + # install python specific packages ARG python=3.6 COPY ci/conda_env_python.txt /arrow/ci/ diff --git a/ci/docker/linux-apt-jni.dockerfile b/ci/docker/linux-apt-jni.dockerfile index 1abbf05af3bca..d120b479a43aa 100644 --- a/ci/docker/linux-apt-jni.dockerfile +++ b/ci/docker/linux-apt-jni.dockerfile @@ -63,6 +63,15 @@ RUN apt-get update -y -q && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + ARG cmake=3.11.4 RUN wget -nv -O - https://github.com/Kitware/CMake/releases/download/v${cmake}/cmake-${cmake}-Linux-x86_64.tar.gz | tar -xzf - -C /opt ENV PATH=/opt/cmake-${cmake}-Linux-x86_64/bin:$PATH diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile index 04646585322af..11bcc54fc919b 100644 --- a/ci/docker/linux-apt-lint.dockerfile +++ b/ci/docker/linux-apt-lint.dockerfile @@ -40,6 +40,16 @@ RUN apt-get update && \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Jsoncon is used for Gandiva's get json object function +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + ARG r=4.1 RUN apt-key adv \ --keyserver keyserver.ubuntu.com \ diff --git a/ci/docker/linux-apt-ruby.dockerfile b/ci/docker/linux-apt-ruby.dockerfile index 58fd65bd57a78..21eee12d38e0b 100644 --- a/ci/docker/linux-apt-ruby.dockerfile +++ b/ci/docker/linux-apt-ruby.dockerfile @@ -19,6 +19,15 @@ ARG base FROM ${base} +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + COPY ruby/ /arrow/ruby/ RUN bundle install --gemfile /arrow/ruby/Gemfile RUN \ diff --git a/ci/docker/ubuntu-18.04-cpp.dockerfile b/ci/docker/ubuntu-18.04-cpp.dockerfile index 0c05ac4ee6b77..a4404fb01f751 100644 --- a/ci/docker/ubuntu-18.04-cpp.dockerfile +++ b/ci/docker/ubuntu-18.04-cpp.dockerfile @@ -87,6 +87,15 @@ RUN apt-get update -y -q && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + # Prioritize system packages and local installation # The following dependencies will be downloaded due to missing/invalid packages # provided by the distribution: diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index c2a468d9e3543..6f5db81ece3b9 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -94,6 +94,15 @@ RUN apt-get update -y -q && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + COPY ci/scripts/install_minio.sh \ /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local diff --git a/ci/docker/ubuntu-20.10-cpp.dockerfile b/ci/docker/ubuntu-20.10-cpp.dockerfile index 6cefecfd67819..1fcf0cafd5cb6 100644 --- a/ci/docker/ubuntu-20.10-cpp.dockerfile +++ b/ci/docker/ubuntu-20.10-cpp.dockerfile @@ -96,6 +96,15 @@ RUN apt-get update -y -q && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + COPY ci/scripts/install_minio.sh \ /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local diff --git a/ci/docker/ubuntu-21.04-cpp.dockerfile b/ci/docker/ubuntu-21.04-cpp.dockerfile index 18c377811bcf5..6d4b8e22f33b6 100644 --- a/ci/docker/ubuntu-21.04-cpp.dockerfile +++ b/ci/docker/ubuntu-21.04-cpp.dockerfile @@ -94,6 +94,15 @@ RUN apt-get update -y -q && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +ARG jsoncons +RUN git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + COPY ci/scripts/install_minio.sh \ /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 46845d0e62352..f8853915c7d21 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -49,6 +49,15 @@ fi mkdir -p ${build_dir} pushd ${build_dir} +# installing jsoncons +git clone https://github.com/danielaparker/jsoncons.git && \ + cd jsoncons && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install + cmake -G "${CMAKE_GENERATOR:-Ninja}" \ -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 654a4a40be151..c9663858e5cef 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -137,6 +137,7 @@ add_arrow_lib(gandiva $ ${GANDIVA_OPENSSL_INCLUDE_DIR} ${UTF8PROC_INCLUDE_DIR} + ${jsoncons_INCLUDE_DIRS} SHARED_LINK_FLAGS ${GANDIVA_SHARED_LINK_FLAGS} SHARED_LINK_LIBS @@ -243,7 +244,8 @@ add_gandiva_test(internals-test $ ${GANDIVA_INTERNALS_TEST_ARGUMENTS} ${GANDIVA_OPENSSL_INCLUDE_DIR} - ${UTF8PROC_INCLUDE_DIR}) + ${UTF8PROC_INCLUDE_DIR} + ${jsoncons_INCLUDE_DIRS}) if(ARROW_GANDIVA_JAVA) add_subdirectory(jni) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 3ea426c85f489..49423309f06eb 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -406,6 +406,10 @@ std::vector GetStringFunctionRegistry() { NativeFunction("split_part", {}, DataTypeVector{utf8(), utf8(), int32()}, utf8(), kResultNullIfNull, "split_part", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + NativeFunction("get_json_object", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullIfNull, "gdv_fn_get_json_object", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)}; return string_fn_registry_; diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 2cac036abd577..ae8dcd28a52f6 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -18,6 +18,8 @@ #include "gandiva/gdv_function_stubs.h" #include +#include +#include #include #include @@ -792,6 +794,64 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_ *out_len = out_idx; return out; } +// An implementation of jsonpath to work with json similarly to xpath. +// +// It follows the Stefan Goessner's JSONPath standard: +// http://goessner.net/articles/JsonPath/ +GANDIVA_EXPORT +const char* gdv_fn_get_json_object(gdv_int64 context, const char* search_text, + gdv_int32 search_len, const char* json_text, + gdv_int32 json_len, gdv_int32* out_len) { + std::string search_string(search_text, search_len); + std::string json_string(json_text, json_len); + + // if there is no json string return null + if (json_len == 0 || json_text == nullptr) { + *out_len = 0; + return ""; + } + + // if there is no json search text return the entire object + if (search_len == 0 || search_text == nullptr) { + *out_len = 0; + return ""; + } + + jsoncons::json json; + + try { + json = jsoncons::json::parse(json_string); + } catch (...) { + gdv_fn_context_set_error_msg(context, "Invalid json text"); + *out_len = 0; + return ""; + } + + jsoncons::json result; + + try { + result = jsoncons::jsonpath::json_query(json, search_string); + } catch (...) { + gdv_fn_context_set_error_msg(context, "Invalid jsonpath search query"); + *out_len = 0; + return ""; + } + + // prevents nullptr when the result.to_string().c_str() finishes the expression; + std::string json_result = result.to_string(); + + *out_len = strlen(json_result.c_str()); + + // try to allocate memory for the response + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string."); + *out_len = 0; + return ""; + } + memcpy(ret, json_result.c_str(), *out_len); + return ret; +} } namespace gandiva { @@ -1597,5 +1657,19 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8", types->i8_ptr_type() /*return_type*/, args, reinterpret_cast(gdv_fn_initcap_utf8)); + + // gdv_fn_get_json_object + args = { + types->i64_type(), // context + types->i8_ptr_type(), // search_data + types->i32_type(), // search_length + types->i8_ptr_type(), // json_data + types->i32_type(), // json_length + types->i32_ptr_type(), // out_len + }; + + engine->AddGlobalMappingForFunc("gdv_fn_get_json_object", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_get_json_object)); } } // namespace gandiva diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 670ac94df1b89..321f0fd36891f 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -170,4 +170,9 @@ float gdv_fn_castFLOAT4_varbinary(gdv_int64 context, const char* in, int32_t in_ GANDIVA_EXPORT double gdv_fn_castFLOAT8_varbinary(gdv_int64 context, const char* in, int32_t in_len); + +GANDIVA_EXPORT +const char* gdv_fn_get_json_object(int64_t context, const char* search_text, + int32_t search_len, const char* json_text, + int32_t json_len, int32_t* out_len); } diff --git a/cpp/src/gandiva/gdv_function_stubs_test.cc b/cpp/src/gandiva/gdv_function_stubs_test.cc index f7c21981cbc4d..35c758126c5ea 100644 --- a/cpp/src/gandiva/gdv_function_stubs_test.cc +++ b/cpp/src/gandiva/gdv_function_stubs_test.cc @@ -766,4 +766,76 @@ TEST(TestGdvFnStubs, TestCastVarbinaryFloat8) { ctx.Reset(); } + +TEST(TestGdvFnStubs, TestGetJsonObject) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + const char* json_str = + "[\n" + " {\n" + " \"id\": 1,\n" + " \"name\": \"John Doe\",\n" + " \"favorite_fruits\": [\"mango\", \"banana\"]\n" + " },\n" + " {\n" + " \"id\": 2,\n" + " \"name\": \"Mary Doe\",\n" + " \"favorite_fruits\": [\"grapefruit\", \"pineapple\"]\n" + " }\n" + "]"; + int64_t json_len = strlen(json_str); + const char* search_str1 = "$.*.id"; + int64_t search_len1 = strlen(search_str1); + + out_str = gdv_fn_get_json_object(ctx_ptr, search_str1, search_len1, json_str, json_len, + &out_len); + EXPECT_EQ(std::string(out_str, out_len), "[1,2]"); + EXPECT_FALSE(ctx.has_error()); + + const char* search_str2 = "$.1.name"; + int64_t search_len2 = strlen(search_str2); + + out_str = gdv_fn_get_json_object(ctx_ptr, search_str2, search_len2, json_str, json_len, + &out_len); + EXPECT_EQ(std::string(out_str, out_len), "[\"Mary Doe\"]"); + EXPECT_FALSE(ctx.has_error()); + + const char* search_str3 = "$.1.favorite_fruits[0]"; + int64_t search_len3 = strlen(search_str3); + + out_str = gdv_fn_get_json_object(ctx_ptr, search_str3, search_len3, json_str, json_len, + &out_len); + EXPECT_EQ(std::string(out_str, out_len), "[\"grapefruit\"]"); + EXPECT_FALSE(ctx.has_error()); + + const char* json_empty_str = ""; + int64_t json_empty_len = strlen(json_empty_str); + const char* search_str4 = "$.1.favorite_fruits[0]"; + int64_t search_len4 = strlen(search_str4); + + out_str = gdv_fn_get_json_object(ctx_ptr, search_str4, search_len4, json_empty_str, + json_empty_len, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + const char* search_str5 = ""; + int64_t search_len5 = strlen(search_str5); + + out_str = gdv_fn_get_json_object(ctx_ptr, search_str5, search_len5, json_str, json_len, + &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + const char* search_str6 = "$."; + int64_t search_len6 = strlen(search_str6); + + out_str = gdv_fn_get_json_object(ctx_ptr, search_str6, search_len6, json_str, json_len, + &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Invalid jsonpath search query")); + ctx.Reset(); +} } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 6221dffb30224..5aec3e1360f8b 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1754,5 +1754,4 @@ TEST(TestStringOps, TestConvertToBigEndian) { } #endif } - } // namespace gandiva diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 12020777309a4..6857a644943b7 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1606,4 +1606,77 @@ TEST_F(TestProjector, TestCastNullableIntYearInterval) { EXPECT_ARROW_ARRAY_EQUALS(out_int64, outputs.at(1)); } +TEST_F(TestProjector, TestGetJsonObject) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::utf8()); + auto schema = arrow::schema({field0, field1}); + + // output fields + auto field_get_json_object = field("get_json_object", arrow::utf8()); + + auto get_json_object_expr = TreeExprBuilder::MakeExpression( + "get_json_object", {field0, field1}, field_get_json_object); + + std::shared_ptr projector; + auto status = + Projector::Make(schema, {get_json_object_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 3; + + const char* json_str1 = + "{\n" + " \"firstName\": \"John\",\n" + " \"lastName\" : \"doe\",\n" + " \"age\" : 26\n" + "}"; + + const char* json_str2 = + "{\n" + " \"phoneNumbers\": [\n" + " {\n" + " \"type\" : \"iPhone\",\n" + " \"number\": \"0123-4567-8888\"\n" + " },\n" + " {\n" + " \"type\" : \"home\",\n" + " \"number\": \"0123-4567-8910\"\n" + " }\n" + " ]" + "}"; + + const char* json_str3 = + "[\n" + " {\n" + " \"id\": 1,\n" + " \"name\": \"John Doe\",\n" + " \"favorite_fruits\": [\"mango\", \"banana\"]\n" + " },\n" + " {\n" + " \"id\": 2,\n" + " \"name\": \"Mary Doe\",\n" + " \"favorite_fruits\": [\"grapefruit\", \"pineapple\"]\n" + " }\n" + "]"; + + auto array0 = MakeArrowArrayUtf8({"$.firstName", "$.phoneNumbers[1].type", "$.0.favorite_fruits[0]"}, {true, true, true}); + auto array1 = MakeArrowArrayUtf8({json_str1, json_str2, json_str3}, {true, true, true}); + + // expected output + auto exp_get_json_object = MakeArrowArrayUtf8({"[\"John\"]", "[\"home\"]", "[\"mango\"]"}, {true, true, true}); + + // prepare input record batch + auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_get_json_object, outputs.at(0)); +} + } // namespace gandiva diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index 723f3a46e7819..677e5d2a0b39f 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -27,6 +27,7 @@ "glog", "grpc", "gtest", + "jsoncons", "lz4", "openssl", "orc",