Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DS-172] Implement GET_JSON_OBJECT Hive function #9

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions ci/docker/conda-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ ARG repo
ARG arch
FROM ${repo}:${arch}-conda

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

# install the required conda packages into the test environment
COPY ci/conda_env_cpp.txt \
ci/conda_env_gandiva.txt \
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/conda-python-pandas.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ ARG arch=amd64
ARG python=3.6
FROM ${repo}:${arch}-conda-python-${python}

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

ARG pandas=latest
ARG numpy=latest
COPY ci/scripts/install_pandas.sh /arrow/ci/scripts/
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/conda-python.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ ARG repo
ARG arch
FROM ${repo}:${arch}-conda-cpp

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

# install python specific packages
ARG python=3.6
COPY ci/conda_env_python.txt /arrow/ci/
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/linux-apt-jni.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ RUN apt-get update -y -q && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

ARG cmake=3.11.4
RUN wget -nv -O - https://github.com/Kitware/CMake/releases/download/v${cmake}/cmake-${cmake}-Linux-x86_64.tar.gz | tar -xzf - -C /opt
ENV PATH=/opt/cmake-${cmake}-Linux-x86_64/bin:$PATH
Expand Down
10 changes: 10 additions & 0 deletions ci/docker/linux-apt-lint.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ RUN apt-get update && \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Jsoncon is used for Gandiva's get json object function
ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

ARG r=4.1
RUN apt-key adv \
--keyserver keyserver.ubuntu.com \
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/linux-apt-ruby.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@
ARG base
FROM ${base}

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

COPY ruby/ /arrow/ruby/
RUN bundle install --gemfile /arrow/ruby/Gemfile
RUN \
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/ubuntu-18.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ RUN apt-get update -y -q && \
apt-get clean && \
rm -rf /var/lib/apt/lists*

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

# Prioritize system packages and local installation
# The following dependencies will be downloaded due to missing/invalid packages
# provided by the distribution:
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/ubuntu-20.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ RUN apt-get update -y -q && \
apt-get clean && \
rm -rf /var/lib/apt/lists*

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

COPY ci/scripts/install_minio.sh \
/arrow/ci/scripts/
RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/ubuntu-20.10-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,15 @@ RUN apt-get update -y -q && \
apt-get clean && \
rm -rf /var/lib/apt/lists*

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

COPY ci/scripts/install_minio.sh \
/arrow/ci/scripts/
RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local
Expand Down
9 changes: 9 additions & 0 deletions ci/docker/ubuntu-21.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ RUN apt-get update -y -q && \
apt-get clean && \
rm -rf /var/lib/apt/lists*

ARG jsoncons
RUN git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

COPY ci/scripts/install_minio.sh \
/arrow/ci/scripts/
RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local
Expand Down
9 changes: 9 additions & 0 deletions ci/scripts/cpp_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ fi
mkdir -p ${build_dir}
pushd ${build_dir}

# installing jsoncons
git clone https://github.com/danielaparker/jsoncons.git && \
cd jsoncons && \
mkdir build && \
cd build && \
cmake .. && \
make && \
make install

cmake -G "${CMAKE_GENERATOR:-Ninja}" \
-DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \
-DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/gandiva/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ add_arrow_lib(gandiva
$<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
${GANDIVA_OPENSSL_INCLUDE_DIR}
${UTF8PROC_INCLUDE_DIR}
${jsoncons_INCLUDE_DIRS}
SHARED_LINK_FLAGS
${GANDIVA_SHARED_LINK_FLAGS}
SHARED_LINK_LIBS
Expand Down Expand Up @@ -243,7 +244,8 @@ add_gandiva_test(internals-test
$<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
${GANDIVA_INTERNALS_TEST_ARGUMENTS}
${GANDIVA_OPENSSL_INCLUDE_DIR}
${UTF8PROC_INCLUDE_DIR})
${UTF8PROC_INCLUDE_DIR}
${jsoncons_INCLUDE_DIRS})

if(ARROW_GANDIVA_JAVA)
add_subdirectory(jni)
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {

NativeFunction("split_part", {}, DataTypeVector{utf8(), utf8(), int32()}, utf8(),
kResultNullIfNull, "split_part",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("get_json_object", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "gdv_fn_get_json_object",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)};

return string_fn_registry_;
Expand Down
74 changes: 74 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "gandiva/gdv_function_stubs.h"

#include <utf8proc.h>
#include <jsoncons/json.hpp>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't know if it is okay to use hpp references in Gandiva files.
Normally, I see cases in Gandiva where a wrapper .h file is created to avoid having these .hpp references (something related to compatibility problems while using llvm).

You may take a look at the decimal_xlarge.cc file which is a wrapper to use the boost .hpp file to handle some decimal operations

#include <jsoncons_ext/jsonpath/jsonpath.hpp>

#include <string>
#include <vector>
Expand Down Expand Up @@ -792,6 +794,64 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
*out_len = out_idx;
return out;
}
// An implementation of jsonpath to work with json similarly to xpath.
//
// It follows the Stefan Goessner's JSONPath standard:
// http://goessner.net/articles/JsonPath/
GANDIVA_EXPORT
const char* gdv_fn_get_json_object(gdv_int64 context, const char* search_text,
gdv_int32 search_len, const char* json_text,
gdv_int32 json_len, gdv_int32* out_len) {
std::string search_string(search_text, search_len);
std::string json_string(json_text, json_len);

// if there is no json string return null
if (json_len == 0 || json_text == nullptr) {
*out_len = 0;
return "";
}

// if there is no json search text return the entire object
if (search_len == 0 || search_text == nullptr) {
*out_len = 0;
return "";
}

jsoncons::json json;

try {
json = jsoncons::json::parse(json_string);
} catch (...) {
gdv_fn_context_set_error_msg(context, "Invalid json text");
*out_len = 0;
return "";
}

jsoncons::json result;

try {
result = jsoncons::jsonpath::json_query(json, search_string);
} catch (...) {
gdv_fn_context_set_error_msg(context, "Invalid jsonpath search query");
*out_len = 0;
return "";
}

// prevents nullptr when the result.to_string().c_str() finishes the expression;
std::string json_result = result.to_string();

*out_len = strlen(json_result.c_str());

// try to allocate memory for the response
char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string.");
*out_len = 0;
return "";
}
memcpy(ret, json_result.c_str(), *out_len);
return ret;
}
}

namespace gandiva {
Expand Down Expand Up @@ -1597,5 +1657,19 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_initcap_utf8));

// gdv_fn_get_json_object
args = {
types->i64_type(), // context
types->i8_ptr_type(), // search_data
types->i32_type(), // search_length
types->i8_ptr_type(), // json_data
types->i32_type(), // json_length
types->i32_ptr_type(), // out_len
};

engine->AddGlobalMappingForFunc("gdv_fn_get_json_object",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_get_json_object));
}
} // namespace gandiva
5 changes: 5 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,9 @@ float gdv_fn_castFLOAT4_varbinary(gdv_int64 context, const char* in, int32_t in_

GANDIVA_EXPORT
double gdv_fn_castFLOAT8_varbinary(gdv_int64 context, const char* in, int32_t in_len);

GANDIVA_EXPORT
const char* gdv_fn_get_json_object(int64_t context, const char* search_text,
int32_t search_len, const char* json_text,
int32_t json_len, int32_t* out_len);
}
72 changes: 72 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -766,4 +766,76 @@ TEST(TestGdvFnStubs, TestCastVarbinaryFloat8) {
ctx.Reset();
}


TEST(TestGdvFnStubs, TestGetJsonObject) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;
const char* out_str;

const char* json_str =
"[\n"
" {\n"
" \"id\": 1,\n"
" \"name\": \"John Doe\",\n"
" \"favorite_fruits\": [\"mango\", \"banana\"]\n"
" },\n"
" {\n"
" \"id\": 2,\n"
" \"name\": \"Mary Doe\",\n"
" \"favorite_fruits\": [\"grapefruit\", \"pineapple\"]\n"
" }\n"
"]";
int64_t json_len = strlen(json_str);
const char* search_str1 = "$.*.id";
int64_t search_len1 = strlen(search_str1);

out_str = gdv_fn_get_json_object(ctx_ptr, search_str1, search_len1, json_str, json_len,
&out_len);
EXPECT_EQ(std::string(out_str, out_len), "[1,2]");
EXPECT_FALSE(ctx.has_error());

const char* search_str2 = "$.1.name";
int64_t search_len2 = strlen(search_str2);

out_str = gdv_fn_get_json_object(ctx_ptr, search_str2, search_len2, json_str, json_len,
&out_len);
EXPECT_EQ(std::string(out_str, out_len), "[\"Mary Doe\"]");
EXPECT_FALSE(ctx.has_error());

const char* search_str3 = "$.1.favorite_fruits[0]";
int64_t search_len3 = strlen(search_str3);

out_str = gdv_fn_get_json_object(ctx_ptr, search_str3, search_len3, json_str, json_len,
&out_len);
EXPECT_EQ(std::string(out_str, out_len), "[\"grapefruit\"]");
EXPECT_FALSE(ctx.has_error());

const char* json_empty_str = "";
int64_t json_empty_len = strlen(json_empty_str);
const char* search_str4 = "$.1.favorite_fruits[0]";
int64_t search_len4 = strlen(search_str4);

out_str = gdv_fn_get_json_object(ctx_ptr, search_str4, search_len4, json_empty_str,
json_empty_len, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());

const char* search_str5 = "";
int64_t search_len5 = strlen(search_str5);

out_str = gdv_fn_get_json_object(ctx_ptr, search_str5, search_len5, json_str, json_len,
&out_len);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());

const char* search_str6 = "$.";
int64_t search_len6 = strlen(search_str6);

out_str = gdv_fn_get_json_object(ctx_ptr, search_str6, search_len6, json_str, json_len,
&out_len);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Invalid jsonpath search query"));
ctx.Reset();
}
} // namespace gandiva
1 change: 0 additions & 1 deletion cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1754,5 +1754,4 @@ TEST(TestStringOps, TestConvertToBigEndian) {
}
#endif
}

} // namespace gandiva
Loading