Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADIOS2 User Side KVCache to accelerate the remote query access. #4210

Merged
merged 28 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ce2d56b
KVCache V1.0
Change72 Jan 3, 2024
1e6e129
adjust some details for KVCache 1.0
Change72 Jun 24, 2024
d7f5e27
formatting and remove REQUIRED in AUTO setting
Change72 Jun 24, 2024
9812eef
update build-adios2-kvcache.sh for formatting and replace git clone w…
Change72 Jun 24, 2024
6d1bf17
fix link issue and remove rpath setting in CMakeLists.txt
Change72 Jun 24, 2024
e07c30b
solve unused warnings
Change72 Jun 25, 2024
398a0fb
formatting
Change72 Jun 25, 2024
2afdca6
move DoRemote and useKVCache declaration
Change72 Jun 25, 2024
5790fcd
remove all template in cache part & remove using of KWSYS Base64
Change72 Jun 26, 2024
22267e9
store all partial cache requests and execuate together while waiting …
Change72 Jun 26, 2024
8079eac
use Redis Pipeline to accelerate the get and set operations
Change72 Jun 26, 2024
ced5680
rename all functions follow Upper Camel Case
Change72 Jun 26, 2024
5f5c249
open redis connection during the m_Remote initialization
Change72 Jun 26, 2024
a4f572a
fix free invalid pointer
Change72 Jun 26, 2024
9414a33
separate kvcache into PerformRemoteGetsWithKVCache
Change72 Jun 27, 2024
8e3de8c
fix building issues without kvcache
Change72 Jun 27, 2024
b336025
convert Dims in QueryBox.h to DimsArray
Change72 Jul 1, 2024
fbf1cf2
fix copy issues in DimArrays
Change72 Jul 1, 2024
7c93c8a
Merge branch 'master' into main
Change72 Jul 1, 2024
4218e96
update QueryBox to be compatible with PR 4221
Change72 Jul 3, 2024
738e47e
revert changes in source/adios2/helper/adiosType.h
Change72 Jul 4, 2024
52da2e2
Merge branch 'ornladios:master' into main
Change72 Jul 4, 2024
27e0aab
Merge branch 'ornladios:master' into main
Change72 Jul 4, 2024
49d4000
Merge branch 'ornladios:master' into main
Change72 Jul 5, 2024
ac40ef6
update format
Change72 Jul 5, 2024
1b67484
add static_cast for typeSize from size_t to int
Change72 Jul 5, 2024
4168d6c
Merge branch 'master' into main
Change72 Jul 8, 2024
4352dcd
Merge branch 'ornladios:master' into main
Change72 Jul 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,11 @@ CMakeSettings.json
# Python wheels stuff

*.egg-info/

# CMake generated files
build/
.idea/
.vscode/

# redis dump files
*.rdb
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ adios_option(AWSSDK "Enable support for S3 compatible storage using AWS SDK'
adios_option(Derived_Variable "Enable support for derived variables" OFF)
adios_option(PIP "Enable support for pip packaging" OFF)
adios_option(XRootD "Enable support for XRootD" AUTO)
adios_option(KVCACHE "Enable support for KVCache" AUTO)

option(ADIOS2_LIBADIOS_MODE "Install only C/C++ library components" OFF)
mark_as_advanced(ADIOS2_LIBADIOS_MODE)
Expand Down Expand Up @@ -265,7 +266,7 @@ set(ADIOS2_CONFIG_OPTS
DataMan DataSpaces HDF5 HDF5_VOL MHS SST Fortran MPI Python PIP Blosc2 BZip2
LIBPRESSIO MGARD MGARD_MDR PNG SZ ZFP DAOS IME O_DIRECT Sodium Catalyst SysVShMem UCX
ZeroMQ Profiling Endian_Reverse Derived_Variable AWSSDK XRootD GPU_Support CUDA Kokkos
Kokkos_CUDA Kokkos_HIP Kokkos_SYCL Campaign
Kokkos_CUDA Kokkos_HIP Kokkos_SYCL Campaign KVCACHE
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to add as at adios2_option in this file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

)

GenerateADIOSHeaderConfig(${ADIOS2_CONFIG_OPTS})
Expand Down
15 changes: 15 additions & 0 deletions cmake/DetectOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,21 @@ elseif(ADIOS2_USE_Campaign)
endif()
endif()

# KVCache
if(ADIOS2_USE_Cache STREQUAL AUTO)
find_package(hiredis)
if (hiredis_FOUND)
message(STATUS "hiredis found. Turn on KVCache")
set(ADIOS2_HAVE_KVCACHE TRUE)
endif()
elseif(ADIOS2_USE_Cache)
find_package(hiredis REQUIRED)
if (hiredis_FOUND)
message(STATUS "hiredis found. Turn on KVCache")
set(ADIOS2_HAVE_KVCACHE TRUE)
endif()
endif()

# Multithreading
find_package(Threads REQUIRED)

Expand Down
113 changes: 113 additions & 0 deletions scripts/build_scripts/build-adios2-kvcache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/bin/bash
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A future idea is that this could be a docker-compose file


# This script is to build the remote server with cache enabled.
# Attention: hiredis cannot be installed by apt-get, since the default version is too old (libhiredis0.14 (= 0.14.1-2)).
# We need to build it from source code. You can also use the following scripts to install hiredis (v1.2.0).

# sample usage: in project home directory:
# source scripts/build_scripts/build-adios2-kvcache.sh --build
# source scripts/build_scripts/build-adios2-kvcache.sh --start
# source scripts/build_scripts/build-adios2-kvcache.sh --stop

if [ -z "${BUILD_DIR}" ]
then
BUILD_DIR="${PWD}"/build
fi

if [ ! -d "${BUILD_DIR}" ]
then
mkdir -p "${BUILD_DIR}"
fi

SW_DIR="${BUILD_DIR}"/sw
if [ ! -d "${SW_DIR}" ]
then
mkdir -p "${SW_DIR}"
fi

build_cache() {
# redis - in-memory data structure store
redis_dir="${SW_DIR}"/redis-7.2.3
if [ ! -d "${redis_dir}" ]
then
cd "${SW_DIR}" || exit
curl -L -o redis-7.2.3.tar.gz https://github.com/redis/redis/archive/refs/tags/7.2.3.tar.gz
tar -xzf redis-7.2.3.tar.gz
rm redis-7.2.3.tar.gz
cd "${redis_dir}" || exit
# cannot accleerate by 'make -j8'. It will cause error.
make

# hiredis - C client library to connect Redis server
cd "${redis_dir}"/deps/hiredis || exit
mkdir build && cd build || exit
cmake .. -DCMAKE_INSTALL_PREFIX="${SW_DIR}"/hiredis
make -j32
make install
fi

cd "${BUILD_DIR}" || exit
cmake .. -DADIOS2_USE_Cache=ON \
-DADIOS2_USE_Python=ON \
-DCMAKE_PREFIX_PATH="${SW_DIR}/hiredis" \
-DCMAKE_INSTALL_PREFIX="${SW_DIR}"/adios2

make -j32
make install
cd "${BUILD_DIR}"/../ || exit
echo "Build completed."
unset BUILD_DIR

export DoRemote=1
export useKVCache=1
}

start_services() {
eisenhauer marked this conversation as resolved.
Show resolved Hide resolved
echo "Starting redis server and setting environment variables..."
export PYTHONPATH=${SW_DIR}/adios2/local/lib/python3.10/dist-packages/
nohup "${SW_DIR}"/redis-7.2.3/src/redis-server > "${SW_DIR}"/redis_server.log 2>&1 &
nohup "${SW_DIR}"/adios2/bin/adios2_remote_server -v > "${SW_DIR}"/remote_server.log 2>&1 &
sleep 5
nohup "${SW_DIR}"/redis-7.2.3/src/redis-cli monitor > "${SW_DIR}"/redis_monitor.log 2>&1 &
echo "Services started and environment variables set."
}

# Function to stop services (optional, example purpose)
stop_services() {
echo "Stopping services..."
pkill -f redis-server
pkill -f redis-cli
pkill -f remote_server
unset DoRemote
unset useKVCache
unset PYTHONPATH
echo "Services stopped."
}

# Parse command line options
while [[ "$1" != "" ]]; do
case "$1" in
-b | --build )
build_cache
;;
-c | --start )
start_services
;;
-s | --stop )
stop_services
;;
-h | --help )
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " -b, --build Build the software with cache enabled"
echo " -c, --start Start redis server and set environment variables"
echo " -s, --stop Stop the services"
echo " -h, --help Display this help message"
;;
* )
echo "Invalid option: $1"
echo "Use -h or --help for usage information."
;;
esac
shift
done
8 changes: 7 additions & 1 deletion source/adios2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,13 @@ if (ADIOS2_HAVE_SST)
target_link_libraries(adios2_core PRIVATE adios2::thirdparty::EVPath)
add_subdirectory(toolkit/remote)
endif()


if (ADIOS2_HAVE_KVCACHE)
target_sources(adios2_core PRIVATE toolkit/kvcache/KVCacheCommon.h
toolkit/kvcache/KVCacheCommon.cpp)
target_link_libraries(adios2_core PRIVATE hiredis::hiredis)
endif ()

if(ADIOS2_HAVE_Campaign)
target_sources(adios2_core PRIVATE
engine/campaign/CampaignReader.cpp
Expand Down
127 changes: 125 additions & 2 deletions source/adios2/engine/bp5/BP5Reader.cpp
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My lurking worry here is that we have expanded a simple loop with a 2 line body to 100+ lines. Maybe we can break out the KVCache case to a different subroutine to keep the non-cache case simple to understand? Also you might use CoreDims/DimsArray rather than Dims. It's probably not as critical here, but Dims uses std::vector, which means lots of tiny mallocs whenever these are created, destroyed, passed by value, etc. This caused serious performance problems in metadata manipulation with BP4 at scale. As a result, BP5 tries to avoid Dims.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. I want to add a subroutine in:

    if (m_Remote)
    {
    PerformRemoteGets();
    }
    else
    {
    PerformLocalGets();
    }

However, what I can do is like:

#ifdef ADIOS2_HAVE_KVCACHE
if (getenv("useKVCache"))
{
PerformLocalGetsWithKVCache();
}
else
{
PerformRemoteGets();
}
#else
PerformRemoteGets();
#endif

do you think this is fine?

  1. It's hard to initialize DimsArray when I declare ReqInfo struct, since I don't know the dimension of each Req, and DimsArray has no default construction method.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about?

#ifdef ADIOS2_HAVE_KVCACHE
if (getenv("useKVCache"))
{
PerformLocalGetsWithKVCache();
}
else
#endif
PerformRemoteGets();

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer this version...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I could get rid of getenv("useKVCache") ->

if (m_Remote)
{
  #ifdef ADIOS2_HAVE_KVCACHE
    PerformRemoteGetsWithKVCache();
  #else
    PerformRemoteGets();
  #endif
}
else
{
  PerformLocalGets();
}

If the user builds with KVCache and wants to use remote Gets (DoRemote=1), KVCache will be used automatically. However, if the user prefers to use only the remote service without KVCache, they will need to rebuild (possibly in a separate folder).

We could also consider checking whether the cache connection is active, and if it is not, default to using the pure remote service.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2. It's hard to initialize DimsArray when I declare ReqInfo struct, since I don't know the dimension of each Req, and DimsArray has no default construction method.

So, let me tell you my worry. Your code in your cache stuff is in a lot of ways similar to the code in NdCopy (which looks at whether or not two blocks intersect, the extent of the overlap, etc.). We used to have some major performance problems in this when metadata was large (many thousand blocks per variable) because every time it did something like try to determine an intersection it ended up creating a destroying a dozen or more Dims vectors (doing things like converting a taking a StartEnd Box and creating a StartCount box from it, having all those temporary values get free'd when the intersection test returned, only to recreate them later, etc.). We traced the issue back to lots and lots of small malloc/free calls when those Dims were created, initialized and destroyed. It took a ton of effort to rewrite the worst parts of that code to use ArrayDims (stack allocated dimension array, so cheap to create/destroy, own their own data) or CoreDims (the equivalent of a "span", don't own their data but let us pass it in places and operate on it like a vector without a copy). With a lot of careful work kill all the object creation/destruction overhead in NdCopy so its only overhead was necessary control and data movement. (And if you go look at its source in helper/adiosMemory.cpp you'll see extensive use of ArrayDims and CoreDims.).

The storm of vector construction and destruction in ADIOS metadata handling wasn't noticeable at small scales and it was easy to ignore the implicit overhead of casual use of Dims, but as we ran at higher scales it became seriously problematic. You're probably not seeing an issue at the scales you're testing caching at either, but we hope to use this in large-metadata situations, so it'll become an issue if it's not done well. There's just too much similarity in your processing and the NdCopy case for me not to be concerned. We obviously didn't eliminate Dims from all the ADIOS code, but we killed it where it was used most intensively. You might do the same.

(And of course you do know the dimensionality of the variable in the place where you declare ReqInfo variable, it's in the Req variable that you're iterating over in that loop. Or it would be easy to add a default constructor and initialize the dimcount later. My concern is about places where there might be intensive vector usage.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test script we talked about just now is: https://github.com/Change72/adios-test/blob/master/TestCoreDims.cpp

// will make a value copy of dimsPoint2
DimsArray dimsPointer3(reinterpret_cast<CoreDims&>(dimsPointer2));

// will only copy the pointer
DimsArray dimsPointer3(dimsPointer2);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, the following code will also give wrong results, but vector works fine.

helper::NdCopy(reinterpret_cast<char *>(ReqInfo.Data), reinterpret_cast<helper::CoreDims&>(ReqInfo.ReqBox.Start), reinterpret_cast<helper::CoreDims&>(ReqInfo.ReqBox.Count), true, false, reinterpret_cast<char *>(Req.Data), Req.Start, Req.Count, true, false, ReqInfo.TypeSize);

Vector code:

        std::vector<size_t> start;
        std::vector<size_t> count;
        ReqInfo.ReqBox.StartToVector(start);
        ReqInfo.ReqBox.CountToVector(count);
        helper::NdCopy(reinterpret_cast<char *>(ReqInfo.Data), start, count, true, false, reinterpret_cast<char *>(Req.Data), Req.Start, Req.Count, true, false, ReqInfo.TypeSize);

Original file line number Diff line number Diff line change
Expand Up @@ -349,16 +349,139 @@ void BP5Reader::PerformRemoteGets()
// TP startGenerate = NOW();
auto GetRequests = m_BP5Deserializer->PendingGetRequests;
std::vector<Remote::GetHandle> handles;
for (auto &Req : GetRequests)

#ifdef ADIOS2_HAVE_KVCACHE // open kv cache connection
struct RequestInfo
{
size_t ReqSeq;
DataType varType;
size_t ReqCount;
std::string CacheKey;
size_t TypeSize;
Dims Count;
Dims Start;
void *Data;
};
std::vector<RequestInfo> getRequestsInfo;

if (getenv("useKVCache"))
{
m_KVCacheCommon.openConnection();
}
#endif

for (size_t req_seq = 0; req_seq < GetRequests.size(); req_seq++)
{
auto &Req = GetRequests[req_seq];
#ifdef ADIOS2_HAVE_KVCACHE // get data from cache
if (getenv("useKVCache"))
{
const DataType varType = m_IO.InquireVariableType(Req.VarName);
QueryBox targetBox(Req.Start, Req.Count);
size_t numOfElements = targetBox.size();
std::string keyPrefix =
m_KVCacheCommon.keyPrefix(Req.VarName, Req.RelStep, Req.BlockID);
std::string targetKey = m_KVCacheCommon.keyComposition(keyPrefix, Req.Start, Req.Count);
size_t varSize = helper::GetDataTypeSize(varType);

// Exact Match: check if targetKey exists
if (m_KVCacheCommon.exists(targetKey))
{
m_KVCacheCommon.get(targetKey.c_str(), numOfElements * varSize, Req.Data);
}
else
{
int max_depth = 999;
std::set<std::string> samePrefixKeys;
m_KVCacheCommon.keyPrefixExistence(keyPrefix, samePrefixKeys);
std::vector<QueryBox> regularBoxes;
std::vector<std::string> cachedKeys;

if (getenv("maxDepth"))
{
max_depth = std::stoi(getenv("maxDepth"));
}

if (samePrefixKeys.size() > 0)
{
targetBox.getMaxInteractBox(samePrefixKeys, max_depth, 0, regularBoxes,
cachedKeys);
}
else
{
regularBoxes.push_back(targetBox);
}

std::cout << "Going to retrieve " << regularBoxes.size()
<< " boxes from remote server, and " << cachedKeys.size()
<< " boxes from cache" << std::endl;

// Get data from remote server
for (auto &box : regularBoxes)
{
RequestInfo ReqInfo;
ReqInfo.ReqSeq = req_seq;
ReqInfo.varType = varType;
ReqInfo.ReqCount = box.size();
ReqInfo.CacheKey =
m_KVCacheCommon.keyComposition(keyPrefix, box.start, box.count);
ReqInfo.TypeSize = varSize;
ReqInfo.Count = box.count;
ReqInfo.Start = box.start;
ReqInfo.Data = malloc(box.size() * varSize);
auto handle = m_Remote->Get(Req.VarName, Req.RelStep, Req.BlockID, box.count,
box.start, ReqInfo.Data);
handles.push_back(handle);
getRequestsInfo.push_back(ReqInfo);
}

// Get data from cache
for (auto &boxKey : cachedKeys)
{
QueryBox box(boxKey);
void *data = malloc(box.size() * varSize);
m_KVCacheCommon.get(boxKey.c_str(), box.size() * varSize, data);
helper::NdCopy(reinterpret_cast<char *>(data), box.start, box.count, true,
false, reinterpret_cast<char *>(Req.Data), Req.Start, Req.Count,
true, false, varSize);
free(data);
}
}

continue;
}
#endif

auto handle =
m_Remote->Get(Req.VarName, Req.RelStep, Req.BlockID, Req.Count, Req.Start, Req.Data);
handles.push_back(handle);
}
for (auto &handle : handles)

for (size_t handle_seq = 0; handle_seq < handles.size(); handle_seq++)
{
auto handle = handles[handle_seq];
m_Remote->WaitForGet(handle);
#ifdef ADIOS2_HAVE_KVCACHE // close cache connection
if (getenv("useKVCache"))
{
auto &ReqInfo = getRequestsInfo[handle_seq];
auto &Req = GetRequests[ReqInfo.ReqSeq];
helper::NdCopy(reinterpret_cast<char *>(ReqInfo.Data), ReqInfo.Start, ReqInfo.Count,
true, false, reinterpret_cast<char *>(Req.Data), Req.Start, Req.Count,
true, false, ReqInfo.TypeSize);
m_KVCacheCommon.set(ReqInfo.CacheKey.c_str(), ReqInfo.ReqCount * ReqInfo.TypeSize,
ReqInfo.Data);
free(ReqInfo.Data);
}
#endif
}

#ifdef ADIOS2_HAVE_KVCACHE // close cache connection
if (getenv("useKVCache"))
{
m_KVCacheCommon.closeConnection();
}
#endif
}

void BP5Reader::PerformLocalGets()
Expand Down
8 changes: 7 additions & 1 deletion source/adios2/engine/bp5/BP5Reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
#include "adios2/toolkit/remote/Remote.h"
#include "adios2/toolkit/transportman/TransportMan.h"

#ifdef ADIOS2_HAVE_KVCACHE
#include "adios2/toolkit/kvcache/KVCacheCommon.h"
#endif

#include <chrono>
#include <map>
#include <vector>
Expand Down Expand Up @@ -98,7 +102,9 @@ class BP5Reader : public BP5Engine, public Engine
std::unique_ptr<Remote> m_Remote;
bool m_WriterIsActive = true;
adios2::profiling::JSONProfiler m_JSONProfiler;

#ifdef ADIOS2_HAVE_KVCACHE
KVCacheCommon m_KVCacheCommon;
#endif
/** used for per-step reads, TODO: to be moved to BP5Deserializer */
size_t m_CurrentStep = 0;
size_t m_StepsCount = 0;
Expand Down
Loading
Loading