-
Notifications
You must be signed in to change notification settings - Fork 420
[Store]feat: Add 3fs native api plugin for KVCache storage persistence #610
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
190cf83
1d4c992
20faa75
f110847
63c16f2
73c4060
09b7be7
049aa8d
9eb931a
4c78b01
04023b8
9603f60
10aa916
9cd7365
6221283
3c98e3f
014495b
3846033
15435a8
11803ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
| #pragma once | ||
|
|
||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <sys/uio.h> | ||
| #include <cstdio> | ||
| #include "types.h" | ||
| #include <atomic> | ||
| #include <thread> | ||
| #include <sys/file.h> | ||
|
|
||
| namespace mooncake { | ||
| class FileLockRAII { | ||
| public: | ||
| enum class LockType { READ, WRITE }; | ||
|
|
||
| FileLockRAII(int fd, LockType type) : fd_(fd), locked_(false) { | ||
| if (type == LockType::READ) { | ||
| locked_ = (flock(fd_, LOCK_SH) == 0); | ||
| } else { | ||
| locked_ = (flock(fd_, LOCK_EX) == 0); | ||
| } | ||
| } | ||
|
|
||
| ~FileLockRAII() { | ||
| if (locked_) { | ||
| flock(fd_, LOCK_UN); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| FileLockRAII(const FileLockRAII&) = delete; | ||
| FileLockRAII& operator=(const FileLockRAII&) = delete; | ||
|
|
||
| FileLockRAII(FileLockRAII&& other) noexcept | ||
| : fd_(other.fd_), locked_(other.locked_) { | ||
| other.locked_ = false; | ||
| } | ||
|
|
||
| bool is_locked() const { return locked_; } | ||
|
|
||
| private: | ||
| int fd_; | ||
| bool locked_; | ||
| }; | ||
|
|
||
| /** | ||
| * @class LocalFile | ||
| * @brief RAII wrapper for file operations with thread-safe locking support | ||
| * | ||
| * Provides thread-safe file I/O operations including read/write and vectorized I/O. | ||
| * Implements proper resource management through RAII pattern. | ||
| */ | ||
| class StorageFile { | ||
| public: | ||
|
|
||
| StorageFile(const std::string &filename, int fd) | ||
| : filename_(filename), fd_(fd), error_code_(ErrorCode::OK), is_locked_(false) {} | ||
| /** | ||
| * @brief Destructor | ||
| * @note Automatically closes the file and releases resources | ||
| */ | ||
| virtual ~StorageFile() = default; | ||
|
|
||
| /** | ||
| * @brief Writes data from buffer to file | ||
| * @param buffer Input buffer containing data to write | ||
| * @param length Number of bytes to write | ||
| * @return tl::expected<size_t, ErrorCode> containing number of bytes written on success, or ErrorCode on failure | ||
| * @note Thread-safe operation with write locking | ||
| */ | ||
| virtual tl::expected<size_t, ErrorCode> write(const std::string &buffer, size_t length) = 0; | ||
|
|
||
| /** | ||
| * @brief Writes data from buffer to file | ||
| * @param data Input span containing data to write | ||
| * @param length Number of bytes to write | ||
| * @return tl::expected<size_t, ErrorCode> containing number of bytes written on success, or ErrorCode on failure | ||
| * @note Thread-safe operation with write locking | ||
| */ | ||
| virtual tl::expected<size_t, ErrorCode> write(std::span<const char> data, size_t length) = 0; | ||
|
|
||
| /** | ||
| * @brief Reads data from file into buffer | ||
| * @param buffer Output buffer for read data | ||
| * @param length Maximum number of bytes to read | ||
| * @return tl::expected<size_t, ErrorCode> containing number of bytes read on success, or ErrorCode on failure | ||
| * @note Thread-safe operation with read locking | ||
| */ | ||
| virtual tl::expected<size_t, ErrorCode> read(std::string &buffer, size_t length) = 0; | ||
|
|
||
| /** | ||
| * @brief Scattered write at specified file offset | ||
| * @param iov Array of I/O vectors | ||
| * @param iovcnt Number of elements in iov array | ||
| * @param offset File offset to write at | ||
| * @return tl::expected<size_t, ErrorCode> containing total bytes written on success, or ErrorCode on failure | ||
| * @note Thread-safe operation with write locking | ||
| */ | ||
| virtual tl::expected<size_t, ErrorCode> vector_write(const iovec *iov, int iovcnt, off_t offset) = 0; | ||
|
|
||
| /** | ||
| * @brief Scattered read from specified file offset | ||
| * @param iov Array of I/O vectors | ||
| * @param iovcnt Number of elements in iov array | ||
| * @param offset File offset to read from | ||
| * @return tl::expected<size_t, ErrorCode> containing total bytes read on success, or ErrorCode on failure | ||
| * @note Thread-safe operation with read locking | ||
| */ | ||
| virtual tl::expected<size_t, ErrorCode> vector_read(const iovec *iov, int iovcnt, off_t offset) = 0; | ||
|
|
||
| template<typename T> | ||
| tl::expected<T, ErrorCode> make_error(ErrorCode code) { | ||
| error_code_ = code; | ||
| return tl::make_unexpected(code); | ||
| } | ||
|
|
||
| /** | ||
| * @brief file locking mechanism | ||
| */ | ||
| FileLockRAII acquire_write_lock() { | ||
| return FileLockRAII(fd_, FileLockRAII::LockType::WRITE); | ||
| } | ||
|
|
||
| FileLockRAII acquire_read_lock() { | ||
| return FileLockRAII(fd_, FileLockRAII::LockType::READ); | ||
| } | ||
|
|
||
| /** | ||
| * @brief Gets the current error code | ||
| * @return Current error code | ||
| */ | ||
| ErrorCode get_error_code(){ | ||
| return error_code_; | ||
| } | ||
|
|
||
| protected: | ||
| std::string filename_; | ||
| int fd_; | ||
| ErrorCode error_code_{ErrorCode::OK}; | ||
| std::atomic<bool> is_locked_{false}; | ||
| }; | ||
|
|
||
| class PosixFile : public StorageFile { | ||
| public: | ||
| PosixFile(const std::string &filename, int fd); | ||
| ~PosixFile() override; | ||
|
|
||
| tl::expected<size_t, ErrorCode> write(const std::string &buffer, size_t length) override; | ||
| tl::expected<size_t, ErrorCode> write(std::span<const char> data, size_t length) override; | ||
| tl::expected<size_t, ErrorCode> read(std::string &buffer, size_t length) override; | ||
| tl::expected<size_t, ErrorCode> vector_write(const iovec *iov, int iovcnt, off_t offset) override; | ||
| tl::expected<size_t, ErrorCode> vector_read(const iovec *iov, int iovcnt, off_t offset) override; | ||
| }; | ||
|
|
||
| } // namespace mooncake | ||
|
|
||
| #ifdef USE_3FS | ||
| #include <hf3fs/hf3fs.h> | ||
| #endif | ||
|
|
||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| #pragma once | ||
|
|
||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <mutex> | ||
| #include <thread> | ||
| #include <hf3fs_usrbio.h> | ||
| #include "types.h" | ||
|
|
||
| namespace mooncake { | ||
|
|
||
| class StorageFile; | ||
|
|
||
| // Forward declaration of USRBIOResourceManager | ||
| struct Hf3fsConfig { | ||
| // 3FS cluster related parameters | ||
|
|
||
| // USRBIO related parameters | ||
| std::string mount_root = "/"; // Mount point root directory | ||
| size_t iov_size = 32 << 20; // Shared memory size (32MB) | ||
| size_t ior_entries = 16; // Maximum number of requests in IO ring | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the batch size is greater than 16, what will happen?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each thread has its own USRBIO resources (iov, ior, etc.), so ior is separeted in batchget now. Besides in the current implementation, only one I/O request is submitted to the ior at a time, waiting for completion before submitting the next, thus avoiding ior overflow (splitting 32MB into 4*8MB I/O requests showed no significant performance gain in local tests, so this approach was not adopted). |
||
| //`0` for no control with I/O depth. | ||
| // If greater than 0, then only when `io_depth` I/O requests are in queue, they will be issued to server as a batch. | ||
| // If smaller than 0, then USRBIO will wait for at most `-io_depth` I/O requests are in queue and issue them in one batch. | ||
| // If io_depth is 0, then USRBIO will issue all the prepared I/O requests to server ASAP. | ||
| size_t io_depth = 0; // IO batch processing depth | ||
| int ior_timeout = 0; // IO timeout (milliseconds) | ||
| }; | ||
|
|
||
| class USRBIOResourceManager { | ||
| public: | ||
|
|
||
| USRBIOResourceManager() {} | ||
|
|
||
| void setDefaultParams(const Hf3fsConfig& config) { | ||
| default_config_ = config; | ||
| } | ||
|
|
||
| struct ThreadUSRBIOResource* getThreadResource( | ||
| const Hf3fsConfig &config); | ||
|
|
||
| struct ThreadUSRBIOResource* getThreadResource() { | ||
| return getThreadResource(default_config_); | ||
| } | ||
|
|
||
| ~USRBIOResourceManager(); | ||
|
|
||
|
|
||
| private: | ||
| USRBIOResourceManager(const USRBIOResourceManager &) = delete; | ||
| USRBIOResourceManager &operator=(const USRBIOResourceManager &) = delete; | ||
| Hf3fsConfig default_config_; | ||
|
|
||
| // Thread resources map protection lock | ||
| std::mutex resource_map_mutex; | ||
|
|
||
| // ThreadID to resource mapping | ||
| std::unordered_map<std::thread::id, struct ThreadUSRBIOResource *> | ||
| thread_resources; | ||
| }; | ||
|
|
||
| // Thread level USRBIO resource structure | ||
| struct ThreadUSRBIOResource { | ||
| // USRBIO resources | ||
| struct hf3fs_iov iov_; | ||
| struct hf3fs_ior ior_read_; | ||
| struct hf3fs_ior ior_write_; | ||
|
|
||
| // Resource initialization status | ||
| bool initialized; | ||
|
|
||
| // Resource belongs to parameters | ||
| Hf3fsConfig config_; | ||
|
|
||
| ThreadUSRBIOResource() : initialized(false) {} | ||
|
|
||
| // Initialize resource | ||
| bool Initialize(const Hf3fsConfig &config); | ||
|
|
||
| // Cleanup resource | ||
| void Cleanup(); | ||
|
|
||
| ~ThreadUSRBIOResource() { Cleanup(); } | ||
| }; | ||
|
|
||
| class ThreeFSFile : public StorageFile { | ||
| public: | ||
| ThreeFSFile(const std::string &filename, int fd, USRBIOResourceManager* resource_manager); | ||
| ~ThreeFSFile() override; | ||
|
|
||
| tl::expected<size_t, ErrorCode> write(const std::string &buffer, size_t length) override; | ||
| tl::expected<size_t, ErrorCode> write(std::span<const char> data, size_t length) override; | ||
| tl::expected<size_t, ErrorCode> read(std::string &buffer, size_t length) override; | ||
| tl::expected<size_t, ErrorCode> vector_write(const iovec *iov, int iovcnt, off_t offset) override; | ||
| tl::expected<size_t, ErrorCode> vector_read(const iovec *iov, int iovcnt, off_t offset) override; | ||
|
|
||
| private: | ||
| USRBIOResourceManager* resource_manager_; | ||
| }; | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same, if value size bigger than 32MB, what will happen?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The current implementation handles values exceeding
iov_sizeby splitting the operation into multiple read-and-copy iterations within a loop (e.g., for 64MB data, it performs two passes to read into theiovand copy to slices).