From b340bbfc1d0fd52d94799fa8d3b016178b46d6ba Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Wed, 16 Sep 2020 15:07:13 +0200 Subject: [PATCH 01/50] std: add io_uring library This brings io_uring helper methods to Zig for kernels >= 5.4. We follow liburing's design decisions so that anyone who is comfortable with liburing (https://unixism.net/loti/ref-liburing/index.html) will feel at home. Thanks to @daurnimator for the first draft. Refs: https://github.com/ziglang/zig/pull/3083 Signed-off-by: Joran Dirk Greef --- lib/std/io_uring.zig | 826 +++++++++++++++++++++++++++++++++++++++++++ lib/std/std.zig | 1 + 2 files changed, 827 insertions(+) create mode 100644 lib/std/io_uring.zig diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig new file mode 100644 index 000000000000..f4881ea9079f --- /dev/null +++ b/lib/std/io_uring.zig @@ -0,0 +1,826 @@ +const builtin = @import("builtin"); +const std = @import("std"); +const assert = std.debug.assert; +const os = std.os; +const linux = os.linux; +const mem = std.mem; +const net = std.net; +const testing = std.testing; + +pub const io_uring_params = linux.io_uring_params; +pub const io_uring_cqe = linux.io_uring_cqe; + +// TODO Update linux.zig's definition of linux.io_uring_sqe: +// linux.io_uring_sqe uses numbered unions, i.e. `union1` etc. that are not future-proof and need to +// be re-numbered whenever new unions are interposed by the kernel. Furthermore, Zig's unions do not +// support assignment by any union member directly as in C, without going through the union, so the +// kernel adding new unions would also break existing Zig code. +// We therefore use a flat struct without unions to avoid these two issues. +// Pending https://github.com/ziglang/zig/issues/6349. +pub const io_uring_sqe = extern struct { + opcode: linux.IORING_OP, + flags: u8 = 0, + ioprio: u16 = 0, + fd: i32 = 0, + off: u64 = 0, + addr: u64 = 0, + len: u32 = 0, + opflags: u32 = 0, + user_data: u64 = 0, + buffer: u16 = 0, + personality: u16 = 0, + splice_fd_in: i32 = 0, + options: [2]u64 = [2]u64{ 0, 0 } +}; + +// TODO Add to zig/std/os/bits/linux.zig: +const IORING_SQ_CQ_OVERFLOW = 1 << 1; + +comptime { + assert(@sizeOf(io_uring_params) == 120); + assert(@sizeOf(io_uring_sqe) == 64); + assert(@sizeOf(io_uring_cqe) == 16); + + assert(linux.IORING_OFF_SQ_RING == 0); + assert(linux.IORING_OFF_CQ_RING == 0x8000000); + assert(linux.IORING_OFF_SQES == 0x10000000); +} + +pub const IO_Uring = struct { + fd: i32 = -1, + sq: SubmissionQueue, + cq: CompletionQueue, + flags: u32, + + /// A friendly way to setup an io_uring, with default io_uring_params. + /// `entries` must be a power of two between 1 and 4096, although the kernel will make the final + /// call on how many entries the submission and completion queues will ultimately have, + /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. + /// Matches the interface of io_uring_queue_init() in liburing. + pub fn init(entries: u32, flags: u32) !IO_Uring { + var params = io_uring_params { + .sq_entries = 0, + .cq_entries = 0, + .flags = flags, + .sq_thread_cpu = 0, + .sq_thread_idle = 1000, + .features = 0, + .wq_fd = 0, + .resv = [_]u32{0} ** 3, + .sq_off = undefined, + .cq_off = undefined, + }; + // The kernel will zero the memory of the sq_off and cq_off structs in io_uring_create(), + // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7986-L8002. + return try IO_Uring.init_params(entries, ¶ms); + } + + /// A powerful way to setup an io_uring, if you want to tweak io_uring_params such as submission + /// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). + /// `params` is passed by reference because the kernel needs to modify the parameters. + /// You may only set the `flags`, `sq_thread_cpu` and `sq_thread_idle` parameters. + /// Every other parameter belongs to the kernel and must be zeroed. + /// Matches the interface of io_uring_queue_init_params() in liburing. + pub fn init_params(entries: u32, p: *io_uring_params) !IO_Uring { + assert(entries >= 1 and entries <= 4096 and std.math.isPowerOfTwo(entries)); + assert(p.*.sq_entries == 0); + assert(p.*.cq_entries == 0); + assert(p.*.features == 0); + assert(p.*.wq_fd == 0); + assert(p.*.resv[0] == 0); + assert(p.*.resv[1] == 0); + assert(p.*.resv[2] == 0); + + const res = linux.io_uring_setup(entries, p); + try check_errno(res); + const fd = @intCast(i32, res); + assert(fd >= 0); + errdefer os.close(fd); + + // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. + // This is not an optional feature for us... if the kernel does it, we have to do it. + // The thinking on this by the kernel developers was that both the submission and the + // completion queue rings have sizes just over a power of two, but the submission queue ring + // is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel + // gets the submission queue ring for free. + // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. + // We do not support the double mmap() done before 5.4, because we want to keep the + // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. + if ((p.*.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { + return error.IO_UringKernelNotSupported; + } + + // Check that the kernel has actually set params and that "impossible is nothing". + assert(p.*.sq_entries != 0); + assert(p.*.cq_entries != 0); + assert(p.*.cq_entries >= p.*.sq_entries); + + // From here on, we only need to read from params, so pass `p` by value for convenience. + // The completion queue shares the mmap with the submission queue, so pass `sq` there too. + var sq = try SubmissionQueue.init(fd, p.*); + errdefer sq.deinit(); + var cq = try CompletionQueue.init(fd, p.*, sq); + errdefer cq.deinit(); + + // Check that our starting state is as we expect. + assert(sq.head.* == 0); + assert(sq.tail.* == 0); + assert(sq.mask.* == p.*.sq_entries - 1); + // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. + assert(sq.dropped.* == 0); + assert(sq.array.len == p.*.sq_entries); + assert(sq.sqes.len == p.*.sq_entries); + assert(sq.sqe_head == 0); + assert(sq.sqe_tail == 0); + + assert(cq.head.* == 0); + assert(cq.tail.* == 0); + assert(cq.mask.* == p.*.cq_entries - 1); + assert(cq.overflow.* == 0); + assert(cq.cqes.len == p.*.cq_entries); + + // Alles in Ordnung! + return IO_Uring { + .fd = fd, + .sq = sq, + .cq = cq, + .flags = p.*.flags + }; + } + + pub fn deinit(self: *IO_Uring) void { + assert(self.fd >= 0); + // The mmaps depend on the fd, so the order of these calls is important: + self.cq.deinit(); + self.sq.deinit(); + os.close(self.fd); + self.fd = -1; + } + + /// Returns a vacant SQE, or an error if the submission queue is full. + /// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. + /// However, instead of a null we return an error to force safe handling. + /// Any situation where the submission queue is full tends more towards a control flow error, + /// and the null return in liburing is more a C idiom than anything else, for lack of a better + /// alternative. In Zig, we have first-class error handling... so let's use it. + /// Matches the implementation of io_uring_get_sqe() in liburing. + pub fn get_sqe(self: *IO_Uring) !*io_uring_sqe { + const head = @atomicLoad(u32, self.sq.head, .Acquire); + // Remember that these head and tail offsets wrap around every four billion operations. + // We must therefore use wrapping addition and subtraction to avoid a runtime crash. + const next = self.sq.sqe_tail +% 1; + if (next -% head > self.sq.sqes.len) return error.IO_UringSubmissionQueueFull; + var sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask.*]; + self.sq.sqe_tail = next; + return sqe; + } + + /// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have + /// called get_sqe() multiple times to setup multiple I/O requests. + /// Returns the number of SQEs submitted. + /// Matches the implementation of io_uring_submit() in liburing. + pub fn submit(self: *IO_Uring) !u32 { + return self.submit_and_wait(0); + } + + /// Like submit(), but allows waiting for events as well. + /// Returns the number of SQEs submitted. + /// Matches the implementation of io_uring_submit_and_wait() in liburing. + pub fn submit_and_wait(self: *IO_Uring, wait_nr: u32) !u32 { + var submitted = self.flush_sq(); + var flags: u32 = 0; + if (self.sq_ring_needs_enter(submitted, &flags) or wait_nr > 0) { + if (wait_nr > 0 or (self.flags & linux.IORING_SETUP_IOPOLL) > 0) { + flags |= linux.IORING_ENTER_GETEVENTS; + } + return try self.enter(submitted, wait_nr, flags); + } + return submitted; + } + + // Tell the kernel we have submitted SQEs and/or want to wait for CQEs. + // Returns the number of SQEs submitted. + fn enter(self: *IO_Uring, to_submit: u32, min_complete: u32, flags: u32) !u32 { + assert(self.fd >= 0); + const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); + try check_errno(res); + return @truncate(u32, res); + } + + // Sync internal state with kernel ring state on the SQ side. + // Returns the number of all pending events in the SQ ring, for the shared ring. + // This return value includes previously flushed SQEs, as per liburing. + // The reasoning for this is to suggest that an io_uring_enter() call is needed rather than not. + // Matches the implementation of __io_uring_flush_sq() in liburing. + fn flush_sq(self: *IO_Uring) u32 { + if (self.sq.sqe_head != self.sq.sqe_tail) { + // Fill in SQEs that we have queued up, adding them to the kernel ring. + const to_submit = self.sq.sqe_tail -% self.sq.sqe_head; + const mask = self.sq.mask.*; + var tail = self.sq.tail.*; + var i: usize = 0; + while (i < to_submit) : (i += 1) { + self.sq.array[tail & mask] = self.sq.sqe_head & mask; + tail +%= 1; + self.sq.sqe_head +%= 1; + } + // Ensure that the kernel can actually see the SQE updates when it sees the tail update. + @atomicStore(u32, self.sq.tail, tail, .Release); + } + return self.sq_ready(); + } + + /// Returns true if we are not using an SQ thread (thus nobody submits but us), + /// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. + /// For the latter case, we set the SQ thread wakeup flag. + /// Matches the implementation of sq_ring_needs_enter() in liburing. + fn sq_ring_needs_enter(self: *IO_Uring, submitted: u32, flags: *u32) bool { + assert(flags.* == 0); + if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0 and submitted > 0) return true; + if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) > 0) { + flags.* |= linux.IORING_ENTER_SQ_WAKEUP; + return true; + } + return false; + } + + /// Returns the number of flushed and unflushed SQEs pending in the submission queue. + /// In other words, this is the number of SQEs in the submission queue, i.e. its length. + /// These are SQEs that the kernel is yet to consume. + /// Matches the implementation of io_uring_sq_ready in liburing. + pub fn sq_ready(self: *IO_Uring) u32 { + // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, + // see https://github.com/axboe/liburing/issues/92. + return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .Acquire); + } + + /// Returns the number of CQEs in the completion queue, i.e. its length. + /// These are CQEs that the application is yet to consume. + /// Matches the implementation of io_uring_cq_ready in liburing. + pub fn cq_ready(self: *IO_Uring) u32 { + return @atomicLoad(u32, self.cq.tail, .Acquire) -% self.cq.head.*; + } + + /// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice. + /// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs. + /// Returns the number of CQEs copied, advancing the CQ ring. + /// Provides all the wait/peek methods found in liburing, but with batching and a single method. + /// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes + /// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface. + /// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs. + /// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. + /// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. + /// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. + pub fn copy_cqes(self: *IO_Uring, cqes: []io_uring_cqe, wait_nr: u32) !u32 { + const count = self.copy_cqes_ready(cqes, wait_nr); + if (count > 0) return count; + if (self.cq_ring_needs_flush() or wait_nr > 0) { + _ = try self.enter(0, wait_nr, linux.IORING_ENTER_GETEVENTS); + return self.copy_cqes_ready(cqes, wait_nr); + } + return 0; + } + + fn copy_cqes_ready(self: *IO_Uring, cqes: []io_uring_cqe, wait_nr: u32) u32 { + const ready = self.cq_ready(); + const count = std.math.min(cqes.len, ready); + const mask = self.cq.mask.*; + var head = self.cq.head.*; + var tail = head +% count; + // TODO Optimize this by using 1 or 2 memcpy's (if the tail wraps) rather than a loop. + var i: usize = 0; + // Do not use "less-than" operator since head and tail may wrap: + while (head != tail) { + cqes[i] = self.cq.cqes[head & mask]; // Copy struct by value. + head +%= 1; + i += 1; + } + self.cq_advance(count); + return count; + } + + /// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. + /// A convenience method for `copy_cqes()` for when you don't need to batch or peek. + pub fn copy_cqe(ring: *IO_Uring) !io_uring_cqe { + var cqes: [1]io_uring_cqe = undefined; + const count = try ring.copy_cqes(&cqes, 1); + assert(count == 1); + return cqes[0]; + } + + // Matches the implementation of cq_ring_needs_flush() in liburing. + fn cq_ring_needs_flush(self: *IO_Uring) bool { + return (@atomicLoad(u32, self.sq.flags, .Unordered) & IORING_SQ_CQ_OVERFLOW) > 0; + } + + /// For advanced use cases only that implement custom completion queue methods. + /// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance(). + /// Must be called exactly once after a zero-copy CQE has been processed by your application. + /// Not idempotent, calling more than once will result in other CQEs being lost. + /// Matches the implementation of cqe_seen() in liburing. + pub fn cqe_seen(self: *IO_Uring, cqe: *io_uring_cqe) void { + self.cq_advance(1); + } + + /// For advanced use cases only that implement custom completion queue methods. + /// Matches the implementation of cq_advance() in liburing. + pub fn cq_advance(self: *IO_Uring, count: u32) void { + if (count > 0) { + // Ensure the kernel only sees the new head value after the CQEs have been read. + @atomicStore(u32, self.cq.head, self.cq.head.* +% count, .Release); + } + } + + /// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. + /// Returns a pointer to the SQE. + pub fn queue_accept( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + addr: *os.sockaddr, + addrlen: *os.socklen_t, + accept_flags: u32 + ) !*io_uring_sqe { + // "sqe->fd is the file descriptor, sqe->addr holds a pointer to struct sockaddr, + // sqe->addr2 holds a pointer to socklen_t, and finally sqe->accept_flags holds the flags + // for accept(4)." - https://lwn.net/ml/linux-block/20191025173037.13486-1-axboe@kernel.dk/ + const sqe = try self.get_sqe(); + sqe.* = .{ + .opcode = .ACCEPT, + .fd = fd, + .off = @ptrToInt(addrlen), // `addr2` is a newer union member that maps to `off`. + .addr = @ptrToInt(addr), + .user_data = user_data, + .opflags = accept_flags + }; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform an `fsync(2)`. + /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. + /// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `opflags`. + /// N.B. While SQEs are initiated in the order in which they appear in the submission queue, + /// operations execute in parallel and completions are unordered. Therefore, an application that + /// submits a write followed by an fsync in the submission queue cannot expect the fsync to + /// apply to the write, since the fsync may complete before the write is issued to the disk. + /// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, + /// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. + pub fn queue_fsync(self: *IO_Uring, user_data: u64, fd: os.fd_t) !*io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.* = .{ + .opcode = .FSYNC, + .fd = fd, + .user_data = user_data + }; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform a no-op. + /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. + /// A no-op is more useful than may appear at first glance. + /// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to + /// know when the ring is idle before acting on a kill signal. + pub fn queue_nop(self: *IO_Uring, user_data: u64) !*io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.* = .{ + .opcode = .NOP, + .user_data = user_data + }; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform a `read(2)`. + /// Returns a pointer to the SQE. + pub fn queue_read( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + buffer: []u8, + offset: u64 + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.* = .{ + .opcode = .READ, + .fd = fd, + .off = offset, + .addr = @ptrToInt(buffer.ptr), + .len = @truncate(u32, buffer.len), + .user_data = user_data + }; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform a `write(2)`. + /// Returns a pointer to the SQE. + pub fn queue_write( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + buffer: []const u8, + offset: u64 + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.* = .{ + .opcode = .WRITE, + .fd = fd, + .off = offset, + .addr = @ptrToInt(buffer.ptr), + .len = @truncate(u32, buffer.len), + .user_data = user_data + }; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform a `preadv()`. + /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. + /// For example, if you want to do a `preadv2()` then set `opflags` on the returned SQE. + /// See https://linux.die.net/man/2/preadv. + pub fn queue_readv( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + iovecs: []const os.iovec, + offset: u64 + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.* = .{ + .opcode = .READV, + .fd = fd, + .off = offset, + .addr = @ptrToInt(iovecs.ptr), + .len = @truncate(u32, iovecs.len), + .user_data = user_data + }; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform a `pwritev()`. + /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. + /// For example, if you want to do a `pwritev2()` then set `opflags` on the returned SQE. + /// See https://linux.die.net/man/2/pwritev. + pub fn queue_writev( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + iovecs: []const os.iovec_const, + offset: u64 + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + sqe.* = .{ + .opcode = .WRITEV, + .fd = fd, + .off = offset, + .addr = @ptrToInt(iovecs.ptr), + .len = @truncate(u32, iovecs.len), + .user_data = user_data + }; + return sqe; + } + + /// The next SQE will not be started until this one completes. + /// This can be used to chain causally dependent SQEs, and the chain can be arbitrarily long. + /// The tail of the chain is denoted by the first SQE that does not have this flag set. + /// This flag has no effect on previous SQEs, nor does it impact SQEs outside the chain. + /// This means that multiple chains can be executing in parallel, along with individual SQEs. + /// Only members inside the chain are serialized. + /// A chain will be broken if any SQE in the chain ends in error, where any unexpected result is + /// considered an error. For example, a short read will terminate the remainder of the chain. + pub fn link_with_next_sqe(self: *IO_Uring, sqe: *io_uring_sqe) void { + sqe.*.flags |= linux.IOSQE_IO_LINK; + } + + /// Like `link_with_next_sqe()` but stronger. + /// For when you don't want the chain to fail in the event of a completion result error. + /// For example, you may know that some commands will fail and may want the chain to continue. + /// Hard links are resilient to completion results, but are not resilient to submission errors. + pub fn hardlink_with_next_sqe(self: *IO_Uring, sqe: *io_uring_sqe) void { + sqe.*.flags |= linux.IOSQE_IO_HARDLINK; + } + + /// This creates a full pipeline barrier in the submission queue. + /// This SQE will not be started until previous SQEs complete. + /// Subsequent SQEs will not be started until this SQE completes. + /// In other words, this stalls the entire submission queue. + /// You should first consider using link_with_next_sqe() for more granular SQE sequence control. + pub fn drain_previous_sqes(self: *IO_Uring, sqe: *io_uring_sqe) void { + sqe.*.flags |= linux.IOSQE_IO_DRAIN; + } + + /// Registers an array of file descriptors. + /// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must + /// retrieve a reference to the file, and once I/O has completed the file reference must be + /// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads. + /// This slowdown can be avoided by pre-registering file descriptors. + /// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags, + /// and the SQE's fd must be set to the index of the file descriptor in the registered array. + /// Registering file descriptors will wait for the ring to idle. + /// Files are automatically unregistered by the kernel when the ring is torn down. + /// An application need unregister only if it wants to register a new array of file descriptors. + pub fn register_files(self: *IO_Uring, fds: []const i32) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_FILES, + fds.ptr, + @truncate(u32, fds.len) + ); + try check_errno(res); + } + + /// Changes the semantics of the SQE's `fd` to refer to a pre-registered file descriptor. + pub fn use_registered_fd(self: *IO_Uring, sqe: *io_uring_sqe) void { + sqe.*.flags |= linux.IOSQE_FIXED_FILE; + } + + /// Unregisters all registered file descriptors previously associated with the ring. + pub fn unregister_files(self: *IO_Uring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); + try check_errno(res); + } +}; + + +pub const SubmissionQueue = struct { + head: *u32, + tail: *u32, + mask: *u32, + flags: *u32, + dropped: *u32, + array: []u32, + sqes: []io_uring_sqe, + mmap: []align(std.mem.page_size) u8, + mmap_sqes: []align(std.mem.page_size) u8, + + // We use `sqe_head` and `sqe_tail` in the same way as liburing: + // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. + // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. + // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. + sqe_head: u32 = 0, + sqe_tail: u32 = 0, + + pub fn init(fd: i32, p: io_uring_params) !SubmissionQueue { + assert(fd >= 0); + assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) > 0); + const size = std.math.max( + p.sq_off.array + p.sq_entries * @sizeOf(u32), + p.cq_off.cqes + p.cq_entries * @sizeOf(io_uring_cqe) + ); + const mmap = try os.mmap( + null, + size, + os.PROT_READ | os.PROT_WRITE, + os.MAP_SHARED | os.MAP_POPULATE, + fd, + linux.IORING_OFF_SQ_RING, + ); + errdefer os.munmap(mmap); + assert(mmap.len == size); + + // The motivation for the `sqes` and `array` indirection is to make it possible for the + // application to preallocate static io_uring_sqe entries and then replay them when needed. + const size_sqes = p.sq_entries * @sizeOf(io_uring_sqe); + const mmap_sqes = try os.mmap( + null, + size_sqes, + os.PROT_READ | os.PROT_WRITE, + os.MAP_SHARED | os.MAP_POPULATE, + fd, + linux.IORING_OFF_SQES, + ); + errdefer os.munmap(mmap_sqes); + assert(mmap_sqes.len == size_sqes); + + const array = @ptrCast([*]u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.array])); + const sqes = @ptrCast([*]io_uring_sqe, @alignCast(@alignOf(io_uring_sqe), &mmap_sqes[0])); + // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, + // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. + assert( + p.sq_entries == + @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.ring_entries])).* + ); + return SubmissionQueue { + .head = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.head])), + .tail = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.tail])), + .mask = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.ring_mask])), + .flags = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.flags])), + .dropped = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.dropped])), + .array = array[0..p.sq_entries], + .sqes = sqes[0..p.sq_entries], + .mmap = mmap, + .mmap_sqes = mmap_sqes + }; + } + + pub fn deinit(self: *SubmissionQueue) void { + os.munmap(self.mmap_sqes); + os.munmap(self.mmap); + } +}; + +pub const CompletionQueue = struct { + head: *u32, + tail: *u32, + mask: *u32, + overflow: *u32, + cqes: []io_uring_cqe, + + pub fn init(fd: i32, p: io_uring_params, sq: SubmissionQueue) !CompletionQueue { + assert(fd >= 0); + assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) > 0); + const mmap = sq.mmap; + const cqes = @ptrCast( + [*]io_uring_cqe, + @alignCast(@alignOf(io_uring_cqe), &mmap[p.cq_off.cqes]) + ); + assert( + p.cq_entries == + @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.ring_entries])).* + ); + return CompletionQueue { + .head = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.head])), + .tail = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.tail])), + .mask = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.ring_mask])), + .overflow = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.overflow])), + .cqes = cqes[0..p.cq_entries] + }; + } + + pub fn deinit(self: *CompletionQueue) void { + // A no-op since we now share the mmap with the submission queue. + // Here for symmetry with the submission queue, and for any future feature support. + } +}; + +inline fn check_errno(res: usize) !void { + const errno = linux.getErrno(res); + if (errno != 0) return os.unexpectedErrno(errno); +} + +test "queue_nop" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = try IO_Uring.init(1, 0); + defer { + ring.deinit(); + testing.expectEqual(@as(i32, -1), ring.fd); + } + + var sqe = try ring.queue_nop(@intCast(u64, 0xaaaaaaaa)); + testing.expectEqual(io_uring_sqe { + .opcode = .NOP, + .flags = 0, + .ioprio = 0, + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .opflags = 0, + .user_data = @intCast(u64, 0xaaaaaaaa), + .buffer = 0, + .personality = 0, + .splice_fd_in = 0, + .options = [2]u64{ 0, 0 } + }, sqe.*); + + testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); + testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + testing.expectEqual(@as(u32, 0), ring.sq.tail.*); + testing.expectEqual(@as(u32, 0), ring.cq.head.*); + testing.expectEqual(@as(u32, 1), ring.sq_ready()); + testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + testing.expectEqual(@as(u32, 1), try ring.submit()); + testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); + testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + testing.expectEqual(@as(u32, 1), ring.sq.tail.*); + testing.expectEqual(@as(u32, 0), ring.cq.head.*); + testing.expectEqual(@as(u32, 0), ring.sq_ready()); + + testing.expectEqual(io_uring_cqe { + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = 0 + }, try ring.copy_cqe()); + testing.expectEqual(@as(u32, 1), ring.cq.head.*); + testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + var sqe_barrier = try ring.queue_nop(@intCast(u64, 0xbbbbbbbb)); + ring.drain_previous_sqes(sqe_barrier); + testing.expectEqual(@as(u8, linux.IOSQE_IO_DRAIN), sqe_barrier.*.flags); + testing.expectEqual(@as(u32, 1), try ring.submit()); + testing.expectEqual(io_uring_cqe { + .user_data = 0xbbbbbbbb, + .res = 0, + .flags = 0 + }, try ring.copy_cqe()); + testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); + testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); + testing.expectEqual(@as(u32, 2), ring.sq.tail.*); + testing.expectEqual(@as(u32, 2), ring.cq.head.*); +} + +test "queue_readv" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = try IO_Uring.init(1, 0); + defer ring.deinit(); + + const fd = try os.openZ("/dev/zero", os.O_RDONLY | os.O_CLOEXEC, 0); + defer os.close(fd); + + var registered_fds = [_]i32{-1} ** 10; + const fd_index = 9; + registered_fds[fd_index] = fd; + try ring.register_files(registered_fds[0..]); + + var buffer = [_]u8{42} ** 128; + var iovecs = [_]os.iovec{ os.iovec { .iov_base = &buffer, .iov_len = buffer.len } }; + var sqe = try ring.queue_readv(0xcccccccc, fd_index, iovecs[0..], 0); + ring.use_registered_fd(sqe); + testing.expectEqual(@as(u8, linux.IOSQE_FIXED_FILE), sqe.*.flags); + + testing.expectError(error.IO_UringSubmissionQueueFull, ring.queue_nop(0)); + testing.expectEqual(@as(u32, 1), try ring.submit()); + testing.expectEqual(linux.io_uring_cqe { + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = 0, + }, try ring.copy_cqe()); + testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + + try ring.unregister_files(); +} + +test "queue_writev/queue_fsync" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = try IO_Uring.init(2, 0); + defer ring.deinit(); + + const path = "test_io_uring_queue_writev"; + const file = try std.fs.cwd().createFile(path, .{ .truncate = true }); + defer file.close(); + defer std.fs.cwd().deleteFile(path) catch {}; + const fd = file.handle; + + var buffer = [_]u8{42} ** 128; + var iovecs = [_]os.iovec_const { + os.iovec_const { .iov_base = &buffer, .iov_len = buffer.len } + }; + var sqe_writev = try ring.queue_writev(0xdddddddd, fd, iovecs[0..], 0); + ring.link_with_next_sqe(sqe_writev); + testing.expectEqual(@as(u8, linux.IOSQE_IO_LINK), sqe_writev.*.flags); + + var sqe_fsync = try ring.queue_fsync(0xeeeeeeee, fd); + testing.expectEqual(fd, sqe_fsync.*.fd); + + testing.expectEqual(@as(u32, 2), ring.sq_ready()); + testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); + testing.expectEqual(@as(u32, 0), ring.sq_ready()); + testing.expectEqual(@as(u32, 2), ring.cq_ready()); + testing.expectEqual(linux.io_uring_cqe { + .user_data = 0xdddddddd, + .res = buffer.len, + .flags = 0, + }, try ring.copy_cqe()); + testing.expectEqual(@as(u32, 1), ring.cq_ready()); + testing.expectEqual(linux.io_uring_cqe { + .user_data = 0xeeeeeeee, + .res = 0, + .flags = 0, + }, try ring.copy_cqe()); + testing.expectEqual(@as(u32, 0), ring.cq_ready()); +} + +test "queue_write/queue_read" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + // This test may require newer kernel versions. + + var ring = try IO_Uring.init(2, 0); + defer ring.deinit(); + + const path = "test_io_uring_queue_write"; + const file = try std.fs.cwd().createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + defer std.fs.cwd().deleteFile(path) catch {}; + const fd = file.handle; + + var buffer_write = [_]u8{97} ** 20; + var buffer_read = [_]u8{98} ** 20; + var sqe_write = try ring.queue_write(123, fd, buffer_write[0..], 10); + ring.link_with_next_sqe(sqe_write); + var sqe_read = try ring.queue_read(456, fd, buffer_read[0..], 10); + testing.expectEqual(@as(u32, 2), try ring.submit()); + testing.expectEqual(linux.io_uring_cqe { + .user_data = 123, + .res = buffer_write.len, + .flags = 0, + }, try ring.copy_cqe()); + testing.expectEqual(linux.io_uring_cqe { + .user_data = 456, + .res = buffer_read.len, + .flags = 0, + }, try ring.copy_cqe()); + testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); +} diff --git a/lib/std/std.zig b/lib/std/std.zig index 4236b292983b..c706c469b73c 100644 --- a/lib/std/std.zig +++ b/lib/std/std.zig @@ -65,6 +65,7 @@ pub const hash_map = @import("hash_map.zig"); pub const heap = @import("heap.zig"); pub const http = @import("http.zig"); pub const io = @import("io.zig"); +pub const io_uring = @import("io_uring.zig"); pub const json = @import("json.zig"); pub const log = @import("log.zig"); pub const macho = @import("macho.zig"); From 6f09796ff28f0c73d4ea0b63b02d58215c4450f1 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Wed, 16 Sep 2020 18:51:21 +0200 Subject: [PATCH 02/50] Add short license and copyright notice --- lib/std/io_uring.zig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index f4881ea9079f..9b8abfd98737 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -1,3 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2015-2020 Zig Contributors +// This file is part of [zig](https://ziglang.org/), which is MIT licensed. +// The MIT license requires this copyright notice to be included in all copies +// and substantial portions of the software. const builtin = @import("builtin"); const std = @import("std"); const assert = std.debug.assert; From 491a434b0184bc517cda4f57ef14b4e528e23065 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Wed, 16 Sep 2020 19:36:29 +0200 Subject: [PATCH 03/50] Check kernel support for single_mmap, accept, and read/write --- lib/std/io_uring.zig | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index 9b8abfd98737..d09ae250e396 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -95,6 +95,7 @@ pub const IO_Uring = struct { assert(p.*.resv[0] == 0); assert(p.*.resv[1] == 0); assert(p.*.resv[2] == 0); + if (!supported) return error.IO_UringKernelNotSupported; const res = linux.io_uring_setup(entries, p); try check_errno(res); @@ -346,6 +347,7 @@ pub const IO_Uring = struct { addrlen: *os.socklen_t, accept_flags: u32 ) !*io_uring_sqe { + if (!can_accept) return error.IO_UringKernelCannotAccept; // "sqe->fd is the file descriptor, sqe->addr holds a pointer to struct sockaddr, // sqe->addr2 holds a pointer to socklen_t, and finally sqe->accept_flags holds the flags // for accept(4)." - https://lwn.net/ml/linux-block/20191025173037.13486-1-axboe@kernel.dk/ @@ -403,6 +405,7 @@ pub const IO_Uring = struct { buffer: []u8, offset: u64 ) !*io_uring_sqe { + if (!can_read) return error.IO_UringKernelCannotRead; const sqe = try self.get_sqe(); sqe.* = .{ .opcode = .READ, @@ -424,6 +427,7 @@ pub const IO_Uring = struct { buffer: []const u8, offset: u64 ) !*io_uring_sqe { + if (!can_write) return error.IO_UringKernelCannotWrite; const sqe = try self.get_sqe(); sqe.* = .{ .opcode = .WRITE, @@ -662,8 +666,15 @@ inline fn check_errno(res: usize) !void { if (errno != 0) return os.unexpectedErrno(errno); } +const minimum = std.Target.current.os.isAtLeast; +pub const can_single_mmap = comptime minimum(.linux, .{ .major = 5, .minor = 4 }) == true; +pub const can_accept = comptime minimum(.linux, .{ .major = 5, .minor = 5 }) == true; +pub const can_read = comptime minimum(.linux, .{ .major = 5, .minor = 6 }) == true; +pub const can_write = comptime minimum(.linux, .{ .major = 5, .minor = 6 }) == true; +pub const supported = can_single_mmap; + test "queue_nop" { - if (builtin.os.tag != .linux) return error.SkipZigTest; + if (!supported) return error.SkipZigTest; var ring = try IO_Uring.init(1, 0); defer { @@ -726,7 +737,7 @@ test "queue_nop" { } test "queue_readv" { - if (builtin.os.tag != .linux) return error.SkipZigTest; + if (!supported) return error.SkipZigTest; var ring = try IO_Uring.init(1, 0); defer ring.deinit(); @@ -758,7 +769,7 @@ test "queue_readv" { } test "queue_writev/queue_fsync" { - if (builtin.os.tag != .linux) return error.SkipZigTest; + if (!supported) return error.SkipZigTest; var ring = try IO_Uring.init(2, 0); defer ring.deinit(); @@ -799,8 +810,7 @@ test "queue_writev/queue_fsync" { } test "queue_write/queue_read" { - if (builtin.os.tag != .linux) return error.SkipZigTest; - // This test may require newer kernel versions. + if (!can_read or !can_write) return error.SkipZigTest; var ring = try IO_Uring.init(2, 0); defer ring.deinit(); From ac1d9f716ac07786bd5e77bb51b93caf06bddd9e Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Thu, 17 Sep 2020 19:37:17 +0200 Subject: [PATCH 04/50] Use != 0 for bitwise flag conditions --- lib/std/io_uring.zig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index d09ae250e396..cfc8f5a754db 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -196,7 +196,7 @@ pub const IO_Uring = struct { var submitted = self.flush_sq(); var flags: u32 = 0; if (self.sq_ring_needs_enter(submitted, &flags) or wait_nr > 0) { - if (wait_nr > 0 or (self.flags & linux.IORING_SETUP_IOPOLL) > 0) { + if (wait_nr > 0 or (self.flags & linux.IORING_SETUP_IOPOLL) != 0) { flags |= linux.IORING_ENTER_GETEVENTS; } return try self.enter(submitted, wait_nr, flags); @@ -243,7 +243,7 @@ pub const IO_Uring = struct { fn sq_ring_needs_enter(self: *IO_Uring, submitted: u32, flags: *u32) bool { assert(flags.* == 0); if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0 and submitted > 0) return true; - if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) > 0) { + if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { flags.* |= linux.IORING_ENTER_SQ_WAKEUP; return true; } @@ -316,7 +316,7 @@ pub const IO_Uring = struct { // Matches the implementation of cq_ring_needs_flush() in liburing. fn cq_ring_needs_flush(self: *IO_Uring) bool { - return (@atomicLoad(u32, self.sq.flags, .Unordered) & IORING_SQ_CQ_OVERFLOW) > 0; + return (@atomicLoad(u32, self.sq.flags, .Unordered) & IORING_SQ_CQ_OVERFLOW) != 0; } /// For advanced use cases only that implement custom completion queue methods. @@ -570,7 +570,7 @@ pub const SubmissionQueue = struct { pub fn init(fd: i32, p: io_uring_params) !SubmissionQueue { assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) > 0); + assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); const size = std.math.max( p.sq_off.array + p.sq_entries * @sizeOf(u32), p.cq_off.cqes + p.cq_entries * @sizeOf(io_uring_cqe) @@ -636,7 +636,7 @@ pub const CompletionQueue = struct { pub fn init(fd: i32, p: io_uring_params, sq: SubmissionQueue) !CompletionQueue { assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) > 0); + assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); const mmap = sq.mmap; const cqes = @ptrCast( [*]io_uring_cqe, From 21c81360ba40d645768d061723522116b6408d48 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Thu, 17 Sep 2020 19:44:53 +0200 Subject: [PATCH 05/50] Remove comment --- lib/std/io_uring.zig | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index cfc8f5a754db..af9c545e098c 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -145,7 +145,6 @@ pub const IO_Uring = struct { assert(cq.overflow.* == 0); assert(cq.cqes.len == p.*.cq_entries); - // Alles in Ordnung! return IO_Uring { .fd = fd, .sq = sq, From d966fe63190de2992d6a7985bc492b9d9699d253 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Thu, 17 Sep 2020 19:53:34 +0200 Subject: [PATCH 06/50] Add IORING_SQ_CQ_OVERFLOW to std/os/bits/linux.zig --- lib/std/io_uring.zig | 5 +---- lib/std/os/bits/linux.zig | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index af9c545e098c..40ebb9017c24 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -38,9 +38,6 @@ pub const io_uring_sqe = extern struct { options: [2]u64 = [2]u64{ 0, 0 } }; -// TODO Add to zig/std/os/bits/linux.zig: -const IORING_SQ_CQ_OVERFLOW = 1 << 1; - comptime { assert(@sizeOf(io_uring_params) == 120); assert(@sizeOf(io_uring_sqe) == 64); @@ -315,7 +312,7 @@ pub const IO_Uring = struct { // Matches the implementation of cq_ring_needs_flush() in liburing. fn cq_ring_needs_flush(self: *IO_Uring) bool { - return (@atomicLoad(u32, self.sq.flags, .Unordered) & IORING_SQ_CQ_OVERFLOW) != 0; + return (@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; } /// For advanced use cases only that implement custom completion queue methods. diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index df31bc32fde6..c5f272c0428a 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -1251,6 +1251,9 @@ pub const io_sqring_offsets = extern struct { /// needs io_uring_enter wakeup pub const IORING_SQ_NEED_WAKEUP = 1 << 0; +/// kernel has cqes waiting beyond the cq ring +pub const IORING_SQ_CQ_OVERFLOW = 1 << 1; + pub const io_cqring_offsets = extern struct { head: u32, tail: u32, From e33c466dafce1d19014e5b9b9606f42b93d8f408 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Thu, 17 Sep 2020 19:56:57 +0200 Subject: [PATCH 07/50] Use std.builtin --- lib/std/io_uring.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index 40ebb9017c24..1f30c6f1c651 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -3,9 +3,9 @@ // This file is part of [zig](https://ziglang.org/), which is MIT licensed. // The MIT license requires this copyright notice to be included in all copies // and substantial portions of the software. -const builtin = @import("builtin"); const std = @import("std"); const assert = std.debug.assert; +const builtin = std.builtin; const os = std.os; const linux = os.linux; const mem = std.mem; From 8b030a65994f7f71a71474c27636744a5aab3e08 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Thu, 17 Sep 2020 20:29:56 +0200 Subject: [PATCH 08/50] Use x.y for C-style x->y instead of x.*.y --- lib/std/io_uring.zig | 52 ++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index 1f30c6f1c651..1a4f24767619 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -85,13 +85,13 @@ pub const IO_Uring = struct { /// Matches the interface of io_uring_queue_init_params() in liburing. pub fn init_params(entries: u32, p: *io_uring_params) !IO_Uring { assert(entries >= 1 and entries <= 4096 and std.math.isPowerOfTwo(entries)); - assert(p.*.sq_entries == 0); - assert(p.*.cq_entries == 0); - assert(p.*.features == 0); - assert(p.*.wq_fd == 0); - assert(p.*.resv[0] == 0); - assert(p.*.resv[1] == 0); - assert(p.*.resv[2] == 0); + assert(p.sq_entries == 0); + assert(p.cq_entries == 0); + assert(p.features == 0); + assert(p.wq_fd == 0); + assert(p.resv[0] == 0); + assert(p.resv[1] == 0); + assert(p.resv[2] == 0); if (!supported) return error.IO_UringKernelNotSupported; const res = linux.io_uring_setup(entries, p); @@ -109,16 +109,16 @@ pub const IO_Uring = struct { // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. // We do not support the double mmap() done before 5.4, because we want to keep the // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. - if ((p.*.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { + if ((p.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { return error.IO_UringKernelNotSupported; } // Check that the kernel has actually set params and that "impossible is nothing". - assert(p.*.sq_entries != 0); - assert(p.*.cq_entries != 0); - assert(p.*.cq_entries >= p.*.sq_entries); + assert(p.sq_entries != 0); + assert(p.cq_entries != 0); + assert(p.cq_entries >= p.sq_entries); - // From here on, we only need to read from params, so pass `p` by value for convenience. + // From here on, we only need to read from params, so pass `p` by value as immutable. // The completion queue shares the mmap with the submission queue, so pass `sq` there too. var sq = try SubmissionQueue.init(fd, p.*); errdefer sq.deinit(); @@ -128,25 +128,25 @@ pub const IO_Uring = struct { // Check that our starting state is as we expect. assert(sq.head.* == 0); assert(sq.tail.* == 0); - assert(sq.mask.* == p.*.sq_entries - 1); + assert(sq.mask.* == p.sq_entries - 1); // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. assert(sq.dropped.* == 0); - assert(sq.array.len == p.*.sq_entries); - assert(sq.sqes.len == p.*.sq_entries); + assert(sq.array.len == p.sq_entries); + assert(sq.sqes.len == p.sq_entries); assert(sq.sqe_head == 0); assert(sq.sqe_tail == 0); assert(cq.head.* == 0); assert(cq.tail.* == 0); - assert(cq.mask.* == p.*.cq_entries - 1); + assert(cq.mask.* == p.cq_entries - 1); assert(cq.overflow.* == 0); - assert(cq.cqes.len == p.*.cq_entries); + assert(cq.cqes.len == p.cq_entries); return IO_Uring { .fd = fd, .sq = sq, .cq = cq, - .flags = p.*.flags + .flags = p.flags }; } @@ -491,7 +491,7 @@ pub const IO_Uring = struct { /// A chain will be broken if any SQE in the chain ends in error, where any unexpected result is /// considered an error. For example, a short read will terminate the remainder of the chain. pub fn link_with_next_sqe(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.*.flags |= linux.IOSQE_IO_LINK; + sqe.flags |= linux.IOSQE_IO_LINK; } /// Like `link_with_next_sqe()` but stronger. @@ -499,7 +499,7 @@ pub const IO_Uring = struct { /// For example, you may know that some commands will fail and may want the chain to continue. /// Hard links are resilient to completion results, but are not resilient to submission errors. pub fn hardlink_with_next_sqe(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.*.flags |= linux.IOSQE_IO_HARDLINK; + sqe.flags |= linux.IOSQE_IO_HARDLINK; } /// This creates a full pipeline barrier in the submission queue. @@ -508,7 +508,7 @@ pub const IO_Uring = struct { /// In other words, this stalls the entire submission queue. /// You should first consider using link_with_next_sqe() for more granular SQE sequence control. pub fn drain_previous_sqes(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.*.flags |= linux.IOSQE_IO_DRAIN; + sqe.flags |= linux.IOSQE_IO_DRAIN; } /// Registers an array of file descriptors. @@ -534,7 +534,7 @@ pub const IO_Uring = struct { /// Changes the semantics of the SQE's `fd` to refer to a pre-registered file descriptor. pub fn use_registered_fd(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.*.flags |= linux.IOSQE_FIXED_FILE; + sqe.flags |= linux.IOSQE_FIXED_FILE; } /// Unregisters all registered file descriptors previously associated with the ring. @@ -719,7 +719,7 @@ test "queue_nop" { var sqe_barrier = try ring.queue_nop(@intCast(u64, 0xbbbbbbbb)); ring.drain_previous_sqes(sqe_barrier); - testing.expectEqual(@as(u8, linux.IOSQE_IO_DRAIN), sqe_barrier.*.flags); + testing.expectEqual(@as(u8, linux.IOSQE_IO_DRAIN), sqe_barrier.flags); testing.expectEqual(@as(u32, 1), try ring.submit()); testing.expectEqual(io_uring_cqe { .user_data = 0xbbbbbbbb, @@ -750,7 +750,7 @@ test "queue_readv" { var iovecs = [_]os.iovec{ os.iovec { .iov_base = &buffer, .iov_len = buffer.len } }; var sqe = try ring.queue_readv(0xcccccccc, fd_index, iovecs[0..], 0); ring.use_registered_fd(sqe); - testing.expectEqual(@as(u8, linux.IOSQE_FIXED_FILE), sqe.*.flags); + testing.expectEqual(@as(u8, linux.IOSQE_FIXED_FILE), sqe.flags); testing.expectError(error.IO_UringSubmissionQueueFull, ring.queue_nop(0)); testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -782,10 +782,10 @@ test "queue_writev/queue_fsync" { }; var sqe_writev = try ring.queue_writev(0xdddddddd, fd, iovecs[0..], 0); ring.link_with_next_sqe(sqe_writev); - testing.expectEqual(@as(u8, linux.IOSQE_IO_LINK), sqe_writev.*.flags); + testing.expectEqual(@as(u8, linux.IOSQE_IO_LINK), sqe_writev.flags); var sqe_fsync = try ring.queue_fsync(0xeeeeeeee, fd); - testing.expectEqual(fd, sqe_fsync.*.fd); + testing.expectEqual(fd, sqe_fsync.fd); testing.expectEqual(@as(u32, 2), ring.sq_ready()); testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); From ee5931908e9c23403146b18dd3020d57fd3ea10e Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 13:25:43 +0200 Subject: [PATCH 09/50] @ptrCast fds.ptr to *const c_void for io_uring_register() --- lib/std/io_uring.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index 1a4f24767619..39ac8fe8e1f5 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -526,7 +526,7 @@ pub const IO_Uring = struct { const res = linux.io_uring_register( self.fd, .REGISTER_FILES, - fds.ptr, + @ptrCast(*const c_void, fds.ptr), @truncate(u32, fds.len) ); try check_errno(res); From 5df0d284d05c5aeb4b721d124b55f40031d93f88 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 14:29:23 +0200 Subject: [PATCH 10/50] Do not register /dev/zero as an fd when testing queue_readv() --- lib/std/io_uring.zig | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index 39ac8fe8e1f5..e1db6d141cee 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -740,17 +740,10 @@ test "queue_readv" { const fd = try os.openZ("/dev/zero", os.O_RDONLY | os.O_CLOEXEC, 0); defer os.close(fd); - - var registered_fds = [_]i32{-1} ** 10; - const fd_index = 9; - registered_fds[fd_index] = fd; - try ring.register_files(registered_fds[0..]); var buffer = [_]u8{42} ** 128; var iovecs = [_]os.iovec{ os.iovec { .iov_base = &buffer, .iov_len = buffer.len } }; - var sqe = try ring.queue_readv(0xcccccccc, fd_index, iovecs[0..], 0); - ring.use_registered_fd(sqe); - testing.expectEqual(@as(u8, linux.IOSQE_FIXED_FILE), sqe.flags); + var sqe = try ring.queue_readv(0xcccccccc, fd, iovecs[0..], 0); testing.expectError(error.IO_UringSubmissionQueueFull, ring.queue_nop(0)); testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -760,8 +753,6 @@ test "queue_readv" { .flags = 0, }, try ring.copy_cqe()); testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); - - try ring.unregister_files(); } test "queue_writev/queue_fsync" { From b2a54b95814eeeb657ca953cb0547e3169893d2d Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 15:07:10 +0200 Subject: [PATCH 11/50] Test IORING_REGISTER_FILES but avoid sparse fd sets --- lib/std/io_uring.zig | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index e1db6d141cee..099b368abd4d 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -741,9 +741,22 @@ test "queue_readv" { const fd = try os.openZ("/dev/zero", os.O_RDONLY | os.O_CLOEXEC, 0); defer os.close(fd); + // Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1). + // Linux Kernel 5.5 adds support for sparse fd sets. + // Compare: + // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs + // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 + // We therefore avoid stressing sparse fd sets here: + var registered_fds = [_]i32{0} ** 1; + const fd_index = 0; + registered_fds[fd_index] = fd; + try ring.register_files(registered_fds[0..]); + var buffer = [_]u8{42} ** 128; var iovecs = [_]os.iovec{ os.iovec { .iov_base = &buffer, .iov_len = buffer.len } }; - var sqe = try ring.queue_readv(0xcccccccc, fd, iovecs[0..], 0); + var sqe = try ring.queue_readv(0xcccccccc, fd_index, iovecs[0..], 0); + ring.use_registered_fd(sqe); + testing.expectEqual(@as(u8, linux.IOSQE_FIXED_FILE), sqe.flags); testing.expectError(error.IO_UringSubmissionQueueFull, ring.queue_nop(0)); testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -753,6 +766,8 @@ test "queue_readv" { .flags = 0, }, try ring.copy_cqe()); testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + + try ring.unregister_files(); } test "queue_writev/queue_fsync" { From 09f2f4aeb366279f30256a5968a246369fafcc27 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 15:14:47 +0200 Subject: [PATCH 12/50] Fix std @import --- lib/std/io_uring.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/io_uring.zig b/lib/std/io_uring.zig index 099b368abd4d..ee6618d59f05 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/io_uring.zig @@ -3,7 +3,7 @@ // This file is part of [zig](https://ziglang.org/), which is MIT licensed. // The MIT license requires this copyright notice to be included in all copies // and substantial portions of the software. -const std = @import("std"); +const std = @import("std.zig"); const assert = std.debug.assert; const builtin = std.builtin; const os = std.os; From 873d1c80b3a34dc610946fb31de9dd88dd311d35 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 16:15:17 +0200 Subject: [PATCH 13/50] Add splice_fd_in to io_uring_sqe and future-proof for anonymous unions --- lib/std/os/bits/linux.zig | 53 +++++++++------------------------------ 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index c5f272c0428a..0008ce6cda22 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -1265,48 +1265,19 @@ pub const io_cqring_offsets = extern struct { }; pub const io_uring_sqe = extern struct { - pub const union1 = extern union { - off: u64, - addr2: u64, - }; - - pub const union2 = extern union { - rw_flags: kernel_rwf, - fsync_flags: u32, - poll_events: u16, - sync_range_flags: u32, - msg_flags: u32, - timeout_flags: u32, - accept_flags: u32, - cancel_flags: u32, - open_flags: u32, - statx_flags: u32, - fadvise_flags: u32, - }; - - pub const union3 = extern union { - struct1: extern struct { - /// index into fixed buffers, if used - buf_index: u16, - - /// personality to use, if used - personality: u16, - }, - __pad2: [3]u64, - }; opcode: IORING_OP, - flags: u8, - ioprio: u16, - fd: i32, - - union1: union1, - addr: u64, - len: u32, - - union2: union2, - user_data: u64, - - union3: union3, + flags: u8 = 0, + ioprio: u16 = 0, + fd: i32 = 0, + off: u64 = 0, + addr: u64 = 0, + len: u32 = 0, + opflags: u32 = 0, + user_data: u64 = 0, + buf_index: u16 = 0, + personality: u16 = 0, + splice_fd_in: i32 = 0, + __pad2: [2]u64 = [2]u64{ 0, 0 } }; pub const IOSQE_BIT = extern enum(u8) { From 31533eb74300ad934d3fca11ffebd86fe67a31ba Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 16:18:04 +0200 Subject: [PATCH 14/50] Move to std/os/linux --- lib/std/os/linux.zig | 1 + lib/std/{ => os/linux}/io_uring.zig | 68 ++++++----------------------- 2 files changed, 14 insertions(+), 55 deletions(-) rename lib/std/{ => os/linux}/io_uring.zig (93%) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 50d1e4ae7867..0fe55528f729 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -31,6 +31,7 @@ pub usingnamespace switch (builtin.arch) { pub usingnamespace @import("bits.zig"); pub const tls = @import("linux/tls.zig"); pub const BPF = @import("linux/bpf.zig"); +pub usingnamespace @import("linux/io_uring.zig"); /// Set by startup code, used by `getauxval`. pub var elf_aux_maybe: ?[*]std.elf.Auxv = null; diff --git a/lib/std/io_uring.zig b/lib/std/os/linux/io_uring.zig similarity index 93% rename from lib/std/io_uring.zig rename to lib/std/os/linux/io_uring.zig index ee6618d59f05..409c954d6f27 100644 --- a/lib/std/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -3,7 +3,7 @@ // This file is part of [zig](https://ziglang.org/), which is MIT licensed. // The MIT license requires this copyright notice to be included in all copies // and substantial portions of the software. -const std = @import("std.zig"); +const std = @import("../../std.zig"); const assert = std.debug.assert; const builtin = std.builtin; const os = std.os; @@ -12,41 +12,9 @@ const mem = std.mem; const net = std.net; const testing = std.testing; -pub const io_uring_params = linux.io_uring_params; -pub const io_uring_cqe = linux.io_uring_cqe; - -// TODO Update linux.zig's definition of linux.io_uring_sqe: -// linux.io_uring_sqe uses numbered unions, i.e. `union1` etc. that are not future-proof and need to -// be re-numbered whenever new unions are interposed by the kernel. Furthermore, Zig's unions do not -// support assignment by any union member directly as in C, without going through the union, so the -// kernel adding new unions would also break existing Zig code. -// We therefore use a flat struct without unions to avoid these two issues. -// Pending https://github.com/ziglang/zig/issues/6349. -pub const io_uring_sqe = extern struct { - opcode: linux.IORING_OP, - flags: u8 = 0, - ioprio: u16 = 0, - fd: i32 = 0, - off: u64 = 0, - addr: u64 = 0, - len: u32 = 0, - opflags: u32 = 0, - user_data: u64 = 0, - buffer: u16 = 0, - personality: u16 = 0, - splice_fd_in: i32 = 0, - options: [2]u64 = [2]u64{ 0, 0 } -}; - -comptime { - assert(@sizeOf(io_uring_params) == 120); - assert(@sizeOf(io_uring_sqe) == 64); - assert(@sizeOf(io_uring_cqe) == 16); - - assert(linux.IORING_OFF_SQ_RING == 0); - assert(linux.IORING_OFF_CQ_RING == 0x8000000); - assert(linux.IORING_OFF_SQES == 0x10000000); -} +const io_uring_params = linux.io_uring_params; +const io_uring_sqe = linux.io_uring_sqe; +const io_uring_cqe = linux.io_uring_cqe; pub const IO_Uring = struct { fd: i32 = -1, @@ -92,7 +60,6 @@ pub const IO_Uring = struct { assert(p.resv[0] == 0); assert(p.resv[1] == 0); assert(p.resv[2] == 0); - if (!supported) return error.IO_UringKernelNotSupported; const res = linux.io_uring_setup(entries, p); try check_errno(res); @@ -110,7 +77,7 @@ pub const IO_Uring = struct { // We do not support the double mmap() done before 5.4, because we want to keep the // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. if ((p.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { - return error.IO_UringKernelNotSupported; + return error.UnsupportedKernel; } // Check that the kernel has actually set params and that "impossible is nothing". @@ -312,7 +279,7 @@ pub const IO_Uring = struct { // Matches the implementation of cq_ring_needs_flush() in liburing. fn cq_ring_needs_flush(self: *IO_Uring) bool { - return (@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; + return (@atomicLoad(u32, self.sq.flags, .Unordered) & IORING_SQ_CQ_OVERFLOW) != 0; } /// For advanced use cases only that implement custom completion queue methods. @@ -343,7 +310,6 @@ pub const IO_Uring = struct { addrlen: *os.socklen_t, accept_flags: u32 ) !*io_uring_sqe { - if (!can_accept) return error.IO_UringKernelCannotAccept; // "sqe->fd is the file descriptor, sqe->addr holds a pointer to struct sockaddr, // sqe->addr2 holds a pointer to socklen_t, and finally sqe->accept_flags holds the flags // for accept(4)." - https://lwn.net/ml/linux-block/20191025173037.13486-1-axboe@kernel.dk/ @@ -401,7 +367,6 @@ pub const IO_Uring = struct { buffer: []u8, offset: u64 ) !*io_uring_sqe { - if (!can_read) return error.IO_UringKernelCannotRead; const sqe = try self.get_sqe(); sqe.* = .{ .opcode = .READ, @@ -423,7 +388,6 @@ pub const IO_Uring = struct { buffer: []const u8, offset: u64 ) !*io_uring_sqe { - if (!can_write) return error.IO_UringKernelCannotWrite; const sqe = try self.get_sqe(); sqe.* = .{ .opcode = .WRITE, @@ -662,15 +626,8 @@ inline fn check_errno(res: usize) !void { if (errno != 0) return os.unexpectedErrno(errno); } -const minimum = std.Target.current.os.isAtLeast; -pub const can_single_mmap = comptime minimum(.linux, .{ .major = 5, .minor = 4 }) == true; -pub const can_accept = comptime minimum(.linux, .{ .major = 5, .minor = 5 }) == true; -pub const can_read = comptime minimum(.linux, .{ .major = 5, .minor = 6 }) == true; -pub const can_write = comptime minimum(.linux, .{ .major = 5, .minor = 6 }) == true; -pub const supported = can_single_mmap; - test "queue_nop" { - if (!supported) return error.SkipZigTest; + if (builtin.os.tag != .linux) return error.SkipZigTest; var ring = try IO_Uring.init(1, 0); defer { @@ -689,10 +646,10 @@ test "queue_nop" { .len = 0, .opflags = 0, .user_data = @intCast(u64, 0xaaaaaaaa), - .buffer = 0, + .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .options = [2]u64{ 0, 0 } + .__pad2 = [2]u64{ 0, 0 } }, sqe.*); testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); @@ -733,7 +690,7 @@ test "queue_nop" { } test "queue_readv" { - if (!supported) return error.SkipZigTest; + if (builtin.os.tag != .linux) return error.SkipZigTest; var ring = try IO_Uring.init(1, 0); defer ring.deinit(); @@ -771,7 +728,7 @@ test "queue_readv" { } test "queue_writev/queue_fsync" { - if (!supported) return error.SkipZigTest; + if (builtin.os.tag != .linux) return error.SkipZigTest; var ring = try IO_Uring.init(2, 0); defer ring.deinit(); @@ -812,7 +769,8 @@ test "queue_writev/queue_fsync" { } test "queue_write/queue_read" { - if (!can_read or !can_write) return error.SkipZigTest; + // TODO + if (builtin.os.tag != .linux or true) return error.SkipZigTest; var ring = try IO_Uring.init(2, 0); defer ring.deinit(); From cb591285d74d9333576b746d41f8c6288701fb96 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 16:20:21 +0200 Subject: [PATCH 15/50] Use linux.IORING_SQ_CQ_OVERFLOW --- lib/std/os/linux/io_uring.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 409c954d6f27..6f6c254b38db 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -279,7 +279,7 @@ pub const IO_Uring = struct { // Matches the implementation of cq_ring_needs_flush() in liburing. fn cq_ring_needs_flush(self: *IO_Uring) bool { - return (@atomicLoad(u32, self.sq.flags, .Unordered) & IORING_SQ_CQ_OVERFLOW) != 0; + return (@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; } /// For advanced use cases only that implement custom completion queue methods. From 0d8c6a960f32627738d61c8168f11fa85b8100b8 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 16:45:52 +0200 Subject: [PATCH 16/50] Remove dangling @import from std --- lib/std/std.zig | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/std/std.zig b/lib/std/std.zig index c706c469b73c..4236b292983b 100644 --- a/lib/std/std.zig +++ b/lib/std/std.zig @@ -65,7 +65,6 @@ pub const hash_map = @import("hash_map.zig"); pub const heap = @import("heap.zig"); pub const http = @import("http.zig"); pub const io = @import("io.zig"); -pub const io_uring = @import("io_uring.zig"); pub const json = @import("json.zig"); pub const log = @import("log.zig"); pub const macho = @import("macho.zig"); From 9fabae2a28d6579de6d13dfc46c75f5df8a67335 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 16:47:05 +0200 Subject: [PATCH 17/50] Return error.UnsupportedKernel for ENOSYS --- lib/std/os/linux/io_uring.zig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 6f6c254b38db..1e0f9fe183af 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -623,7 +623,10 @@ pub const CompletionQueue = struct { inline fn check_errno(res: usize) !void { const errno = linux.getErrno(res); - if (errno != 0) return os.unexpectedErrno(errno); + if (errno != 0) { + if (errno == linux.ENOSYS) return error.UnsupportedKernel; + return os.unexpectedErrno(errno); + } } test "queue_nop" { From c1f9d10b6a0cb530ee1088a24193c322e255c23b Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 16:51:35 +0200 Subject: [PATCH 18/50] Remove unused import aliases --- lib/std/os/linux/io_uring.zig | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 1e0f9fe183af..30b5981dbba9 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -8,8 +8,6 @@ const assert = std.debug.assert; const builtin = std.builtin; const os = std.os; const linux = os.linux; -const mem = std.mem; -const net = std.net; const testing = std.testing; const io_uring_params = linux.io_uring_params; From 64ae9a6a870fc6d8295118f2d255ee661942dc78 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 16:54:44 +0200 Subject: [PATCH 19/50] Rename to error.SubmissionQueueFull --- lib/std/os/linux/io_uring.zig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 30b5981dbba9..805646b3e971 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -136,7 +136,7 @@ pub const IO_Uring = struct { // Remember that these head and tail offsets wrap around every four billion operations. // We must therefore use wrapping addition and subtraction to avoid a runtime crash. const next = self.sq.sqe_tail +% 1; - if (next -% head > self.sq.sqes.len) return error.IO_UringSubmissionQueueFull; + if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; var sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask.*]; self.sq.sqe_tail = next; return sqe; @@ -716,7 +716,7 @@ test "queue_readv" { ring.use_registered_fd(sqe); testing.expectEqual(@as(u8, linux.IOSQE_FIXED_FILE), sqe.flags); - testing.expectError(error.IO_UringSubmissionQueueFull, ring.queue_nop(0)); + testing.expectError(error.SubmissionQueueFull, ring.queue_nop(0)); testing.expectEqual(@as(u32, 1), try ring.submit()); testing.expectEqual(linux.io_uring_cqe { .user_data = 0xcccccccc, From f4df2f091ab528dc7e1e91d8d60ebf3fd1bfa377 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 17:31:39 +0200 Subject: [PATCH 20/50] Allow the kernel to drive feature detection --- lib/std/os/linux/io_uring.zig | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 805646b3e971..4499400677a4 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -630,7 +630,10 @@ inline fn check_errno(res: usize) !void { test "queue_nop" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = try IO_Uring.init(1, 0); + var ring = IO_Uring.init(1, 0) catch |err| { + if (err == error.UnsupportedKernel) return error.SkipZigTest; + return err; + }; defer { ring.deinit(); testing.expectEqual(@as(i32, -1), ring.fd); @@ -693,7 +696,10 @@ test "queue_nop" { test "queue_readv" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = try IO_Uring.init(1, 0); + var ring = IO_Uring.init(1, 0) catch |err| { + if (err == error.UnsupportedKernel) return error.SkipZigTest; + return err; + }; defer ring.deinit(); const fd = try os.openZ("/dev/zero", os.O_RDONLY | os.O_CLOEXEC, 0); @@ -731,7 +737,10 @@ test "queue_readv" { test "queue_writev/queue_fsync" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = try IO_Uring.init(2, 0); + var ring = IO_Uring.init(2, 0) catch |err| { + if (err == error.UnsupportedKernel) return error.SkipZigTest; + return err; + }; defer ring.deinit(); const path = "test_io_uring_queue_writev"; @@ -770,10 +779,12 @@ test "queue_writev/queue_fsync" { } test "queue_write/queue_read" { - // TODO - if (builtin.os.tag != .linux or true) return error.SkipZigTest; + if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = try IO_Uring.init(2, 0); + var ring = IO_Uring.init(2, 0) catch |err| { + if (err == error.UnsupportedKernel) return error.SkipZigTest; + return err; + }; defer ring.deinit(); const path = "test_io_uring_queue_write"; @@ -788,15 +799,20 @@ test "queue_write/queue_read" { ring.link_with_next_sqe(sqe_write); var sqe_read = try ring.queue_read(456, fd, buffer_read[0..], 10); testing.expectEqual(@as(u32, 2), try ring.submit()); + + var cqe1 = try ring.copy_cqe(); + var cqe2 = try ring.copy_cqe(); + if (cqe1.res == -linux.EOPNOTSUPP) return error.SkipZigTest; + if (cqe2.res == -linux.EOPNOTSUPP) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { .user_data = 123, .res = buffer_write.len, .flags = 0, - }, try ring.copy_cqe()); + }, cqe1); testing.expectEqual(linux.io_uring_cqe { .user_data = 456, .res = buffer_read.len, .flags = 0, - }, try ring.copy_cqe()); + }, cqe2); testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } From e7ae6f2fadcafc7a4f0eea96815b7d22049dd2c5 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 18:14:41 +0200 Subject: [PATCH 21/50] Remove default values from io_uring_sqe struct --- lib/std/os/bits/linux.zig | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index 0008ce6cda22..703c08fc4acc 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -1266,18 +1266,18 @@ pub const io_cqring_offsets = extern struct { pub const io_uring_sqe = extern struct { opcode: IORING_OP, - flags: u8 = 0, - ioprio: u16 = 0, - fd: i32 = 0, - off: u64 = 0, - addr: u64 = 0, - len: u32 = 0, - opflags: u32 = 0, - user_data: u64 = 0, - buf_index: u16 = 0, - personality: u16 = 0, - splice_fd_in: i32 = 0, - __pad2: [2]u64 = [2]u64{ 0, 0 } + flags: u8, + ioprio: u16, + fd: i32, + off: u64, + addr: u64, + len: u32, + opflags: u32, + user_data: u64, + buf_index: u16, + personality: u16, + splice_fd_in: i32, + __pad2: [2]u64 }; pub const IOSQE_BIT = extern enum(u8) { From ba18420b277a8de21a760387767dcf2c9227cc5a Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 18:17:06 +0200 Subject: [PATCH 22/50] Zero the SQE slot and assign, instead of initializing with default values --- lib/std/os/linux/io_uring.zig | 90 +++++++++++++++-------------------- 1 file changed, 39 insertions(+), 51 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 4499400677a4..82c7dd7cc42c 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -124,13 +124,13 @@ pub const IO_Uring = struct { self.fd = -1; } - /// Returns a vacant SQE, or an error if the submission queue is full. + /// Returns a pointer to a zeroed SQE, or an error if the submission queue is full. /// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. /// However, instead of a null we return an error to force safe handling. /// Any situation where the submission queue is full tends more towards a control flow error, /// and the null return in liburing is more a C idiom than anything else, for lack of a better /// alternative. In Zig, we have first-class error handling... so let's use it. - /// Matches the implementation of io_uring_get_sqe() in liburing. + /// Matches the implementation of io_uring_get_sqe() in liburing, except zeroes for safety. pub fn get_sqe(self: *IO_Uring) !*io_uring_sqe { const head = @atomicLoad(u32, self.sq.head, .Acquire); // Remember that these head and tail offsets wrap around every four billion operations. @@ -139,6 +139,8 @@ pub const IO_Uring = struct { if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; var sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask.*]; self.sq.sqe_tail = next; + // We zero the SQE slot here in a single place, rather than in many `queue_` methods. + @memset(@ptrCast([*]u8, sqe), 0, @sizeOf(io_uring_sqe)); return sqe; } @@ -312,14 +314,12 @@ pub const IO_Uring = struct { // sqe->addr2 holds a pointer to socklen_t, and finally sqe->accept_flags holds the flags // for accept(4)." - https://lwn.net/ml/linux-block/20191025173037.13486-1-axboe@kernel.dk/ const sqe = try self.get_sqe(); - sqe.* = .{ - .opcode = .ACCEPT, - .fd = fd, - .off = @ptrToInt(addrlen), // `addr2` is a newer union member that maps to `off`. - .addr = @ptrToInt(addr), - .user_data = user_data, - .opflags = accept_flags - }; + sqe.opcode = .ACCEPT; + sqe.fd = fd; + sqe.off = @ptrToInt(addrlen); // `addr2` is a newer union member that maps to `off`. + sqe.addr = @ptrToInt(addr); + sqe.user_data = user_data; + sqe.opflags = accept_flags; return sqe; } @@ -334,11 +334,9 @@ pub const IO_Uring = struct { /// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. pub fn queue_fsync(self: *IO_Uring, user_data: u64, fd: os.fd_t) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.* = .{ - .opcode = .FSYNC, - .fd = fd, - .user_data = user_data - }; + sqe.opcode = .FSYNC; + sqe.fd = fd; + sqe.user_data = user_data; return sqe; } @@ -349,10 +347,8 @@ pub const IO_Uring = struct { /// know when the ring is idle before acting on a kill signal. pub fn queue_nop(self: *IO_Uring, user_data: u64) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.* = .{ - .opcode = .NOP, - .user_data = user_data - }; + sqe.opcode = .NOP; + sqe.user_data = user_data; return sqe; } @@ -366,14 +362,12 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.* = .{ - .opcode = .READ, - .fd = fd, - .off = offset, - .addr = @ptrToInt(buffer.ptr), - .len = @truncate(u32, buffer.len), - .user_data = user_data - }; + sqe.opcode = .READ; + sqe.fd = fd; + sqe.off = offset; + sqe.addr = @ptrToInt(buffer.ptr); + sqe.len = @truncate(u32, buffer.len); + sqe.user_data = user_data; return sqe; } @@ -387,14 +381,12 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.* = .{ - .opcode = .WRITE, - .fd = fd, - .off = offset, - .addr = @ptrToInt(buffer.ptr), - .len = @truncate(u32, buffer.len), - .user_data = user_data - }; + sqe.opcode = .WRITE; + sqe.fd = fd; + sqe.off = offset; + sqe.addr = @ptrToInt(buffer.ptr); + sqe.len = @truncate(u32, buffer.len); + sqe.user_data = user_data; return sqe; } @@ -410,14 +402,12 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.* = .{ - .opcode = .READV, - .fd = fd, - .off = offset, - .addr = @ptrToInt(iovecs.ptr), - .len = @truncate(u32, iovecs.len), - .user_data = user_data - }; + sqe.opcode = .READV; + sqe.fd = fd; + sqe.off = offset; + sqe.addr = @ptrToInt(iovecs.ptr); + sqe.len = @truncate(u32, iovecs.len); + sqe.user_data = user_data; return sqe; } @@ -433,14 +423,12 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.* = .{ - .opcode = .WRITEV, - .fd = fd, - .off = offset, - .addr = @ptrToInt(iovecs.ptr), - .len = @truncate(u32, iovecs.len), - .user_data = user_data - }; + sqe.opcode = .WRITEV; + sqe.fd = fd; + sqe.off = offset; + sqe.addr = @ptrToInt(iovecs.ptr); + sqe.len = @truncate(u32, iovecs.len); + sqe.user_data = user_data; return sqe; } From 92407bfcd7b65f153b7523d97ac5f69193561d27 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 18:29:50 +0200 Subject: [PATCH 23/50] Upgrade check_errno() to an exhaustive switch (safer) --- lib/std/os/linux/io_uring.zig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 82c7dd7cc42c..2c17614b04d1 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -608,10 +608,10 @@ pub const CompletionQueue = struct { }; inline fn check_errno(res: usize) !void { - const errno = linux.getErrno(res); - if (errno != 0) { - if (errno == linux.ENOSYS) return error.UnsupportedKernel; - return os.unexpectedErrno(errno); + switch (linux.getErrno(res)) { + 0 => return, + linux.ENOSYS => return error.UnsupportedKernel, + else => |errno| return os.unexpectedErrno(errno) } } From 4bc1b7a7ac99d57619b4f9a84e159310820e83ff Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 19 Sep 2020 18:50:24 +0200 Subject: [PATCH 24/50] Fix io_uring_sqe to use the names of the first member of each union Now we're really future-proof... no more `opflags` creeping in. When anonymous unions land, we can start using `accept_flags` etc. Until then, code using this struct won't break when the kernel adds features. Refs: https://github.com/ziglang/zig/issues/6349 Refs: https://github.com/ziglang/zig/issues/985 --- lib/std/os/bits/linux.zig | 2 +- lib/std/os/linux/io_uring.zig | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index 703c08fc4acc..9c0a29a652e2 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -1272,7 +1272,7 @@ pub const io_uring_sqe = extern struct { off: u64, addr: u64, len: u32, - opflags: u32, + rw_flags: u32, user_data: u64, buf_index: u16, personality: u16, diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 2c17614b04d1..9a07654c6d17 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -319,13 +319,13 @@ pub const IO_Uring = struct { sqe.off = @ptrToInt(addrlen); // `addr2` is a newer union member that maps to `off`. sqe.addr = @ptrToInt(addr); sqe.user_data = user_data; - sqe.opflags = accept_flags; + sqe.rw_flags = accept_flags; return sqe; } /// Queues (but does not submit) an SQE to perform an `fsync(2)`. /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - /// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `opflags`. + /// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. /// N.B. While SQEs are initiated in the order in which they appear in the submission queue, /// operations execute in parallel and completions are unordered. Therefore, an application that /// submits a write followed by an fsync in the submission queue cannot expect the fsync to @@ -392,7 +392,7 @@ pub const IO_Uring = struct { /// Queues (but does not submit) an SQE to perform a `preadv()`. /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - /// For example, if you want to do a `preadv2()` then set `opflags` on the returned SQE. + /// For example, if you want to do a `preadv2()` then set `rw_flags` on the returned SQE. /// See https://linux.die.net/man/2/preadv. pub fn queue_readv( self: *IO_Uring, @@ -413,7 +413,7 @@ pub const IO_Uring = struct { /// Queues (but does not submit) an SQE to perform a `pwritev()`. /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - /// For example, if you want to do a `pwritev2()` then set `opflags` on the returned SQE. + /// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. /// See https://linux.die.net/man/2/pwritev. pub fn queue_writev( self: *IO_Uring, @@ -636,7 +636,7 @@ test "queue_nop" { .off = 0, .addr = 0, .len = 0, - .opflags = 0, + .rw_flags = 0, .user_data = @intCast(u64, 0xaaaaaaaa), .buf_index = 0, .personality = 0, From abebacda322074c040778aaca5347c8cd714362e Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 20 Sep 2020 14:21:44 +0200 Subject: [PATCH 25/50] Handle all possible syscall errors and bring errors in line with os.zig --- lib/std/os/linux/io_uring.zig | 107 ++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 25 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 9a07654c6d17..ff06a097fda2 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -60,7 +60,22 @@ pub const IO_Uring = struct { assert(p.resv[2] == 0); const res = linux.io_uring_setup(entries, p); - try check_errno(res); + switch (linux.getErrno(res)) { + 0 => {}, + linux.EFAULT => return error.ParamsOutsideAccessibleAddressSpace, + // The resv array contains non-zero data, p.flags contains an unsupported flag, + // entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL, + // or IORING_SETUP_CQSIZE was specified but io_uring_params.cq_entries was invalid: + linux.EINVAL => return error.ArgumentsInvalid, + linux.EMFILE => return error.ProcessFdQuotaExceeded, + linux.ENFILE => return error.SystemFdQuotaExceeded, + linux.ENOMEM => return error.SystemResources, + // IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges, + // or a container seccomp policy prohibits io_uring syscalls: + linux.EPERM => return error.PermissionDenied, + linux.ENOSYS => return error.SystemOutdated, + else => |errno| return os.unexpectedErrno(errno) + } const fd = @intCast(i32, res); assert(fd >= 0); errdefer os.close(fd); @@ -75,7 +90,7 @@ pub const IO_Uring = struct { // We do not support the double mmap() done before 5.4, because we want to keep the // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. if ((p.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { - return error.UnsupportedKernel; + return error.SystemOutdated; } // Check that the kernel has actually set params and that "impossible is nothing". @@ -172,7 +187,31 @@ pub const IO_Uring = struct { fn enter(self: *IO_Uring, to_submit: u32, min_complete: u32, flags: u32) !u32 { assert(self.fd >= 0); const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); - try check_errno(res); + switch (linux.getErrno(res)) { + 0 => {}, + // The kernel was unable to allocate memory or ran out of resources for the request. + // The application should wait for some completions and try again: + linux.EAGAIN => return error.SystemResources, + // The application attempted to overcommit the number of requests it can have pending. + // The application should wait for some completions and try again: + linux.EBUSY => return error.CompletionQueueOvercommitted, + // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: + linux.EBADF => return error.FileDescriptorInvalid, + // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED + // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range + // described by `addr` and `len` is not within the buffer registered at `buf_index`: + linux.EFAULT => return error.BufferInvalid, + // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: + linux.EINVAL => return error.SubmissionQueueEntryInvalid, + linux.ENXIO => return error.RingShuttingDown, + // The kernel believes our `self.fd` does not refer to an io_uring instance, + // or the opcode is valid but not supported by this kernel (more likely): + linux.EOPNOTSUPP => return error.OpcodeNotSupported, + // The operation was interrupted by a delivery of a signal before it could complete. + // This can happen while waiting for events with IORING_ENTER_GETEVENTS: + linux.EINTR => return error.SignalInterrupt, + else => |errno| return os.unexpectedErrno(errno) + } return @truncate(u32, res); } @@ -479,7 +518,25 @@ pub const IO_Uring = struct { @ptrCast(*const c_void, fds.ptr), @truncate(u32, fds.len) ); - try check_errno(res); + switch (linux.getErrno(res)) { + 0 => {}, + // One or more fds in the array are invalid, or the kernel does not support sparse sets: + linux.EBADF => return error.FileDescriptorInvalid, + linux.EBUSY => return error.FilesAlreadyRegistered, + linux.EINVAL => return error.FilesEmpty, + // Adding `nr_args` file references would exceed the maximum allowed number of files the + // user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and + // the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed + // for a fixed file set (older kernels have a limit of 1024 files vs 64K files): + linux.EMFILE => return error.UserFdQuotaExceeded, + // Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft + // resource limit but tried to lock more memory than the limit permitted (not enforced + // when the process is privileged with CAP_IPC_LOCK): + linux.ENOMEM => return error.SystemResources, + // Attempt to register files on a ring already registering files or being torn down: + linux.ENXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, + else => |errno| return os.unexpectedErrno(errno) + } } /// Changes the semantics of the SQE's `fd` to refer to a pre-registered file descriptor. @@ -491,7 +548,11 @@ pub const IO_Uring = struct { pub fn unregister_files(self: *IO_Uring) !void { assert(self.fd >= 0); const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); - try check_errno(res); + switch (linux.getErrno(res)) { + 0 => {}, + linux.ENXIO => return error.FilesNotRegistered, + else => |errno| return os.unexpectedErrno(errno) + } } }; @@ -607,20 +668,13 @@ pub const CompletionQueue = struct { } }; -inline fn check_errno(res: usize) !void { - switch (linux.getErrno(res)) { - 0 => return, - linux.ENOSYS => return error.UnsupportedKernel, - else => |errno| return os.unexpectedErrno(errno) - } -} - test "queue_nop" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = IO_Uring.init(1, 0) catch |err| { - if (err == error.UnsupportedKernel) return error.SkipZigTest; - return err; + var ring = IO_Uring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err }; defer { ring.deinit(); @@ -684,9 +738,10 @@ test "queue_nop" { test "queue_readv" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = IO_Uring.init(1, 0) catch |err| { - if (err == error.UnsupportedKernel) return error.SkipZigTest; - return err; + var ring = IO_Uring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err }; defer ring.deinit(); @@ -725,9 +780,10 @@ test "queue_readv" { test "queue_writev/queue_fsync" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = IO_Uring.init(2, 0) catch |err| { - if (err == error.UnsupportedKernel) return error.SkipZigTest; - return err; + var ring = IO_Uring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err }; defer ring.deinit(); @@ -769,9 +825,10 @@ test "queue_writev/queue_fsync" { test "queue_write/queue_read" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = IO_Uring.init(2, 0) catch |err| { - if (err == error.UnsupportedKernel) return error.SkipZigTest; - return err; + var ring = IO_Uring.init(2, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err }; defer ring.deinit(); From b672dc7abfe318623e88afd2e9ccaffbf38eb401 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 20 Sep 2020 14:59:40 +0200 Subject: [PATCH 26/50] Use os.fd_t instead of i32 and assert against c_int for syscall safety --- lib/std/os/linux/io_uring.zig | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index ff06a097fda2..940270316a93 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -15,7 +15,7 @@ const io_uring_sqe = linux.io_uring_sqe; const io_uring_cqe = linux.io_uring_cqe; pub const IO_Uring = struct { - fd: i32 = -1, + fd: os.fd_t = -1, sq: SubmissionQueue, cq: CompletionQueue, flags: u32, @@ -76,7 +76,7 @@ pub const IO_Uring = struct { linux.ENOSYS => return error.SystemOutdated, else => |errno| return os.unexpectedErrno(errno) } - const fd = @intCast(i32, res); + const fd = @intCast(os.fd_t, res); assert(fd >= 0); errdefer os.close(fd); @@ -510,8 +510,9 @@ pub const IO_Uring = struct { /// Registering file descriptors will wait for the ring to idle. /// Files are automatically unregistered by the kernel when the ring is torn down. /// An application need unregister only if it wants to register a new array of file descriptors. - pub fn register_files(self: *IO_Uring, fds: []const i32) !void { + pub fn register_files(self: *IO_Uring, fds: []const os.fd_t) !void { assert(self.fd >= 0); + comptime assert(@sizeOf(os.fd_t) == @sizeOf(c_int)); const res = linux.io_uring_register( self.fd, .REGISTER_FILES, @@ -575,7 +576,7 @@ pub const SubmissionQueue = struct { sqe_head: u32 = 0, sqe_tail: u32 = 0, - pub fn init(fd: i32, p: io_uring_params) !SubmissionQueue { + pub fn init(fd: os.fd_t, p: io_uring_params) !SubmissionQueue { assert(fd >= 0); assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); const size = std.math.max( @@ -641,7 +642,7 @@ pub const CompletionQueue = struct { overflow: *u32, cqes: []io_uring_cqe, - pub fn init(fd: i32, p: io_uring_params, sq: SubmissionQueue) !CompletionQueue { + pub fn init(fd: os.fd_t, p: io_uring_params, sq: SubmissionQueue) !CompletionQueue { assert(fd >= 0); assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); const mmap = sq.mmap; @@ -678,7 +679,7 @@ test "queue_nop" { }; defer { ring.deinit(); - testing.expectEqual(@as(i32, -1), ring.fd); + testing.expectEqual(@as(os.fd_t, -1), ring.fd); } var sqe = try ring.queue_nop(@intCast(u64, 0xaaaaaaaa)); @@ -754,7 +755,7 @@ test "queue_readv" { // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 // We therefore avoid stressing sparse fd sets here: - var registered_fds = [_]i32{0} ** 1; + var registered_fds = [_]os.fd_t{0} ** 1; const fd_index = 0; registered_fds[fd_index] = fd; try ring.register_files(registered_fds[0..]); From f22eea82c4563545844c2f5840e9cc1c9bd31fe0 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 20 Sep 2020 15:33:48 +0200 Subject: [PATCH 27/50] Fix opcode support detection for read/write test --- lib/std/os/linux/io_uring.zig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 940270316a93..d03ccbe2a403 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -848,8 +848,10 @@ test "queue_write/queue_read" { var cqe1 = try ring.copy_cqe(); var cqe2 = try ring.copy_cqe(); - if (cqe1.res == -linux.EOPNOTSUPP) return error.SkipZigTest; - if (cqe2.res == -linux.EOPNOTSUPP) return error.SkipZigTest; + // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: + // https://lwn.net/Articles/809820/ + if (cqe1.res == -linux.EINVAL) return error.SkipZigTest; + if (cqe2.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { .user_data = 123, .res = buffer_write.len, From 40293a0643419b7b52438861215c57b32669656f Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 20 Sep 2020 15:41:22 +0200 Subject: [PATCH 28/50] Add safety checks --- lib/std/os/linux/io_uring.zig | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index d03ccbe2a403..b638442fd929 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -13,6 +13,16 @@ const testing = std.testing; const io_uring_params = linux.io_uring_params; const io_uring_sqe = linux.io_uring_sqe; const io_uring_cqe = linux.io_uring_cqe; + +comptime { + assert(@sizeOf(io_uring_params) == 120); + assert(@sizeOf(io_uring_sqe) == 64); + assert(@sizeOf(io_uring_cqe) == 16); + + assert(linux.IORING_OFF_SQ_RING == 0); + assert(linux.IORING_OFF_CQ_RING == 0x8000000); + assert(linux.IORING_OFF_SQES == 0x10000000); +} pub const IO_Uring = struct { fd: os.fd_t = -1, From e51728a1b4b68cbfbd5aae47330a7a43563fce0e Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 20 Sep 2020 15:54:31 +0200 Subject: [PATCH 29/50] Make enter(), flush_sq(), sq_ring_needs_enter(), cq_ring_needs_flush() public These will also be needed by any custom helpers --- lib/std/os/linux/io_uring.zig | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index b638442fd929..6550647aacaf 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -18,7 +18,7 @@ comptime { assert(@sizeOf(io_uring_params) == 120); assert(@sizeOf(io_uring_sqe) == 64); assert(@sizeOf(io_uring_cqe) == 16); - + assert(linux.IORING_OFF_SQ_RING == 0); assert(linux.IORING_OFF_CQ_RING == 0x8000000); assert(linux.IORING_OFF_SQES == 0x10000000); @@ -192,9 +192,9 @@ pub const IO_Uring = struct { return submitted; } - // Tell the kernel we have submitted SQEs and/or want to wait for CQEs. - // Returns the number of SQEs submitted. - fn enter(self: *IO_Uring, to_submit: u32, min_complete: u32, flags: u32) !u32 { + /// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. + /// Returns the number of SQEs submitted. + pub fn enter(self: *IO_Uring, to_submit: u32, min_complete: u32, flags: u32) !u32 { assert(self.fd >= 0); const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); switch (linux.getErrno(res)) { @@ -225,12 +225,12 @@ pub const IO_Uring = struct { return @truncate(u32, res); } - // Sync internal state with kernel ring state on the SQ side. - // Returns the number of all pending events in the SQ ring, for the shared ring. - // This return value includes previously flushed SQEs, as per liburing. - // The reasoning for this is to suggest that an io_uring_enter() call is needed rather than not. - // Matches the implementation of __io_uring_flush_sq() in liburing. - fn flush_sq(self: *IO_Uring) u32 { + /// Sync internal state with kernel ring state on the SQ side. + /// Returns the number of all pending events in the SQ ring, for the shared ring. + /// This return value includes previously flushed SQEs, as per liburing. + /// The rationale is to suggest that an io_uring_enter() call is needed rather than not. + /// Matches the implementation of __io_uring_flush_sq() in liburing. + pub fn flush_sq(self: *IO_Uring) u32 { if (self.sq.sqe_head != self.sq.sqe_tail) { // Fill in SQEs that we have queued up, adding them to the kernel ring. const to_submit = self.sq.sqe_tail -% self.sq.sqe_head; @@ -252,7 +252,7 @@ pub const IO_Uring = struct { /// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. /// For the latter case, we set the SQ thread wakeup flag. /// Matches the implementation of sq_ring_needs_enter() in liburing. - fn sq_ring_needs_enter(self: *IO_Uring, submitted: u32, flags: *u32) bool { + pub fn sq_ring_needs_enter(self: *IO_Uring, submitted: u32, flags: *u32) bool { assert(flags.* == 0); if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0 and submitted > 0) return true; if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { @@ -326,8 +326,8 @@ pub const IO_Uring = struct { return cqes[0]; } - // Matches the implementation of cq_ring_needs_flush() in liburing. - fn cq_ring_needs_flush(self: *IO_Uring) bool { + /// Matches the implementation of cq_ring_needs_flush() in liburing. + pub fn cq_ring_needs_flush(self: *IO_Uring) bool { return (@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; } @@ -567,7 +567,6 @@ pub const IO_Uring = struct { } }; - pub const SubmissionQueue = struct { head: *u32, tail: *u32, From 77903f8d4ec229acfa67f8b55ad0bfb8cd9d111e Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 20 Sep 2020 18:45:44 +0200 Subject: [PATCH 30/50] Test structs and offsets --- lib/std/os/linux/io_uring.zig | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 6550647aacaf..169f061a41d3 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -13,16 +13,6 @@ const testing = std.testing; const io_uring_params = linux.io_uring_params; const io_uring_sqe = linux.io_uring_sqe; const io_uring_cqe = linux.io_uring_cqe; - -comptime { - assert(@sizeOf(io_uring_params) == 120); - assert(@sizeOf(io_uring_sqe) == 64); - assert(@sizeOf(io_uring_cqe) == 16); - - assert(linux.IORING_OFF_SQ_RING == 0); - assert(linux.IORING_OFF_CQ_RING == 0x8000000); - assert(linux.IORING_OFF_SQES == 0x10000000); -} pub const IO_Uring = struct { fd: os.fd_t = -1, @@ -678,6 +668,18 @@ pub const CompletionQueue = struct { } }; +test "structs and offsets" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + testing.expectEqual(@as(usize, 120), @sizeOf(io_uring_params)); + testing.expectEqual(@as(usize, 64), @sizeOf(io_uring_sqe)); + testing.expectEqual(@as(usize, 16), @sizeOf(io_uring_cqe)); + + testing.expectEqual(0, linux.IORING_OFF_SQ_RING); + testing.expectEqual(0x8000000, linux.IORING_OFF_CQ_RING); + testing.expectEqual(0x10000000, linux.IORING_OFF_SQES); +} + test "queue_nop" { if (builtin.os.tag != .linux) return error.SkipZigTest; From a853f004101f22e94e2a18660bfb017a489e7b10 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 20 Sep 2020 19:51:23 +0200 Subject: [PATCH 31/50] Add IORING_FEAT_FAST_POLL --- lib/std/os/bits/linux.zig | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index 9c0a29a652e2..2529abc2c81e 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -1199,6 +1199,7 @@ pub const IORING_FEAT_NODROP = 1 << 1; pub const IORING_FEAT_SUBMIT_STABLE = 1 << 2; pub const IORING_FEAT_RW_CUR_POS = 1 << 3; pub const IORING_FEAT_CUR_PERSONALITY = 1 << 4; +pub const IORING_FEAT_FAST_POLL = 1 << 5; // io_uring_params.flags From 843c104fc9cb544367fda2168c6ad45a625cb979 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Mon, 21 Sep 2020 10:39:58 +0200 Subject: [PATCH 32/50] Add io_uring syscalls to os.bits.linux.mips.SYS As per lib/libc/musl/arch/mips/bits/syscall.h.in: ```c ``` --- lib/std/os/bits/linux/mips.zig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/std/os/bits/linux/mips.zig b/lib/std/os/bits/linux/mips.zig index 4b81c6e622ac..f3e590fcea7a 100644 --- a/lib/std/os/bits/linux/mips.zig +++ b/lib/std/os/bits/linux/mips.zig @@ -383,6 +383,9 @@ pub const SYS = extern enum(usize) { statx = Linux + 366, rseq = Linux + 367, io_pgetevents = Linux + 368, + io_uring_setup = Linux + 425, + io_uring_enter = Linux + 426, + io_uring_register = Linux + 427, openat2 = Linux + 437, pidfd_getfd = Linux + 438, From 575ed941d7b53e4643ab803e5500c96e0f4698b3 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Mon, 21 Sep 2020 11:07:00 +0200 Subject: [PATCH 33/50] Cache mask instead of dereferencing mask pointer --- lib/std/os/linux/io_uring.zig | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 169f061a41d3..be9fec1bece5 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -108,7 +108,7 @@ pub const IO_Uring = struct { // Check that our starting state is as we expect. assert(sq.head.* == 0); assert(sq.tail.* == 0); - assert(sq.mask.* == p.sq_entries - 1); + assert(sq.mask == p.sq_entries - 1); // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. assert(sq.dropped.* == 0); assert(sq.array.len == p.sq_entries); @@ -118,7 +118,7 @@ pub const IO_Uring = struct { assert(cq.head.* == 0); assert(cq.tail.* == 0); - assert(cq.mask.* == p.cq_entries - 1); + assert(cq.mask == p.cq_entries - 1); assert(cq.overflow.* == 0); assert(cq.cqes.len == p.cq_entries); @@ -152,7 +152,7 @@ pub const IO_Uring = struct { // We must therefore use wrapping addition and subtraction to avoid a runtime crash. const next = self.sq.sqe_tail +% 1; if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; - var sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask.*]; + var sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; self.sq.sqe_tail = next; // We zero the SQE slot here in a single place, rather than in many `queue_` methods. @memset(@ptrCast([*]u8, sqe), 0, @sizeOf(io_uring_sqe)); @@ -224,11 +224,10 @@ pub const IO_Uring = struct { if (self.sq.sqe_head != self.sq.sqe_tail) { // Fill in SQEs that we have queued up, adding them to the kernel ring. const to_submit = self.sq.sqe_tail -% self.sq.sqe_head; - const mask = self.sq.mask.*; var tail = self.sq.tail.*; var i: usize = 0; while (i < to_submit) : (i += 1) { - self.sq.array[tail & mask] = self.sq.sqe_head & mask; + self.sq.array[tail & self.sq.mask] = self.sq.sqe_head & self.sq.mask; tail +%= 1; self.sq.sqe_head +%= 1; } @@ -292,14 +291,13 @@ pub const IO_Uring = struct { fn copy_cqes_ready(self: *IO_Uring, cqes: []io_uring_cqe, wait_nr: u32) u32 { const ready = self.cq_ready(); const count = std.math.min(cqes.len, ready); - const mask = self.cq.mask.*; var head = self.cq.head.*; var tail = head +% count; // TODO Optimize this by using 1 or 2 memcpy's (if the tail wraps) rather than a loop. var i: usize = 0; // Do not use "less-than" operator since head and tail may wrap: while (head != tail) { - cqes[i] = self.cq.cqes[head & mask]; // Copy struct by value. + cqes[i] = self.cq.cqes[head & self.cq.mask]; // Copy struct by value. head +%= 1; i += 1; } @@ -560,7 +558,7 @@ pub const IO_Uring = struct { pub const SubmissionQueue = struct { head: *u32, tail: *u32, - mask: *u32, + mask: u32, flags: *u32, dropped: *u32, array: []u32, @@ -618,7 +616,7 @@ pub const SubmissionQueue = struct { return SubmissionQueue { .head = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.head])), .tail = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.tail])), - .mask = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.ring_mask])), + .mask = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.ring_mask])).*, .flags = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.flags])), .dropped = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.sq_off.dropped])), .array = array[0..p.sq_entries], @@ -637,7 +635,7 @@ pub const SubmissionQueue = struct { pub const CompletionQueue = struct { head: *u32, tail: *u32, - mask: *u32, + mask: u32, overflow: *u32, cqes: []io_uring_cqe, @@ -656,7 +654,7 @@ pub const CompletionQueue = struct { return CompletionQueue { .head = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.head])), .tail = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.tail])), - .mask = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.ring_mask])), + .mask = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.ring_mask])).*, .overflow = @ptrCast(*u32, @alignCast(@alignOf(u32), &mmap[p.cq_off.overflow])), .cqes = cqes[0..p.cq_entries] }; @@ -670,7 +668,7 @@ pub const CompletionQueue = struct { test "structs and offsets" { if (builtin.os.tag != .linux) return error.SkipZigTest; - + testing.expectEqual(@as(usize, 120), @sizeOf(io_uring_params)); testing.expectEqual(@as(usize, 64), @sizeOf(io_uring_sqe)); testing.expectEqual(@as(usize, 16), @sizeOf(io_uring_cqe)); From 57603fd26d2902d82a7fd5e9ec2529b38bd51344 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Mon, 21 Sep 2020 11:09:09 +0200 Subject: [PATCH 34/50] Use @intCast instead of @truncate on io_uring_enter() result --- lib/std/os/linux/io_uring.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index be9fec1bece5..f1cf32aeaa5b 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -212,7 +212,7 @@ pub const IO_Uring = struct { linux.EINTR => return error.SignalInterrupt, else => |errno| return os.unexpectedErrno(errno) } - return @truncate(u32, res); + return @intCast(u32, res); } /// Sync internal state with kernel ring state on the SQ side. From 7719abbf5460c320b63f7fe10ee27b9e9b66a069 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Mon, 21 Sep 2020 11:09:37 +0200 Subject: [PATCH 35/50] Add flags to queue_fsync() signature as per liburing --- lib/std/os/linux/io_uring.zig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index f1cf32aeaa5b..3d2d863f5c13 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -369,10 +369,11 @@ pub const IO_Uring = struct { /// apply to the write, since the fsync may complete before the write is issued to the disk. /// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, /// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. - pub fn queue_fsync(self: *IO_Uring, user_data: u64, fd: os.fd_t) !*io_uring_sqe { + pub fn queue_fsync(self: *IO_Uring, user_data: u64, fd: os.fd_t, flags: u32) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.opcode = .FSYNC; sqe.fd = fd; + sqe.rw_flags = flags; sqe.user_data = user_data; return sqe; } @@ -811,7 +812,7 @@ test "queue_writev/queue_fsync" { ring.link_with_next_sqe(sqe_writev); testing.expectEqual(@as(u8, linux.IOSQE_IO_LINK), sqe_writev.flags); - var sqe_fsync = try ring.queue_fsync(0xeeeeeeee, fd); + var sqe_fsync = try ring.queue_fsync(0xeeeeeeee, fd, 0); testing.expectEqual(fd, sqe_fsync.fd); testing.expectEqual(@as(u32, 2), ring.sq_ready()); From 2d8df2b745d4004a790375deb654065224edd6b0 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Mon, 21 Sep 2020 12:03:52 +0200 Subject: [PATCH 36/50] Use @intCast instead of @truncate --- lib/std/os/linux/io_uring.zig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 3d2d863f5c13..648d56fad906 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -404,7 +404,7 @@ pub const IO_Uring = struct { sqe.fd = fd; sqe.off = offset; sqe.addr = @ptrToInt(buffer.ptr); - sqe.len = @truncate(u32, buffer.len); + sqe.len = @intCast(u32, buffer.len); sqe.user_data = user_data; return sqe; } @@ -423,7 +423,7 @@ pub const IO_Uring = struct { sqe.fd = fd; sqe.off = offset; sqe.addr = @ptrToInt(buffer.ptr); - sqe.len = @truncate(u32, buffer.len); + sqe.len = @intCast(u32, buffer.len); sqe.user_data = user_data; return sqe; } @@ -444,7 +444,7 @@ pub const IO_Uring = struct { sqe.fd = fd; sqe.off = offset; sqe.addr = @ptrToInt(iovecs.ptr); - sqe.len = @truncate(u32, iovecs.len); + sqe.len = @intCast(u32, iovecs.len); sqe.user_data = user_data; return sqe; } @@ -465,7 +465,7 @@ pub const IO_Uring = struct { sqe.fd = fd; sqe.off = offset; sqe.addr = @ptrToInt(iovecs.ptr); - sqe.len = @truncate(u32, iovecs.len); + sqe.len = @intCast(u32, iovecs.len); sqe.user_data = user_data; return sqe; } @@ -516,7 +516,7 @@ pub const IO_Uring = struct { self.fd, .REGISTER_FILES, @ptrCast(*const c_void, fds.ptr), - @truncate(u32, fds.len) + @intCast(u32, fds.len) ); switch (linux.getErrno(res)) { 0 => {}, From 5f99d2c2407a057e0b239c19189c9e1a36fb1c0f Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Mon, 21 Sep 2020 20:15:14 +0200 Subject: [PATCH 37/50] Define SPLICE, PROVIDE_BUFFERS, REMOVE_BUFFERS and TEE opcodes and flags --- lib/std/os/bits/linux.zig | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index 2529abc2c81e..cdc3e291a95e 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -1287,7 +1287,8 @@ pub const IOSQE_BIT = extern enum(u8) { IO_LINK, IO_HARDLINK, ASYNC, - + BUFFER_SELECT, + _, }; @@ -1306,7 +1307,10 @@ pub const IOSQE_IO_LINK = 1 << @enumToInt(IOSQE_BIT.IO_LINK); pub const IOSQE_IO_HARDLINK = 1 << @enumToInt(IOSQE_BIT.IO_HARDLINK); /// always go async -pub const IOSQE_ASYNC = 1 << IOSQE_BIT.ASYNC; +pub const IOSQE_ASYNC = 1 << @enumToInt(IOSQE_BIT.ASYNC); + +/// select buffer from buf_group +pub const IOSQE_BUFFER_SELECT = 1 << @enumToInt(IOSQE_BIT.BUFFER_SELECT); pub const IORING_OP = extern enum(u8) { NOP, @@ -1339,6 +1343,10 @@ pub const IORING_OP = extern enum(u8) { RECV, OPENAT2, EPOLL_CTL, + SPLICE, + PROVIDE_BUFFERS, + REMOVE_BUFFERS, + TEE, _, }; From 95def89c232acc53c926731fe5143ea093128d73 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 3 Oct 2020 14:34:01 +0200 Subject: [PATCH 38/50] Handle EBADFD (ring fd in bad state) in enter() --- lib/std/os/linux/io_uring.zig | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 648d56fad906..8dc59eb82a0f 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -192,17 +192,20 @@ pub const IO_Uring = struct { // The kernel was unable to allocate memory or ran out of resources for the request. // The application should wait for some completions and try again: linux.EAGAIN => return error.SystemResources, + // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: + linux.EBADF => return error.FileDescriptorInvalid, + // The file descriptor is valid, but the ring is not in the right state. + // See io_uring_register(2) for how to enable the ring. + linux.EBADFD => return error.FileDescriptorInBadState, // The application attempted to overcommit the number of requests it can have pending. // The application should wait for some completions and try again: linux.EBUSY => return error.CompletionQueueOvercommitted, - // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: - linux.EBADF => return error.FileDescriptorInvalid, + // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: + linux.EINVAL => return error.SubmissionQueueEntryInvalid, // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range // described by `addr` and `len` is not within the buffer registered at `buf_index`: linux.EFAULT => return error.BufferInvalid, - // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: - linux.EINVAL => return error.SubmissionQueueEntryInvalid, linux.ENXIO => return error.RingShuttingDown, // The kernel believes our `self.fd` does not refer to an io_uring instance, // or the opcode is valid but not supported by this kernel (more likely): From a9b107045fb0592f813ffd9f5fef3e2cbfd2ac89 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 3 Oct 2020 14:34:42 +0200 Subject: [PATCH 39/50] Use load acquire semantics when reading the SQPOLL wakeup flag Ensures that the wakeup flag is read after the tail pointer has been written. It's important to use memory load acquire semantics for the flags read, otherwise the application and the kernel might not agree on the consistency of the wakeup flag, leading to I/O starvation. Refs: https://github.com/axboe/liburing/commit/6768ddcc562adb6ea141cf508bccecb6be8ce666 Refs: https://github.com/axboe/liburing/issues/219 --- lib/std/os/linux/io_uring.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 8dc59eb82a0f..b3519c674fea 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -247,7 +247,7 @@ pub const IO_Uring = struct { pub fn sq_ring_needs_enter(self: *IO_Uring, submitted: u32, flags: *u32) bool { assert(flags.* == 0); if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0 and submitted > 0) return true; - if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { + if ((@atomicLoad(u32, self.sq.flags, .Acquire) & linux.IORING_SQ_NEED_WAKEUP) != 0) { flags.* |= linux.IORING_ENTER_SQ_WAKEUP; return true; } From c5b4fcaa1cf4eaa721b5b2abfa0805cc3bbb29e2 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sat, 3 Oct 2020 17:43:08 +0200 Subject: [PATCH 40/50] Add IORING_FEAT_POLL_32BITS --- lib/std/os/bits/linux.zig | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index cdc3e291a95e..c70977184ce8 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -1200,6 +1200,7 @@ pub const IORING_FEAT_SUBMIT_STABLE = 1 << 2; pub const IORING_FEAT_RW_CUR_POS = 1 << 3; pub const IORING_FEAT_CUR_PERSONALITY = 1 << 4; pub const IORING_FEAT_FAST_POLL = 1 << 5; +pub const IORING_FEAT_POLL_32BITS = 1 << 6; // io_uring_params.flags From 61ec6cb6d375ff896d892102d8ee7b7d4536b3a5 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 12:48:08 +0200 Subject: [PATCH 41/50] Expose available kernel features --- lib/std/os/linux/io_uring.zig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index b3519c674fea..64b4ed550ba9 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -6,6 +6,8 @@ const std = @import("../../std.zig"); const assert = std.debug.assert; const builtin = std.builtin; +const mem = std.mem; +const net = std.net; const os = std.os; const linux = os.linux; const testing = std.testing; @@ -19,6 +21,7 @@ pub const IO_Uring = struct { sq: SubmissionQueue, cq: CompletionQueue, flags: u32, + features: u32, /// A friendly way to setup an io_uring, with default io_uring_params. /// `entries` must be a power of two between 1 and 4096, although the kernel will make the final @@ -126,7 +129,8 @@ pub const IO_Uring = struct { .fd = fd, .sq = sq, .cq = cq, - .flags = p.flags + .flags = p.flags, + .features = p.features }; } From e32c7d06e5c51ff88856c8f48f6fb4fcdf564d17 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 12:49:48 +0200 Subject: [PATCH 42/50] Limit entries to u12, add errors for invalid entries, use mem.zeroInit --- lib/std/os/linux/io_uring.zig | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 64b4ed550ba9..49a1eab556f1 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -28,21 +28,11 @@ pub const IO_Uring = struct { /// call on how many entries the submission and completion queues will ultimately have, /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. /// Matches the interface of io_uring_queue_init() in liburing. - pub fn init(entries: u32, flags: u32) !IO_Uring { - var params = io_uring_params { - .sq_entries = 0, - .cq_entries = 0, + pub fn init(entries: u12, flags: u32) !IO_Uring { + var params = mem.zeroInit(io_uring_params, .{ .flags = flags, - .sq_thread_cpu = 0, - .sq_thread_idle = 1000, - .features = 0, - .wq_fd = 0, - .resv = [_]u32{0} ** 3, - .sq_off = undefined, - .cq_off = undefined, - }; - // The kernel will zero the memory of the sq_off and cq_off structs in io_uring_create(), - // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7986-L8002. + .sq_thread_idle = 1000 + }); return try IO_Uring.init_params(entries, ¶ms); } @@ -52,8 +42,10 @@ pub const IO_Uring = struct { /// You may only set the `flags`, `sq_thread_cpu` and `sq_thread_idle` parameters. /// Every other parameter belongs to the kernel and must be zeroed. /// Matches the interface of io_uring_queue_init_params() in liburing. - pub fn init_params(entries: u32, p: *io_uring_params) !IO_Uring { - assert(entries >= 1 and entries <= 4096 and std.math.isPowerOfTwo(entries)); + pub fn init_params(entries: u12, p: *io_uring_params) !IO_Uring { + if (entries == 0) return error.EntriesZero; + if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + assert(p.sq_entries == 0); assert(p.cq_entries == 0); assert(p.features == 0); @@ -684,6 +676,9 @@ test "structs and offsets" { testing.expectEqual(0, linux.IORING_OFF_SQ_RING); testing.expectEqual(0x8000000, linux.IORING_OFF_CQ_RING); testing.expectEqual(0x10000000, linux.IORING_OFF_SQES); + + testing.expectError(error.EntriesZero, IO_Uring.init(0, 0)); + testing.expectError(error.EntriesNotPowerOfTwo, IO_Uring.init(3, 0)); } test "queue_nop" { From 69a55fc560dab477222f2ad104050b443647faa5 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 13:01:41 +0200 Subject: [PATCH 43/50] Allow for advanced non-sequential SQE allocation schemes Decouples SQE queueing and SQE prepping methods to allow for non-sequential SQE allocation schemes as suggested by @daurnimator. Adds essential SQE prepping methods from liburing to reduce boilerplate. Removes non-essential .link_with_next_sqe() and .use_registered_fd(). --- lib/std/os/linux/io_uring.zig | 357 +++++++++++++++++++++++----------- 1 file changed, 244 insertions(+), 113 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 49a1eab556f1..c142fa3f73f8 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -135,13 +135,13 @@ pub const IO_Uring = struct { self.fd = -1; } - /// Returns a pointer to a zeroed SQE, or an error if the submission queue is full. + /// Returns a pointer to a vacant SQE, or an error if the submission queue is full. /// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. /// However, instead of a null we return an error to force safe handling. /// Any situation where the submission queue is full tends more towards a control flow error, /// and the null return in liburing is more a C idiom than anything else, for lack of a better /// alternative. In Zig, we have first-class error handling... so let's use it. - /// Matches the implementation of io_uring_get_sqe() in liburing, except zeroes for safety. + /// Matches the implementation of io_uring_get_sqe() in liburing. pub fn get_sqe(self: *IO_Uring) !*io_uring_sqe { const head = @atomicLoad(u32, self.sq.head, .Acquire); // Remember that these head and tail offsets wrap around every four billion operations. @@ -150,8 +150,6 @@ pub const IO_Uring = struct { if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; var sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; self.sq.sqe_tail = next; - // We zero the SQE slot here in a single place, rather than in many `queue_` methods. - @memset(@ptrCast([*]u8, sqe), 0, @sizeOf(io_uring_sqe)); return sqe; } @@ -336,29 +334,6 @@ pub const IO_Uring = struct { } } - /// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. - /// Returns a pointer to the SQE. - pub fn queue_accept( - self: *IO_Uring, - user_data: u64, - fd: os.fd_t, - addr: *os.sockaddr, - addrlen: *os.socklen_t, - accept_flags: u32 - ) !*io_uring_sqe { - // "sqe->fd is the file descriptor, sqe->addr holds a pointer to struct sockaddr, - // sqe->addr2 holds a pointer to socklen_t, and finally sqe->accept_flags holds the flags - // for accept(4)." - https://lwn.net/ml/linux-block/20191025173037.13486-1-axboe@kernel.dk/ - const sqe = try self.get_sqe(); - sqe.opcode = .ACCEPT; - sqe.fd = fd; - sqe.off = @ptrToInt(addrlen); // `addr2` is a newer union member that maps to `off`. - sqe.addr = @ptrToInt(addr); - sqe.user_data = user_data; - sqe.rw_flags = accept_flags; - return sqe; - } - /// Queues (but does not submit) an SQE to perform an `fsync(2)`. /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. /// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. @@ -368,11 +343,9 @@ pub const IO_Uring = struct { /// apply to the write, since the fsync may complete before the write is issued to the disk. /// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, /// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. - pub fn queue_fsync(self: *IO_Uring, user_data: u64, fd: os.fd_t, flags: u32) !*io_uring_sqe { + pub fn fsync(self: *IO_Uring, user_data: u64, fd: os.fd_t, flags: u32) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.opcode = .FSYNC; - sqe.fd = fd; - sqe.rw_flags = flags; + io_uring_prep_fsync(sqe, fd, flags); sqe.user_data = user_data; return sqe; } @@ -382,16 +355,16 @@ pub const IO_Uring = struct { /// A no-op is more useful than may appear at first glance. /// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to /// know when the ring is idle before acting on a kill signal. - pub fn queue_nop(self: *IO_Uring, user_data: u64) !*io_uring_sqe { + pub fn nop(self: *IO_Uring, user_data: u64) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.opcode = .NOP; + io_uring_prep_nop(sqe); sqe.user_data = user_data; return sqe; } /// Queues (but does not submit) an SQE to perform a `read(2)`. /// Returns a pointer to the SQE. - pub fn queue_read( + pub fn read( self: *IO_Uring, user_data: u64, fd: os.fd_t, @@ -399,18 +372,14 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.opcode = .READ; - sqe.fd = fd; - sqe.off = offset; - sqe.addr = @ptrToInt(buffer.ptr); - sqe.len = @intCast(u32, buffer.len); + io_uring_prep_read(sqe, fd, buffer, offset); sqe.user_data = user_data; return sqe; } /// Queues (but does not submit) an SQE to perform a `write(2)`. /// Returns a pointer to the SQE. - pub fn queue_write( + pub fn write( self: *IO_Uring, user_data: u64, fd: os.fd_t, @@ -418,11 +387,7 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.opcode = .WRITE; - sqe.fd = fd; - sqe.off = offset; - sqe.addr = @ptrToInt(buffer.ptr); - sqe.len = @intCast(u32, buffer.len); + io_uring_prep_write(sqe, fd, buffer, offset); sqe.user_data = user_data; return sqe; } @@ -431,7 +396,7 @@ pub const IO_Uring = struct { /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. /// For example, if you want to do a `preadv2()` then set `rw_flags` on the returned SQE. /// See https://linux.die.net/man/2/preadv. - pub fn queue_readv( + pub fn readv( self: *IO_Uring, user_data: u64, fd: os.fd_t, @@ -439,11 +404,7 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.opcode = .READV; - sqe.fd = fd; - sqe.off = offset; - sqe.addr = @ptrToInt(iovecs.ptr); - sqe.len = @intCast(u32, iovecs.len); + io_uring_prep_readv(sqe, fd, iovecs, offset); sqe.user_data = user_data; return sqe; } @@ -452,7 +413,7 @@ pub const IO_Uring = struct { /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. /// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. /// See https://linux.die.net/man/2/pwritev. - pub fn queue_writev( + pub fn writev( self: *IO_Uring, user_data: u64, fd: os.fd_t, @@ -460,25 +421,25 @@ pub const IO_Uring = struct { offset: u64 ) !*io_uring_sqe { const sqe = try self.get_sqe(); - sqe.opcode = .WRITEV; - sqe.fd = fd; - sqe.off = offset; - sqe.addr = @ptrToInt(iovecs.ptr); - sqe.len = @intCast(u32, iovecs.len); + io_uring_prep_writev(sqe, fd, iovecs, offset); sqe.user_data = user_data; return sqe; } - /// The next SQE will not be started until this one completes. - /// This can be used to chain causally dependent SQEs, and the chain can be arbitrarily long. - /// The tail of the chain is denoted by the first SQE that does not have this flag set. - /// This flag has no effect on previous SQEs, nor does it impact SQEs outside the chain. - /// This means that multiple chains can be executing in parallel, along with individual SQEs. - /// Only members inside the chain are serialized. - /// A chain will be broken if any SQE in the chain ends in error, where any unexpected result is - /// considered an error. For example, a short read will terminate the remainder of the chain. - pub fn link_with_next_sqe(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.flags |= linux.IOSQE_IO_LINK; + /// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. + /// Returns a pointer to the SQE. + pub fn accept( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + addr: *os.sockaddr, + addrlen: *os.socklen_t, + flags: u32 + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + io_uring_prep_accept(sqe, fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; } /// Like `link_with_next_sqe()` but stronger. @@ -538,11 +499,6 @@ pub const IO_Uring = struct { } } - /// Changes the semantics of the SQE's `fd` to refer to a pre-registered file descriptor. - pub fn use_registered_fd(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.flags |= linux.IOSQE_FIXED_FILE; - } - /// Unregisters all registered file descriptors previously associated with the ring. pub fn unregister_files(self: *IO_Uring) !void { assert(self.fd >= 0); @@ -563,8 +519,8 @@ pub const SubmissionQueue = struct { dropped: *u32, array: []u32, sqes: []io_uring_sqe, - mmap: []align(std.mem.page_size) u8, - mmap_sqes: []align(std.mem.page_size) u8, + mmap: []align(mem.page_size) u8, + mmap_sqes: []align(mem.page_size) u8, // We use `sqe_head` and `sqe_tail` in the same way as liburing: // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. @@ -666,7 +622,156 @@ pub const CompletionQueue = struct { } }; -test "structs and offsets" { +pub fn io_uring_prep_nop(sqe: *io_uring_sqe) void { + sqe.* = .{ + .opcode = .NOP, + .flags = 0, + .ioprio = 0, + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .__pad2 = [2]u64{ 0, 0 } + }; +} + +pub fn io_uring_prep_fsync(sqe: *io_uring_sqe, fd: os.fd_t, flags: u32) void { + sqe.* = .{ + .opcode = .FSYNC, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = flags, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .__pad2 = [2]u64{ 0, 0 } + }; +} + +pub fn io_uring_prep_rw( + op: linux.IORING_OP, + sqe: *io_uring_sqe, + fd: os.fd_t, + addr: anytype, + len: usize, + offset: u64 +) void { + sqe.* = .{ + .opcode = op, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = offset, + .addr = @ptrToInt(addr), + .len = @intCast(u32, len), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .__pad2 = [2]u64{ 0, 0 } + }; +} + +pub fn io_uring_prep_read(sqe: *io_uring_sqe, fd: os.fd_t, buffer: []u8, offset: u64) void { + io_uring_prep_rw(.READ, sqe, fd, buffer.ptr, buffer.len, offset); +} + +pub fn io_uring_prep_write(sqe: *io_uring_sqe, fd: os.fd_t, buffer: []const u8, offset: u64) void { + io_uring_prep_rw(.WRITE, sqe, fd, buffer.ptr, buffer.len, offset); +} + +pub fn io_uring_prep_readv( + sqe: *io_uring_sqe, + fd: os.fd_t, + iovecs: []const os.iovec, + offset: u64 +) void { + io_uring_prep_rw(.READV, sqe, fd, iovecs.ptr, iovecs.len, offset); +} + +pub fn io_uring_prep_writev( + sqe: *io_uring_sqe, + fd: os.fd_t, + iovecs: []const os.iovec_const, + offset: u64 +) void { + io_uring_prep_rw(.WRITEV, sqe, fd, iovecs.ptr, iovecs.len, offset); +} + +pub fn io_uring_prep_accept( + sqe: *io_uring_sqe, + fd: os.fd_t, + addr: *os.sockaddr, + addrlen: *os.socklen_t, + flags: u32 +) void { + // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. + // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + io_uring_prep_rw(.ACCEPT, sqe, fd, addr, 0, @ptrToInt(addrlen)); + sqe.rw_flags = flags; +} + +pub fn io_uring_prep_connect( + sqe: *io_uring_sqe, + fd: os.fd_t, + addr: *const os.sockaddr, + addrlen: os.socklen_t +) void { + // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + io_uring_prep_rw(.CONNECT, sqe, fd, addr, 0, addrlen); +} + +pub fn io_uring_prep_recv(sqe: *io_uring_sqe, fd: os.fd_t, buffer: []u8, flags: u32) void { + io_uring_prep_rw(.RECV, sqe, fd, buffer.ptr, buffer.len, 0); + sqe.rw_flags = flags; +} + +pub fn io_uring_prep_send(sqe: *io_uring_sqe, fd: os.fd_t, buffer: []const u8, flags: u32) void { + io_uring_prep_rw(.SEND, sqe, fd, buffer.ptr, buffer.len, 0); + sqe.rw_flags = flags; +} + +pub fn io_uring_prep_openat( + sqe: *io_uring_sqe, + fd: os.fd_t, + path: [*:0]const u8, + flags: u32, + mode: os.mode_t +) void { + io_uring_prep_rw(.OPENAT, sqe, fd, path, mode, 0); + sqe.rw_flags = flags; +} + +pub fn io_uring_prep_close(sqe: *io_uring_sqe, fd: os.fd_t) void { + sqe.* = .{ + .opcode = .CLOSE, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .__pad2 = [2]u64{ 0, 0 } + }; +} + +test "structs/offsets/entries" { if (builtin.os.tag != .linux) return error.SkipZigTest; testing.expectEqual(@as(usize, 120), @sizeOf(io_uring_params)); @@ -681,7 +786,7 @@ test "structs and offsets" { testing.expectError(error.EntriesNotPowerOfTwo, IO_Uring.init(3, 0)); } -test "queue_nop" { +test "nop" { if (builtin.os.tag != .linux) return error.SkipZigTest; var ring = IO_Uring.init(1, 0) catch |err| switch (err) { @@ -694,7 +799,7 @@ test "queue_nop" { testing.expectEqual(@as(os.fd_t, -1), ring.fd); } - var sqe = try ring.queue_nop(@intCast(u64, 0xaaaaaaaa)); + var sqe = try ring.nop(0xaaaaaaaa); testing.expectEqual(io_uring_sqe { .opcode = .NOP, .flags = 0, @@ -704,7 +809,7 @@ test "queue_nop" { .addr = 0, .len = 0, .rw_flags = 0, - .user_data = @intCast(u64, 0xaaaaaaaa), + .user_data = 0xaaaaaaaa, .buf_index = 0, .personality = 0, .splice_fd_in = 0, @@ -733,9 +838,8 @@ test "queue_nop" { testing.expectEqual(@as(u32, 1), ring.cq.head.*); testing.expectEqual(@as(u32, 0), ring.cq_ready()); - var sqe_barrier = try ring.queue_nop(@intCast(u64, 0xbbbbbbbb)); - ring.drain_previous_sqes(sqe_barrier); - testing.expectEqual(@as(u8, linux.IOSQE_IO_DRAIN), sqe_barrier.flags); + var sqe_barrier = try ring.nop(0xbbbbbbbb); + sqe_barrier.flags |= linux.IOSQE_IO_DRAIN; testing.expectEqual(@as(u32, 1), try ring.submit()); testing.expectEqual(io_uring_cqe { .user_data = 0xbbbbbbbb, @@ -748,7 +852,7 @@ test "queue_nop" { testing.expectEqual(@as(u32, 2), ring.cq.head.*); } -test "queue_readv" { +test "readv" { if (builtin.os.tag != .linux) return error.SkipZigTest; var ring = IO_Uring.init(1, 0) catch |err| switch (err) { @@ -774,11 +878,11 @@ test "queue_readv" { var buffer = [_]u8{42} ** 128; var iovecs = [_]os.iovec{ os.iovec { .iov_base = &buffer, .iov_len = buffer.len } }; - var sqe = try ring.queue_readv(0xcccccccc, fd_index, iovecs[0..], 0); - ring.use_registered_fd(sqe); - testing.expectEqual(@as(u8, linux.IOSQE_FIXED_FILE), sqe.flags); + var sqe = try ring.readv(0xcccccccc, fd_index, iovecs[0..], 0); + testing.expectEqual(linux.IORING_OP.READV, sqe.opcode); + sqe.flags |= linux.IOSQE_FIXED_FILE; - testing.expectError(error.SubmissionQueueFull, ring.queue_nop(0)); + testing.expectError(error.SubmissionQueueFull, ring.nop(0)); testing.expectEqual(@as(u32, 1), try ring.submit()); testing.expectEqual(linux.io_uring_cqe { .user_data = 0xcccccccc, @@ -790,52 +894,75 @@ test "queue_readv" { try ring.unregister_files(); } -test "queue_writev/queue_fsync" { +test "writev/fsync/readv" { if (builtin.os.tag != .linux) return error.SkipZigTest; - var ring = IO_Uring.init(2, 0) catch |err| switch (err) { + var ring = IO_Uring.init(4, 0) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err }; defer ring.deinit(); - const path = "test_io_uring_queue_writev"; - const file = try std.fs.cwd().createFile(path, .{ .truncate = true }); + const path = "test_io_uring_writev_fsync_readv"; + const file = try std.fs.cwd().createFile(path, .{ .read = true, .truncate = true }); defer file.close(); defer std.fs.cwd().deleteFile(path) catch {}; const fd = file.handle; - var buffer = [_]u8{42} ** 128; - var iovecs = [_]os.iovec_const { - os.iovec_const { .iov_base = &buffer, .iov_len = buffer.len } + var buffer_write = [_]u8{42} ** 128; + var iovecs_write = [_]os.iovec_const { + os.iovec_const { .iov_base = &buffer_write, .iov_len = buffer_write.len } + }; + var buffer_read = [_]u8{0} ** 128; + var iovecs_read = [_]os.iovec { + os.iovec { .iov_base = &buffer_read, .iov_len = buffer_read.len } }; - var sqe_writev = try ring.queue_writev(0xdddddddd, fd, iovecs[0..], 0); - ring.link_with_next_sqe(sqe_writev); - testing.expectEqual(@as(u8, linux.IOSQE_IO_LINK), sqe_writev.flags); + + var sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); + testing.expectEqual(linux.IORING_OP.WRITEV, sqe_writev.opcode); + testing.expectEqual(@as(u64, 17), sqe_writev.off); + sqe_writev.flags |= linux.IOSQE_IO_LINK; - var sqe_fsync = try ring.queue_fsync(0xeeeeeeee, fd, 0); + var sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); + testing.expectEqual(linux.IORING_OP.FSYNC, sqe_fsync.opcode); testing.expectEqual(fd, sqe_fsync.fd); + sqe_fsync.flags |= linux.IOSQE_IO_LINK; + + var sqe_readv = try ring.readv(0xffffffff, fd, iovecs_read[0..], 17); + testing.expectEqual(linux.IORING_OP.READV, sqe_readv.opcode); + testing.expectEqual(@as(u64, 17), sqe_readv.off); - testing.expectEqual(@as(u32, 2), ring.sq_ready()); - testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); + testing.expectEqual(@as(u32, 3), ring.sq_ready()); + testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); testing.expectEqual(@as(u32, 0), ring.sq_ready()); - testing.expectEqual(@as(u32, 2), ring.cq_ready()); + testing.expectEqual(@as(u32, 3), ring.cq_ready()); + testing.expectEqual(linux.io_uring_cqe { .user_data = 0xdddddddd, - .res = buffer.len, + .res = buffer_write.len, .flags = 0, }, try ring.copy_cqe()); - testing.expectEqual(@as(u32, 1), ring.cq_ready()); + testing.expectEqual(@as(u32, 2), ring.cq_ready()); + testing.expectEqual(linux.io_uring_cqe { .user_data = 0xeeeeeeee, .res = 0, .flags = 0, }, try ring.copy_cqe()); + testing.expectEqual(@as(u32, 1), ring.cq_ready()); + + testing.expectEqual(linux.io_uring_cqe { + .user_data = 0xffffffff, + .res = buffer_read.len, + .flags = 0, + }, try ring.copy_cqe()); testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } -test "queue_write/queue_read" { +test "write/read" { if (builtin.os.tag != .linux) return error.SkipZigTest; var ring = IO_Uring.init(2, 0) catch |err| switch (err) { @@ -845,7 +972,7 @@ test "queue_write/queue_read" { }; defer ring.deinit(); - const path = "test_io_uring_queue_write"; + const path = "test_io_uring_write_read"; const file = try std.fs.cwd().createFile(path, .{ .read = true, .truncate = true }); defer file.close(); defer std.fs.cwd().deleteFile(path) catch {}; @@ -853,26 +980,30 @@ test "queue_write/queue_read" { var buffer_write = [_]u8{97} ** 20; var buffer_read = [_]u8{98} ** 20; - var sqe_write = try ring.queue_write(123, fd, buffer_write[0..], 10); - ring.link_with_next_sqe(sqe_write); - var sqe_read = try ring.queue_read(456, fd, buffer_read[0..], 10); + var sqe_write = try ring.write(123, fd, buffer_write[0..], 10); + testing.expectEqual(linux.IORING_OP.WRITE, sqe_write.opcode); + testing.expectEqual(@as(u64, 10), sqe_write.off); + sqe_write.flags |= linux.IOSQE_IO_LINK; + var sqe_read = try ring.read(456, fd, buffer_read[0..], 10); + testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); + testing.expectEqual(@as(u64, 10), sqe_read.off); testing.expectEqual(@as(u32, 2), try ring.submit()); - var cqe1 = try ring.copy_cqe(); - var cqe2 = try ring.copy_cqe(); + var cqe_write = try ring.copy_cqe(); + var cqe_read = try ring.copy_cqe(); // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: // https://lwn.net/Articles/809820/ - if (cqe1.res == -linux.EINVAL) return error.SkipZigTest; - if (cqe2.res == -linux.EINVAL) return error.SkipZigTest; + if (cqe_write.res == -linux.EINVAL) return error.SkipZigTest; + if (cqe_read.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { .user_data = 123, .res = buffer_write.len, .flags = 0, - }, cqe1); + }, cqe_write); testing.expectEqual(linux.io_uring_cqe { .user_data = 456, .res = buffer_read.len, .flags = 0, - }, cqe2); + }, cqe_read); testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } From 3d2de6cfbacb139ee101202e786b9355be92f960 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 13:14:57 +0200 Subject: [PATCH 44/50] Use load relaxed semantics when reading the SQPOLL wakeup flag --- lib/std/os/linux/io_uring.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index c142fa3f73f8..50e3453bb84e 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -241,7 +241,7 @@ pub const IO_Uring = struct { pub fn sq_ring_needs_enter(self: *IO_Uring, submitted: u32, flags: *u32) bool { assert(flags.* == 0); if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0 and submitted > 0) return true; - if ((@atomicLoad(u32, self.sq.flags, .Acquire) & linux.IORING_SQ_NEED_WAKEUP) != 0) { + if ((@atomicLoad(u32, self.sq.flags, .Unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { flags.* |= linux.IORING_ENTER_SQ_WAKEUP; return true; } From 6a53f4be4b349ea8ebef6f7df3ecb333142d70d7 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 13:15:39 +0200 Subject: [PATCH 45/50] Add openat(), close(), connect(), send(), recv(), as well as tests Removes non-essential .hardlink_with_next_sqe() and .drain_previous_sqes(). --- lib/std/os/linux/io_uring.zig | 213 +++++++++++++++++++++++++++++++--- 1 file changed, 198 insertions(+), 15 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 50e3453bb84e..ae8cb1518ae4 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -441,22 +441,75 @@ pub const IO_Uring = struct { sqe.user_data = user_data; return sqe; } - - /// Like `link_with_next_sqe()` but stronger. - /// For when you don't want the chain to fail in the event of a completion result error. - /// For example, you may know that some commands will fail and may want the chain to continue. - /// Hard links are resilient to completion results, but are not resilient to submission errors. - pub fn hardlink_with_next_sqe(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.flags |= linux.IOSQE_IO_HARDLINK; + + /// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. + /// Returns a pointer to the SQE. + pub fn connect( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + addr: *const os.sockaddr, + addrlen: os.socklen_t + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + io_uring_prep_connect(sqe, fd, addr, addrlen); + sqe.user_data = user_data; + return sqe; } - - /// This creates a full pipeline barrier in the submission queue. - /// This SQE will not be started until previous SQEs complete. - /// Subsequent SQEs will not be started until this SQE completes. - /// In other words, this stalls the entire submission queue. - /// You should first consider using link_with_next_sqe() for more granular SQE sequence control. - pub fn drain_previous_sqes(self: *IO_Uring, sqe: *io_uring_sqe) void { - sqe.flags |= linux.IOSQE_IO_DRAIN; + + /// Queues (but does not submit) an SQE to perform a `recv(2)`. + /// Returns a pointer to the SQE. + pub fn recv( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + buffer: []u8, + flags: u32 + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + io_uring_prep_recv(sqe, fd, buffer, flags); + sqe.user_data = user_data; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform a `send(2)`. + /// Returns a pointer to the SQE. + pub fn send( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + buffer: []u8, + flags: u32 + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + io_uring_prep_send(sqe, fd, buffer, flags); + sqe.user_data = user_data; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform an `openat(2)`. + /// Returns a pointer to the SQE. + pub fn openat( + self: *IO_Uring, + user_data: u64, + fd: os.fd_t, + path: [*:0]const u8, + flags: u32, + mode: os.mode_t + ) !*io_uring_sqe { + const sqe = try self.get_sqe(); + io_uring_prep_openat(sqe, fd, path, flags, mode); + sqe.user_data = user_data; + return sqe; + } + + /// Queues (but does not submit) an SQE to perform a `close(2)`. + /// Returns a pointer to the SQE. + pub fn close(self: *IO_Uring, user_data: u64, fd: os.fd_t) !*io_uring_sqe { + const sqe = try self.get_sqe(); + io_uring_prep_close(sqe, fd); + sqe.user_data = user_data; + return sqe; } /// Registers an array of file descriptors. @@ -1007,3 +1060,133 @@ test "write/read" { }, cqe_read); testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } + +test "openat/close" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IO_Uring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err + }; + defer ring.deinit(); + + const path = "test_io_uring_openat_close"; + defer std.fs.cwd().deleteFile(path) catch {}; + + const flags: u32 = os.O_CLOEXEC | os.O_RDWR | os.O_CREAT; + const mode: os.mode_t = 0o666; + var sqe_openat = try ring.openat(789, linux.AT_FDCWD, path, flags, mode); + testing.expectEqual(io_uring_sqe { + .opcode = .OPENAT, + .flags = 0, + .ioprio = 0, + .fd = linux.AT_FDCWD, + .off = 0, + .addr = @ptrToInt(path), + .len = mode, + .rw_flags = flags, + .user_data = 789, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .__pad2 = [2]u64{ 0, 0 } + }, sqe_openat.*); + testing.expectEqual(@as(u32, 1), try ring.submit()); + + var cqe_openat = try ring.copy_cqe(); + if (cqe_openat.res == -linux.EINVAL) return error.SkipZigTest; + testing.expectEqual(@as(u64, 789), cqe_openat.user_data); + testing.expect(cqe_openat.res > 0); + testing.expectEqual(@as(u32, 0), cqe_openat.flags); + + var sqe_close = try ring.close(1011, cqe_openat.res); + testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); + testing.expectEqual(cqe_openat.res, sqe_close.fd); + testing.expectEqual(@as(u32, 1), try ring.submit()); + + var cqe_close = try ring.copy_cqe(); + if (cqe_close.res == -linux.EINVAL) return error.SkipZigTest; + testing.expectEqual(linux.io_uring_cqe { + .user_data = 1011, + .res = 0, + .flags = 0, + }, cqe_close); +} + +test "accept/connect/send/recv" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IO_Uring.init(16, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err + }; + defer ring.deinit(); + + var address = try net.Address.parseIp4("127.0.0.1", 3131); + const kernel_backlog = 1; + const server = try os.socket(address.any.family, os.SOCK_STREAM | os.SOCK_CLOEXEC, 0); + defer os.close(server); + try os.setsockopt(server, os.SOL_SOCKET, os.SO_REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try os.bind(server, &address.any, address.getOsSockLen()); + try os.listen(server, kernel_backlog); + + var buffer_send = [_]u8{1,0,1,0,1,0,1,0,1,0}; + var buffer_recv = [_]u8{0,1,0,1,0}; + + var accept_addr: os.sockaddr = undefined; + var accept_addr_len: os.socklen_t = @sizeOf(@TypeOf(accept_addr)); + var accept = try ring.accept(0xaaaaaaaa, server, &accept_addr, &accept_addr_len, 0); + testing.expectEqual(@as(u32, 1), try ring.submit()); + + const client = try os.socket(address.any.family, os.SOCK_STREAM | os.SOCK_CLOEXEC, 0); + defer os.close(client); + var connect = try ring.connect(0xcccccccc, client, &address.any, address.getOsSockLen()); + testing.expectEqual(@as(u32, 1), try ring.submit()); + + var cqe_accept = try ring.copy_cqe(); + if (cqe_accept.res == -linux.EINVAL) return error.SkipZigTest; + var cqe_connect = try ring.copy_cqe(); + if (cqe_connect.res == -linux.EINVAL) return error.SkipZigTest; + + // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: + if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { + var a = cqe_accept; + var b = cqe_connect; + cqe_accept = b; + cqe_connect = a; + } + + testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); + testing.expect(cqe_accept.res > 0); + testing.expectEqual(@as(u32, 0), cqe_accept.flags); + testing.expectEqual(linux.io_uring_cqe { + .user_data = 0xcccccccc, + .res = 0, + .flags = 0, + }, cqe_connect); + + var send = try ring.send(0xeeeeeeee, client, buffer_send[0..], 0); + send.flags |= linux.IOSQE_IO_LINK; + var recv = try ring.recv(0xffffffff, cqe_accept.res, buffer_recv[0..], 0); + testing.expectEqual(@as(u32, 2), try ring.submit()); + + var cqe_send = try ring.copy_cqe(); + if (cqe_send.res == -linux.EINVAL) return error.SkipZigTest; + testing.expectEqual(linux.io_uring_cqe { + .user_data = 0xeeeeeeee, + .res = buffer_send.len, + .flags = 0, + }, cqe_send); + + var cqe_recv = try ring.copy_cqe(); + if (cqe_recv.res == -linux.EINVAL) return error.SkipZigTest; + testing.expectEqual(linux.io_uring_cqe { + .user_data = 0xffffffff, + .res = buffer_recv.len, + .flags = 0, + }, cqe_recv); + + testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); +} From 9091fcbe9d9161b12bb2229eb9b30b1b14153a0c Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 15:11:09 +0200 Subject: [PATCH 46/50] Improve openat/accept test debugging --- lib/std/os/linux/io_uring.zig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index ae8cb1518ae4..518601bd573a 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -1097,6 +1097,7 @@ test "openat/close" { var cqe_openat = try ring.copy_cqe(); if (cqe_openat.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(@as(u64, 789), cqe_openat.user_data); + if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{ cqe_openat.res }); testing.expect(cqe_openat.res > 0); testing.expectEqual(@as(u32, 0), cqe_openat.flags); @@ -1159,6 +1160,7 @@ test "accept/connect/send/recv" { } testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); + if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{ cqe_accept.res }); testing.expect(cqe_accept.res > 0); testing.expectEqual(@as(u32, 0), cqe_accept.flags); testing.expectEqual(linux.io_uring_cqe { From 72bdfa5bdda561e640206082f1deb5f80ebebfa7 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 16:05:52 +0200 Subject: [PATCH 47/50] Skip openat test only for older kernels that do not fully support AT_FDCWD --- lib/std/os/linux/io_uring.zig | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 518601bd573a..b39b48661f2a 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -1095,8 +1095,14 @@ test "openat/close" { testing.expectEqual(@as(u32, 1), try ring.submit()); var cqe_openat = try ring.copy_cqe(); - if (cqe_openat.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(@as(u64, 789), cqe_openat.user_data); + if (cqe_openat.res == -linux.EINVAL) return error.SkipZigTest; + // AT_FDCWD is not fully supported before kernel 5.6: + // See https://lore.kernel.org/io-uring/20200207155039.12819-1-axboe@kernel.dk/T/ + // We use IORING_FEAT_RW_CUR_POS to know if we are pre-5.6 since that feature was added in 5.6. + if (cqe_openat.res == -linux.EBADF and (ring.features & linux.IORING_FEAT_RW_CUR_POS) == 0) { + return error.SkipZigTest; + } if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{ cqe_openat.res }); testing.expect(cqe_openat.res > 0); testing.expectEqual(@as(u32, 0), cqe_openat.flags); From 958ff087f263a98986abfbc2a471d7e974813b91 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 16:57:04 +0200 Subject: [PATCH 48/50] Use const wherever possible --- lib/std/os/linux/io_uring.zig | 58 +++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index b39b48661f2a..b7d3c8a1276e 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -478,7 +478,7 @@ pub const IO_Uring = struct { self: *IO_Uring, user_data: u64, fd: os.fd_t, - buffer: []u8, + buffer: []const u8, flags: u32 ) !*io_uring_sqe { const sqe = try self.get_sqe(); @@ -852,7 +852,7 @@ test "nop" { testing.expectEqual(@as(os.fd_t, -1), ring.fd); } - var sqe = try ring.nop(0xaaaaaaaa); + const sqe = try ring.nop(0xaaaaaaaa); testing.expectEqual(io_uring_sqe { .opcode = .NOP, .flags = 0, @@ -891,7 +891,7 @@ test "nop" { testing.expectEqual(@as(u32, 1), ring.cq.head.*); testing.expectEqual(@as(u32, 0), ring.cq_ready()); - var sqe_barrier = try ring.nop(0xbbbbbbbb); + const sqe_barrier = try ring.nop(0xbbbbbbbb); sqe_barrier.flags |= linux.IOSQE_IO_DRAIN; testing.expectEqual(@as(u32, 1), try ring.submit()); testing.expectEqual(io_uring_cqe { @@ -931,7 +931,7 @@ test "readv" { var buffer = [_]u8{42} ** 128; var iovecs = [_]os.iovec{ os.iovec { .iov_base = &buffer, .iov_len = buffer.len } }; - var sqe = try ring.readv(0xcccccccc, fd_index, iovecs[0..], 0); + const sqe = try ring.readv(0xcccccccc, fd_index, iovecs[0..], 0); testing.expectEqual(linux.IORING_OP.READV, sqe.opcode); sqe.flags |= linux.IOSQE_FIXED_FILE; @@ -963,8 +963,8 @@ test "writev/fsync/readv" { defer std.fs.cwd().deleteFile(path) catch {}; const fd = file.handle; - var buffer_write = [_]u8{42} ** 128; - var iovecs_write = [_]os.iovec_const { + const buffer_write = [_]u8{42} ** 128; + const iovecs_write = [_]os.iovec_const { os.iovec_const { .iov_base = &buffer_write, .iov_len = buffer_write.len } }; var buffer_read = [_]u8{0} ** 128; @@ -972,17 +972,17 @@ test "writev/fsync/readv" { os.iovec { .iov_base = &buffer_read, .iov_len = buffer_read.len } }; - var sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); + const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); testing.expectEqual(linux.IORING_OP.WRITEV, sqe_writev.opcode); testing.expectEqual(@as(u64, 17), sqe_writev.off); sqe_writev.flags |= linux.IOSQE_IO_LINK; - var sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); + const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); testing.expectEqual(linux.IORING_OP.FSYNC, sqe_fsync.opcode); testing.expectEqual(fd, sqe_fsync.fd); sqe_fsync.flags |= linux.IOSQE_IO_LINK; - var sqe_readv = try ring.readv(0xffffffff, fd, iovecs_read[0..], 17); + const sqe_readv = try ring.readv(0xffffffff, fd, iovecs_read[0..], 17); testing.expectEqual(linux.IORING_OP.READV, sqe_readv.opcode); testing.expectEqual(@as(u64, 17), sqe_readv.off); @@ -1031,19 +1031,19 @@ test "write/read" { defer std.fs.cwd().deleteFile(path) catch {}; const fd = file.handle; - var buffer_write = [_]u8{97} ** 20; + const buffer_write = [_]u8{97} ** 20; var buffer_read = [_]u8{98} ** 20; - var sqe_write = try ring.write(123, fd, buffer_write[0..], 10); + const sqe_write = try ring.write(123, fd, buffer_write[0..], 10); testing.expectEqual(linux.IORING_OP.WRITE, sqe_write.opcode); testing.expectEqual(@as(u64, 10), sqe_write.off); sqe_write.flags |= linux.IOSQE_IO_LINK; - var sqe_read = try ring.read(456, fd, buffer_read[0..], 10); + const sqe_read = try ring.read(456, fd, buffer_read[0..], 10); testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); testing.expectEqual(@as(u64, 10), sqe_read.off); testing.expectEqual(@as(u32, 2), try ring.submit()); - var cqe_write = try ring.copy_cqe(); - var cqe_read = try ring.copy_cqe(); + const cqe_write = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: // https://lwn.net/Articles/809820/ if (cqe_write.res == -linux.EINVAL) return error.SkipZigTest; @@ -1076,7 +1076,7 @@ test "openat/close" { const flags: u32 = os.O_CLOEXEC | os.O_RDWR | os.O_CREAT; const mode: os.mode_t = 0o666; - var sqe_openat = try ring.openat(789, linux.AT_FDCWD, path, flags, mode); + const sqe_openat = try ring.openat(789, linux.AT_FDCWD, path, flags, mode); testing.expectEqual(io_uring_sqe { .opcode = .OPENAT, .flags = 0, @@ -1094,7 +1094,7 @@ test "openat/close" { }, sqe_openat.*); testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_openat = try ring.copy_cqe(); + const cqe_openat = try ring.copy_cqe(); testing.expectEqual(@as(u64, 789), cqe_openat.user_data); if (cqe_openat.res == -linux.EINVAL) return error.SkipZigTest; // AT_FDCWD is not fully supported before kernel 5.6: @@ -1107,12 +1107,12 @@ test "openat/close" { testing.expect(cqe_openat.res > 0); testing.expectEqual(@as(u32, 0), cqe_openat.flags); - var sqe_close = try ring.close(1011, cqe_openat.res); + const sqe_close = try ring.close(1011, cqe_openat.res); testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); testing.expectEqual(cqe_openat.res, sqe_close.fd); testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_close = try ring.copy_cqe(); + const cqe_close = try ring.copy_cqe(); if (cqe_close.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { .user_data = 1011, @@ -1131,7 +1131,7 @@ test "accept/connect/send/recv" { }; defer ring.deinit(); - var address = try net.Address.parseIp4("127.0.0.1", 3131); + const address = try net.Address.parseIp4("127.0.0.1", 3131); const kernel_backlog = 1; const server = try os.socket(address.any.family, os.SOCK_STREAM | os.SOCK_CLOEXEC, 0); defer os.close(server); @@ -1139,17 +1139,17 @@ test "accept/connect/send/recv" { try os.bind(server, &address.any, address.getOsSockLen()); try os.listen(server, kernel_backlog); - var buffer_send = [_]u8{1,0,1,0,1,0,1,0,1,0}; - var buffer_recv = [_]u8{0,1,0,1,0}; + const buffer_send = [_]u8{ 1,0,1,0,1,0,1,0,1,0 }; + var buffer_recv = [_]u8{ 0,1,0,1,0 }; var accept_addr: os.sockaddr = undefined; var accept_addr_len: os.socklen_t = @sizeOf(@TypeOf(accept_addr)); - var accept = try ring.accept(0xaaaaaaaa, server, &accept_addr, &accept_addr_len, 0); + const accept = try ring.accept(0xaaaaaaaa, server, &accept_addr, &accept_addr_len, 0); testing.expectEqual(@as(u32, 1), try ring.submit()); const client = try os.socket(address.any.family, os.SOCK_STREAM | os.SOCK_CLOEXEC, 0); defer os.close(client); - var connect = try ring.connect(0xcccccccc, client, &address.any, address.getOsSockLen()); + const connect = try ring.connect(0xcccccccc, client, &address.any, address.getOsSockLen()); testing.expectEqual(@as(u32, 1), try ring.submit()); var cqe_accept = try ring.copy_cqe(); @@ -1159,8 +1159,8 @@ test "accept/connect/send/recv" { // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { - var a = cqe_accept; - var b = cqe_connect; + const a = cqe_accept; + const b = cqe_connect; cqe_accept = b; cqe_connect = a; } @@ -1175,12 +1175,12 @@ test "accept/connect/send/recv" { .flags = 0, }, cqe_connect); - var send = try ring.send(0xeeeeeeee, client, buffer_send[0..], 0); + const send = try ring.send(0xeeeeeeee, client, buffer_send[0..], 0); send.flags |= linux.IOSQE_IO_LINK; - var recv = try ring.recv(0xffffffff, cqe_accept.res, buffer_recv[0..], 0); + const recv = try ring.recv(0xffffffff, cqe_accept.res, buffer_recv[0..], 0); testing.expectEqual(@as(u32, 2), try ring.submit()); - var cqe_send = try ring.copy_cqe(); + const cqe_send = try ring.copy_cqe(); if (cqe_send.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { .user_data = 0xeeeeeeee, @@ -1188,7 +1188,7 @@ test "accept/connect/send/recv" { .flags = 0, }, cqe_send); - var cqe_recv = try ring.copy_cqe(); + const cqe_recv = try ring.copy_cqe(); if (cqe_recv.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { .user_data = 0xffffffff, From 9be29410914e9d10171536fd275739dc8d1f437e Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Sun, 4 Oct 2020 17:48:05 +0200 Subject: [PATCH 49/50] Split openat/close test into two separate tests If an older kernel fails the `openat` test because of `AT_FDCWD` then we don't want to skip the `close` test. --- lib/std/os/linux/io_uring.zig | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index b7d3c8a1276e..13a44375ed05 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -1061,7 +1061,7 @@ test "write/read" { testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } -test "openat/close" { +test "openat" { if (builtin.os.tag != .linux) return error.SkipZigTest; var ring = IO_Uring.init(1, 0) catch |err| switch (err) { @@ -1071,7 +1071,7 @@ test "openat/close" { }; defer ring.deinit(); - const path = "test_io_uring_openat_close"; + const path = "test_io_uring_openat"; defer std.fs.cwd().deleteFile(path) catch {}; const flags: u32 = os.O_CLOEXEC | os.O_RDWR | os.O_CREAT; @@ -1107,9 +1107,27 @@ test "openat/close" { testing.expect(cqe_openat.res > 0); testing.expectEqual(@as(u32, 0), cqe_openat.flags); - const sqe_close = try ring.close(1011, cqe_openat.res); + os.close(cqe_openat.res); +} + +test "close" { + if (builtin.os.tag != .linux) return error.SkipZigTest; + + var ring = IO_Uring.init(1, 0) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err + }; + defer ring.deinit(); + + const path = "test_io_uring_close"; + const file = try std.fs.cwd().createFile(path, .{}); + errdefer file.close(); + defer std.fs.cwd().deleteFile(path) catch {}; + + const sqe_close = try ring.close(1011, file.handle); testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); - testing.expectEqual(cqe_openat.res, sqe_close.fd); + testing.expectEqual(file.handle, sqe_close.fd); testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_close = try ring.copy_cqe(); From e9ba12f456f37ec1f59a680cd9bf107e8e84e7b7 Mon Sep 17 00:00:00 2001 From: Joran Dirk Greef Date: Mon, 5 Oct 2020 09:36:07 +0200 Subject: [PATCH 50/50] Test the range of user_data bits --- lib/std/os/linux/io_uring.zig | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 13a44375ed05..b2d42bab938f 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -1033,11 +1033,11 @@ test "write/read" { const buffer_write = [_]u8{97} ** 20; var buffer_read = [_]u8{98} ** 20; - const sqe_write = try ring.write(123, fd, buffer_write[0..], 10); + const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); testing.expectEqual(linux.IORING_OP.WRITE, sqe_write.opcode); testing.expectEqual(@as(u64, 10), sqe_write.off); sqe_write.flags |= linux.IOSQE_IO_LINK; - const sqe_read = try ring.read(456, fd, buffer_read[0..], 10); + const sqe_read = try ring.read(0x22222222, fd, buffer_read[0..], 10); testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); testing.expectEqual(@as(u64, 10), sqe_read.off); testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -1049,12 +1049,12 @@ test "write/read" { if (cqe_write.res == -linux.EINVAL) return error.SkipZigTest; if (cqe_read.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { - .user_data = 123, + .user_data = 0x11111111, .res = buffer_write.len, .flags = 0, }, cqe_write); testing.expectEqual(linux.io_uring_cqe { - .user_data = 456, + .user_data = 0x22222222, .res = buffer_read.len, .flags = 0, }, cqe_read); @@ -1076,7 +1076,7 @@ test "openat" { const flags: u32 = os.O_CLOEXEC | os.O_RDWR | os.O_CREAT; const mode: os.mode_t = 0o666; - const sqe_openat = try ring.openat(789, linux.AT_FDCWD, path, flags, mode); + const sqe_openat = try ring.openat(0x33333333, linux.AT_FDCWD, path, flags, mode); testing.expectEqual(io_uring_sqe { .opcode = .OPENAT, .flags = 0, @@ -1086,7 +1086,7 @@ test "openat" { .addr = @ptrToInt(path), .len = mode, .rw_flags = flags, - .user_data = 789, + .user_data = 0x33333333, .buf_index = 0, .personality = 0, .splice_fd_in = 0, @@ -1095,7 +1095,7 @@ test "openat" { testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_openat = try ring.copy_cqe(); - testing.expectEqual(@as(u64, 789), cqe_openat.user_data); + testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); if (cqe_openat.res == -linux.EINVAL) return error.SkipZigTest; // AT_FDCWD is not fully supported before kernel 5.6: // See https://lore.kernel.org/io-uring/20200207155039.12819-1-axboe@kernel.dk/T/ @@ -1125,7 +1125,7 @@ test "close" { errdefer file.close(); defer std.fs.cwd().deleteFile(path) catch {}; - const sqe_close = try ring.close(1011, file.handle); + const sqe_close = try ring.close(0x44444444, file.handle); testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); testing.expectEqual(file.handle, sqe_close.fd); testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -1133,7 +1133,7 @@ test "close" { const cqe_close = try ring.copy_cqe(); if (cqe_close.res == -linux.EINVAL) return error.SkipZigTest; testing.expectEqual(linux.io_uring_cqe { - .user_data = 1011, + .user_data = 0x44444444, .res = 0, .flags = 0, }, cqe_close);