diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs
index 71c97699fd07e7..0ce5d006a059ae 100644
--- a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs
@@ -22,6 +22,9 @@ internal static partial class Fcntl
[LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlSetFD", SetLastError = true)]
internal static partial int SetFD(SafeHandle fd, int flags);
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlSetFD", SetLastError = true)]
+ internal static partial int SetFD(IntPtr fd, int flags);
+
[LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlGetFD", SetLastError = true)]
internal static partial int GetFD(SafeHandle fd);
diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs
new file mode 100644
index 00000000000000..1a2216d8d6723c
--- /dev/null
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs
@@ -0,0 +1,58 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.InteropServices;
+
+internal static partial class Interop
+{
+ internal static partial class Sys
+ {
+ /// Wraps io_uring_setup(2): creates an io_uring instance.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimSetup")]
+ internal static unsafe partial Error IoUringShimSetup(
+ uint entries, void* parms, int* ringFd);
+
+ /// Wraps io_uring_enter(2): submits SQEs and/or waits for CQEs.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnter")]
+ internal static unsafe partial Error IoUringShimEnter(
+ int ringFd, uint toSubmit, uint minComplete, uint flags, int* result);
+
+ /// Wraps io_uring_enter2(2) with IORING_ENTER_EXT_ARG for bounded waits.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnterExt")]
+ internal static unsafe partial Error IoUringShimEnterExt(
+ int ringFd, uint toSubmit, uint minComplete, uint flags, void* arg, int* result);
+
+ /// Wraps io_uring_register(2): registers resources (files, buffers, ring fds).
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimRegister")]
+ internal static unsafe partial Error IoUringShimRegister(
+ int ringFd, uint opcode, void* arg, uint nrArgs, int* result);
+
+ /// Wraps mmap(2): maps io_uring SQ/CQ ring memory.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMmap")]
+ internal static unsafe partial Error IoUringShimMmap(
+ int ringFd, ulong size, ulong offset, void** mappedPtr);
+
+ /// Wraps munmap(2): unmaps io_uring ring memory.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMunmap")]
+ internal static unsafe partial Error IoUringShimMunmap(
+ void* addr, ulong size);
+
+ /// Creates an eventfd for io_uring wakeup signaling.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCreateEventFd")]
+ internal static unsafe partial Error IoUringShimCreateEventFd(
+ int* eventFd);
+
+ /// Writes to an eventfd to wake the io_uring event loop.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimWriteEventFd")]
+ internal static partial Error IoUringShimWriteEventFd(int eventFd);
+
+ /// Reads from an eventfd to consume a wakeup signal.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimReadEventFd")]
+ internal static unsafe partial Error IoUringShimReadEventFd(
+ int eventFd, ulong* value);
+
+ /// Wraps close(2): closes a file descriptor.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCloseFd")]
+ internal static partial Error IoUringShimCloseFd(int fd);
+ }
+}
diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs
new file mode 100644
index 00000000000000..1472d04c8b676a
--- /dev/null
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs
@@ -0,0 +1,150 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Net.Sockets;
+using System.Runtime.InteropServices;
+
+internal static partial class Interop
+{
+ internal static partial class Sys
+ {
+ /// Derived SQ ring state computed after mmap, used by the managed submission path.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringSqRingInfo
+ {
+ public IntPtr SqeBase;
+ public IntPtr SqTailPtr;
+ public IntPtr SqHeadPtr;
+ public uint SqMask;
+ public uint SqEntries;
+ public uint SqeSize;
+ public byte UsesNoSqArray;
+ public int RingFd;
+ public int RegisteredRingFd;
+ public byte UsesEnterExtArg;
+ public byte UsesRegisteredFiles;
+ }
+
+ /// Mirrors kernel struct io_sqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringSqOffsets
+ {
+ [FieldOffset(0)] public uint Head;
+ [FieldOffset(4)] public uint Tail;
+ [FieldOffset(8)] public uint RingMask;
+ [FieldOffset(12)] public uint RingEntries;
+ [FieldOffset(16)] public uint Flags;
+ [FieldOffset(20)] public uint Dropped;
+ [FieldOffset(24)] public uint Array;
+ // resv1 at 28, user_addr at 32 - not needed by managed code
+ }
+
+ /// Mirrors kernel struct io_cqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringCqOffsets
+ {
+ [FieldOffset(0)] public uint Head;
+ [FieldOffset(4)] public uint Tail;
+ [FieldOffset(8)] public uint RingMask;
+ [FieldOffset(12)] public uint RingEntries;
+ [FieldOffset(16)] public uint Overflow;
+ [FieldOffset(20)] public uint Cqes;
+ [FieldOffset(24)] public uint Flags;
+ // resv1 at 28, user_addr at 32 - not needed by managed code
+ }
+
+ /// Mirrors kernel struct io_uring_params (120 bytes), passed to io_uring_setup.
+ [StructLayout(LayoutKind.Explicit, Size = 120)]
+ internal struct IoUringParams
+ {
+ [FieldOffset(0)] public uint SqEntries;
+ [FieldOffset(4)] public uint CqEntries;
+ [FieldOffset(8)] public uint Flags;
+ [FieldOffset(12)] public uint SqThreadCpu;
+ [FieldOffset(16)] public uint SqThreadIdle;
+ [FieldOffset(20)] public uint Features;
+ [FieldOffset(24)] public uint WqFd;
+ // resv[3] at 28-39
+ [FieldOffset(40)] public IoUringSqOffsets SqOff;
+ [FieldOffset(80)] public IoUringCqOffsets CqOff;
+ }
+
+ /// Mirrors kernel struct io_uring_cqe (16 bytes), read from the CQ ring.
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringCqe
+ {
+ [FieldOffset(0)] public ulong UserData;
+ [FieldOffset(8)] public int Result;
+ [FieldOffset(12)] public uint Flags;
+ }
+
+ /// Mirrors kernel struct io_uring_buf (16 bytes), used by provided-buffer rings.
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringBuf
+ {
+ [FieldOffset(0)] public ulong Address;
+ [FieldOffset(8)] public uint Length;
+ [FieldOffset(12)] public ushort BufferId;
+ [FieldOffset(14)] public ushort Reserved;
+ }
+
+ ///
+ /// Mirrors the header overlay of kernel struct io_uring_buf_ring (16 bytes).
+ /// In UAPI this shares offset 0 with the first io_uring_buf entry via a union.
+ ///
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringBufRingHeader
+ {
+ [FieldOffset(0)] public ulong Reserved1;
+ [FieldOffset(8)] public uint Reserved2;
+ [FieldOffset(12)] public ushort Reserved3;
+ [FieldOffset(14)] public ushort Tail;
+ }
+
+ /// Mirrors kernel struct io_uring_buf_reg (40 bytes), used for pbuf ring registration.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringBufReg
+ {
+ [FieldOffset(0)] public ulong RingAddress;
+ [FieldOffset(8)] public uint RingEntries;
+ [FieldOffset(12)] public ushort BufferGroupId;
+ [FieldOffset(14)] public ushort Padding;
+ [FieldOffset(16)] public ulong Reserved0;
+ [FieldOffset(24)] public ulong Reserved1;
+ [FieldOffset(32)] public ulong Reserved2;
+ }
+
+ /// Derived CQ ring state computed after mmap, used by the managed completion drain path.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringCqRingInfo
+ {
+ public IntPtr CqeBase; // io_uring_cqe* base of CQE array
+ public IntPtr CqTailPtr; // uint32_t* kernel writes CQ tail
+ public IntPtr CqHeadPtr; // uint32_t* managed advances CQ head
+ public uint CqMask; // CqEntries - 1
+ public uint CqEntries; // number of CQ slots
+ public uint CqeSize; // sizeof(io_uring_cqe) = 16
+ public IntPtr CqOverflowPtr; // uint32_t* kernel CQ overflow counter
+ }
+
+ /// Mirrors kernel struct io_uring_getevents_arg, used with IORING_ENTER_EXT_ARG.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringGeteventsArg
+ {
+ public ulong Sigmask;
+ public uint SigmaskSize;
+ public uint MinWaitUsec;
+ public ulong Ts;
+ }
+
+ /// Mirrors kernel struct __kernel_timespec, used for io_uring timeout arguments.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringKernelTimespec
+ {
+ public long TvSec;
+ public long TvNsec;
+ }
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
index bdb03b5a7b5548..4dab060410324e 100644
--- a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
+++ b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
@@ -7,6 +7,8 @@
System.Net.Internals namespace. -->
$(DefineConstants);SYSTEM_NET_SOCKETS_DLL
false
+
+ false
@@ -197,9 +199,34 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -291,6 +318,8 @@
Link="Common\Interop\Unix\System.Native\Interop.SendMessage.cs" />
+
+ /// Initializes a provided-buffer ring and registers it with the kernel when supported.
+ /// Failures are non-fatal and leave completion mode enabled without provided buffers.
+ ///
+ private void InitializeIoUringProvidedBufferRingIfSupported(int ringFd)
+ {
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: false,
+ hasRegisteredBuffers: false);
+ _adaptiveBufferSizingEnabled = false;
+ _ioUringProvidedBufferGroupId = 0;
+ _ioUringProvidedBufferRing = null;
+ ushort initialGroupId = AllocateProvidedBufferGroupId();
+
+ if (!IoUringProvidedBufferRing.TryCreate(
+ initialGroupId,
+ IoUringProvidedBufferRingEntries,
+ s_ioUringProvidedBufferSize,
+ s_ioUringAdaptiveBufferSizingEnabled,
+ out IoUringProvidedBufferRing? bufferRing) ||
+ bufferRing is null)
+ {
+ return;
+ }
+
+ Interop.Error registerError = bufferRing.Register(ringFd);
+ if (registerError != Interop.Error.SUCCESS)
+ {
+ bufferRing.Dispose();
+ return;
+ }
+
+ _ioUringProvidedBufferRing = bufferRing;
+ _ioUringProvidedBufferGroupId = bufferRing.BufferGroupId;
+ _adaptiveBufferSizingEnabled = s_ioUringAdaptiveBufferSizingEnabled;
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: true,
+ hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry(bufferRing, ringFd));
+
+ }
+
+ ///
+ /// Evaluates adaptive buffer-sizing recommendations and hot-swaps the provided-buffer ring when safe.
+ /// Must run on the event-loop thread.
+ ///
+ private void EvaluateProvidedBufferRingResize()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Provided-buffer resize evaluation must run on the io_uring event-loop thread.");
+ if (!_adaptiveBufferSizingEnabled || _ringState.RingFd < 0)
+ {
+ return;
+ }
+
+ IoUringProvidedBufferRing? currentRing = _ioUringProvidedBufferRing;
+ if (currentRing is null)
+ {
+ return;
+ }
+
+ int currentBufferSize = currentRing.BufferSize;
+ int recommendedBufferSize = currentRing.RecommendedBufferSize;
+ if (recommendedBufferSize == 0 || recommendedBufferSize == currentBufferSize)
+ {
+ return;
+ }
+
+ if (!IsProvidedBufferResizeQuiescent(currentRing))
+ {
+ return;
+ }
+
+ ushort newGroupId = AllocateProvidedBufferGroupId(_ioUringProvidedBufferGroupId);
+ if (!IoUringProvidedBufferRing.TryCreate(
+ newGroupId,
+ IoUringProvidedBufferRingEntries,
+ recommendedBufferSize,
+ adaptiveSizingEnabled: true,
+ out IoUringProvidedBufferRing? replacementRing) ||
+ replacementRing is null)
+ {
+ return;
+ }
+
+ AssertProvidedBufferResizeQuiescent(currentRing);
+
+ bool restorePreviousBufferRegistration = _ioUringCapabilities.HasRegisteredBuffers;
+ TryUnregisterProvidedBuffersIfRegistered(currentRing, _ringState.RingFd, restorePreviousBufferRegistration);
+
+ if (replacementRing.Register(_ringState.RingFd) != Interop.Error.SUCCESS)
+ {
+ replacementRing.Dispose();
+ if (restorePreviousBufferRegistration)
+ {
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: true,
+ hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry(
+ currentRing,
+ _ringState.RingFd));
+ }
+
+ return;
+ }
+
+ currentRing.Unregister(_ringState.RingFd);
+ currentRing.Dispose();
+
+ _ioUringProvidedBufferRing = replacementRing;
+ _ioUringProvidedBufferGroupId = replacementRing.BufferGroupId;
+ RefreshIoUringMultishotRecvSupport();
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: true,
+ hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry(
+ replacementRing,
+ _ringState.RingFd));
+
+ }
+
+ private bool IsProvidedBufferResizeQuiescent(IoUringProvidedBufferRing currentRing)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Provided-buffer resize quiescence must be evaluated on the io_uring event-loop thread.");
+
+ if (currentRing.InUseCount != 0)
+ {
+ return false;
+ }
+
+ if (_cqOverflowRecoveryActive)
+ {
+ return false;
+ }
+
+ // Ring swap frees/replaces native buffer-ring memory. Delay swap until all tracked
+ // io_uring operations have drained so no in-flight SQE can still reference the old ring.
+ return Volatile.Read(ref _trackedIoUringOperationCount) == 0;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private ushort AllocateProvidedBufferGroupId(ushort avoidGroupId = 0)
+ {
+ ushort candidate = _nextIoUringProvidedBufferGroupId;
+ for (int attempts = 0; attempts < ushort.MaxValue; attempts++)
+ {
+ if (candidate != 0 &&
+ candidate != ushort.MaxValue &&
+ candidate != avoidGroupId)
+ {
+ _nextIoUringProvidedBufferGroupId = GetNextProvidedBufferGroupId(candidate);
+ return candidate;
+ }
+
+ candidate = GetNextProvidedBufferGroupId(candidate);
+ }
+
+ Debug.Fail("Unable to allocate an io_uring provided-buffer group id.");
+ _nextIoUringProvidedBufferGroupId = IoUringProvidedBufferGroupIdStart;
+ return IoUringProvidedBufferGroupIdStart;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ushort GetNextProvidedBufferGroupId(ushort currentGroupId)
+ {
+ ushort nextGroupId = unchecked((ushort)(currentGroupId + 1));
+ if (nextGroupId < IoUringProvidedBufferGroupIdStart || nextGroupId == ushort.MaxValue)
+ {
+ nextGroupId = IoUringProvidedBufferGroupIdStart;
+ }
+
+ return nextGroupId;
+ }
+
+ [Conditional("DEBUG")]
+ private void AssertProvidedBufferResizeQuiescent(IoUringProvidedBufferRing currentRing)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Provided-buffer resize assertions must run on the io_uring event-loop thread.");
+ Debug.Assert(
+ currentRing.InUseCount == 0,
+ "Provided-buffer resize requires no checked-out buffers before ring swap.");
+ Debug.Assert(
+ !_cqOverflowRecoveryActive,
+ "Provided-buffer resize must not run during CQ overflow recovery.");
+ Debug.Assert(
+ Volatile.Read(ref _trackedIoUringOperationCount) == 0,
+ "Provided-buffer resize requires no tracked io_uring operations before old ring disposal.");
+ }
+
+ private static int GetConfiguredIoUringProvidedBufferSize()
+ {
+#if DEBUG
+ string? configuredValue = Environment.GetEnvironmentVariable(
+ IoUringTestEnvironmentVariables.ProvidedBufferSize);
+
+ if (!string.IsNullOrWhiteSpace(configuredValue))
+ {
+ return int.TryParse(configuredValue, out int parsedSize) && parsedSize > 0
+ ? parsedSize
+ : IoUringProvidedBufferSizeDefault;
+ }
+#endif
+
+ return IoUringProvidedBufferSizeDefault;
+ }
+
+ private static bool IsAdaptiveIoUringProvidedBufferSizingEnabled()
+ {
+ bool enabled = AppContext.TryGetSwitch(IoUringAdaptiveBufferSizingSwitchName, out bool configured) && configured;
+#if DEBUG
+ bool? parsed = TryParseBoolSwitch(
+ Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.AdaptiveBufferSizing));
+ if (parsed.HasValue) return parsed.Value;
+#endif
+ return enabled;
+ }
+
+ private static bool IsIoUringRegisterBuffersEnabled()
+ {
+#if DEBUG
+ bool? parsed = TryParseBoolSwitch(
+ Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.RegisterBuffers));
+ if (parsed.HasValue) return parsed.Value;
+#endif
+ return true;
+ }
+
+ private static bool TryRegisterProvidedBuffersWithTelemetry(
+ IoUringProvidedBufferRing bufferRing,
+ int ringFd)
+ {
+ if (!s_ioUringRegisterBuffersEnabled || ringFd < 0)
+ {
+ return false;
+ }
+
+ // REGISTER_BUFFERS is orthogonal to provided-buffer selection (RECV + IOSQE_BUFFER_SELECT).
+ // Any performance benefit for this path is kernel-dependent and must be validated empirically.
+ return bufferRing.TryRegisterBuffersWithKernel(ringFd);
+ }
+
+ private void TryUnregisterProvidedBuffersIfRegistered(
+ IoUringProvidedBufferRing bufferRing,
+ int ringFd,
+ bool hasRegisteredBuffers)
+ {
+ if (!hasRegisteredBuffers || ringFd < 0)
+ {
+ return;
+ }
+
+ bufferRing.TryUnregisterBuffersFromKernel(ringFd);
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: _ioUringCapabilities.SupportsProvidedBufferRings,
+ hasRegisteredBuffers: false);
+ }
+
+ /// Unregisters and disposes the provided-buffer ring.
+ private void FreeIoUringProvidedBufferRing()
+ {
+ IoUringProvidedBufferRing? bufferRing = _ioUringProvidedBufferRing;
+ bool hadRegisteredBuffers = _ioUringCapabilities.HasRegisteredBuffers;
+ _ioUringProvidedBufferRing = null;
+ // Teardown invariant: clear provided-buffer capabilities immediately so no
+ // subsequent receive-prepare path can select provided/fixed-buffer strategies.
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: false,
+ hasRegisteredBuffers: false);
+ _adaptiveBufferSizingEnabled = false;
+ _ioUringProvidedBufferGroupId = 0;
+
+ if (bufferRing is null)
+ {
+ return;
+ }
+
+ bufferRing.RecycleCheckedOutBuffersForTeardown();
+
+ // Unregister the IORING_REGISTER_BUFFERS iovec array (registered-buffer acceleration).
+ TryUnregisterProvidedBuffersIfRegistered(bufferRing, _ringState.RingFd, hadRegisteredBuffers);
+
+ // Unregister the IORING_REGISTER_PBUF_RING provided-buffer ring itself.
+ if (_ringState.RingFd >= 0)
+ {
+ bufferRing.Unregister(_ringState.RingFd);
+ }
+
+ bufferRing.Dispose();
+ }
+
+ ///
+ /// Owns a managed provided-buffer ring registration: native ring memory, pinned managed
+ /// buffers, buffer-id lifecycle, and recycle counters.
+ /// Lifetime is process-engine managed and deterministic via ; no finalizer is used.
+ ///
+ private sealed unsafe class IoUringProvidedBufferRing : IDisposable
+ {
+ private const int AdaptiveWindowCompletionCount = 256;
+ private const int AdaptiveMinBufferSize = 128;
+ private const int AdaptiveMaxBufferSize = 65536;
+ private const int PreparedReceiveMinimumReserve = 8;
+ private const int PreparedReceiveMaximumReserve = 64;
+ private const byte BufferStatePosted = 1;
+ private const byte BufferStateCheckedOut = 2;
+#if DEBUG
+ private static int s_testForceCreateOomOnce = -1;
+#endif
+
+ private readonly ushort _bufferGroupId;
+ private readonly int _bufferSize;
+ private readonly uint _ringEntries;
+ private readonly uint _ringMask;
+ private readonly bool _adaptiveSizingEnabled;
+ private readonly byte[][] _buffers;
+ private readonly nint[] _bufferAddresses;
+ private readonly byte[] _bufferStates;
+ private readonly ulong[] _postedBufferStateBits;
+ private Interop.Sys.IoUringBuf* _ringBuffers;
+ private Interop.Sys.IoUringBufRingHeader* _ringHeader;
+ private readonly void* _ringMemory;
+ private bool _registered;
+ private bool _disposed;
+ private int _availableCount;
+ private int _inUseCount;
+ private long _recycledCount;
+ private long _allocationFailureCount;
+ private long _totalCompletionBytes;
+ private long _totalCompletionCount;
+ private long _completionsAboveHighWatermark;
+ private long _completionsBelowLowWatermark;
+ private int _recommendedBufferSize;
+ private uint _nextPreparedReceiveBufferHint;
+ private uint _nextPreparedReceivePostedWordHint;
+ private bool _deferTailPublish;
+ private bool _deferredTailDirty;
+ private ushort _deferredTailValue;
+ private int _debugOwningThreadId;
+
+ internal ushort BufferGroupId => _bufferGroupId;
+ internal int BufferSize => _bufferSize;
+ internal int AvailableCount => Volatile.Read(ref _availableCount);
+ // Writers are single-threaded via AssertSingleThreadAccess; Volatile.Read keeps
+ // diagnostics/resize sampling conservative when observed outside mutation sites.
+ internal int InUseCount => Volatile.Read(ref _inUseCount);
+ internal long RecycledCount => Volatile.Read(ref _recycledCount);
+ internal long AllocationFailureCount => Volatile.Read(ref _allocationFailureCount);
+ internal int RecommendedBufferSize => Volatile.Read(ref _recommendedBufferSize);
+ internal int TotalBufferCountForTest => _bufferStates.Length;
+
+ private IoUringProvidedBufferRing(ushort bufferGroupId, int ringEntries, int bufferSize, bool adaptiveSizingEnabled)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(ringEntries);
+ if (!BitOperations.IsPow2((uint)ringEntries) || ringEntries > ushort.MaxValue)
+ {
+ throw new ArgumentOutOfRangeException(nameof(ringEntries));
+ }
+
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(bufferSize);
+
+ _bufferGroupId = bufferGroupId;
+ _bufferSize = bufferSize;
+ _adaptiveSizingEnabled = adaptiveSizingEnabled;
+ _ringEntries = (uint)ringEntries;
+ _ringMask = (uint)ringEntries - 1;
+ _availableCount = ringEntries;
+ _recommendedBufferSize = bufferSize;
+ _buffers = new byte[ringEntries][];
+ _bufferAddresses = new nint[ringEntries];
+ _bufferStates = GC.AllocateUninitializedArray(ringEntries);
+ _postedBufferStateBits = new ulong[(ringEntries + 63) / 64];
+
+ nuint ringByteCount = checked((nuint)ringEntries * (nuint)sizeof(Interop.Sys.IoUringBuf));
+ _ringMemory = NativeMemory.AlignedAlloc(ringByteCount, (nuint)Environment.SystemPageSize);
+ if (_ringMemory is null)
+ {
+ throw new OutOfMemoryException();
+ }
+
+ NativeMemory.Clear(_ringMemory, ringByteCount);
+ _ringBuffers = (Interop.Sys.IoUringBuf*)_ringMemory;
+ _ringHeader = (Interop.Sys.IoUringBufRingHeader*)_ringMemory;
+
+ int initializedCount = 0;
+ try
+ {
+ for (int i = 0; i < ringEntries; i++)
+ {
+ byte[] buffer = GC.AllocateUninitializedArray(bufferSize, pinned: true);
+ _buffers[i] = buffer;
+ _bufferAddresses[i] = (nint)Unsafe.AsPointer(ref MemoryMarshal.GetArrayDataReference(buffer));
+ _bufferStates[i] = BufferStatePosted;
+ SetPostedBufferBit((ushort)i, isPosted: true);
+
+ WriteBufferDescriptor((uint)i, (ushort)i);
+ initializedCount++;
+ }
+
+ PublishTail((ushort)initializedCount);
+ }
+ catch
+ {
+ _allocationFailureCount++;
+ Array.Clear(_buffers, 0, initializedCount);
+ Array.Clear(_bufferAddresses, 0, initializedCount);
+ NativeMemory.AlignedFree(_ringMemory);
+ throw;
+ }
+ }
+
+#if DEBUG
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool TryConsumeForcedCreateOutOfMemoryForTest()
+ {
+ int configured = Volatile.Read(ref s_testForceCreateOomOnce);
+ if (configured < 0)
+ {
+ configured = string.Equals(
+ Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceProvidedBufferRingOomOnce),
+ "1",
+ StringComparison.Ordinal) ? 1 : 0;
+ Volatile.Write(ref s_testForceCreateOomOnce, configured);
+ }
+
+ if (configured == 0)
+ {
+ return false;
+ }
+
+ return Interlocked.Exchange(ref s_testForceCreateOomOnce, 0) != 0;
+ }
+#endif
+
+ internal static bool TryCreate(
+ ushort bufferGroupId,
+ int ringEntries,
+ int bufferSize,
+ bool adaptiveSizingEnabled,
+ out IoUringProvidedBufferRing? bufferRing)
+ {
+#if DEBUG
+ if (TryConsumeForcedCreateOutOfMemoryForTest())
+ {
+ bufferRing = null;
+ return false;
+ }
+#endif
+
+ try
+ {
+ bufferRing = new IoUringProvidedBufferRing(bufferGroupId, ringEntries, bufferSize, adaptiveSizingEnabled);
+ return true;
+ }
+ catch (ArgumentOutOfRangeException)
+ {
+ }
+ catch (OutOfMemoryException)
+ {
+ }
+
+ bufferRing = null;
+ return false;
+ }
+
+ /// Records a completion's bytes-transferred for adaptive sizing decisions.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void RecordCompletionUtilization(int bytesTransferred)
+ {
+ AssertSingleThreadAccess();
+ if (!_adaptiveSizingEnabled || bytesTransferred <= 0)
+ {
+ return;
+ }
+
+ int clampedBytes = Math.Min(bytesTransferred, _bufferSize);
+ _totalCompletionBytes += clampedBytes;
+ long count = ++_totalCompletionCount;
+
+ int highWatermark = (_bufferSize * 3) / 4;
+ int lowWatermark = _bufferSize / 4;
+ if (clampedBytes > highWatermark)
+ {
+ _completionsAboveHighWatermark++;
+ }
+ else if (clampedBytes < lowWatermark)
+ {
+ _completionsBelowLowWatermark++;
+ }
+
+ if ((count & (AdaptiveWindowCompletionCount - 1)) == 0)
+ {
+ EvaluateAdaptiveResize();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void EvaluateAdaptiveResize()
+ {
+ AssertSingleThreadAccess();
+ if (!_adaptiveSizingEnabled)
+ {
+ return;
+ }
+
+ long windowBytes = _totalCompletionBytes;
+ long aboveHigh = _completionsAboveHighWatermark;
+ long belowLow = _completionsBelowLowWatermark;
+ _totalCompletionBytes = 0;
+ _completionsAboveHighWatermark = 0;
+ _completionsBelowLowWatermark = 0;
+
+ int currentSize = _bufferSize;
+ int recommendedSize = currentSize;
+ if (aboveHigh > AdaptiveWindowCompletionCount / 2 ||
+ windowBytes > (long)AdaptiveWindowCompletionCount * ((long)currentSize * 3 / 4))
+ {
+ recommendedSize = Math.Min(currentSize * 2, AdaptiveMaxBufferSize);
+ }
+ else if (belowLow > AdaptiveWindowCompletionCount / 2 ||
+ windowBytes < (long)AdaptiveWindowCompletionCount * ((long)currentSize / 4))
+ {
+ recommendedSize = Math.Max(currentSize / 2, AdaptiveMinBufferSize);
+ }
+
+ Volatile.Write(ref _recommendedBufferSize, recommendedSize);
+ }
+
+ internal Interop.Error Register(int ringFd)
+ {
+ Debug.Assert(!_disposed);
+
+ if (_registered)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Interop.Sys.IoUringBufReg registration = default;
+ registration.RingAddress = (ulong)(nuint)_ringMemory;
+ registration.RingEntries = _ringEntries;
+ registration.BufferGroupId = _bufferGroupId;
+
+ int result;
+ Interop.Error registerError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.RegisterPbufRing,
+ ®istration,
+ 1u,
+ &result);
+ if (registerError == Interop.Error.SUCCESS)
+ {
+ _registered = true;
+ }
+
+ return registerError;
+ }
+
+ internal Interop.Error Unregister(int ringFd)
+ {
+ if (!_registered)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Interop.Sys.IoUringBufReg registration = default;
+ registration.BufferGroupId = _bufferGroupId;
+ int result;
+ Interop.Error unregisterError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.UnregisterPbufRing,
+ ®istration,
+ 1u,
+ &result);
+ if (unregisterError == Interop.Error.SUCCESS)
+ {
+ _registered = false;
+ }
+
+ return unregisterError;
+ }
+
+ ///
+ /// Attempts to register pinned buffer payload pages with the kernel via IORING_REGISTER_BUFFERS.
+ /// Failure is non-fatal and callers should gracefully continue with unregistered buffers.
+ /// This does not switch recv SQEs to fixed-buffer opcodes; provided-buffer recv stays on
+ /// IORING_OP_RECV + IOSQE_BUFFER_SELECT.
+ ///
+ internal bool TryRegisterBuffersWithKernel(int ringFd)
+ {
+ if (_disposed || ringFd < 0 || _buffers.Length == 0)
+ {
+ return false;
+ }
+
+ nuint allocationSize = checked((nuint)_buffers.Length * (nuint)sizeof(Interop.Sys.IOVector));
+ Interop.Sys.IOVector* iovecArray;
+ try
+ {
+ iovecArray = (Interop.Sys.IOVector*)NativeMemory.Alloc(allocationSize);
+ }
+ catch (OutOfMemoryException)
+ {
+ return false;
+ }
+
+ try
+ {
+ for (int i = 0; i < _buffers.Length; i++)
+ {
+ nint bufferAddress = _bufferAddresses[i];
+ if (bufferAddress == 0)
+ {
+ return false;
+ }
+
+ iovecArray[i].Base = (byte*)bufferAddress;
+ iovecArray[i].Count = (UIntPtr)_bufferSize;
+ }
+
+ int result;
+ Interop.Error registerError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.RegisterBuffers,
+ iovecArray,
+ (uint)_buffers.Length,
+ &result);
+ return registerError == Interop.Error.SUCCESS;
+ }
+ finally
+ {
+ NativeMemory.Free(iovecArray);
+ }
+ }
+
+ /// Unregisters previously registered pinned buffers via IORING_UNREGISTER_BUFFERS.
+ internal bool TryUnregisterBuffersFromKernel(int ringFd)
+ {
+ if (_disposed || ringFd < 0)
+ {
+ return false;
+ }
+
+ int result;
+ Interop.Error unregisterError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.UnregisterBuffers,
+ null,
+ 0u,
+ &result);
+ return unregisterError == Interop.Error.SUCCESS;
+ }
+
+ /// Acquires a kernel-selected buffer id for completion processing.
+ internal bool TryAcquireBufferForCompletion(ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ AssertSingleThreadAccess();
+ buffer = null;
+ bufferLength = 0;
+
+ if (bufferId >= _ringEntries)
+ {
+ _allocationFailureCount++;
+ return false;
+ }
+
+ byte state = _bufferStates[bufferId];
+ if (state != BufferStatePosted)
+ {
+ Debug.Assert(
+ state == BufferStateCheckedOut,
+ $"Unexpected provided-buffer state during acquire: id={bufferId}, state={state}");
+ _allocationFailureCount++;
+ return false;
+ }
+
+ _bufferStates[bufferId] = BufferStateCheckedOut;
+ SetPostedBufferBit(bufferId, isPosted: false);
+ Debug.Assert(_availableCount > 0, "Provided-buffer available count underflow.");
+ _availableCount--;
+ _inUseCount++;
+
+ nint bufferAddress = _bufferAddresses[bufferId];
+ if (bufferAddress == 0)
+ {
+ _bufferStates[bufferId] = BufferStatePosted;
+ SetPostedBufferBit(bufferId, isPosted: true);
+ _availableCount++;
+ _inUseCount--;
+ _allocationFailureCount++;
+ return false;
+ }
+
+ buffer = (byte*)bufferAddress;
+ bufferLength = _bufferSize;
+ return true;
+ }
+
+ ///
+ /// Acquires any currently posted provided buffer for fixed-recv submission.
+ /// The acquired buffer remains checked out until completion recycles it.
+ ///
+ internal bool TryAcquireBufferForPreparedReceive(out ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ AssertSingleThreadAccess();
+ bufferId = 0;
+ buffer = null;
+ bufferLength = 0;
+
+ // Keep a reserve for kernel-selected (IOSQE_BUFFER_SELECT) receive completions so
+ // fixed-recv one-shots don't deplete the provided-buffer pool under sustained load.
+ int reserveCount = GetPreparedReceiveReserveCount();
+ if (Volatile.Read(ref _availableCount) <= reserveCount)
+ {
+ return false;
+ }
+
+ uint searchStart = _nextPreparedReceiveBufferHint;
+ int maxAttempts = _postedBufferStateBits.Length + 1;
+ for (int attempt = 0; attempt < maxAttempts && TryFindPostedBufferId(searchStart, out ushort candidateId); attempt++)
+ {
+ if (TryAcquireBufferForCompletion(candidateId, out buffer, out bufferLength))
+ {
+ bufferId = candidateId;
+ uint nextSearchStart = ((uint)candidateId + 1) & _ringMask;
+ _nextPreparedReceiveBufferHint = nextSearchStart;
+ _nextPreparedReceivePostedWordHint = nextSearchStart >> 6;
+ return true;
+ }
+
+ searchStart = ((uint)candidateId + 1) & _ringMask;
+ _nextPreparedReceiveBufferHint = searchStart;
+ _nextPreparedReceivePostedWordHint = searchStart >> 6;
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int GetPreparedReceiveReserveCount()
+ {
+ int ringEntryCount = (int)_ringEntries;
+ int dynamicReserve = ringEntryCount / 16;
+ return Math.Clamp(dynamicReserve, PreparedReceiveMinimumReserve, PreparedReceiveMaximumReserve);
+ }
+
+ /// Returns the pointer/length for a buffer that is already checked out.
+ internal bool TryGetCheckedOutBuffer(ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ buffer = null;
+ bufferLength = 0;
+
+ if (bufferId >= _ringEntries || _bufferStates[bufferId] != BufferStateCheckedOut)
+ {
+ return false;
+ }
+
+ nint bufferAddress = _bufferAddresses[bufferId];
+ if (bufferAddress == 0)
+ {
+ _allocationFailureCount++;
+ return false;
+ }
+
+ buffer = (byte*)bufferAddress;
+ bufferLength = _bufferSize;
+ return true;
+ }
+
+ /// Returns a previously acquired buffer id back to the provided-buffer ring.
+ internal bool TryRecycleBufferFromCompletion(ushort bufferId)
+ {
+ AssertSingleThreadAccess();
+ if (bufferId >= _ringEntries)
+ {
+ return false;
+ }
+
+ byte state = _bufferStates[bufferId];
+ if (state != BufferStateCheckedOut)
+ {
+ Debug.Assert(
+ state == BufferStatePosted,
+ $"Unexpected provided-buffer state during recycle: id={bufferId}, state={state}");
+ return false;
+ }
+
+ RecycleCheckedOutBuffer(bufferId);
+ return true;
+ }
+
+ ///
+ /// Recycles any still-checked-out ids back into the ring during teardown.
+ /// Returns the number of ids recycled.
+ ///
+ internal int RecycleCheckedOutBuffersForTeardown()
+ {
+ AssertSingleThreadAccess();
+ int recycledCount = 0;
+ for (ushort bufferId = 0; bufferId < _ringEntries; bufferId++)
+ {
+ if (_bufferStates[bufferId] != BufferStateCheckedOut)
+ {
+ continue;
+ }
+
+ RecycleCheckedOutBuffer(bufferId);
+ recycledCount++;
+ }
+
+ return recycledCount;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void BeginDeferredRecyclePublish()
+ {
+ AssertSingleThreadAccess();
+ if (_deferTailPublish)
+ {
+ return;
+ }
+
+ _deferTailPublish = true;
+ _deferredTailDirty = false;
+ _deferredTailValue = ReadTail();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void EndDeferredRecyclePublish()
+ {
+ AssertSingleThreadAccess();
+ if (!_deferTailPublish)
+ {
+ return;
+ }
+
+ _deferTailPublish = false;
+ if (_deferredTailDirty)
+ {
+ PublishTail(_deferredTailValue);
+ _deferredTailDirty = false;
+ }
+ }
+
+ ///
+ /// Marks every provided buffer as checked out for deterministic test-only depletion setup.
+ ///
+ internal void ForceAllBuffersCheckedOutForTest()
+ {
+ AssertSingleThreadAccess();
+ for (int i = 0; i < _bufferStates.Length; i++)
+ {
+ _bufferStates[i] = BufferStateCheckedOut;
+ }
+
+ Array.Clear(_postedBufferStateBits);
+ _nextPreparedReceivePostedWordHint = 0;
+ Volatile.Write(ref _availableCount, 0);
+ Volatile.Write(ref _inUseCount, _bufferStates.Length);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecycleCheckedOutBuffer(ushort bufferId)
+ {
+ ushort tail = _deferTailPublish ? _deferredTailValue : ReadTail();
+ uint ringIndex = (uint)tail & _ringMask;
+ WriteBufferDescriptor(ringIndex, bufferId);
+ _bufferStates[bufferId] = BufferStatePosted;
+ SetPostedBufferBit(bufferId, isPosted: true);
+ _availableCount++;
+ Debug.Assert(_inUseCount > 0, "Provided-buffer in-use count underflow.");
+ _inUseCount--;
+ ushort nextTail = unchecked((ushort)(tail + 1));
+ if (_deferTailPublish)
+ {
+ _deferredTailValue = nextTail;
+ _deferredTailDirty = true;
+ }
+ else
+ {
+ PublishTail(nextTail);
+ }
+ _recycledCount++;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void SetPostedBufferBit(ushort bufferId, bool isPosted)
+ {
+ int wordIndex = bufferId >> 6;
+ ulong bit = 1UL << (bufferId & 63);
+ if (isPosted)
+ {
+ bool wordWasEmpty = _postedBufferStateBits[wordIndex] == 0;
+ _postedBufferStateBits[wordIndex] |= bit;
+ if (wordWasEmpty)
+ {
+ _nextPreparedReceivePostedWordHint = (uint)wordIndex;
+ }
+ }
+ else
+ {
+ _postedBufferStateBits[wordIndex] &= ~bit;
+ }
+ }
+
+ private bool TryFindPostedBufferId(uint startIndex, out ushort bufferId)
+ {
+ int wordCount = _postedBufferStateBits.Length;
+ if (wordCount == 0)
+ {
+ bufferId = 0;
+ return false;
+ }
+
+ int hintWord = (int)(_nextPreparedReceivePostedWordHint % (uint)wordCount);
+ if (TryFindBitInWord(hintWord, _postedBufferStateBits[hintWord], out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = (uint)hintWord;
+ return true;
+ }
+
+ uint startWord = startIndex >> 6;
+ int bitOffset = (int)(startIndex & 63);
+ if (startWord >= (uint)wordCount)
+ {
+ bufferId = 0;
+ return false;
+ }
+
+ if (TryFindBitInWord((int)startWord, _postedBufferStateBits[startWord] & (~0UL << bitOffset), out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = startWord;
+ return true;
+ }
+
+ for (int word = (int)startWord + 1; word < wordCount; word++)
+ {
+ if (TryFindBitInWord(word, _postedBufferStateBits[word], out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = (uint)word;
+ return true;
+ }
+ }
+
+ for (int word = 0; word < (int)startWord; word++)
+ {
+ if (TryFindBitInWord(word, _postedBufferStateBits[word], out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = (uint)word;
+ return true;
+ }
+ }
+
+ bufferId = 0;
+ return false;
+ }
+
+ private bool TryFindBitInWord(int wordIndex, ulong wordBits, out ushort bufferId)
+ {
+ while (wordBits != 0)
+ {
+ int bitIndex = BitOperations.TrailingZeroCount(wordBits);
+ int candidate = (wordIndex << 6) + bitIndex;
+ if ((uint)candidate < _ringEntries)
+ {
+ bufferId = (ushort)candidate;
+ return true;
+ }
+
+ wordBits &= wordBits - 1;
+ }
+
+ bufferId = 0;
+ return false;
+ }
+
+ [Conditional("DEBUG")]
+ private void AssertSingleThreadAccess()
+ {
+ int currentThreadId = Environment.CurrentManagedThreadId;
+ int ownerThreadId = Volatile.Read(ref _debugOwningThreadId);
+ if (ownerThreadId == 0)
+ {
+ int prior = Interlocked.CompareExchange(ref _debugOwningThreadId, currentThreadId, comparand: 0);
+ ownerThreadId = prior == 0 ? currentThreadId : prior;
+ }
+
+ Debug.Assert(
+ ownerThreadId == currentThreadId,
+ $"IoUringProvidedBufferRing mutable state must be accessed from one thread. Owner={ownerThreadId}, current={currentThreadId}");
+ }
+
+ public void Dispose()
+ {
+ if (_disposed)
+ {
+ return;
+ }
+
+#if DEBUG
+ int checkedOutBufferCount = 0;
+ for (int i = 0; i < _bufferStates.Length; i++)
+ {
+ if (_bufferStates[i] == BufferStateCheckedOut)
+ {
+ checkedOutBufferCount++;
+ }
+ }
+
+ Debug.Assert(
+ checkedOutBufferCount == 0,
+ $"Disposing provided-buffer ring with outstanding checked-out buffers: {checkedOutBufferCount}");
+#endif
+
+ Debug.Assert(
+ !_registered,
+ "Provided-buffer ring must be unregistered before disposing native ring memory.");
+ if (_registered)
+ {
+ return;
+ }
+
+ _ringBuffers = null;
+ _ringHeader = null;
+ NativeMemory.AlignedFree(_ringMemory);
+ _disposed = true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private ushort ReadTail() =>
+ Volatile.Read(ref Unsafe.AsRef(&_ringHeader->Tail));
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void PublishTail(ushort tail) =>
+ Volatile.Write(ref Unsafe.AsRef(&_ringHeader->Tail), tail);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void WriteBufferDescriptor(uint ringIndex, ushort bufferId)
+ {
+ Debug.Assert(ringIndex < _ringEntries);
+ Debug.Assert(bufferId < _ringEntries);
+ Debug.Assert(_bufferAddresses[bufferId] != 0);
+
+ Interop.Sys.IoUringBuf* bufferSlot = _ringBuffers + ringIndex;
+ bufferSlot->Address = (ulong)(nuint)_bufferAddresses[bufferId];
+ bufferSlot->Length = (uint)_bufferSize;
+ bufferSlot->BufferId = bufferId;
+ // Do NOT write Reserved: at bufs[0] it overlays the ring tail field
+ // in the kernel's io_uring_buf_ring union. Writing 0 would corrupt the
+ // tail, causing the kernel to miscompute available buffer count.
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs
new file mode 100644
index 00000000000000..2546430d353762
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs
@@ -0,0 +1,417 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ ///
+ /// Lock-free multi-producer, single-consumer queue optimized for the io_uring
+ /// event loop pattern where many threads enqueue work items but exactly one
+ /// thread drains them.
+ ///
+ /// Liveness contract:
+ /// TryDequeue/IsEmpty may observe a producer between index claim and publish
+ /// (Interlocked.Increment followed by Volatile.Write), and can transiently report
+ /// no available item even though an enqueue is in progress. Callers must provide
+ /// their own wakeup/progress mechanism after Enqueue.
+ ///
+ internal sealed class MpscQueue
+ {
+ private const int DefaultSegmentSize = 256;
+ private const int UnlinkedSegmentCacheCapacity = 4;
+ private const int MaxEnqueueSlowAttempts = 2048;
+#if DEBUG
+ private static int s_testSegmentAllocationFailuresRemaining;
+#endif
+
+ private readonly int _segmentSize;
+ private PaddedSegment _head;
+ private PaddedSegment _tail;
+ // Segment cache is shared by:
+ // - unlinked segments that lost tail->next publication races, and
+ // - drained head segments returned only after producer quiescence checks.
+ // Cache bookkeeping is protected by a tiny lock because this path is already slow-path only.
+ private readonly Lock _cachedUnlinkedSegmentGate = new Lock();
+ private readonly Segment?[] _cachedUnlinkedSegments = new Segment?[UnlinkedSegmentCacheCapacity];
+ private int _cachedUnlinkedSegmentCount;
+ private int _activeEnqueueOperations;
+
+ internal MpscQueue(int segmentSize = DefaultSegmentSize)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(segmentSize);
+ _segmentSize = segmentSize;
+ Segment initial = new Segment(segmentSize);
+ _head.Value = initial;
+ _tail.Value = initial;
+ }
+
+ ///
+ /// Attempts to enqueue an item.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryEnqueue(T item)
+ {
+ if (TryEnqueueFast(item))
+ {
+ return true;
+ }
+
+ return TryEnqueueSlowWithProducerTracking(item);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool TryEnqueueSlowWithProducerTracking(T item)
+ {
+ // Only slow-path producers can retain stale segment references long enough to race with
+ // drained-segment recycling. Fast-path success doesn't need this accounting.
+ Interlocked.Increment(ref _activeEnqueueOperations);
+ try
+ {
+ return TryEnqueueSlow(item);
+ }
+ finally
+ {
+ Interlocked.Decrement(ref _activeEnqueueOperations);
+ }
+ }
+
+ ///
+ /// Enqueues an item, retrying until an enqueue slot is observed.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void Enqueue(T item)
+ {
+ SpinWait spinner = default;
+ while (!TryEnqueue(item))
+ {
+ spinner.SpinOnce();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryEnqueueFast(T item)
+ {
+ Segment tail = Volatile.Read(ref _tail.Value)!;
+ T[] items = tail.Items;
+ int[] states = tail.States;
+ // Snapshot incarnation before claiming a slot. If the segment is recycled
+ // between this read and the Interlocked.Increment, the incarnation will differ.
+ int incarnation = Volatile.Read(ref tail.Incarnation);
+ int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1;
+ // A stale claim can over-increment the old segment index before incarnation
+ // mismatch is detected; this is safe because ResetForReuse resets EnqueueIndex.
+ if ((uint)index < (uint)states.Length)
+ {
+ // Verify segment was not recycled while we were claiming the slot.
+ // A recycled segment has a different incarnation because ResetForReuse
+ // increments it. Without this check, TryReturnDrainedSegmentToCache can
+ // recycle the segment (since fast-path producers are not tracked by
+ // _activeEnqueueOperations) and we would write into reused memory.
+ if (Volatile.Read(ref tail.Incarnation) == incarnation)
+ {
+ items[index] = item;
+ Volatile.Write(ref states[index], 1);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool TryEnqueueSlow(T item)
+ {
+ SpinWait spinner = default;
+ for (int attempt = 0; attempt < MaxEnqueueSlowAttempts; attempt++)
+ {
+ Segment tail = Volatile.Read(ref _tail.Value)!;
+ T[] items = tail.Items;
+ int[] states = tail.States;
+ int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1;
+ if ((uint)index < (uint)states.Length)
+ {
+ items[index] = item;
+ Volatile.Write(ref states[index], 1);
+ return true;
+ }
+
+ Segment? next = Volatile.Read(ref tail.Next);
+ if (next is null)
+ {
+ Segment newSegment;
+ try
+ {
+ newSegment = RentUnlinkedSegment();
+ }
+ catch (OutOfMemoryException)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref tail.Next, newSegment, null) is null)
+ {
+ next = newSegment;
+ }
+ else
+ {
+ // Another producer linked its own segment first. Reuse ours later.
+ ReturnUnlinkedSegment(newSegment);
+ next = Volatile.Read(ref tail.Next);
+ }
+ }
+
+ if (next is not null)
+ {
+ Interlocked.CompareExchange(ref _tail.Value, next, tail);
+ }
+
+ spinner.SpinOnce();
+ }
+
+ return false;
+ }
+
+ ///
+ /// Attempts to dequeue an item. Must only be called by the single consumer thread.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryDequeue(out T item)
+ {
+ if (TryDequeueFast(out item))
+ {
+ return true;
+ }
+
+ return TryDequeueSlow(out item);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool TryDequeueFromSegment(Segment head, out T item)
+ {
+ int[] states = head.States;
+ int index = head.DequeueIndex;
+ if ((uint)index >= (uint)states.Length)
+ {
+ item = default!;
+ return false;
+ }
+
+ // Acquire published slot before reading the item value.
+ if (Volatile.Read(ref states[index]) != 1)
+ {
+ item = default!;
+ return false;
+ }
+
+ T[] items = head.Items;
+ item = items[index];
+ if (RuntimeHelpers.IsReferenceOrContainsReferences())
+ {
+ items[index] = default!;
+ }
+
+ head.DequeueIndex = index + 1;
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryDequeueFast(out T item)
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ return TryDequeueFromSegment(head, out item);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool TryDequeueSlow(out T item)
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ while ((uint)head.DequeueIndex >= (uint)head.States.Length)
+ {
+ Segment? next = Volatile.Read(ref head.Next);
+ if (next is null)
+ {
+ item = default!;
+ return false;
+ }
+
+ // Consumer publishes head advance; producers read _head when resolving slow-path
+ // enqueue progress, so this store must be visible across cores.
+ Volatile.Write(ref _head.Value, next);
+ TryReturnDrainedSegmentToCache(head);
+ head = next;
+ }
+
+ return TryDequeueFromSegment(head, out item);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void TryReturnDrainedSegmentToCache(Segment drainedSegment)
+ {
+ // Safe reuse requires producer quiescence and tail advancement away from this segment.
+ // Without these checks, a producer that captured a stale segment pointer could publish
+ // into a reset segment after it has been recycled.
+ if (Volatile.Read(ref _activeEnqueueOperations) != 0 ||
+ ReferenceEquals(Volatile.Read(ref _tail.Value), drainedSegment))
+ {
+ return;
+ }
+
+ ReturnUnlinkedSegment(drainedSegment);
+ }
+
+ ///
+ /// Returns whether the queue currently appears empty (snapshot, not linearizable).
+ /// A return value of can also mean an enqueue is mid-flight.
+ ///
+ internal bool IsEmpty
+ {
+ get
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ while (true)
+ {
+ int[] states = head.States;
+ int index = head.DequeueIndex;
+ if ((uint)index >= (uint)states.Length)
+ {
+ Segment? next = Volatile.Read(ref head.Next);
+ if (next is null)
+ {
+ return true;
+ }
+
+ head = next;
+ continue;
+ }
+
+ return Volatile.Read(ref states[index]) != 1;
+ }
+ }
+ }
+
+ private Segment RentUnlinkedSegment()
+ {
+ lock (_cachedUnlinkedSegmentGate)
+ {
+ if (_cachedUnlinkedSegmentCount != 0)
+ {
+ int nextIndex = _cachedUnlinkedSegmentCount - 1;
+ Segment segment = _cachedUnlinkedSegments[nextIndex]!;
+ _cachedUnlinkedSegments[nextIndex] = null;
+ _cachedUnlinkedSegmentCount = nextIndex;
+ segment.ResetForReuse();
+ return segment;
+ }
+ }
+
+#if DEBUG
+ if (TryConsumeSegmentAllocationFailureForTest())
+ {
+ throw new OutOfMemoryException("Injected MpscQueue segment allocation failure for test.");
+ }
+#endif
+
+ return new Segment(_segmentSize);
+ }
+
+#if DEBUG
+ internal static void SetSegmentAllocationFailuresForTest(int failureCount)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegative(failureCount);
+
+ Volatile.Write(ref s_testSegmentAllocationFailuresRemaining, failureCount);
+ }
+
+ private static bool TryConsumeSegmentAllocationFailureForTest()
+ {
+ while (true)
+ {
+ int remainingFailures = Volatile.Read(ref s_testSegmentAllocationFailuresRemaining);
+ if (remainingFailures <= 0)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(
+ ref s_testSegmentAllocationFailuresRemaining,
+ remainingFailures - 1,
+ remainingFailures) == remainingFailures)
+ {
+ return true;
+ }
+ }
+ }
+#endif
+
+ private void ReturnUnlinkedSegment(Segment segment)
+ {
+ segment.ResetForReuse();
+ lock (_cachedUnlinkedSegmentGate)
+ {
+ if (_cachedUnlinkedSegmentCount < _cachedUnlinkedSegments.Length)
+ {
+ _cachedUnlinkedSegments[_cachedUnlinkedSegmentCount++] = segment;
+ }
+ }
+ }
+
+ private sealed class Segment
+ {
+ // SoA layout keeps producer-published states compact so consumer scans avoid
+ // touching adjacent item payload cache lines.
+ internal readonly T[] Items;
+ internal readonly int[] States;
+ internal int Incarnation;
+ internal PaddedInt32 EnqueueIndex;
+ internal int DequeueIndex;
+ internal Segment? Next;
+
+ internal Segment(int size)
+ {
+ Items = new T[size];
+ States = new int[size];
+ ResetForReuse();
+ }
+
+ internal void ResetForReuse()
+ {
+ Interlocked.Increment(ref Incarnation);
+ EnqueueIndex.Value = 0;
+ DequeueIndex = 0;
+ Next = null;
+ if (RuntimeHelpers.IsReferenceOrContainsReferences())
+ {
+ Array.Clear(Items);
+ }
+ Array.Clear(States);
+ }
+ }
+
+#if TARGET_ARM64 || TARGET_LOONGARCH64
+ private const int CacheLineWordCount = 16; // 128-byte cache line / sizeof(nint)
+#else
+ private const int CacheLineWordCount = 8; // 64-byte cache line / sizeof(nint)
+#endif
+
+ [InlineArray(CacheLineWordCount - 1)]
+ private struct CacheLinePadding
+ {
+ internal nint _element0;
+ }
+
+ private struct PaddedSegment
+ {
+ internal Segment? Value;
+ internal CacheLinePadding _padding;
+ }
+
+ private struct PaddedInt32
+ {
+ internal int Value;
+ internal CacheLinePadding _padding;
+ }
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.Unix.cs
index 0993a0088b4041..c874f5cea27c81 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.Unix.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.Unix.cs
@@ -224,6 +224,11 @@ private unsafe bool TryUnblockSocket(bool abortive)
return true;
}
+ partial void TryWakeIoUringEventLoop()
+ {
+ _asyncContext?.WakeIoUringEventLoopIfNeeded();
+ }
+
private unsafe SocketError DoCloseHandle(bool abortive)
{
Interop.Error errorCode = Interop.Error.SUCCESS;
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.cs
index 7f653dab759b80..fb69ce4c7ce834 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SafeSocketHandle.cs
@@ -108,6 +108,7 @@ internal void CloseAsIs(bool abortive)
// Wait until it's safe.
SpinWait sw = default;
+ bool ioUringWakeSent = false;
while (!_released)
{
// The socket was not released due to the SafeHandle being used.
@@ -115,6 +116,16 @@ internal void CloseAsIs(bool abortive)
// On Linux, TryUnblockSocket will unblock current operations but it doesn't prevent
// a new one from starting. So we must call TryUnblockSocket multiple times.
canceledOperations |= TryUnblockSocket(abortive);
+
+ // With io_uring DEFER_TASKRUN, shutdown/disconnect queues cancel CQEs
+ // as deferred task work. Wake the event loop once so it calls
+ // io_uring_enter to process them, rather than waiting for the 50ms timeout.
+ if (!ioUringWakeSent)
+ {
+ ioUringWakeSent = true;
+ TryWakeIoUringEventLoop();
+ }
+
sw.SpinOnce();
}
@@ -130,6 +141,12 @@ internal void CloseAsIs(bool abortive)
#endif
}
+ ///
+ /// Wakes the io_uring event loop if the socket is registered with one.
+ /// No-op on platforms without io_uring.
+ ///
+ partial void TryWakeIoUringEventLoop();
+
private bool CloseHandle(bool abortive, bool canceledOperations)
{
bool ret = false;
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs
new file mode 100644
index 00000000000000..3734139753078e
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs
@@ -0,0 +1,3128 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Buffers;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed partial class SocketAsyncContext
+ {
+ private const int MultishotAcceptQueueMaxSize = 4096;
+ private const int PersistentMultishotRecvDataQueueMaxSize = 64;
+ private const int SolSocket = 1;
+ private const int SoIncomingCpu = 49;
+ private const int SoReusePort = 15;
+ private const int IoUringUserDataTagShift = 56;
+ private const byte IoUringReservedCompletionTag = 2;
+ private const long MultishotAcceptStateDisarmed = 0;
+ private const long MultishotAcceptStateArming = 1;
+ private Queue? _multishotAcceptQueue;
+ private int _migrationState; // 0=unchecked, 1=checking, 2=done
+ private long _multishotAcceptState; // 0=disarmed, 1=arming, otherwise encoded reserved-completion user_data
+ private ulong _persistentMultishotRecvUserData; // user_data of armed multishot recv SQE
+ private int _persistentMultishotRecvArmed; // 0=not armed, 1=armed
+ private Queue? _persistentMultishotRecvDataQueue;
+ private BufferedPersistentMultishotRecvData _persistentMultishotRecvDataHead;
+ private bool _hasPersistentMultishotRecvDataHead;
+ private int _persistentMultishotRecvDataHeadOffset;
+ private Lock? _multishotAcceptQueueGate;
+ private Lock? _persistentMultishotRecvDataGate;
+ private Lock? _reusePortShadowListenersGate;
+ private ReusePortShadowListenerState[]? _reusePortShadowListeners;
+
+ /// Tracks a SO_REUSEPORT shadow listener socket armed on a non-primary engine.
+ private struct ReusePortShadowListenerState
+ {
+ internal SafeSocketHandle Handle;
+ internal int EngineIndex;
+ internal ulong ArmedUserData;
+ }
+
+ private readonly struct BufferedPersistentMultishotRecvData
+ {
+ internal readonly byte[] Data;
+ internal readonly int Length;
+ internal readonly bool UsesPooledBuffer;
+
+ internal BufferedPersistentMultishotRecvData(byte[] data, int length, bool usesPooledBuffer)
+ {
+ Data = data;
+ Length = length;
+ UsesPooledBuffer = usesPooledBuffer;
+ }
+ }
+
+ /// Holds a pre-accepted connection's fd and socket address from a multishot accept CQE.
+ private readonly struct PreAcceptedConnection
+ {
+ internal readonly IntPtr FileDescriptor;
+ internal readonly byte[] SocketAddressData;
+ internal readonly int SocketAddressLength;
+ internal readonly bool UsesPooledBuffer;
+
+ internal PreAcceptedConnection(IntPtr fileDescriptor, byte[] socketAddressData, int socketAddressLength, bool usesPooledBuffer)
+ {
+ FileDescriptor = fileDescriptor;
+ SocketAddressData = socketAddressData;
+ SocketAddressLength = socketAddressLength;
+ UsesPooledBuffer = usesPooledBuffer;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Lock EnsureMultishotAcceptQueueGate() => EnsureLockInitialized(ref _multishotAcceptQueueGate);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Lock EnsurePersistentMultishotRecvDataGate() => EnsureLockInitialized(ref _persistentMultishotRecvDataGate);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Lock EnsureReusePortShadowListenersGate() => EnsureLockInitialized(ref _reusePortShadowListenersGate);
+
+ private bool IsPrimarySocketReusePortEnabled()
+ {
+ Span value = stackalloc byte[sizeof(int)];
+ int valueLength = sizeof(int);
+ SocketError error = SocketPal.GetRawSockOpt(_socket, SolSocket, SoReusePort, value, ref valueLength);
+ if (error != SocketError.Success || valueLength < sizeof(int))
+ {
+ return false;
+ }
+
+ return BitConverter.ToInt32(value) != 0;
+ }
+
+ private void AddReusePortShadowListener(ref ReusePortShadowListenerState state)
+ {
+ Lock gate = EnsureReusePortShadowListenersGate();
+ lock (gate)
+ {
+ ReusePortShadowListenerState[]? existing = _reusePortShadowListeners;
+ if (existing is null)
+ {
+ _reusePortShadowListeners = [state];
+ return;
+ }
+
+ var updated = new ReusePortShadowListenerState[existing.Length + 1];
+ Array.Copy(existing, updated, existing.Length);
+ updated[^1] = state;
+ _reusePortShadowListeners = updated;
+ }
+ }
+
+ private void RemoveReusePortShadowListenerByEngineIndex(int engineIndex)
+ {
+ Lock gate = EnsureReusePortShadowListenersGate();
+ lock (gate)
+ {
+ ReusePortShadowListenerState[]? existing = _reusePortShadowListeners;
+ if (existing is null || existing.Length == 0)
+ {
+ return;
+ }
+
+ int removeIndex = -1;
+ for (int i = 0; i < existing.Length; i++)
+ {
+ if (existing[i].EngineIndex == engineIndex)
+ {
+ removeIndex = i;
+ break;
+ }
+ }
+
+ if (removeIndex < 0)
+ {
+ return;
+ }
+
+ if (existing.Length == 1)
+ {
+ _reusePortShadowListeners = null;
+ return;
+ }
+
+ var updated = new ReusePortShadowListenerState[existing.Length - 1];
+ if (removeIndex > 0)
+ {
+ Array.Copy(existing, 0, updated, 0, removeIndex);
+ }
+
+ if (removeIndex < existing.Length - 1)
+ {
+ Array.Copy(existing, removeIndex + 1, updated, removeIndex, existing.Length - removeIndex - 1);
+ }
+
+ _reusePortShadowListeners = updated;
+ }
+ }
+
+ private static bool TryGetIncomingCpu(SafeSocketHandle socket, out int cpu)
+ {
+ cpu = -1;
+ Span value = stackalloc byte[sizeof(int)];
+ int valueLength = sizeof(int);
+ SocketError error = SocketPal.GetRawSockOpt(socket, SolSocket, SoIncomingCpu, value, ref valueLength);
+ if (error != SocketError.Success || valueLength < sizeof(int))
+ {
+ return false;
+ }
+
+ cpu = BitConverter.ToInt32(value);
+ return cpu >= 0;
+ }
+
+ private void TryMigrateIoUringEngineOnFirstReceiveCompletion()
+ {
+ if (Interlocked.CompareExchange(ref _migrationState, 1, 0) != 0)
+ {
+ return;
+ }
+
+ try
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ if (engine is null || !engine.IsIoUringCompletionModeEnabled || IsPersistentMultishotRecvArmed())
+ {
+ return;
+ }
+
+ if (!TryGetIncomingCpu(_socket, out int incomingCpu))
+ {
+ return;
+ }
+
+ int targetEngineIndex = SocketAsyncEngine.GetEngineIndexForCpu(incomingCpu);
+ if (targetEngineIndex < 0 || targetEngineIndex == engine.EngineIndex)
+ {
+ return;
+ }
+
+ _ = TryMigrateToEngine(targetEngineIndex);
+ }
+ finally
+ {
+ Volatile.Write(ref _migrationState, 2);
+ }
+ }
+
+ private int PersistentMultishotRecvBufferedCount =>
+ (_persistentMultishotRecvDataQueue?.Count ?? 0) + (_hasPersistentMultishotRecvDataHead ? 1 : 0);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Lock EnsureLockInitialized(ref Lock? gate)
+ {
+ Lock? existing = Volatile.Read(ref gate);
+ if (existing is not null)
+ {
+ return existing;
+ }
+
+ Lock created = new Lock();
+ Lock? prior = Interlocked.CompareExchange(ref gate, created, null);
+ return prior ?? created;
+ }
+
+ /// Returns whether this context's engine is using io_uring completion mode.
+ private bool IsIoUringCompletionModeEnabled()
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ return engine is not null && engine.IsIoUringCompletionModeEnabled;
+ }
+
+ /// Returns the total count of non-pinnable buffer prepare fallbacks across active engines.
+ internal static long GetIoUringNonPinnablePrepareFallbackCount() =>
+ SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackCount();
+
+ /// Test-only setter for the non-pinnable fallback counter.
+ internal static void SetIoUringNonPinnablePrepareFallbackCountForTest(long value) =>
+ SocketAsyncEngine.SetIoUringNonPinnablePrepareFallbackCountForTest(value);
+
+ private static SocketAsyncContext? GetContextForTest(Socket socket)
+ {
+ try { return socket.SafeHandle.AsyncContext; }
+ catch (ObjectDisposedException) { return null; }
+ }
+
+ internal static bool TryGetSocketAsyncContextForTest(Socket socket, out SocketAsyncContext? context)
+ {
+ context = GetContextForTest(socket);
+ return context is not null;
+ }
+
+ internal static int GetReusePortShadowListenerCountForTest(Socket socket) =>
+ GetContextForTest(socket)?._reusePortShadowListeners?.Length ?? 0;
+
+ internal static bool IsMultishotAcceptArmedForTest(Socket socket) =>
+ GetContextForTest(socket)?.IsMultishotAcceptArmed ?? false;
+
+ internal static int GetMultishotAcceptQueueCountForTest(Socket socket)
+ {
+ SocketAsyncContext? context = GetContextForTest(socket);
+ if (context is null) return 0;
+ Lock gate = context.EnsureMultishotAcceptQueueGate();
+ lock (gate) { return context._multishotAcceptQueue?.Count ?? 0; }
+ }
+
+ internal static bool TryGetIncomingCpuForTest(Socket socket, out int cpu)
+ {
+ cpu = -1;
+ SocketAsyncContext? context = GetContextForTest(socket);
+ return context is not null && TryGetIncomingCpu(context._socket, out cpu);
+ }
+
+ internal static bool IsPersistentMultishotRecvArmedForTest(Socket socket) =>
+ GetContextForTest(socket)?.IsPersistentMultishotRecvArmed() ?? false;
+
+ internal static ulong GetPersistentMultishotRecvUserDataForTest(Socket socket)
+ {
+ SocketAsyncContext? context = GetContextForTest(socket);
+ return context is not null && context.IsPersistentMultishotRecvArmed()
+ ? context.PersistentMultishotRecvUserData : 0;
+ }
+
+ internal static int GetPersistentMultishotRecvBufferedCountForTest(Socket socket)
+ {
+ SocketAsyncContext? context = GetContextForTest(socket);
+ if (context is null) return 0;
+ Lock gate = context.EnsurePersistentMultishotRecvDataGate();
+ lock (gate) { return context.PersistentMultishotRecvBufferedCount; }
+ }
+
+ internal int GetPersistentMultishotRecvBufferedCountForDiagnostics()
+ {
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ lock (gate)
+ {
+ return PersistentMultishotRecvBufferedCount;
+ }
+ }
+
+ /// Test-only wrapper accepting byte[] to avoid Span reflection limitations.
+ internal bool TryBufferEarlyPersistentMultishotRecvDataForTest(byte[] payload) =>
+ TryBufferEarlyPersistentMultishotRecvData(payload);
+
+ /// Returns whether a multishot accept SQE is currently armed for this context.
+ internal bool IsMultishotAcceptArmed => Volatile.Read(ref _multishotAcceptState) != MultishotAcceptStateDisarmed;
+
+ /// Returns the user_data payload for the armed multishot accept SQE, if any.
+ internal ulong MultishotAcceptUserData => DecodeMultishotAcceptUserData(Volatile.Read(ref _multishotAcceptState));
+
+ /// Clears multishot accept armed-state for this context.
+ internal void DisarmMultishotAccept()
+ {
+ Volatile.Write(ref _multishotAcceptState, MultishotAcceptStateDisarmed);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong DecodeMultishotAcceptUserData(long packedState)
+ {
+ ulong rawState = (ulong)packedState;
+ return (byte)(rawState >> IoUringUserDataTagShift) == IoUringReservedCompletionTag
+ ? rawState
+ : 0;
+ }
+
+ /// Returns whether a persistent multishot recv SQE is currently armed for this context.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool IsPersistentMultishotRecvArmed() =>
+ Volatile.Read(ref _persistentMultishotRecvArmed) != 0;
+
+ /// Records that a persistent multishot recv SQE has been armed for this context.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void SetPersistentMultishotRecvArmed(ulong userData)
+ {
+ Volatile.Write(ref _persistentMultishotRecvUserData, userData);
+ Volatile.Write(ref _persistentMultishotRecvArmed, 1);
+ }
+
+ /// Clears this context's armed persistent multishot recv state.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void ClearPersistentMultishotRecvArmed()
+ {
+ Volatile.Write(ref _persistentMultishotRecvUserData, 0);
+ Volatile.Write(ref _persistentMultishotRecvArmed, 0);
+ }
+
+ /// Gets the user_data of the armed persistent multishot recv SQE, or 0 if none is armed.
+ internal ulong PersistentMultishotRecvUserData =>
+ Volatile.Read(ref _persistentMultishotRecvUserData);
+
+ ///
+ /// Clears persistent multishot recv armed-state and requests ASYNC_CANCEL for
+ /// the armed user_data when available.
+ ///
+ internal void RequestPersistentMultishotRecvCancel()
+ {
+ ulong recvUserData = Volatile.Read(ref _persistentMultishotRecvUserData);
+ ClearPersistentMultishotRecvArmed();
+ if (recvUserData != 0)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ engine?.TryRequestIoUringCancellation(recvUserData);
+ }
+ }
+
+ /// Copies an early multishot-recv payload into the per-socket replay queue.
+ internal bool TryBufferEarlyPersistentMultishotRecvData(ReadOnlySpan payload)
+ {
+ if (payload.Length == 0)
+ {
+ return true;
+ }
+
+ EnsurePersistentMultishotRecvDataQueueInitialized();
+ Queue? queue = _persistentMultishotRecvDataQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ byte[] copy = ArrayPool.Shared.Rent(payload.Length);
+ payload.CopyTo(copy);
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ lock (gate)
+ {
+ if (PersistentMultishotRecvBufferedCount >= PersistentMultishotRecvDataQueueMaxSize)
+ {
+ ArrayPool.Shared.Return(copy);
+ return false;
+ }
+
+ // Publish queue count only after enqueue to avoid teardown observing phantom items.
+ queue.Enqueue(new BufferedPersistentMultishotRecvData(copy, payload.Length, usesPooledBuffer: true));
+ }
+
+ return true;
+ }
+
+ /// Attempts to drain buffered multishot-recv payload into the caller destination.
+ internal bool TryConsumeBufferedPersistentMultishotRecvData(Memory destination, out int bytesTransferred)
+ {
+ bytesTransferred = 0;
+ if (destination.Length == 0)
+ {
+ return false;
+ }
+
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ byte[] sourceBuffer;
+ int sourceOffset;
+ int toCopy;
+ bool releaseHeadAfterCopy;
+ BufferedPersistentMultishotRecvData sourceHead;
+ lock (gate)
+ {
+ if (!TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered))
+ {
+ return false;
+ }
+
+ int headOffset = _persistentMultishotRecvDataHeadOffset;
+ int remaining = buffered.Length - headOffset;
+ Debug.Assert(remaining > 0);
+ if (remaining <= 0)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+ return false;
+ }
+
+ toCopy = Math.Min(destination.Length, remaining);
+ sourceBuffer = buffered.Data;
+ sourceOffset = headOffset;
+ sourceHead = buffered;
+ _persistentMultishotRecvDataHeadOffset = headOffset + toCopy;
+ releaseHeadAfterCopy = _persistentMultishotRecvDataHeadOffset >= buffered.Length;
+ }
+
+ sourceBuffer.AsSpan(sourceOffset, toCopy).CopyTo(destination.Span);
+ bytesTransferred = toCopy;
+
+ if (releaseHeadAfterCopy)
+ {
+ lock (gate)
+ {
+ if (_hasPersistentMultishotRecvDataHead &&
+ _persistentMultishotRecvDataHead.Length == sourceHead.Length &&
+ ReferenceEquals(_persistentMultishotRecvDataHead.Data, sourceHead.Data) &&
+ _persistentMultishotRecvDataHeadOffset >= sourceHead.Length)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /// Ensures the pre-accepted connection queue exists.
+ private void EnsureMultishotAcceptQueueInitialized()
+ {
+ if (_multishotAcceptQueue is null)
+ {
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ _multishotAcceptQueue ??= new Queue();
+ }
+ }
+ }
+
+ ///
+ /// Attempts to enqueue a pre-accepted connection from a multishot accept CQE.
+ /// Caller is responsible for closing when this returns false.
+ ///
+ internal bool TryEnqueuePreAcceptedConnection(IntPtr acceptedFd, ReadOnlySpan socketAddressData, int socketAddressLen)
+ {
+ EnsureMultishotAcceptQueueInitialized();
+ Queue? queue = _multishotAcceptQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ int length = socketAddressLen;
+ if (length < 0)
+ {
+ length = 0;
+ }
+
+ if ((uint)length > (uint)socketAddressData.Length)
+ {
+ length = socketAddressData.Length;
+ }
+
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ if (queue.Count >= MultishotAcceptQueueMaxSize)
+ {
+ return false;
+ }
+
+ byte[] copy;
+ if (length != 0)
+ {
+ copy = ArrayPool.Shared.Rent(length);
+ socketAddressData.Slice(0, length).CopyTo(copy);
+ }
+ else
+ {
+ copy = Array.Empty();
+ }
+
+ queue.Enqueue(new PreAcceptedConnection(acceptedFd, copy, length, usesPooledBuffer: length != 0));
+ }
+
+ return true;
+ }
+
+ ///
+ /// Attempts to dequeue a pre-accepted connection from the multishot accept queue.
+ /// Returns true if a connection was available, populating the operation fields.
+ ///
+ internal bool TryDequeuePreAcceptedConnection(AcceptOperation operation)
+ {
+ EnsureMultishotAcceptQueueInitialized();
+ Queue? queue = _multishotAcceptQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ PreAcceptedConnection accepted;
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ if (queue.Count == 0)
+ {
+ return false;
+ }
+
+ accepted = queue.Dequeue();
+ }
+
+ try
+ {
+ operation.AcceptedFileDescriptor = accepted.FileDescriptor;
+ int socketAddressLen = accepted.SocketAddressLength;
+ if ((uint)socketAddressLen > (uint)operation.SocketAddress.Length)
+ {
+ socketAddressLen = operation.SocketAddress.Length;
+ }
+
+ if (socketAddressLen != 0)
+ {
+ accepted.SocketAddressData.AsSpan(0, socketAddressLen).CopyTo(operation.SocketAddress.Span);
+ }
+
+ operation.AcceptSocketAddressLength = socketAddressLen;
+ operation.SocketAddress = operation.SocketAddress.Slice(0, socketAddressLen);
+ operation.ErrorCode = SocketError.Success;
+ return true;
+ }
+ finally
+ {
+ ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer);
+ }
+ }
+
+ /// Records that a shadow listener's multishot accept SQE was armed on the specified engine.
+ internal void RecordReusePortShadowArmed(ulong userData, int engineIndex)
+ {
+ Lock gate = EnsureReusePortShadowListenersGate();
+ lock (gate)
+ {
+ ReusePortShadowListenerState[]? shadows = _reusePortShadowListeners;
+ if (shadows is null)
+ {
+ return;
+ }
+
+ for (int i = 0; i < shadows.Length; i++)
+ {
+ if (shadows[i].EngineIndex == engineIndex)
+ {
+ shadows[i].ArmedUserData = userData;
+ return;
+ }
+ }
+ }
+ }
+
+ ///
+ /// Creates SO_REUSEPORT shadow listener sockets on non-primary engines to distribute
+ /// incoming connections across all io_uring engines. Called after primary multishot accept
+ /// is successfully armed.
+ ///
+ internal unsafe void TryCreateReusePortShadowListeners(SocketAsyncEngine primaryEngine)
+ {
+ if (SocketAsyncEngine.IsReusePortAcceptDisabled() || SocketAsyncEngine.EngineCount <= 1)
+ {
+ return;
+ }
+
+ // Get the primary socket's bound address via getsockname.
+ byte* sockAddrBuffer = stackalloc byte[128]; // large enough for sockaddr_storage
+ int sockAddrLen = 128;
+ Interop.Error getSockNameErr = Interop.Sys.GetSockName(_socket, sockAddrBuffer, &sockAddrLen);
+ if (getSockNameErr != Interop.Error.SUCCESS || sockAddrLen <= 0)
+ {
+ return;
+ }
+
+ ReadOnlySpan boundAddress = new ReadOnlySpan(sockAddrBuffer, sockAddrLen);
+
+ // Determine socket family, type, protocol from the primary socket.
+ Interop.Error getTypeErr = Interop.Sys.GetSocketType(
+ _socket,
+ out AddressFamily addressFamily,
+ out SocketType socketType,
+ out ProtocolType protocolType,
+ out bool _);
+ if (getTypeErr != Interop.Error.SUCCESS)
+ {
+ return;
+ }
+
+ // SO_REUSEPORT must be enabled before the primary bind/listen sequence.
+ // If the primary wasn't created with REUSEPORT, shadow binds to the same endpoint
+ // won't join the reuseport group and will fail with EADDRINUSE.
+ if (!IsPrimarySocketReusePortEnabled())
+ {
+ return;
+ }
+
+ SocketAsyncEngine.EnsureFdEngineAffinityTable();
+
+ int engineCount = SocketAsyncEngine.EngineCount;
+ for (int i = 0; i < engineCount; i++)
+ {
+ SocketAsyncEngine targetEngine = SocketAsyncEngine.GetEngineByIndex(i);
+ if (targetEngine == primaryEngine)
+ {
+ continue;
+ }
+
+ // Create shadow socket.
+ IntPtr shadowFd;
+ Interop.Error socketErr = Interop.Sys.Socket(
+ (int)addressFamily, (int)socketType, (int)protocolType, &shadowFd);
+ if (socketErr != Interop.Error.SUCCESS)
+ {
+ continue;
+ }
+
+ SafeSocketHandle shadowHandle = new SafeSocketHandle();
+ Marshal.InitHandle(shadowHandle, shadowFd);
+
+ bool shadowCreated = false;
+ try
+ {
+ // Set SO_REUSEPORT.
+ int reusePort = 1;
+ Interop.Error setOptErr = Interop.Sys.SetRawSockOpt(
+ shadowHandle, SolSocket, SoReusePort, (byte*)&reusePort, sizeof(int));
+ if (setOptErr != Interop.Error.SUCCESS)
+ {
+ continue;
+ }
+
+ // Bind to same address.
+ Interop.Error bindErr = Interop.Sys.Bind(shadowHandle, protocolType, boundAddress);
+ if (bindErr != Interop.Error.SUCCESS)
+ {
+ continue;
+ }
+
+ // Listen.
+ Interop.Error listenErr = Interop.Sys.Listen(shadowHandle, 512);
+ if (listenErr != Interop.Error.SUCCESS)
+ {
+ continue;
+ }
+
+ // Enqueue setup request to target engine (SQE arming happens on its event loop).
+ ReusePortShadowListenerState state = new ReusePortShadowListenerState
+ {
+ Handle = shadowHandle,
+ EngineIndex = i,
+ ArmedUserData = 0
+ };
+
+ // Publish the shadow state before enqueuing setup so RecordReusePortShadowArmed
+ // can always resolve and persist armed user_data from the target event loop.
+ AddReusePortShadowListener(ref state);
+ if (targetEngine.TryEnqueueReusePortShadowSetup(shadowHandle, this, primaryEngine))
+ {
+ shadowCreated = true;
+ }
+ else
+ {
+ RemoveReusePortShadowListenerByEngineIndex(i);
+ }
+ }
+ finally
+ {
+ if (!shadowCreated)
+ {
+ shadowHandle.Dispose();
+ }
+ }
+ }
+ }
+
+ /// Removes a completed io_uring operation from its queue and signals or dispatches its callback.
+ internal bool TryCompleteIoUringOperation(AsyncOperation operation)
+ {
+ bool removed =
+ operation is ReadOperation readOperation ? _receiveQueue.TryRemoveCompletedOperation(this, readOperation) :
+ operation is WriteOperation writeOperation ? _sendQueue.TryRemoveCompletedOperation(this, writeOperation) :
+ false;
+ if (!removed)
+ {
+ return false;
+ }
+
+ ManualResetEventSlim? e = operation.Event;
+ if (e is not null)
+ {
+ e.Set();
+ return true;
+ }
+
+ operation.CancellationRegistration.Dispose();
+ if (ShouldDispatchCompletionCallback(operation))
+ {
+ if (PreferInlineCompletions)
+ {
+ // Inline completion: invoke directly on the event-loop thread,
+ // matching the epoll path (HandleEventsInline). This avoids the
+ // ThreadPool hop for latency-sensitive workloads that opted in
+ // via DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS=1.
+ operation.InvokeCallback(allowPooling: true);
+ }
+ else
+ {
+ operation.QueueIoUringCompletionCallback();
+ }
+ }
+
+ return true;
+ }
+
+ /// Enqueues an operation for deferred SQE preparation on the event loop thread.
+ private bool TryEnqueueIoUringPreparation(AsyncOperation operation, long prepareSequence)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ return engine is not null && engine.TryEnqueueIoUringPreparation(operation, prepareSequence);
+ }
+
+ /// Applies cancellation and/or untracking to an operation's io_uring state.
+ private void HandleIoUringCancellationTransition(
+ AsyncOperation operation,
+ bool requestKernelCancellation,
+ bool untrackAndClear)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ ulong userData = operation.IoUringUserData;
+ if (userData == 0)
+ {
+ return;
+ }
+
+ if (requestKernelCancellation)
+ {
+ engine?.TryRequestIoUringCancellation(userData);
+ }
+
+ if (untrackAndClear)
+ {
+ bool clearAllowed = engine?.TryUntrackIoUringOperation(userData, operation) ?? true;
+ if (clearAllowed)
+ {
+ operation.ClearIoUringUserData();
+ }
+ }
+ }
+
+ /// Requests kernel-level ASYNC_CANCEL for an in-flight operation.
+ private void TryRequestIoUringCancellation(AsyncOperation operation)
+ {
+ HandleIoUringCancellationTransition(
+ operation,
+ requestKernelCancellation: true,
+ untrackAndClear: false);
+ }
+
+ /// Removes an operation from the registry and clears its user_data.
+ internal void TryUntrackIoUringOperation(AsyncOperation operation)
+ {
+ HandleIoUringCancellationTransition(
+ operation,
+ requestKernelCancellation: false,
+ untrackAndClear: true);
+ }
+
+ /// Stages an operation for io_uring preparation if completion mode is active.
+ static partial void LinuxTryStageIoUringOperation(AsyncOperation operation)
+ {
+ if (operation.Event is null &&
+ operation.AssociatedContext.IsIoUringCompletionModeEnabled() &&
+ operation.IoUringUserData == 0 &&
+ operation.IsInWaitingState())
+ {
+ if (!operation.TryQueueIoUringPreparation())
+ {
+ operation.EmitReadinessFallbackForQueueOverflow();
+ }
+ }
+ }
+
+ partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued)
+ {
+ dequeued = TryDequeuePreAcceptedConnection(operation);
+ }
+
+ partial void LinuxHasBufferedPersistentMultishotRecvData(ref bool hasBuffered)
+ {
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ lock (gate)
+ {
+ hasBuffered = PersistentMultishotRecvBufferedCount > 0;
+ }
+ }
+
+ partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred)
+ {
+ consumed = TryConsumeBufferedPersistentMultishotRecvData(destination, out bytesTransferred);
+ }
+
+ /// Cleans up multishot-accept state and queued pre-accepted descriptors during abort.
+ partial void LinuxOnStopAndAbort()
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ if (IsPersistentMultishotRecvArmed())
+ {
+ RequestPersistentMultishotRecvCancel();
+ }
+
+ ulong armedUserData = GetArmedMultishotAcceptUserDataForCancellation();
+ if (engine is not null && armedUserData != 0)
+ {
+ engine.TryRequestIoUringCancellation(armedUserData);
+ }
+
+ DisarmMultishotAccept();
+
+ // Clean up SO_REUSEPORT shadow listeners.
+ ReusePortShadowListenerState[]? shadows;
+ Lock shadowGate = EnsureReusePortShadowListenersGate();
+ lock (shadowGate)
+ {
+ shadows = _reusePortShadowListeners;
+ _reusePortShadowListeners = null;
+ }
+
+ if (shadows is not null)
+ {
+ for (int i = 0; i < shadows.Length; i++)
+ {
+ ref ReusePortShadowListenerState shadow = ref shadows[i];
+ if (shadow.ArmedUserData != 0)
+ {
+ SocketAsyncEngine targetEngine = SocketAsyncEngine.GetEngineByIndex(shadow.EngineIndex);
+ targetEngine.TryRequestIoUringCancellation(shadow.ArmedUserData);
+ }
+
+ shadow.Handle?.Dispose();
+ }
+ }
+
+ Queue? multishotAcceptQueue = _multishotAcceptQueue;
+ if (multishotAcceptQueue is not null)
+ {
+ while (true)
+ {
+ PreAcceptedConnection accepted;
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ if (multishotAcceptQueue.Count == 0)
+ {
+ break;
+ }
+
+ accepted = multishotAcceptQueue.Dequeue();
+ }
+
+ Interop.Sys.Close(accepted.FileDescriptor);
+ ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer);
+ }
+ }
+
+ Lock persistentGate = EnsurePersistentMultishotRecvDataGate();
+ lock (persistentGate)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+
+ Queue? bufferedQueue = _persistentMultishotRecvDataQueue;
+ if (bufferedQueue is not null)
+ {
+ while (bufferedQueue.Count != 0)
+ {
+ BufferedPersistentMultishotRecvData buffered = bufferedQueue.Dequeue();
+ ReturnPooledBufferIfNeeded(buffered.Data, buffered.UsesPooledBuffer);
+ }
+ }
+ }
+ }
+
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void EnsurePersistentMultishotRecvDataQueueInitialized()
+ {
+ if (_persistentMultishotRecvDataQueue is null)
+ {
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ lock (gate)
+ {
+ _persistentMultishotRecvDataQueue ??= new Queue();
+ }
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered)
+ {
+ if (_hasPersistentMultishotRecvDataHead)
+ {
+ buffered = _persistentMultishotRecvDataHead;
+ return true;
+ }
+
+ Queue? queue = _persistentMultishotRecvDataQueue;
+ if (queue is null || queue.Count == 0)
+ {
+ buffered = default;
+ return false;
+ }
+
+ BufferedPersistentMultishotRecvData dequeued = queue.Dequeue();
+ _persistentMultishotRecvDataHead = dequeued;
+ _hasPersistentMultishotRecvDataHead = true;
+ _persistentMultishotRecvDataHeadOffset = 0;
+ buffered = dequeued;
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ReleasePersistentMultishotRecvDataHead()
+ {
+ if (!_hasPersistentMultishotRecvDataHead)
+ {
+ return;
+ }
+
+ BufferedPersistentMultishotRecvData head = _persistentMultishotRecvDataHead;
+ _persistentMultishotRecvDataHead = default;
+ _hasPersistentMultishotRecvDataHead = false;
+ _persistentMultishotRecvDataHeadOffset = 0;
+ ReturnPooledBufferIfNeeded(head.Data, head.UsesPooledBuffer);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ReturnPooledBufferIfNeeded(byte[] buffer, bool usesPooledBuffer)
+ {
+ if (usesPooledBuffer)
+ {
+ ArrayPool.Shared.Return(buffer);
+ }
+ }
+
+ private ulong GetArmedMultishotAcceptUserDataForCancellation()
+ {
+ long packedState = Volatile.Read(ref _multishotAcceptState);
+ ulong userData = DecodeMultishotAcceptUserData(packedState);
+ if (userData != 0 || packedState == MultishotAcceptStateDisarmed)
+ {
+ return userData;
+ }
+
+ // A transient "arming without published user_data" state can race this read.
+ // Bounded spin is best-effort; a miss is benign because later cancellation
+ // and teardown paths still unarm/cleanup safely.
+ SpinWait spinner = default;
+ do
+ {
+ spinner.SpinOnce();
+ packedState = Volatile.Read(ref _multishotAcceptState);
+ userData = DecodeMultishotAcceptUserData(packedState);
+ if (userData != 0 || packedState == MultishotAcceptStateDisarmed)
+ {
+ break;
+ }
+ } while (!spinner.NextSpinWillYield);
+
+ return userData;
+ }
+
+ internal abstract partial class AsyncOperation
+ {
+ /// Outcome of processing an io_uring CQE, determining the dispatch action.
+ internal enum IoUringCompletionResult
+ {
+ Completed = 0,
+ Pending = 1,
+ Canceled = 2,
+ Ignored = 3
+ }
+
+ /// Tri-state result from direct (managed) SQE preparation.
+ internal enum IoUringDirectPrepareResult
+ {
+ Unsupported = 0, // Direct path unavailable for this shape; caller keeps operation pending.
+ Prepared = 1, // SQE written
+ PrepareFailed = 2, // Direct preparation failed; caller handles retry/fallback without native prepare.
+ CompletedFromBuffer = 3 // Operation completed synchronously from early-buffer data; no SQE needed.
+ }
+
+ /// Tracks whether a receive operation prepared as one-shot or multishot.
+ internal enum IoUringReceiveSubmissionMode : byte
+ {
+ None = 0,
+ OneShot = 1,
+ Multishot = 2
+ }
+
+ private long _ioUringPrepareSequence;
+ private int _ioUringPrepareQueued;
+ private int _ioUringPreparationReusable;
+ private MemoryHandle _ioUringPinnedBuffer;
+ private int _ioUringPinnedBufferActive;
+ private int _ioUringCompletionSocketAddressLen;
+ private int _ioUringCompletionControlBufferLen;
+ private int _ioUringReceiveSubmissionMode;
+ private int _ioUringSlotExhaustionRetryCount;
+ internal ulong IoUringUserData;
+
+ /// Requests kernel cancellation if the flag is set.
+ partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation)
+ {
+ if (requestIoUringCancellation)
+ {
+ AssociatedContext.TryRequestIoUringCancellation(this);
+ }
+ }
+
+ /// Untracks this operation unless it is in the Canceled state awaiting a terminal CQE.
+ partial void LinuxUntrackIoUringOperation()
+ {
+ // Canceled operations remain tracked until the terminal CQE arrives so that
+ // pinned/user-owned resources are not released while the kernel may still
+ // reference them. Dispatch will clear resources on that terminal completion.
+ if (_state == State.Canceled)
+ {
+ return;
+ }
+
+ AssociatedContext.TryUntrackIoUringOperation(this);
+ }
+
+ /// Resets all io_uring preparation state and advances the prepare sequence.
+ partial void ResetIoUringState()
+ {
+ ReleaseIoUringPreparationResources();
+ IoUringUserData = 0;
+ Volatile.Write(ref _ioUringPreparationReusable, 0);
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None;
+ _ioUringSlotExhaustionRetryCount = 0;
+ long nextPrepareSequence = unchecked(_ioUringPrepareSequence + 1);
+ // Keep sequence strictly positive so stale queued work from previous resets never matches.
+ if (nextPrepareSequence <= 0)
+ {
+ nextPrepareSequence = 1;
+ }
+
+ Volatile.Write(ref _ioUringPrepareSequence, nextPrepareSequence);
+ Volatile.Write(ref _ioUringPrepareQueued, 0);
+ }
+
+ /// Marks this operation as ready for SQE preparation and returns its sequence number.
+ internal long MarkReadyForIoUringPreparation()
+ {
+ long prepareSequence = Volatile.Read(ref _ioUringPrepareSequence);
+ Debug.Assert(prepareSequence > 0);
+ Volatile.Write(ref _ioUringPrepareQueued, 1);
+ return prepareSequence;
+ }
+
+ /// Cancels a pending preparation if the sequence number still matches.
+ internal void CancelPendingIoUringPreparation(long prepareSequence)
+ {
+ if (Volatile.Read(ref _ioUringPrepareSequence) == prepareSequence)
+ {
+ Volatile.Write(ref _ioUringPrepareQueued, 0);
+ }
+ }
+
+ /// Attempts to prepare an SQE for this operation via the managed direct path.
+ internal bool TryPrepareIoUring(SocketAsyncContext context, long prepareSequence)
+ {
+ long observedPrepareSequence = Volatile.Read(ref _ioUringPrepareSequence);
+ bool waiting = _state == State.Waiting;
+ if (prepareSequence <= 0 ||
+ observedPrepareSequence != prepareSequence ||
+ !waiting)
+ {
+ return false;
+ }
+
+ // Consume the queued flag only for a currently valid sequence/state pair.
+ // Stale work items must not clear a newer queued prepare request.
+ if (Interlocked.CompareExchange(ref _ioUringPrepareQueued, 0, 1) == 0)
+ {
+ return false;
+ }
+
+ if (Interlocked.Exchange(ref _ioUringPreparationReusable, 0) == 0)
+ {
+ ReleaseIoUringPreparationResources();
+ }
+
+ SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine);
+ if (engine is null || !engine.IsIoUringDirectSqeEnabled)
+ {
+ // Managed completion mode assumes direct SQE submission.
+ // If direct submission is unavailable, keep operation pending for fallback handling.
+ ErrorCode = SocketError.Success;
+ IoUringUserData = 0;
+ return false;
+ }
+
+ IoUringDirectPrepareResult directResult = IoUringPrepareDirect(context, engine, out ulong directUserData);
+ if (directResult == IoUringDirectPrepareResult.CompletedFromBuffer)
+ {
+ // Operation completed synchronously from early-buffer data during prepare.
+ // Transition to Complete; caller will dispatch the completion callback.
+ _state = State.Complete;
+ IoUringUserData = 0;
+ return false;
+ }
+
+ if (directResult == IoUringDirectPrepareResult.Prepared)
+ {
+ _ioUringSlotExhaustionRetryCount = 0;
+ IoUringUserData = ErrorCode == SocketError.Success ? directUserData : 0;
+ return true;
+ }
+
+ if (directResult == IoUringDirectPrepareResult.PrepareFailed)
+ {
+ IoUringUserData = 0;
+ return false;
+ }
+
+ // Direct preparation unsupported for this operation shape.
+ // Leave operation pending so caller can use completion-path fallback semantics.
+ ErrorCode = SocketError.Success;
+ IoUringUserData = 0;
+ return false;
+ }
+
+ /// Queues this operation for deferred preparation on the event loop thread.
+ internal bool TryQueueIoUringPreparation()
+ {
+ if (!AssociatedContext.IsIoUringCompletionModeEnabled())
+ {
+ return false;
+ }
+
+ long prepareSequence = MarkReadyForIoUringPreparation();
+ if (AssociatedContext.TryEnqueueIoUringPreparation(this, prepareSequence))
+ {
+ return true;
+ }
+
+ CancelPendingIoUringPreparation(prepareSequence);
+ return false;
+ }
+
+ /// Returns whether this operation is currently in the waiting state.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool IsInWaitingState() => _state == State.Waiting;
+ internal bool IsInCompletedState() => _state == State.Complete;
+
+ /// Increments and returns the slot-exhaustion retry count for this operation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal int IncrementIoUringSlotExhaustionRetryCount() => ++_ioUringSlotExhaustionRetryCount;
+
+ /// Resets slot-exhaustion retry tracking for this operation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void ResetIoUringSlotExhaustionRetryCount() => _ioUringSlotExhaustionRetryCount = 0;
+
+ ///
+ /// Emits a readiness fallback event when io_uring prepare-queue staging fails.
+ ///
+ internal void EmitReadinessFallbackForQueueOverflow()
+ {
+ Interop.Sys.SocketEvents fallbackEvents = GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ SocketAsyncContext context = AssociatedContext;
+ SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine);
+ if (engine is null)
+ {
+ return;
+ }
+
+ // Queue-overflow fallback still needs completion-mode re-prepare semantics:
+ // mark the operation so the next readiness-driven EAGAIN path restages an SQE.
+ RequestIoUringFallbackReprepare();
+
+ engine.EnqueueReadinessFallbackEvent(
+ context,
+ fallbackEvents,
+ countAsPrepareQueueOverflowFallback: true);
+ }
+
+ /// Processes a CQE result and returns the dispatch action for the completion handler.
+ internal IoUringCompletionResult ProcessIoUringCompletionResult(int result, uint flags, uint auxiliaryData)
+ {
+ Trace($"Enter, result={result}, flags={flags}, auxiliaryData={auxiliaryData}");
+
+ // Claim ownership of completion processing; if cancellation already won, do not publish completion.
+ State oldState = Interlocked.CompareExchange(ref _state, State.Running, State.Waiting);
+ if (oldState == State.Canceled)
+ {
+ Trace("Exit, previously canceled");
+ return IoUringCompletionResult.Canceled;
+ }
+
+ if (oldState != State.Waiting)
+ {
+ Trace("Exit, ignored");
+ return IoUringCompletionResult.Ignored;
+ }
+
+ if (ProcessIoUringCompletionViaDiscriminator(AssociatedContext, result, auxiliaryData))
+ {
+ _state = State.Complete;
+ Trace("Exit, completed");
+ return IoUringCompletionResult.Completed;
+ }
+
+ // Incomplete path (e.g. transient retry): mirror TryComplete state transition handling.
+ State newState;
+ while (true)
+ {
+ State state = _state;
+ Debug.Assert(state is State.Running or State.RunningWithPendingCancellation, $"Unexpected operation state: {(State)state}");
+
+ newState = (state == State.Running ? State.Waiting : State.Canceled);
+ if (state == Interlocked.CompareExchange(ref _state, newState, state))
+ {
+ break;
+ }
+ }
+
+ if (newState == State.Canceled)
+ {
+ ProcessCancellation();
+ Trace("Exit, canceled while pending");
+ return IoUringCompletionResult.Canceled;
+ }
+
+ Trace("Exit, pending");
+ return IoUringCompletionResult.Pending;
+ }
+
+ /// Stores recvmsg output lengths from the CQE for post-completion processing.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void SetIoUringCompletionMessageMetadata(int socketAddressLen, int controlBufferLen)
+ {
+ _ioUringCompletionSocketAddressLen = socketAddressLen;
+ _ioUringCompletionControlBufferLen = controlBufferLen;
+ }
+
+ /// Releases preparation resources and resets the user_data to zero.
+ internal void ClearIoUringUserData()
+ {
+ ReleaseIoUringPreparationResources();
+ IoUringUserData = 0;
+ Volatile.Write(ref _ioUringPreparationReusable, 0);
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None;
+ _ioUringSlotExhaustionRetryCount = 0;
+ }
+
+ /// Clears user_data without releasing preparation resources for pending requeue.
+ internal void ResetIoUringUserDataForRequeue()
+ {
+ IoUringUserData = 0;
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ }
+
+ /// Records whether the current receive preparation uses one-shot or multishot mode.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected void SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode mode)
+ {
+ Volatile.Write(ref _ioUringReceiveSubmissionMode, (int)mode);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected bool IsIoUringBufferPinned() =>
+ Volatile.Read(ref _ioUringPinnedBufferActive) != 0;
+
+ /// Marks preparation resources as reusable so the next prepare skips re-pinning.
+ internal void MarkIoUringPreparationReusable()
+ {
+ Volatile.Write(ref _ioUringPreparationReusable, 1);
+ }
+
+ /// Socket address length reported by the kernel in the CQE.
+ protected int IoUringCompletionSocketAddressLen => _ioUringCompletionSocketAddressLen;
+ /// Control buffer length reported by the kernel in the CQE.
+ protected int IoUringCompletionControlBufferLen => _ioUringCompletionControlBufferLen;
+
+ /// Pins a buffer and returns the raw pointer, recording the handle for later release.
+ protected unsafe byte* PinIoUringBuffer(Memory buffer)
+ {
+ ReleasePinnedIoUringBuffer();
+ if (buffer.Length == 0)
+ {
+ return null;
+ }
+
+ _ioUringPinnedBuffer = buffer.Pin();
+ Volatile.Write(ref _ioUringPinnedBufferActive, 1);
+ return (byte*)_ioUringPinnedBuffer.Pointer;
+ }
+
+ /// Attempts to pin a buffer, falling back to the readiness path if not pinnable.
+ protected unsafe bool TryPinIoUringBuffer(Memory buffer, out byte* pinnedBuffer)
+ {
+ if (Volatile.Read(ref _ioUringPinnedBufferActive) != 0)
+ {
+ pinnedBuffer = (byte*)_ioUringPinnedBuffer.Pointer;
+ if (buffer.Length > 0 && pinnedBuffer is null)
+ {
+ ReleasePinnedIoUringBuffer();
+ RecordIoUringNonPinnablePrepareFallback();
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ return true;
+ }
+
+ try
+ {
+ pinnedBuffer = PinIoUringBuffer(buffer);
+ if (buffer.Length > 0 && pinnedBuffer is null)
+ {
+ ReleasePinnedIoUringBuffer();
+ RecordIoUringNonPinnablePrepareFallback();
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ return true;
+ }
+ catch (NotSupportedException)
+ {
+ pinnedBuffer = null;
+ RecordIoUringNonPinnablePrepareFallback();
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+ }
+
+ /// Transfers ownership of the active pinned buffer to the caller.
+ internal MemoryHandle TransferPinnedBuffer()
+ {
+ if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) == 0)
+ {
+ return default;
+ }
+
+ MemoryHandle pinnedBuffer = _ioUringPinnedBuffer;
+ _ioUringPinnedBuffer = default;
+ return pinnedBuffer;
+ }
+
+ ///
+ /// Attempts to pin a socket address buffer, reusing an existing pin when possible.
+ /// Caller is responsible for setting operation ErrorCode on failure if needed.
+ ///
+ protected static unsafe bool TryPinIoUringSocketAddress(
+ Memory socketAddress,
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ out byte* rawSocketAddress)
+ {
+ rawSocketAddress = null;
+ if (socketAddress.Length == 0)
+ {
+ return true;
+ }
+
+ if (Volatile.Read(ref pinnedSocketAddressActive) != 0)
+ {
+ rawSocketAddress = (byte*)pinnedSocketAddress.Pointer;
+ if (rawSocketAddress is null)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ Volatile.Write(ref pinnedSocketAddressActive, 0);
+ return false;
+ }
+
+ return true;
+ }
+
+ try
+ {
+ pinnedSocketAddress = socketAddress.Pin();
+ Volatile.Write(ref pinnedSocketAddressActive, 1);
+ }
+ catch (NotSupportedException)
+ {
+ rawSocketAddress = null;
+ return false;
+ }
+
+ rawSocketAddress = (byte*)pinnedSocketAddress.Pointer;
+ if (rawSocketAddress is null)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ Volatile.Write(ref pinnedSocketAddressActive, 0);
+ return false;
+ }
+
+ return true;
+ }
+
+ ///
+ /// Pins a socket address buffer and normalizes pinning failures to a non-terminal fallback signal.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected unsafe bool TryPinIoUringSocketAddressForPrepare(
+ Memory socketAddress,
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ out byte* rawSocketAddress)
+ {
+ if (TryPinIoUringSocketAddress(
+ socketAddress,
+ ref pinnedSocketAddress,
+ ref pinnedSocketAddressActive,
+ out rawSocketAddress))
+ {
+ return true;
+ }
+
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ /// Releases an operation-owned pinned socket-address buffer and message-header allocation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected static unsafe void ReleaseIoUringSocketAddressAndMessageHeader(
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ ref IntPtr messageHeader)
+ {
+ if (Interlocked.Exchange(ref pinnedSocketAddressActive, 0) != 0)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ }
+
+ IntPtr header = Interlocked.Exchange(ref messageHeader, IntPtr.Zero);
+ if (header != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)header);
+ }
+ }
+
+ /// Records a telemetry counter for a non-pinnable buffer fallback.
+ private void RecordIoUringNonPinnablePrepareFallback()
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref AssociatedContext._asyncEngine);
+ if (engine is null || !engine.IsIoUringCompletionModeEnabled)
+ {
+ return;
+ }
+
+ engine.RecordIoUringNonPinnablePrepareFallback();
+ }
+
+ /// Releases the currently pinned buffer handle if active.
+ private void ReleasePinnedIoUringBuffer()
+ {
+ if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) != 0)
+ {
+ _ioUringPinnedBuffer.Dispose();
+ _ioUringPinnedBuffer = default;
+ }
+ }
+
+ /// Releases the pinned buffer when the operation shape (single vs list) changes.
+ protected void ReleaseIoUringPinnedBufferForShapeTransition() =>
+ ReleasePinnedIoUringBuffer();
+
+ /// Releases all preparation resources including the pinned buffer and subclass resources.
+ private void ReleaseIoUringPreparationResources()
+ {
+ ReleasePinnedIoUringBuffer();
+ ReleaseIoUringPreparationResourcesCore();
+ }
+
+ /// Subclass hook to release operation-specific preparation resources.
+ protected virtual void ReleaseIoUringPreparationResourcesCore()
+ {
+ }
+
+ /// Frees a set of GCHandles used for buffer list pinning.
+ protected static void ReleasePinnedHandles(GCHandle[] pinnedHandles, int count)
+ {
+ if (count <= 0)
+ {
+ return;
+ }
+
+ int releaseCount = count < pinnedHandles.Length ? count : pinnedHandles.Length;
+ for (int i = 0; i < releaseCount; i++)
+ {
+ if (pinnedHandles[i].IsAllocated)
+ {
+ pinnedHandles[i].Free();
+ }
+ }
+ }
+
+ /// Rents an array from the shared pool for temporary io_uring preparation use.
+ private static T[] RentIoUringArray(int minimumLength) =>
+ minimumLength == 0 ? Array.Empty() : ArrayPool.Shared.Rent(minimumLength);
+
+ /// Returns a rented array to the shared pool.
+ private static void ReturnIoUringArray(T[] array, bool clearArray = false)
+ {
+ if (array.Length != 0)
+ {
+ ArrayPool.Shared.Return(array, clearArray);
+ }
+ }
+
+ /// Releases pinned handles and returns the iovec array to the pool.
+ protected static void ReleaseIoUringPinnedHandlesAndIovecs(
+ ref GCHandle[]? pinnedHandles,
+ ref Interop.Sys.IOVector[]? iovecs,
+ ref int pinnedHandleCount)
+ {
+ GCHandle[]? handles = Interlocked.Exchange(ref pinnedHandles, null);
+ int handleCount = Interlocked.Exchange(ref pinnedHandleCount, 0);
+ if (handles is not null)
+ {
+ ReleasePinnedHandles(handles, handleCount);
+ ReturnIoUringArray(handles, clearArray: true);
+ }
+
+ Interop.Sys.IOVector[]? vectors = Interlocked.Exchange(ref iovecs, null);
+ if (vectors is not null)
+ {
+ ReturnIoUringArray(vectors, clearArray: true);
+ }
+ }
+
+ /// Pins a list of buffer segments and builds an iovec array for scatter/gather I/O.
+ protected static unsafe bool TryPinBufferListForIoUring(
+ IList> buffers,
+ int startIndex,
+ int startOffset,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out int iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode)
+ {
+ iovCount = 0;
+ pinnedHandleCount = 0;
+ if ((uint)startIndex > (uint)buffers.Count)
+ {
+ errorCode = SocketError.InvalidArgument;
+ pinnedHandles = Array.Empty();
+ iovecs = Array.Empty();
+ return false;
+ }
+
+ int remainingBufferCount = buffers.Count - startIndex;
+ pinnedHandles = RentIoUringArray(remainingBufferCount);
+ iovecs = RentIoUringArray(remainingBufferCount);
+
+ int currentOffset = startOffset;
+ byte[]? lastPinnedArray = null;
+ GCHandle lastPinnedHandle = default;
+ try
+ {
+ for (int i = 0; i < remainingBufferCount; i++, currentOffset = 0)
+ {
+ ArraySegment buffer = buffers[startIndex + i];
+ RangeValidationHelpers.ValidateSegment(buffer);
+
+ if ((uint)currentOffset > (uint)buffer.Count)
+ {
+ ReleasePinnedHandles(pinnedHandles, pinnedHandleCount);
+ ReturnIoUringArray(pinnedHandles, clearArray: true);
+ ReturnIoUringArray(iovecs, clearArray: true);
+ errorCode = SocketError.InvalidArgument;
+ return false;
+ }
+
+ int bufferCount = buffer.Count - currentOffset;
+ byte* basePtr = null;
+ if (bufferCount != 0)
+ {
+ byte[] array = buffer.Array!;
+ GCHandle handle;
+ if (ReferenceEquals(array, lastPinnedArray))
+ {
+ handle = lastPinnedHandle;
+ }
+ else
+ {
+ handle = GCHandle.Alloc(array, GCHandleType.Pinned);
+ pinnedHandles[pinnedHandleCount] = handle;
+ pinnedHandleCount++;
+ lastPinnedArray = array;
+ lastPinnedHandle = handle;
+ }
+
+ basePtr = &((byte*)handle.AddrOfPinnedObject())[buffer.Offset + currentOffset];
+ }
+
+ iovecs[i].Base = basePtr;
+ iovecs[i].Count = (UIntPtr)bufferCount;
+ iovCount++;
+ }
+ }
+ catch
+ {
+ ReleasePinnedHandles(pinnedHandles, pinnedHandleCount);
+ ReturnIoUringArray(pinnedHandles, clearArray: true);
+ ReturnIoUringArray(iovecs, clearArray: true);
+ throw;
+ }
+
+ errorCode = SocketError.Success;
+ return true;
+ }
+
+ /// Prepares an SQE via the managed direct path. Override in subclasses for direct submission.
+ protected virtual IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ return IoUringDirectPrepareResult.Unsupported;
+ }
+
+ ///
+ /// Routes a CQE using an operation-kind discriminator to avoid virtual completion dispatch
+ /// on this hot path.
+ ///
+ private bool ProcessIoUringCompletionViaDiscriminator(SocketAsyncContext context, int result, uint auxiliaryData)
+ {
+ IoUringCompletionDispatchKind kind = GetIoUringCompletionDispatchKind();
+ if (result >= 0)
+ {
+ return kind switch
+ {
+ IoUringCompletionDispatchKind.BufferListSendOperation => ((BufferListSendOperation)this).ProcessIoUringCompletionSuccessBufferListSend(result),
+ IoUringCompletionDispatchKind.BufferMemoryReceiveOperation => ((BufferMemoryReceiveOperation)this).ProcessIoUringCompletionSuccessBufferMemoryReceive(result, auxiliaryData),
+ IoUringCompletionDispatchKind.BufferListReceiveOperation => ((BufferListReceiveOperation)this).ProcessIoUringCompletionSuccessBufferListReceive(result, auxiliaryData),
+ IoUringCompletionDispatchKind.ReceiveMessageFromOperation => ((ReceiveMessageFromOperation)this).ProcessIoUringCompletionSuccessReceiveMessageFrom(result, auxiliaryData),
+ IoUringCompletionDispatchKind.AcceptOperation => ((AcceptOperation)this).ProcessIoUringCompletionSuccessAccept(result, auxiliaryData),
+ IoUringCompletionDispatchKind.ConnectOperation => ((ConnectOperation)this).ProcessIoUringCompletionSuccessConnect(context),
+ IoUringCompletionDispatchKind.SendOperation => ((SendOperation)this).ProcessIoUringCompletionSuccessSend(result),
+ _ => ProcessIoUringCompletionSuccessDefault(result)
+ };
+ }
+
+ return kind switch
+ {
+ IoUringCompletionDispatchKind.ReceiveMessageFromOperation => ((ReceiveMessageFromOperation)this).ProcessIoUringCompletionErrorReceiveMessageFrom(result),
+ IoUringCompletionDispatchKind.AcceptOperation => ((AcceptOperation)this).ProcessIoUringCompletionErrorAccept(result),
+ IoUringCompletionDispatchKind.ConnectOperation => ((ConnectOperation)this).ProcessIoUringCompletionErrorConnect(context, result),
+ IoUringCompletionDispatchKind.ReadOperation or
+ IoUringCompletionDispatchKind.BufferMemoryReceiveOperation or
+ IoUringCompletionDispatchKind.BufferListReceiveOperation => ((ReadOperation)this).ProcessIoUringCompletionErrorRead(result),
+ IoUringCompletionDispatchKind.WriteOperation or
+ IoUringCompletionDispatchKind.SendOperation or
+ IoUringCompletionDispatchKind.BufferListSendOperation => ((WriteOperation)this).ProcessIoUringCompletionErrorWrite(result),
+ _ => ProcessIoUringCompletionErrorDefault(result)
+ };
+ }
+
+ /// Processes a successful (non-negative) io_uring completion result.
+ private bool ProcessIoUringCompletionSuccessDefault(int result)
+ {
+ Debug.Assert(result >= 0, $"Expected non-negative io_uring result, got {result}");
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ /// Processes a failed (negative) io_uring completion result.
+ private bool ProcessIoUringCompletionErrorDefault(int result)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+ ErrorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result));
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private IoUringCompletionDispatchKind GetIoUringCompletionDispatchKind()
+ {
+ int dispatchKind = _ioUringCompletionDispatchKind;
+ return dispatchKind != 0 ?
+ (IoUringCompletionDispatchKind)dispatchKind :
+ IoUringCompletionDispatchKind.Default;
+ }
+
+ /// Whether preparation resources should be preserved when the operation is requeued.
+ internal virtual bool ShouldReuseIoUringPreparationResourcesOnPending => false;
+
+ /// Returns whether the negative result represents EAGAIN/EWOULDBLOCK.
+ protected static bool IsIoUringRetryableError(int result)
+ {
+ if (result >= 0)
+ {
+ return false;
+ }
+
+ Interop.Error error = GetIoUringPalError(result);
+ return error == Interop.Error.EAGAIN || error == Interop.Error.EWOULDBLOCK;
+ }
+
+ /// Converts a negative io_uring result to a SocketError, returning false for retryable errors.
+ protected static bool ProcessIoUringErrorResult(int result, out SocketError errorCode)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+
+ if (IsIoUringRetryableError(result))
+ {
+ errorCode = SocketError.Success;
+ return false;
+ }
+
+ errorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result));
+ return true;
+ }
+
+ /// Converts a negative io_uring CQE result (raw -errno) to PAL error space.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected static Interop.Error GetIoUringPalError(int result)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+ int platformErrno = -result;
+ return Interop.Sys.ConvertErrorPlatformToPal(platformErrno);
+ }
+
+ /// Returns the epoll event mask to use when falling back from io_uring to readiness notification.
+ internal virtual Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.None;
+
+ ///
+ /// Copies payload bytes from a provided-buffer ring selection into the operation's target memory.
+ /// Returns false when this operation shape does not support provided-buffer payload materialization.
+ ///
+ internal virtual unsafe bool TryProcessIoUringProvidedBufferCompletion(
+ byte* providedBuffer,
+ int providedBufferLength,
+ int bytesTransferred,
+ ref uint auxiliaryData)
+ {
+ _ = providedBuffer;
+ _ = providedBufferLength;
+ _ = bytesTransferred;
+ _ = auxiliaryData;
+ return false;
+ }
+ }
+
+ internal abstract partial class ReadOperation
+ {
+ internal bool ProcessIoUringCompletionErrorRead(int result) =>
+ ProcessIoUringErrorResult(result, out ErrorCode);
+
+ ///
+ // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback.
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Read;
+ }
+
+ private abstract partial class WriteOperation
+ {
+ internal bool ProcessIoUringCompletionErrorWrite(int result) =>
+ ProcessIoUringErrorResult(result, out ErrorCode);
+
+ ///
+ // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback.
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Write;
+ }
+
+ private abstract partial class SendOperation
+ {
+ internal bool ProcessIoUringCompletionSuccessSend(int result)
+ {
+ if (result == 0)
+ {
+ // A zero-byte completion for a non-empty send payload indicates peer close
+ // on stream sockets; report reset instead of a spurious success/0-byte write.
+ if (Count > 0)
+ {
+ ErrorCode = SocketError.ConnectionReset;
+ return true;
+ }
+
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}");
+ Debug.Assert(result <= Count, $"Unexpected io_uring send completion size: result={result}, count={Count}");
+
+ int sent = Math.Min(result, Count);
+ BytesTransferred += sent;
+ Offset += sent;
+ Count -= sent;
+ ErrorCode = SocketError.Success;
+ return Count == 0;
+ }
+ }
+
+ private partial class BufferMemorySendOperation
+ {
+ private IntPtr _ioUringMessageHeader;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Gets a message header buffer and sets the common sendmsg fields.
+ private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringSendMessageHeader(byte* rawSocketAddress)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+ return messageHeader;
+ }
+
+ /// Configures a message header with zero or one iovec entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConfigureSingleIov(
+ Interop.Sys.MessageHeader* messageHeader,
+ byte* rawBuffer,
+ int bufferLength,
+ Interop.Sys.IOVector* iov)
+ {
+ if (bufferLength == 0)
+ {
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ return;
+ }
+
+ iov->Base = rawBuffer;
+ iov->Count = (UIntPtr)bufferLength;
+ messageHeader->IOVectors = iov;
+ messageHeader->IOVectorCount = 1;
+ }
+
+ /// Builds a connected send or sendmsg preparation request.
+ private unsafe IoUringDirectPrepareResult IoUringPrepareDirectSendMessage(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (rawBuffer is not null)
+ {
+ rawBuffer += Offset;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringSendMessageHeader(rawSocketAddress);
+ Interop.Sys.IOVector sendIov;
+ ConfigureSingleIov(messageHeader, rawBuffer, Count, &sendIov);
+
+ IoUringDirectPrepareResult sendMessagePrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ messageHeader,
+ Count,
+ Flags,
+ out userData,
+ out SocketError sendMessageErrorCode);
+ ErrorCode = sendMessageErrorCode;
+ return sendMessagePrepareResult;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (SocketAddress.Length == 0)
+ {
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (rawBuffer is not null)
+ {
+ rawBuffer += Offset;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendWithZeroCopyFallback(
+ context._socket,
+ rawBuffer,
+ Count,
+ Flags,
+ out bool usedZeroCopy,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ if (usedZeroCopy && prepareResult == IoUringDirectPrepareResult.Prepared)
+ {
+ engine.TransferIoUringZeroCopyPinHold(userData, TransferPinnedBuffer());
+ }
+
+ return prepareResult;
+ }
+
+ return IoUringPrepareDirectSendMessage(context, engine, out userData);
+ }
+ }
+
+ private sealed partial class BufferListSendOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private int _ioUringPreparedBufferCount = -1;
+ private int _ioUringPreparedStartIndex = -1;
+ private int _ioUringPreparedStartOffset = -1;
+ private int _ioUringPreparedIovCount;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedBufferCount = -1;
+ _ioUringPreparedStartIndex = -1;
+ _ioUringPreparedStartOffset = -1;
+ _ioUringPreparedIovCount = 0;
+ }
+
+ /// Pins buffer segments starting at BufferIndex/Offset and builds the iovec array.
+ private bool TryPinIoUringBuffers(
+ IList> buffers,
+ int startIndex,
+ int startOffset,
+ out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedBufferCount == buffers.Count &&
+ _ioUringPreparedStartIndex == startIndex &&
+ _ioUringPreparedStartOffset == startOffset &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ // Release any existing pinned handles and rented arrays before creating new ones.
+ // This handles the partial-send case where BufferIndex/Offset advanced, causing the
+ // reuse check above to fail while old resources are still held.
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex,
+ startOffset,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedBufferCount = buffers.Count;
+ _ioUringPreparedStartIndex = startIndex;
+ _ioUringPreparedStartOffset = startOffset;
+ _ioUringPreparedIovCount = iovCount;
+ return true;
+ }
+
+ /// Advances the buffer position after a partial send, returning true when all data is sent.
+ private bool AdvanceSendBufferPosition(int bytesSent)
+ {
+ IList>? buffers = Buffers;
+ if (buffers is null || bytesSent <= 0)
+ {
+ return buffers is null || BufferIndex >= buffers.Count;
+ }
+
+ int remaining = bytesSent;
+ int index = BufferIndex;
+ int offset = Offset;
+
+ while (remaining > 0 && index < buffers.Count)
+ {
+ int available = buffers[index].Count - offset;
+ Debug.Assert(available >= 0, "Unexpected negative buffer availability during io_uring send completion.");
+
+ if (available > remaining)
+ {
+ offset += remaining;
+ break;
+ }
+
+ remaining -= Math.Max(available, 0);
+ index++;
+ offset = 0;
+ }
+
+ BufferIndex = index;
+ Offset = offset;
+ return index >= buffers.Count;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ IList>? buffers = Buffers;
+ if (buffers is null)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if ((uint)BufferIndex > (uint)buffers.Count)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffers(buffers, BufferIndex, Offset, out int iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ byte* rawSocketAddress = null;
+ if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader messageHeader;
+ messageHeader.SocketAddress = rawSocketAddress;
+ messageHeader.SocketAddressLen = SocketAddress.Length;
+ messageHeader.ControlBuffer = null;
+ messageHeader.ControlBufferLen = 0;
+ messageHeader.Flags = SocketFlags.None;
+
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader.IOVectors = iovecsPtr;
+ messageHeader.IOVectorCount = iovCount;
+ // Buffer-list sends can be many small segments (e.g. 4KB chunks). Use
+ // aggregate payload size for zero-copy eligibility, not per-segment size.
+ long totalPayloadBytes = 0;
+ for (int i = 0; i < iovCount; i++)
+ {
+ totalPayloadBytes += (long)(nuint)iovecs[i].Count;
+ if (totalPayloadBytes >= int.MaxValue)
+ {
+ totalPayloadBytes = int.MaxValue;
+ break;
+ }
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ &messageHeader,
+ (int)totalPayloadBytes,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader.IOVectors = null;
+ messageHeader.IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ &messageHeader,
+ payloadLength: 0,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessBufferListSend(int result)
+ {
+ if (result == 0)
+ {
+ // Buffer-list sends can represent empty payloads; only treat result=0 as
+ // reset when there are still bytes pending across remaining segments.
+ if (HasPendingBufferListSendBytes())
+ {
+ ErrorCode = SocketError.ConnectionReset;
+ return true;
+ }
+
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}");
+ BytesTransferred += result;
+ bool complete = AdvanceSendBufferPosition(result);
+ ErrorCode = SocketError.Success;
+ return complete;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool HasPendingBufferListSendBytes()
+ {
+ IList>? buffers = Buffers;
+ if (buffers is null || BufferIndex >= buffers.Count)
+ {
+ return false;
+ }
+
+ int index = BufferIndex;
+ int offset = Offset;
+ while (index < buffers.Count)
+ {
+ int available = buffers[index].Count - offset;
+ if (available > 0)
+ {
+ return true;
+ }
+
+ index++;
+ offset = 0;
+ }
+
+ return false;
+ }
+ }
+
+ private sealed partial class BufferMemoryReceiveOperation
+ {
+ private IntPtr _ioUringMessageHeader;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Gets a message header buffer and sets the common recvmsg fields.
+ private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringReceiveMessageHeader(byte* rawSocketAddress)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ InitializeReceiveMessageHeader(messageHeader, rawSocketAddress);
+ return messageHeader;
+ }
+
+ /// Initializes recvmsg header fields shared by direct preparation variants.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void InitializeReceiveMessageHeader(Interop.Sys.MessageHeader* messageHeader, byte* rawSocketAddress)
+ {
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+ }
+
+ /// Configures a message header with a single iovec entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConfigureSingleIov(
+ Interop.Sys.MessageHeader* messageHeader,
+ byte* rawBuffer,
+ int bufferLength,
+ Interop.Sys.IOVector* iov)
+ {
+ // Keep a single iovec even for zero-length receives so recvmsg preserves
+ // completion-mode readiness probe behavior for zero-byte operations.
+ iov->Base = rawBuffer;
+ iov->Count = (UIntPtr)bufferLength;
+ messageHeader->IOVectors = iov;
+ messageHeader->IOVectorCount = 1;
+ }
+
+ /// Builds a connected or receive-from recvmsg operation.
+ private unsafe IoUringDirectPrepareResult IoUringPrepareDirectReceiveMessage(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringReceiveMessageHeader(rawSocketAddress);
+ Interop.Sys.IOVector receiveIov;
+ ConfigureSingleIov(messageHeader, rawBuffer, Buffer.Length, &receiveIov);
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ ///
+ /// Returns whether this operation shape is eligible for multishot recv submission.
+ /// Eligible: connected TCP receive (no socket address, no recvmsg flags) with non-empty buffer.
+ /// Ineligible: zero-byte probes, recvmsg-based receive paths (SetReceivedFlags/socket address).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsEligibleForIoUringMultishotRecv()
+ {
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ return false;
+ }
+
+ // Multishot recv uses IORING_OP_RECV (no msg_flags). Message-oriented sockets
+ // rely on MSG_TRUNC to report truncation, which is not observable in this path.
+ if (SocketPal.GetSockOpt(
+ AssociatedContext._socket,
+ SocketOptionLevel.Socket,
+ SocketOptionName.Type,
+ out int socketTypeValue) != SocketError.Success)
+ {
+ // If type probing fails, keep completion correctness by disabling multishot recv.
+ return false;
+ }
+
+ SocketType socketType = (SocketType)socketTypeValue;
+ if (socketType == SocketType.Dgram ||
+ socketType == SocketType.Raw ||
+ socketType == SocketType.Seqpacket)
+ {
+ return false;
+ }
+
+ return Buffer.Length != 0;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.OneShot);
+ IoUringDirectPrepareResult receiveMessagePrepareResult =
+ IoUringPrepareDirectReceiveMessage(context, engine, out userData);
+ if (receiveMessagePrepareResult != IoUringDirectPrepareResult.Prepared || ErrorCode != SocketError.Success)
+ {
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ }
+
+ return receiveMessagePrepareResult;
+ }
+
+ bool allowMultishotRecv = IsEligibleForIoUringMultishotRecv() && engine.SupportsMultishotRecv;
+ if (!allowMultishotRecv && context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ SetIoUringReceiveSubmissionMode(
+ allowMultishotRecv ? IoUringReceiveSubmissionMode.Multishot : IoUringReceiveSubmissionMode.OneShot);
+
+ // Before piggybacking, check the early-buffer for data that may have arrived
+ // between DoTryComplete's check (on ThreadPool) and this prepare (on event loop).
+ // Without this check, piggyback would wait for a CQE that never comes while the
+ // buffer has unconsumed data—a race between ThreadPool buffer consumption and
+ // event loop CQE-driven buffer fill.
+ if (allowMultishotRecv && !SetReceivedFlags && SocketAddress.Length == 0 &&
+ context.TryConsumeBufferedPersistentMultishotRecvData(Buffer, out int earlyBufferedBytes))
+ {
+ BytesTransferred = earlyBufferedBytes;
+ ReceivedFlags = SocketFlags.None;
+ ErrorCode = SocketError.Success;
+ userData = 0;
+ return IoUringDirectPrepareResult.CompletedFromBuffer;
+ }
+
+ // Persistent multishot receive: if one is already armed, attach this operation to
+ // that existing user_data instead of submitting a new recv SQE.
+ if (allowMultishotRecv && context.IsPersistentMultishotRecvArmed())
+ {
+ ulong armedUserData = context.PersistentMultishotRecvUserData;
+ bool replaced = armedUserData != 0 &&
+ engine.TryReplaceIoUringTrackedOperation(armedUserData, this);
+ if (replaced)
+ {
+ userData = armedUserData;
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.Prepared;
+ }
+
+ // Stale armed-state; clear and submit a fresh SQE below.
+ context.ClearPersistentMultishotRecvArmed();
+ }
+
+ bool bufferAlreadyPinned = IsIoUringBufferPinned();
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ ErrorCode = SocketError.Success;
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectRecv(
+ context._socket,
+ rawBuffer,
+ Buffer.Length,
+ Flags,
+ allowMultishotRecv,
+ bufferAlreadyPinned,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ if (allowMultishotRecv &&
+ prepareResult == IoUringDirectPrepareResult.Prepared &&
+ errorCode == SocketError.Success)
+ {
+ context.SetPersistentMultishotRecvArmed(userData);
+ }
+
+ if (prepareResult != IoUringDirectPrepareResult.Prepared || errorCode != SocketError.Success)
+ {
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ }
+
+ return prepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessBufferMemoryReceive(int result, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = SetReceivedFlags ? (SocketFlags)(int)auxiliaryData : SocketFlags.None;
+ if (result >= 0)
+ {
+ AssociatedContext.TryMigrateIoUringEngineOnFirstReceiveCompletion();
+ }
+
+ if (SocketAddress.Length != 0)
+ {
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)SocketAddress.Length)
+ {
+ socketAddressLen = SocketAddress.Length;
+ }
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ }
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ ///
+ internal override unsafe bool TryProcessIoUringProvidedBufferCompletion(
+ byte* providedBuffer,
+ int providedBufferLength,
+ int bytesTransferred,
+ ref uint auxiliaryData)
+ {
+ _ = auxiliaryData;
+
+ if (bytesTransferred <= 0)
+ {
+ return true;
+ }
+
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ return false;
+ }
+
+ if ((uint)bytesTransferred > (uint)providedBufferLength ||
+ (uint)bytesTransferred > (uint)Buffer.Length)
+ {
+ return false;
+ }
+
+ new ReadOnlySpan(providedBuffer, bytesTransferred).CopyTo(Buffer.Span);
+ return true;
+ }
+ }
+
+ private sealed partial class BufferListReceiveOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private IntPtr _ioUringMessageHeader;
+ private int _ioUringPreparedIovCount;
+ private int _ioUringPreparedBufferCount = -1;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferCount = -1;
+
+ IntPtr messageHeader = Interlocked.Exchange(ref _ioUringMessageHeader, IntPtr.Zero);
+ if (messageHeader != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)messageHeader);
+ }
+ }
+
+ /// Pins all buffer segments and builds the iovec array.
+ private bool TryPinIoUringBuffers(IList> buffers, out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedIovCount != 0 &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length &&
+ _ioUringPreparedBufferCount == buffers.Count)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex: 0,
+ startOffset: 0,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedIovCount = iovCount;
+ _ioUringPreparedBufferCount = buffers.Count;
+ return true;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ IList>? buffers = Buffers;
+ if (buffers is null)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffers(buffers, out int iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ byte* rawSocketAddress = null;
+ if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader->IOVectors = iovecsPtr;
+ messageHeader->IOVectorCount = iovCount;
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ internal unsafe bool ProcessIoUringCompletionSuccessBufferListReceive(int result, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = (SocketFlags)(int)auxiliaryData;
+ ErrorCode = SocketError.Success;
+ if (result >= 0)
+ {
+ AssociatedContext.TryMigrateIoUringEngineOnFirstReceiveCompletion();
+ }
+
+ if (_ioUringMessageHeader != IntPtr.Zero && SocketAddress.Length != 0)
+ {
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)SocketAddress.Length)
+ {
+ socketAddressLen = SocketAddress.Length;
+ }
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ }
+
+ return true;
+ }
+ }
+
+ private sealed partial class ReceiveMessageFromOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private int _ioUringPreparedIovCount;
+ private int _ioUringPreparedBufferListCount = -1;
+ private IntPtr _ioUringMessageHeader;
+ private IntPtr _ioUringControlBuffer;
+ private int _ioUringControlBufferLength;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferListCount = -1;
+
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+ _ioUringControlBufferLength = 0;
+
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Pins buffer segments and builds the iovec array for recvmsg.
+ private bool TryPinIoUringBuffers(IList> buffers, out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length &&
+ _ioUringPreparedBufferListCount == buffers.Count)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex: 0,
+ startOffset: 0,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedIovCount = iovCount;
+ _ioUringPreparedBufferListCount = buffers.Count;
+ return true;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ IList>? buffers = Buffers;
+ byte* rawBuffer = null;
+ int iovCount;
+ if (buffers is not null)
+ {
+ ReleaseIoUringPinnedBufferForShapeTransition();
+ if (!TryPinIoUringBuffers(buffers, out iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+ }
+ else
+ {
+ if (!TryPinIoUringBuffer(Buffer, out rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (_ioUringPinnedBufferHandles is not null || _ioUringIovecs is not null)
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferListCount = -1;
+ }
+
+ iovCount = 1;
+ }
+
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->Flags = SocketFlags.None;
+
+ int controlBufferLen = Interop.Sys.GetControlMessageBufferSize(Convert.ToInt32(IsIPv4), Convert.ToInt32(IsIPv6));
+ if (controlBufferLen < 0)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (controlBufferLen != 0)
+ {
+ if (_ioUringControlBuffer == IntPtr.Zero || _ioUringControlBufferLength != controlBufferLen)
+ {
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+
+ void* rawControlBuffer = NativeMemory.Alloc((nuint)controlBufferLen);
+ _ioUringControlBuffer = (IntPtr)rawControlBuffer;
+ _ioUringControlBufferLength = controlBufferLen;
+ }
+
+ messageHeader->ControlBuffer = (byte*)_ioUringControlBuffer;
+ messageHeader->ControlBufferLen = controlBufferLen;
+ }
+ else
+ {
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+
+ _ioUringControlBufferLength = 0;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ }
+
+ if (buffers is not null)
+ {
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader->IOVectors = iovecsPtr;
+ messageHeader->IOVectorCount = iovCount;
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ Interop.Sys.IOVector iov;
+ iov.Base = rawBuffer;
+ iov.Count = (UIntPtr)Buffer.Length;
+ messageHeader->IOVectors = &iov;
+ messageHeader->IOVectorCount = 1;
+ IoUringDirectPrepareResult singleBufferPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError singleBufferErrorCode);
+ ErrorCode = singleBufferErrorCode;
+ return singleBufferPrepareResult;
+ }
+
+ internal unsafe bool ProcessIoUringCompletionSuccessReceiveMessageFrom(int result, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = (SocketFlags)(int)auxiliaryData;
+ ErrorCode = SocketError.Success;
+ IPPacketInformation = default;
+ if (result >= 0)
+ {
+ AssociatedContext.TryMigrateIoUringEngineOnFirstReceiveCompletion();
+ }
+
+ if (_ioUringMessageHeader != IntPtr.Zero)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ int socketAddressCapacity = SocketAddress.Length;
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)socketAddressCapacity)
+ {
+ socketAddressLen = socketAddressCapacity;
+ }
+
+ if (socketAddressLen == 0 && socketAddressCapacity != 0)
+ {
+ socketAddressLen = socketAddressCapacity;
+ SocketAddress.Span.Clear();
+ }
+
+ int controlBufferCapacity = messageHeader->ControlBufferLen;
+ int controlBufferLen = IoUringCompletionControlBufferLen;
+ if (controlBufferLen < 0)
+ {
+ controlBufferLen = 0;
+ }
+
+ if ((uint)controlBufferLen > (uint)controlBufferCapacity)
+ {
+ controlBufferLen = controlBufferCapacity;
+ }
+
+ messageHeader->SocketAddressLen = socketAddressLen;
+ messageHeader->ControlBufferLen = controlBufferLen;
+ messageHeader->Flags = ReceivedFlags;
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+
+ IPPacketInformation = SocketPal.GetIoUringIPPacketInformation(messageHeader, IsIPv4, IsIPv6);
+ }
+
+ return true;
+ }
+
+ internal bool ProcessIoUringCompletionErrorReceiveMessageFrom(int result)
+ {
+ if (!ProcessIoUringErrorResult(result, out ErrorCode))
+ {
+ return false;
+ }
+
+ IPPacketInformation = default;
+ return true;
+ }
+ }
+
+ internal sealed partial class AcceptOperation
+ {
+ ///
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Read;
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ AcceptSocketAddressLength = SocketAddress.Length;
+ if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (engine.SupportsMultishotAccept &&
+ Interlocked.CompareExchange(
+ ref context._multishotAcceptState,
+ MultishotAcceptStateArming,
+ MultishotAcceptStateDisarmed) == MultishotAcceptStateDisarmed)
+ {
+ context.EnsureMultishotAcceptQueueInitialized();
+ IoUringDirectPrepareResult multishotPrepareResult = engine.TryPrepareIoUringDirectMultishotAccept(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError multishotErrorCode);
+ if (multishotPrepareResult == IoUringDirectPrepareResult.Prepared)
+ {
+ Debug.Assert(
+ (byte)(userData >> IoUringUserDataTagShift) == IoUringReservedCompletionTag,
+ "Multishot accept user_data must be a reserved-completion token.");
+ Volatile.Write(ref context._multishotAcceptState, unchecked((long)userData));
+ context.TryCreateReusePortShadowListeners(engine);
+ ErrorCode = multishotErrorCode;
+ return multishotPrepareResult;
+ }
+
+ context.DisarmMultishotAccept();
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectAccept(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessAccept(int result, uint auxiliaryData)
+ {
+ AcceptedFileDescriptor = (IntPtr)result;
+ ErrorCode = SocketError.Success;
+ // Keep parity with readiness path: always honor reported address length, including 0.
+ AcceptSocketAddressLength = auxiliaryData > (uint)SocketAddress.Length ? SocketAddress.Length : (int)auxiliaryData;
+ SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength);
+ return true;
+ }
+
+ internal bool ProcessIoUringCompletionErrorAccept(int result)
+ {
+ AcceptedFileDescriptor = (IntPtr)(-1);
+ return ProcessIoUringCompletionErrorRead(result);
+ }
+ }
+
+ private sealed partial class ConnectOperation
+ {
+ ///
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Write;
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectConnect(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionErrorConnect(SocketAsyncContext context, int result)
+ {
+ Interop.Error error = GetIoUringPalError(result);
+ if (error == Interop.Error.EINPROGRESS)
+ {
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ if (!ProcessIoUringCompletionErrorWrite(result))
+ {
+ return false;
+ }
+
+ context._socket.RegisterConnectResult(ErrorCode);
+ return true;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessConnect(SocketAsyncContext context)
+ {
+ ErrorCode = SocketError.Success;
+ context._socket.RegisterConnectResult(ErrorCode);
+
+ if (Buffer.Length > 0)
+ {
+ Action, SocketFlags, SocketError>? callback = Callback;
+ Debug.Assert(callback is not null);
+ SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, default, ref BytesTransferred, callback!, default);
+ if (error == SocketError.IOPending)
+ {
+ // Callback ownership moved to the async send operation.
+ Callback = null;
+ Buffer = default;
+ }
+ else
+ {
+ if (error != SocketError.Success)
+ {
+ ErrorCode = error;
+ context._socket.RegisterConnectResult(ErrorCode);
+ }
+
+ // Follow-up send completed synchronously (success/error), so invoke
+ // Connect callback from this operation path.
+ Buffer = default;
+ }
+ }
+
+ return true;
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
index 4e2e117984084c..c5d89ba39ea434 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
@@ -43,10 +44,10 @@ internal sealed partial class SocketAsyncContext
private BufferListReceiveOperation? _cachedBufferListReceiveOperation;
private BufferMemorySendOperation? _cachedBufferMemorySendOperation;
private BufferListSendOperation? _cachedBufferListSendOperation;
-
private void ReturnOperation(AcceptOperation operation)
{
operation.Reset();
+ operation.AcceptSocketAddressLength = 0;
operation.Callback = null;
operation.SocketAddress = default;
Volatile.Write(ref _cachedAcceptOperation, operation); // benign race condition
@@ -83,6 +84,7 @@ private void ReturnOperation(BufferListSendOperation operation)
{
operation.Reset();
operation.Buffers = null;
+ operation.SetBufferPosition(bufferIndex: 0, offset: 0);
operation.Callback = null;
operation.SocketAddress = default;
Volatile.Write(ref _cachedBufferListSendOperation, operation); // benign race condition
@@ -108,8 +110,21 @@ private BufferListSendOperation RentBufferListSendOperation() =>
Interlocked.Exchange(ref _cachedBufferListSendOperation, null) ??
new BufferListSendOperation(this);
- private abstract class AsyncOperation : IThreadPoolWorkItem
+ // Partial method hooks for io_uring completion-mode staging (Linux-only).
+ // No-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs.
+ static partial void LinuxTryStageIoUringOperation(AsyncOperation operation);
+ partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued);
+ partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred);
+ partial void LinuxOnStopAndAbort();
+ partial void LinuxHasBufferedPersistentMultishotRecvData(ref bool hasBuffered);
+
+ internal abstract partial class AsyncOperation : IThreadPoolWorkItem
{
+ private const int CancellationCallbackBatchSize = 64;
+ private static readonly ConcurrentQueue s_cancellationCallbackQueue = new ConcurrentQueue();
+ private static readonly IThreadPoolWorkItem s_processCancellationCallbacks = new CancellationCallbackWorker();
+ private static int s_cancellationCallbackWorkerQueued;
+
private enum State
{
Waiting = 0,
@@ -120,6 +135,11 @@ private enum State
}
private volatile AsyncOperation.State _state;
+ private int _ioUringCompletionCallbackQueued;
+ private int _ioUringFallbackReprepareRequested;
+ // Defined in the shared Unix partial so operation constructors can compile
+ // for both linux and non-linux unix TFMs; only linux consumes the value.
+ private int _ioUringCompletionDispatchKind;
#if DEBUG
private bool _callbackQueued; // When true, the callback has been queued.
@@ -133,6 +153,24 @@ private enum State
public ManualResetEventSlim? Event { get; set; }
+ protected enum IoUringCompletionDispatchKind : byte
+ {
+ Default = 0,
+ ReadOperation = 1,
+ WriteOperation = 2,
+ SendOperation = 3,
+ BufferListSendOperation = 4,
+ BufferMemoryReceiveOperation = 5,
+ BufferListReceiveOperation = 6,
+ ReceiveMessageFromOperation = 7,
+ AcceptOperation = 8,
+ ConnectOperation = 9
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected void SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind kind) =>
+ _ioUringCompletionDispatchKind = (int)kind;
+
public AsyncOperation(SocketAsyncContext context)
{
AssociatedContext = context;
@@ -141,7 +179,10 @@ public AsyncOperation(SocketAsyncContext context)
public void Reset()
{
+ ResetIoUringState();
_state = State.Waiting;
+ _ioUringCompletionCallbackQueued = 0;
+ _ioUringFallbackReprepareRequested = 0;
Event = null;
Next = this;
#if DEBUG
@@ -202,6 +243,16 @@ public OperationResult TryComplete(SocketAsyncContext context)
}
public bool TryCancel()
+ {
+ return TryCancelCore(requestIoUringCancellation: true);
+ }
+
+ internal bool TryCancelForTeardown()
+ {
+ return TryCancelCore(requestIoUringCancellation: false);
+ }
+
+ private bool TryCancelCore(bool requestIoUringCancellation)
{
Trace("Enter");
@@ -228,10 +279,16 @@ public bool TryCancel()
if (newState == State.RunningWithPendingCancellation)
{
+ // For in-flight io_uring operations, request best-effort kernel cancellation now.
+ // If completion has already won, the request is benign and will be ignored.
+ LinuxRequestIoUringCancellationIfNeeded(requestIoUringCancellation);
// TryComplete will either succeed, or it will see the pending cancellation and deal with it.
return false;
}
+ // Best effort: if completion-mode io_uring work was already submitted, request kernel-side cancellation now.
+ // Partial method: no-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs.
+ LinuxRequestIoUringCancellationIfNeeded(requestIoUringCancellation);
ProcessCancellation();
// Note, we leave the operation in the OperationQueue.
@@ -245,6 +302,7 @@ public void ProcessCancellation()
Debug.Assert(_state == State.Canceled);
+ LinuxUntrackIoUringOperation();
ErrorCode = SocketError.OperationAborted;
ManualResetEventSlim? e = Event;
@@ -261,10 +319,53 @@ public void ProcessCancellation()
// we can't pool the object, as ProcessQueue may still have a reference to it, due to
// using a pattern whereby it takes the lock to grab an item, but then releases the lock
// to do further processing on the item that's still in the list.
- ThreadPool.UnsafeQueueUserWorkItem(o => ((AsyncOperation)o!).InvokeCallback(allowPooling: false), this);
+ QueueCancellationCallback(this);
+ }
+ }
+
+ private static void QueueCancellationCallback(AsyncOperation operation)
+ {
+ s_cancellationCallbackQueue.Enqueue(operation);
+ if (Interlocked.CompareExchange(ref s_cancellationCallbackWorkerQueued, 1, 0) == 0)
+ {
+ ThreadPool.UnsafeQueueUserWorkItem(s_processCancellationCallbacks, preferLocal: false);
+ }
+ }
+
+ private static void ProcessQueuedCancellationCallbacks()
+ {
+ while (true)
+ {
+ int processed = 0;
+ while (processed < CancellationCallbackBatchSize &&
+ s_cancellationCallbackQueue.TryDequeue(out AsyncOperation? operation))
+ {
+ operation.InvokeCallback(allowPooling: false);
+ processed++;
+ }
+
+ if (s_cancellationCallbackQueue.IsEmpty)
+ {
+ Volatile.Write(ref s_cancellationCallbackWorkerQueued, 0);
+ if (s_cancellationCallbackQueue.IsEmpty ||
+ Interlocked.CompareExchange(ref s_cancellationCallbackWorkerQueued, 1, 0) != 0)
+ {
+ return;
+ }
+
+ continue;
+ }
+
+ ThreadPool.UnsafeQueueUserWorkItem(s_processCancellationCallbacks, preferLocal: false);
+ return;
}
}
+ private sealed class CancellationCallbackWorker : IThreadPoolWorkItem
+ {
+ void IThreadPoolWorkItem.Execute() => ProcessQueuedCancellationCallbacks();
+ }
+
public void Dispatch()
{
ManualResetEventSlim? e = Event;
@@ -288,8 +389,40 @@ public void Schedule()
ThreadPool.UnsafeQueueUserWorkItem(this, preferLocal: false);
}
+ internal void QueueIoUringCompletionCallback()
+ {
+ Debug.Assert(Event == null);
+ if (Interlocked.Exchange(ref _ioUringCompletionCallbackQueued, 1) != 0)
+ {
+ Debug.Fail("io_uring completion callback was already queued for this operation.");
+ return;
+ }
+
+ ThreadPool.UnsafeQueueUserWorkItem(this, preferLocal: false);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryExecuteIoUringCompletionCallback()
+ {
+ if (Interlocked.Exchange(ref _ioUringCompletionCallbackQueued, 0) == 0)
+ {
+ return false;
+ }
+
+ InvokeCallback(allowPooling: true);
+ return true;
+ }
+
public void Process() => ((IThreadPoolWorkItem)this).Execute();
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void RequestIoUringFallbackReprepare() =>
+ Volatile.Write(ref _ioUringFallbackReprepareRequested, 1);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryConsumeIoUringFallbackReprepareRequested() =>
+ Interlocked.Exchange(ref _ioUringFallbackReprepareRequested, 0) != 0;
+
void IThreadPoolWorkItem.Execute()
{
// ReadOperation and WriteOperation, the only two types derived from
@@ -305,17 +438,27 @@ void IThreadPoolWorkItem.Execute()
// We could also add an abstract method that the base interface implementation
// invokes, but that adds an extra virtual dispatch.
Debug.Fail("Expected derived type to implement IThreadPoolWorkItem");
- throw new InvalidOperationException();
+ ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem();
}
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem() =>
+ throw new InvalidOperationException();
+
// Called when op is not in the queue yet, so can't be otherwise executing
public void DoAbort()
{
+ LinuxUntrackIoUringOperation();
ErrorCode = SocketError.OperationAborted;
}
protected abstract bool DoTryComplete(SocketAsyncContext context);
+ partial void ResetIoUringState();
+ partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation);
+ partial void LinuxUntrackIoUringOperation();
+
public abstract void InvokeCallback(bool allowPooling);
[Conditional("SOCKETASYNCCONTEXT_TRACE")]
@@ -333,36 +476,74 @@ public void TraceWithContext(SocketAsyncContext context, string message, [Caller
// These two abstract classes differentiate the operations that go in the
// read queue vs the ones that go in the write queue.
- private abstract class ReadOperation : AsyncOperation, IThreadPoolWorkItem
+ internal abstract partial class ReadOperation : AsyncOperation, IThreadPoolWorkItem
{
- public ReadOperation(SocketAsyncContext context) : base(context) { }
+ public ReadOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ReadOperation);
+ }
+
+ void IThreadPoolWorkItem.Execute()
+ {
+ if (TryExecuteIoUringCompletionCallback())
+ {
+ return;
+ }
+
+ AssociatedContext.ProcessAsyncReadOperation(this);
+ }
+ }
+
+ private static bool ShouldDispatchCompletionCallback(AsyncOperation operation)
+ {
+ if (operation is ConnectOperation connectOperation)
+ {
+ // Connect can hand callback ownership to a follow-up send operation;
+ // dispatch here only when connect still owns the callback.
+ return connectOperation.Buffer.Length == 0 && connectOperation.Callback is not null;
+ }
- void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncReadOperation(this);
+ return true;
}
- private abstract class WriteOperation : AsyncOperation, IThreadPoolWorkItem
+ private abstract partial class WriteOperation : AsyncOperation, IThreadPoolWorkItem
{
- public WriteOperation(SocketAsyncContext context) : base(context) { }
+ public WriteOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.WriteOperation);
+ }
- void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncWriteOperation(this);
+ void IThreadPoolWorkItem.Execute()
+ {
+ if (TryExecuteIoUringCompletionCallback())
+ {
+ return;
+ }
+
+ AssociatedContext.ProcessAsyncWriteOperation(this);
+ }
}
- private abstract class SendOperation : WriteOperation
+ private abstract partial class SendOperation : WriteOperation
{
public SocketFlags Flags;
public int BytesTransferred;
public int Offset;
public int Count;
- public SendOperation(SocketAsyncContext context) : base(context) { }
+ public SendOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.SendOperation);
+ }
public Action, SocketFlags, SocketError>? Callback { get; set; }
public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, SocketFlags.None, ErrorCode);
+
}
- private class BufferMemorySendOperation : SendOperation
+ private partial class BufferMemorySendOperation : SendOperation
{
public Memory Buffer;
@@ -390,18 +571,27 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class BufferListSendOperation : SendOperation
+ private sealed partial class BufferListSendOperation : SendOperation
{
public IList>? Buffers;
public int BufferIndex;
- public BufferListSendOperation(SocketAsyncContext context) : base(context) { }
+ public BufferListSendOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferListSendOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
return SocketPal.TryCompleteSendTo(context._socket, default(ReadOnlySpan), Buffers, ref BufferIndex, ref Offset, ref Count, Flags, SocketAddress.Span, ref BytesTransferred, out ErrorCode);
}
+ internal void SetBufferPosition(int bufferIndex, int offset)
+ {
+ BufferIndex = bufferIndex;
+ Offset = offset;
+ }
+
public override void InvokeCallback(bool allowPooling)
{
var cb = Callback!;
@@ -446,15 +636,31 @@ public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, ReceivedFlags, ErrorCode);
}
- private sealed class BufferMemoryReceiveOperation : ReceiveOperation
+ private sealed partial class BufferMemoryReceiveOperation : ReceiveOperation
{
public Memory Buffer;
public bool SetReceivedFlags;
- public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context) { }
+ public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferMemoryReceiveOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
+ bool consumedBufferedData = false;
+ int bufferedBytes = 0;
+ context.LinuxTryConsumeBufferedPersistentMultishotRecvData(Buffer, ref consumedBufferedData, ref bufferedBytes);
+ if (!SetReceivedFlags &&
+ SocketAddress.Length == 0 &&
+ consumedBufferedData)
+ {
+ BytesTransferred = bufferedBytes;
+ ReceivedFlags = SocketFlags.None;
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
// Zero byte read is performed to know when data is available.
// We don't have to call receive, our caller is interested in the event.
if (Buffer.Length == 0 && Flags == SocketFlags.None && SocketAddress.Length == 0)
@@ -502,11 +708,14 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class BufferListReceiveOperation : ReceiveOperation
+ private sealed partial class BufferListReceiveOperation : ReceiveOperation
{
public IList>? Buffers;
- public BufferListReceiveOperation(SocketAsyncContext context) : base(context) { }
+ public BufferListReceiveOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferListReceiveOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
@@ -553,7 +762,7 @@ protected override bool DoTryComplete(SocketAsyncContext context)
}
}
- private sealed class ReceiveMessageFromOperation : ReadOperation
+ private sealed partial class ReceiveMessageFromOperation : ReadOperation
{
public Memory Buffer;
public SocketFlags Flags;
@@ -565,7 +774,10 @@ private sealed class ReceiveMessageFromOperation : ReadOperation
public bool IsIPv6;
public IPPacketInformation IPPacketInformation;
- public ReceiveMessageFromOperation(SocketAsyncContext context) : base(context) { }
+ public ReceiveMessageFromOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ReceiveMessageFromOperation);
+ }
public Action, SocketFlags, IPPacketInformation, SocketError>? Callback { get; set; }
@@ -613,21 +825,33 @@ public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, ReceivedFlags, IPPacketInformation, ErrorCode);
}
- private sealed class AcceptOperation : ReadOperation
+ internal sealed partial class AcceptOperation : ReadOperation
{
public IntPtr AcceptedFileDescriptor;
+ public int AcceptSocketAddressLength;
- public AcceptOperation(SocketAsyncContext context) : base(context) { }
+ public AcceptOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.AcceptOperation);
+ }
public Action, SocketError>? Callback { get; set; }
protected override bool DoTryComplete(SocketAsyncContext context)
{
+ bool dequeuedPreAcceptedConnection = false;
+ context.LinuxTryDequeuePreAcceptedConnection(this, ref dequeuedPreAcceptedConnection);
+ if (dequeuedPreAcceptedConnection)
+ {
+ return true;
+ }
+
bool completed = SocketPal.TryCompleteAccept(context._socket, SocketAddress, out int socketAddressLen, out AcceptedFileDescriptor, out ErrorCode);
+ AcceptSocketAddressLength = socketAddressLen;
Debug.Assert(ErrorCode == SocketError.Success || AcceptedFileDescriptor == (IntPtr)(-1), $"Unexpected values: ErrorCode={ErrorCode}, AcceptedFileDescriptor={AcceptedFileDescriptor}");
if (ErrorCode == SocketError.Success)
{
- SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength);
}
return completed;
}
@@ -648,21 +872,49 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class ConnectOperation : BufferMemorySendOperation
+ private sealed partial class ConnectOperation : BufferMemorySendOperation
{
- public ConnectOperation(SocketAsyncContext context) : base(context) { }
+ public ConnectOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ConnectOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
bool result = SocketPal.TryCompleteConnect(context._socket, out ErrorCode);
context._socket.RegisterConnectResult(ErrorCode);
- if (result && ErrorCode == SocketError.Success && Buffer.Length > 0)
+ if (result && Buffer.Length > 0)
{
- SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, Callback!, default);
- if (error != SocketError.Success && error != SocketError.IOPending)
+ if (ErrorCode == SocketError.Success)
+ {
+ Action, SocketFlags, SocketError>? callback = Callback;
+ Debug.Assert(callback != null);
+ SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, callback!, default);
+ if (error == SocketError.IOPending)
+ {
+ // Callback ownership moved to the async send operation.
+ Callback = null;
+ Buffer = default;
+ }
+ else
+ {
+ if (error != SocketError.Success)
+ {
+ ErrorCode = error;
+ context._socket.RegisterConnectResult(ErrorCode);
+ }
+
+ // Follow-up send completed synchronously (success/error), so invoke
+ // Connect callback from this operation path.
+ Buffer = default;
+ }
+ }
+ else
{
- context._socket.RegisterConnectResult(ErrorCode);
+ // Connect failed — no follow-up send will occur.
+ // Clear buffer so callback dispatch is not suppressed.
+ Buffer = default;
}
}
return result;
@@ -670,17 +922,18 @@ protected override bool DoTryComplete(SocketAsyncContext context)
public override void InvokeCallback(bool allowPooling)
{
- var cb = Callback!;
+ Action, SocketFlags, SocketError>? cb = Callback;
int bt = BytesTransferred;
Memory sa = SocketAddress;
SocketError ec = ErrorCode;
Memory buffer = Buffer;
- if (buffer.Length == 0)
+ if (cb != null && (buffer.Length == 0 || ec == SocketError.OperationAborted))
{
// Invoke callback only when we are completely done.
// In case data were provided for Connect we may or may not send them all.
- // If we did not we will need follow-up with Send operation
+ // If we did not we will need follow-up with Send operation.
+ // On cancellation, always invoke — the send was never started.
cb(bt, sa, SocketFlags.None, ec);
}
}
@@ -890,6 +1143,9 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation
operation.CancellationRegistration = cancellationToken.UnsafeRegister(s => ((TOperation)s!).TryCancel(), operation);
}
+ // Completion-mode staging: partial method is no-op on non-Linux.
+ LinuxTryStageIoUringOperation(operation);
+
return true;
case QueueState.Stopped:
@@ -898,7 +1154,7 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation
break;
default:
- Environment.FailFast("unexpected queue state");
+ FailFastUnexpectedQueueState(_state);
break;
}
}
@@ -939,7 +1195,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper
}
else
{
- throw new InternalException(error);
+ ThrowInternalException(error);
}
}
}
@@ -986,7 +1242,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper
return null;
default:
- Environment.FailFast("unexpected queue state");
+ FailFastUnexpectedQueueState(_state);
return null;
}
}
@@ -1022,7 +1278,10 @@ internal void ProcessAsyncOperation(TOperation op)
// request for a previous operation could affect a subsequent one)
// and here we know the operation has completed.
op.CancellationRegistration.Dispose();
- op.InvokeCallback(allowPooling: true);
+ if (ShouldDispatchCompletionCallback(op))
+ {
+ op.InvokeCallback(allowPooling: true);
+ }
}
}
@@ -1041,9 +1300,29 @@ public OperationResult ProcessQueuedOperation(TOperation op)
Trace(context, $"Exit (stopped)");
return OperationResult.Cancelled;
}
+ else if (_state != QueueState.Processing)
+ {
+ Debug.Assert(_tail != null);
+ bool isHead = ReferenceEquals(op, _tail.Next);
+ if (_state == QueueState.Waiting && isHead)
+ {
+ // A previously scheduled worker can race queue-state transitions and
+ // arrive after the queue fell back to Waiting. Reclaim processing for
+ // the current head operation rather than dropping it.
+ _state = QueueState.Processing;
+ observedSequenceNumber = _sequenceNumber;
+ }
+ else
+ {
+ // io_uring completion can remove this operation concurrently and transition
+ // the queue to Ready before a previously scheduled worker starts.
+ // In that case, completion ownership has already moved elsewhere.
+ Trace(context, $"Exit (state changed before processing): {_state}");
+ return OperationResult.Cancelled;
+ }
+ }
else
{
- Debug.Assert(_state == QueueState.Processing, $"_state={_state} while processing queue!");
Debug.Assert(_tail != null, "Unexpected empty queue while processing I/O");
Debug.Assert(op == _tail.Next, "Operation is not at head of queue???");
observedSequenceNumber = _sequenceNumber;
@@ -1069,10 +1348,17 @@ public OperationResult ProcessQueuedOperation(TOperation op)
Trace(context, $"Exit (stopped)");
return OperationResult.Cancelled;
}
+ else if (_state != QueueState.Processing)
+ {
+ Debug.Assert(
+ _state == QueueState.Ready || _state == QueueState.Waiting,
+ $"Unexpected queue state while pending retry: {_state}");
+ // Completion may have raced and detached this operation from the queue.
+ Trace(context, $"Exit (state changed while pending): {_state}");
+ return OperationResult.Cancelled;
+ }
else
{
- Debug.Assert(_state == QueueState.Processing, $"_state={_state} while processing queue!");
-
if (observedSequenceNumber != _sequenceNumber)
{
// We received another epoll notification since we previously checked it.
@@ -1083,6 +1369,18 @@ public OperationResult ProcessQueuedOperation(TOperation op)
else
{
_state = QueueState.Waiting;
+ // In io_uring completion mode there may be no native readiness edge to
+ // re-drive pending operations. Re-stage non-connect operations so
+ // completion-mode retries keep making forward progress.
+ // Connect operations can legitimately remain EINPROGRESS; forcing
+ // immediate restaging there can surface spurious EALREADY.
+ if (op is not ConnectOperation)
+ {
+ if (op.TryConsumeIoUringFallbackReprepareRequested())
+ {
+ LinuxTryStageIoUringOperation(op);
+ }
+ }
Trace(context, $"Exit (received EAGAIN)");
return OperationResult.Pending;
}
@@ -1129,6 +1427,59 @@ public OperationResult ProcessQueuedOperation(TOperation op)
return result;
}
+ public bool TryRemoveCompletedOperation(SocketAsyncContext context, TOperation operation)
+ {
+ using (Lock())
+ {
+ if (_tail == null || _state == QueueState.Stopped)
+ {
+ return false;
+ }
+
+ AsyncOperation? previous = _tail;
+ AsyncOperation? current = _tail.Next;
+ while (!ReferenceEquals(current, operation))
+ {
+ if (ReferenceEquals(current, _tail))
+ {
+ return false;
+ }
+
+ previous = current;
+ current = current!.Next;
+ }
+
+ Debug.Assert(previous != null && current != null);
+ bool removedHead = ReferenceEquals(current, _tail.Next);
+ bool removedTail = ReferenceEquals(current, _tail);
+
+ if (removedHead && removedTail)
+ {
+ _tail = null;
+ _isNextOperationSynchronous = false;
+ _state = QueueState.Ready;
+ _sequenceNumber++;
+ Trace(context, $"Removed completed {IdOf(operation)} (queue empty)");
+ return true;
+ }
+
+ previous!.Next = current!.Next;
+ if (removedTail)
+ {
+ _tail = (TOperation)previous;
+ }
+
+ if (removedHead)
+ {
+ Debug.Assert(_tail != null);
+ _isNextOperationSynchronous = _tail.Next.Event != null;
+ }
+
+ Trace(context, $"Removed completed {IdOf(operation)}");
+ return true;
+ }
+ }
+
public void CancelAndContinueProcessing(TOperation op)
{
// Note, only sync operations use this method.
@@ -1244,6 +1595,17 @@ public bool StopAndAbort(SocketAsyncContext context)
return aborted;
}
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(Interop.Error error) =>
+ throw new InternalException(error);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void FailFastUnexpectedQueueState(QueueState state) =>
+ Environment.FailFast($"unexpected queue state: {state}");
+
[Conditional("SOCKETASYNCCONTEXT_TRACE")]
public void Trace(SocketAsyncContext context, string message, [CallerMemberName] string? memberName = null)
{
@@ -1265,6 +1627,13 @@ public void Trace(SocketAsyncContext context, string message, [CallerMemberName]
/// An index into 's table of all contexts that are currently .
internal int GlobalContextIndex = -1;
+ ///
+ /// Wakes the io_uring event loop if this context is registered with an io_uring engine.
+ /// Called from SafeSocketHandle.TryUnblockSocket to ensure deferred cancel CQEs
+ /// (produced by shutdown/disconnect under DEFER_TASKRUN) are processed promptly.
+ ///
+ internal void WakeIoUringEventLoopIfNeeded() => _asyncEngine?.WakeIoUringEventLoopForSocketClose();
+
private readonly object _registerLock = new object();
public SocketAsyncContext(SafeSocketHandle socket)
@@ -1321,6 +1690,65 @@ private bool TryRegister(out Interop.Error error)
}
}
+ internal bool TryMigrateToEngine(int targetEngineIndex)
+ {
+ if ((uint)targetEngineIndex >= (uint)SocketAsyncEngine.EngineCount)
+ {
+ return false;
+ }
+
+ lock (_registerLock)
+ {
+ SocketAsyncEngine? currentEngine = Volatile.Read(ref _asyncEngine);
+ if (currentEngine is null)
+ {
+ return false;
+ }
+
+ if (currentEngine.EngineIndex == targetEngineIndex)
+ {
+ return true;
+ }
+
+ SocketAsyncEngine targetEngine = SocketAsyncEngine.GetEngineByIndex(targetEngineIndex);
+ bool addedRef = false;
+ Interop.Error error;
+ try
+ {
+ _socket.DangerousAddRef(ref addedRef);
+ IntPtr handle = _socket.DangerousGetHandle();
+
+ SocketAsyncEngine.UnregisterSocket(this);
+ if (SocketAsyncEngine.TryRegisterSocketWithEngine(handle, this, targetEngine, out error))
+ {
+ Volatile.Write(ref _asyncEngine, targetEngine);
+ return true;
+ }
+
+ // Best-effort rollback to the previous engine if target registration fails.
+ if (SocketAsyncEngine.TryRegisterSocketWithEngine(handle, this, currentEngine, out error))
+ {
+ Volatile.Write(ref _asyncEngine, currentEngine);
+ }
+ else
+ {
+ // Fail fast: socket is no longer registered with any engine.
+ // Clear the engine reference so subsequent operations don't target stale state.
+ Volatile.Write(ref _asyncEngine, null);
+ }
+
+ return false;
+ }
+ finally
+ {
+ if (addedRef)
+ {
+ _socket.DangerousRelease();
+ }
+ }
+ }
+ }
+
public bool StopAndAbort()
{
bool aborted = false;
@@ -1328,6 +1756,7 @@ public bool StopAndAbort()
// Drain queues
aborted |= _sendQueue.StopAndAbort(this);
aborted |= _receiveQueue.StopAndAbort(this);
+ LinuxOnStopAndAbort();
// We don't need to synchronize with Register.
// This method is called when the handle gets released.
@@ -1360,7 +1789,7 @@ public void SetHandleNonBlocking()
{
if (Interop.Sys.Fcntl.SetIsNonBlocking(_socket, 1) != 0)
{
- throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError()));
+ ThrowSocketExceptionFromLastError();
}
_isHandleNonBlocking = true;
@@ -1369,11 +1798,36 @@ public void SetHandleNonBlocking()
public bool IsHandleNonBlocking => _isHandleNonBlocking;
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ThrowIfThreadsAreNotSupported()
+ {
+ if (!Socket.OSSupportsThreads)
+ {
+ ThrowPlatformNotSupportedForMissingThreadSupport();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ValidateSyncOperationPreconditions(int timeout)
+ {
+ ThrowIfThreadsAreNotSupported();
+ Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ }
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowPlatformNotSupportedForMissingThreadSupport() =>
+ throw new PlatformNotSupportedException();
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowSocketExceptionFromLastError() =>
+ throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError()));
+
private void PerformSyncOperation(ref OperationQueue queue, TOperation operation, int timeout, int observedSequenceNumber)
where TOperation : AsyncOperation
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
using (var e = new ManualResetEventSlim(false, 0))
{
@@ -1509,7 +1963,7 @@ public SocketError AcceptAsync(Memory socketAddress, out int socketAddress
public SocketError Connect(Memory socketAddress)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
+ ThrowIfThreadsAreNotSupported();
Debug.Assert(socketAddress.Length > 0, $"Unexpected socketAddressLen: {socketAddress.Length}");
// Connect is different than the usual "readiness" pattern of other operations.
@@ -1603,9 +2057,7 @@ public SocketError ReceiveAsync(Memory buffer, SocketFlags flags, out int
public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1636,7 +2088,7 @@ public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memor
public unsafe SocketError ReceiveFrom(Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1674,7 +2126,15 @@ public SocketError ReceiveAsync(Memory buffer, SocketFlags flags, out int
SocketError errorCode;
int observedSequenceNumber;
+ // When there is early-buffered multishot recv data pending, skip the direct recv() syscall.
+ // The io_uring multishot recv and direct recv() compete for the same kernel socket buffer;
+ // if we let recv() succeed here, it returns data that is NEWER than the early-buffered data,
+ // causing out-of-order delivery. Instead, fall through to StartAsyncOperation which will
+ // consume the early buffer via DoTryComplete or CompletedFromBuffer on the event loop.
+ bool hasEarlyBuffered = false;
+ LinuxHasBufferedPersistentMultishotRecvData(ref hasEarlyBuffered);
if (_receiveQueue.IsReady(this, out observedSequenceNumber) &&
+ !hasEarlyBuffered &&
SocketPal.TryCompleteReceive(_socket, buffer.Span, flags, out bytesReceived, out errorCode))
{
return errorCode;
@@ -1748,9 +2208,7 @@ public SocketError ReceiveAsync(IList> buffers, SocketFlags f
public SocketError ReceiveFrom(IList> buffers, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1817,9 +2275,7 @@ public SocketError ReceiveFromAsync(IList> buffers, SocketFla
public SocketError ReceiveMessageFrom(
Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1854,9 +2310,7 @@ public SocketError ReceiveMessageFrom(
public unsafe SocketError ReceiveMessageFrom(
Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1946,9 +2400,7 @@ public SocketError SendAsync(Memory buffer, int offset, int count, SocketF
public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
@@ -1978,9 +2430,7 @@ public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flag
public unsafe SocketError SendTo(ReadOnlySpan buffer, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
@@ -2057,9 +2507,7 @@ public SocketError SendAsync(IList> buffers, SocketFlags flag
public SocketError SendTo(IList> buffers, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
int bufferIndex = 0;
@@ -2127,9 +2575,7 @@ public SocketError SendToAsync(IList> buffers, SocketFlags fl
public SocketError SendFile(SafeFileHandle fileHandle, long offset, long count, int timeout, out long bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs
new file mode 100644
index 00000000000000..94591bd0b3f725
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs
@@ -0,0 +1,828 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ private readonly partial struct SocketEventHandler
+ {
+ private enum EarlyBufferFailureReason : byte
+ {
+ None = 0,
+ MissingBufferFlag,
+ ProvidedRingUnavailable,
+ AcquireBufferFailed,
+ BufferQueueRejected,
+ ResultExceedsBuffer,
+ RecycleFailed,
+ }
+
+ /// Delivers a completed operation to its owning socket context.
+ private void DispatchCompletedIoUringOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ operation.AssociatedContext.TryCompleteIoUringOperation(operation);
+ }
+
+ /// Completes a deferred SEND_ZC operation when its NOTIF CQE arrives.
+ public void DispatchZeroCopyIoUringNotification(ulong payload)
+ {
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (!_engine.TryTakeTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null)
+ {
+ return;
+ }
+
+ Debug.Assert(
+ !_engine.IsZeroCopyNotificationPending(userData),
+ "NOTIF CQE dispatch must occur only after clearing SEND_ZC pending slot state.");
+ Debug.Assert(
+ operation.IoUringUserData == userData,
+ "Deferred SEND_ZC operation must still be tracked with its original user_data at NOTIF dispatch.");
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation);
+ }
+
+ /// Processes a single completion and dispatches it to its owning operation.
+ public void DispatchSingleIoUringCompletion(
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ int controlBufferLen,
+ uint auxiliaryData,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref bool enqueuedFallbackEvent)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchSingleIoUringCompletion must only run on the event-loop thread.");
+ if (userData == 0)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ // Benign race: cancellation/abort paths may have already removed this tracked entry.
+ if (!_engine.TryTakeTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation))
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ if (operation is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ SocketAsyncContext receiveContext = operation.AssociatedContext;
+ if (receiveContext.IsPersistentMultishotRecvArmed() &&
+ receiveContext.PersistentMultishotRecvUserData == userData)
+ {
+ // Terminal CQE for persistent multishot recv: clear armed-state so the
+ // next receive can re-arm.
+ receiveContext.ClearPersistentMultishotRecvArmed();
+
+ // If a new operation piggybacked on this multishot via TryReplace, its
+ // IoUringUserData was set to the multishot's userData. Let it through so
+ // ProcessIoUringCompletionResult delivers the terminal error and the
+ // operation can retry with a new SQE. If the operation was recycled
+ // (IoUringUserData cleared in DispatchMultishotIoUringCompletion),
+ // discard to prevent corrupting the recycled operation's state.
+ if (operation.IoUringUserData != userData)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+ }
+
+ if (operation is SocketAsyncContext.AcceptOperation acceptOperation)
+ {
+ SocketAsyncContext acceptContext = acceptOperation.AssociatedContext;
+ if (acceptContext.MultishotAcceptUserData == userData)
+ {
+ acceptContext.DisarmMultishotAccept();
+ }
+ else if (operation.IoUringUserData != userData)
+ {
+ // The multishot accept was already disarmed and completed by
+ // DispatchMultishotAcceptIoUringCompletion (which cleared IoUringUserData).
+ // This is the terminal ECANCELED CQE for a recycled operation — discard.
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+ }
+
+ uint completionAuxiliaryData = auxiliaryData;
+ int completionResultCode = result;
+ if (!TryMaterializeIoUringReceiveCompletion(
+ operation!,
+ completionResultCode,
+ flags,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref completionAuxiliaryData))
+ {
+ completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS);
+ completionAuxiliaryData = 0;
+ }
+
+ // Process completion metadata before processing result to allow message post-processing.
+ operation!.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen);
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData);
+ if (completionDispatchResult == SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed &&
+ _engine.IsZeroCopyNotificationPending(userData))
+ {
+ // SEND_ZC API contract: complete managed operation only once NOTIF confirms
+ // the kernel/NIC no longer references the caller buffer.
+ _engine.AssertZeroCopyDeferredCompletionState(userData, operation);
+ if (!_engine.TryReattachTrackedIoUringOperation(userData, operation))
+ {
+ Debug.Fail("SEND_ZC deferred completion reattach failed; completing operation with EINVAL and releasing deferred slot.");
+ bool cleanedDeferredSlot = _engine.TryCleanupDeferredZeroCopyCompletionSlot(userData);
+ Debug.Assert(
+ cleanedDeferredSlot,
+ "SEND_ZC deferred completion reattach failure should release the deferred completion slot.");
+ operation.ErrorCode = SocketPal.GetSocketErrorForErrorCode(Interop.Error.EINVAL);
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation);
+ return;
+ }
+
+ return;
+ }
+
+ DispatchIoUringCompletionResult(
+ operation,
+ completionDispatchResult,
+ ref enqueuedFallbackEvent);
+ }
+
+ ///
+ /// Processes a multishot completion by completing the current operation and
+ /// preserving persistent multishot ownership unless terminal completion requires disarm.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public void DispatchMultishotIoUringCompletion(
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ int controlBufferLen,
+ uint auxiliaryData,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref bool enqueuedFallbackEvent)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchMultishotIoUringCompletion must only run on the event-loop thread.");
+ _ = enqueuedFallbackEvent; // Transitional path never requeues via readiness fallback.
+ _ = hasFixedRecvBuffer;
+ _ = fixedRecvBufferId;
+ Debug.Assert((flags & IoUringConstants.CqeFMore) != 0,
+ "Multishot dispatch must only be used for non-terminal CQEs (IORING_CQE_F_MORE).");
+
+ if (userData == 0)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0);
+ return;
+ }
+
+ if (!_engine.TryGetTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0);
+ return;
+ }
+
+ if (operation is SocketAsyncContext.AcceptOperation acceptOperation)
+ {
+ DispatchMultishotAcceptIoUringCompletion(
+ acceptOperation,
+ userData,
+ result,
+ flags,
+ socketAddressLen,
+ auxiliaryData);
+ return;
+ }
+
+ // Guard against ThreadPool recycling: when a persistent multishot recv
+ // completion is dispatched (QueueIoUringCompletionCallback), the ThreadPool
+ // may recycle the operation (reset state to Waiting) before the event loop
+ // finishes draining the CQE batch. IoUringUserData is zeroed on the event-loop
+ // thread at completion (line 339) and only restored during prepare-queue drain
+ // (after CQE processing), so IoUringUserData==0 reliably detects a completed-
+ // but-not-yet-retracked operation regardless of ThreadPool-driven state changes.
+ if (operation.IoUringUserData == 0 || !operation.IsInWaitingState())
+ {
+ if (result <= 0)
+ {
+ // Terminal/error shots observed without a waiting managed receiver must
+ // still drive cancel/disarm so the tracked multishot slot cannot stall.
+ _engine.TryRequestIoUringCancellation(userData);
+ return;
+ }
+
+ SocketAsyncContext opContext = operation.AssociatedContext;
+ if (!TryBufferEarlyPersistentMultishotRecvCompletion(opContext, result, flags, out EarlyBufferFailureReason bufferFailureReason))
+ {
+ if (ShouldCancelPersistentMultishotAfterEarlyBufferFailure(bufferFailureReason))
+ {
+ _engine.TryRequestIoUringCancellation(userData);
+ }
+ }
+
+ return;
+ }
+
+ SocketAsyncContext context = operation.AssociatedContext;
+ bool isPersistentMultishotRecv =
+ context.IsPersistentMultishotRecvArmed() &&
+ context.PersistentMultishotRecvUserData == userData;
+ uint completionAuxiliaryData = auxiliaryData;
+ int completionResultCode = result;
+ if (!TryMaterializeIoUringReceiveCompletion(
+ operation,
+ completionResultCode,
+ flags,
+ hasFixedRecvBuffer: false,
+ fixedRecvBufferId: 0,
+ ref completionAuxiliaryData))
+ {
+ if (isPersistentMultishotRecv && completionResultCode > 0)
+ {
+ // Under transient provided-buffer pressure, drop this shot and keep the
+ // persistent multishot request armed instead of surfacing ENOBUFS.
+ return;
+ }
+
+ completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS);
+ completionAuxiliaryData = 0;
+ }
+
+ operation.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen);
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData);
+ bool shouldCancelPersistentMultishotRecv =
+ isPersistentMultishotRecv &&
+ completionDispatchResult == SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed &&
+ completionResultCode <= 0;
+
+ if (!isPersistentMultishotRecv || shouldCancelPersistentMultishotRecv)
+ {
+ _engine.TryRequestIoUringCancellation(userData);
+ }
+
+ switch (completionDispatchResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ if (isPersistentMultishotRecv)
+ {
+ // Zero only IoUringUserData (not the full ClearIoUringUserData which
+ // wipes completion metadata needed by the callback). The terminal CQE
+ // uses this to distinguish recycled (userData=0) from piggybacked
+ // (userData=armedUserData) operations.
+ operation.IoUringUserData = 0;
+ }
+
+ DispatchCompletedIoUringOperation(operation);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ // Persistent multishot receives stay armed; intermediate shots are
+ // delivered through completion-mode dispatch without readiness fallback.
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring multishot completion result: {completionDispatchResult}");
+ break;
+ }
+ }
+
+ ///
+ /// Handles transitional multishot-accept CQEs by completing one waiting operation and
+ /// canceling the multishot request. Extra successful shots are queued for dequeue on
+ /// the accept operation queue when possible.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void DispatchMultishotAcceptIoUringCompletion(
+ SocketAsyncContext.AcceptOperation operation,
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ uint auxiliaryData)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchMultishotAcceptIoUringCompletion must only run on the event-loop thread.");
+ operation.SetIoUringCompletionMessageMetadata(socketAddressLen, 0);
+ SocketAsyncContext context = operation.AssociatedContext;
+
+ if (result >= 0 && s_fdEngineAffinity is not null)
+ SetFdEngineAffinity(result, _engine.EngineIndex);
+
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(result, flags, auxiliaryData);
+
+ bool hasMoreShots = (flags & IoUringConstants.CqeFMore) != 0;
+ bool shouldCancelMultishotAccept =
+ completionDispatchResult == SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed ||
+ result < 0 ||
+ !hasMoreShots;
+ if (shouldCancelMultishotAccept)
+ {
+ _engine.TryRequestIoUringCancellation(userData);
+ }
+
+ switch (completionDispatchResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ // Disarm multishot state and tag the operation before dispatching.
+ // Zero only IoUringUserData (not the full ClearIoUringUserData which
+ // wipes completion metadata needed by the callback). The terminal
+ // ECANCELED CQE uses IoUringUserData to distinguish recycled (userData=0)
+ // from piggybacked (userData=armedUserData) operations.
+ context.DisarmMultishotAccept();
+ operation.IoUringUserData = 0;
+ DispatchCompletedIoUringOperation(operation);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ if (result >= 0)
+ {
+ int addressLength = auxiliaryData > (uint)operation.SocketAddress.Length ?
+ operation.SocketAddress.Length :
+ (int)auxiliaryData;
+ if (context.TryEnqueuePreAcceptedConnection((IntPtr)result, operation.SocketAddress.Span, addressLength))
+ {
+ _engine.EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read);
+ }
+ else
+ {
+ CloseAcceptedFd(result);
+ }
+ }
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring multishot accept completion result: {completionDispatchResult}");
+ break;
+ }
+ }
+
+ ///
+ /// Dispatches a CQE from a SO_REUSEPORT shadow listener's multishot accept.
+ /// Shadow listeners have no pending AcceptAsync operations — accepted fds are
+ /// forwarded directly to the primary listener's pre-accept queue.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public void DispatchReusePortAcceptIoUringCompletion(
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ uint auxiliaryData)
+ {
+ _ = flags;
+ // Reuse-port multishot accept SQEs are intentionally armed without sockaddr writeback
+ // (shared-address writeback is race-prone for multishot batches). Enqueued accepted
+ // sockets carry no peer address metadata here; endpoint resolution is deferred.
+ _ = socketAddressLen;
+ _ = auxiliaryData;
+ ulong payload = userData & IoUringUserDataPayloadMask;
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ IoUringCompletionSlotStorage[]? storageArray = _engine._completionSlotStorage;
+ if (storageArray is null || (uint)slotIndex >= (uint)storageArray.Length)
+ {
+ // Stale or invalid slot; close any accepted fd to prevent leak.
+ if (result >= 0)
+ {
+ CloseAcceptedFd(result);
+ }
+ return;
+ }
+
+ ref IoUringCompletionSlotStorage slotStorage = ref storageArray[slotIndex];
+ SocketAsyncContext? primaryContext = slotStorage.ReusePortPrimaryContext;
+ SocketAsyncEngine? primaryEngine = slotStorage.ReusePortPrimaryEngine;
+
+ if (result < 0)
+ {
+ // Error CQE — nothing to enqueue. If this is a terminal CQE (no MORE flag),
+ // the slot will be freed by the caller after we return.
+ return;
+ }
+
+ // Successful accept: forward the fd to the primary listener's pre-accept queue.
+ SetFdEngineAffinity(result, _engine.EngineIndex);
+ if (primaryContext is not null && primaryEngine is not null)
+ {
+ if (primaryContext.TryEnqueuePreAcceptedConnection((IntPtr)result, ReadOnlySpan.Empty, 0))
+ {
+ primaryEngine.EnqueueReadinessFallbackEvent(primaryContext, Interop.Sys.SocketEvents.Read);
+ }
+ else
+ {
+ CloseAcceptedFd(result);
+ }
+ }
+ else
+ {
+ // Primary context/engine not set — orphaned slot; close the fd.
+ CloseAcceptedFd(result);
+ }
+ }
+
+ ///
+ /// For receive completions that used provided buffers (buffer-select or fixed receive),
+ /// materializes payload bytes into the operation target and recycles checked-out buffers.
+ ///
+ private unsafe bool TryMaterializeIoUringReceiveCompletion(
+ SocketAsyncContext.AsyncOperation operation,
+ int result,
+ uint flags,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref uint auxiliaryData)
+ {
+ bool hasSelectedBuffer = (flags & IoUringConstants.CqeFBuffer) != 0;
+ if (!hasFixedRecvBuffer && !hasSelectedBuffer)
+ {
+ return true;
+ }
+
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return false;
+ }
+
+ ushort bufferId;
+ bool reportRecycleFailureAsDepletion;
+ byte* providedBuffer = null;
+ int providedBufferLength = 0;
+ if (hasFixedRecvBuffer)
+ {
+ bufferId = fixedRecvBufferId;
+ reportRecycleFailureAsDepletion = true;
+
+ if (result > 0 &&
+ !providedBufferRing.TryGetCheckedOutBuffer(
+ bufferId,
+ out providedBuffer,
+ out providedBufferLength))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ return false;
+ }
+ }
+ else
+ {
+ bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ reportRecycleFailureAsDepletion = false;
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out providedBuffer,
+ out providedBufferLength))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ return false;
+ }
+ }
+
+ bool handled = result <= 0;
+ try
+ {
+ if (result > 0)
+ {
+ handled =
+ operation.TryProcessIoUringProvidedBufferCompletion(
+ providedBuffer,
+ providedBufferLength,
+ result,
+ ref auxiliaryData);
+ }
+
+ RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result);
+ }
+ finally
+ {
+ handled &= TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: reportRecycleFailureAsDepletion);
+ }
+
+ return handled;
+ }
+
+ ///
+ /// For persistent multishot recv, buffers payload bytes that arrive while no
+ /// managed receive operation is in the Waiting state.
+ ///
+ private unsafe bool TryBufferEarlyPersistentMultishotRecvCompletion(
+ SocketAsyncContext context,
+ int result,
+ uint flags,
+ out EarlyBufferFailureReason failureReason)
+ {
+ failureReason = EarlyBufferFailureReason.None;
+ Debug.Assert(result > 0, $"Expected positive result for early-buffered multishot recv, got {result}");
+
+ if ((flags & IoUringConstants.CqeFBuffer) == 0)
+ {
+ failureReason = EarlyBufferFailureReason.MissingBufferFlag;
+ return false;
+ }
+
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ failureReason = EarlyBufferFailureReason.ProvidedRingUnavailable;
+ return false;
+ }
+
+ ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out byte* providedBuffer,
+ out int providedBufferLength))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ failureReason = EarlyBufferFailureReason.AcquireBufferFailed;
+ return false;
+ }
+
+ bool buffered = false;
+ try
+ {
+ if ((uint)result <= (uint)providedBufferLength)
+ {
+ buffered = context.TryBufferEarlyPersistentMultishotRecvData(
+ new ReadOnlySpan(providedBuffer, result));
+ if (buffered)
+ {
+ RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result);
+ }
+ else
+ {
+ failureReason = EarlyBufferFailureReason.BufferQueueRejected;
+ }
+ }
+ else
+ {
+ failureReason = EarlyBufferFailureReason.ResultExceedsBuffer;
+ }
+ }
+ finally
+ {
+ buffered &= TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: false);
+ }
+
+ if (!buffered && failureReason == EarlyBufferFailureReason.None)
+ {
+ failureReason = EarlyBufferFailureReason.RecycleFailed;
+ }
+
+ return buffered;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool ShouldCancelPersistentMultishotAfterEarlyBufferFailure(EarlyBufferFailureReason failureReason) =>
+ failureReason switch
+ {
+ // Transient pressure: keep multishot armed and drop this shot.
+ EarlyBufferFailureReason.AcquireBufferFailed => false,
+ // Back-pressure: cancel multishot when the early-buffer queue is full to prevent
+ // further data loss. The multishot will be re-armed when the next recv operation
+ // drains the buffer and submits a fresh SQE.
+ EarlyBufferFailureReason.BufferQueueRejected => true,
+ _ => true,
+ };
+
+ ///
+ /// Recycles a provided-buffer selection for completions that can no longer be
+ /// dispatched to a tracked operation (e.g., late multishot CQEs after cancel).
+ ///
+ private unsafe void RecycleUntrackedReceiveCompletionBuffers(
+ uint flags,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return;
+ }
+
+ if ((flags & IoUringConstants.CqeFBuffer) == 0)
+ {
+ if (hasFixedRecvBuffer)
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ fixedRecvBufferId,
+ reportFailureAsDepletion: true);
+ }
+
+ return;
+ }
+
+ ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out _,
+ out _))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ }
+ else
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: false);
+ }
+
+ if (hasFixedRecvBuffer)
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ fixedRecvBufferId,
+ reportFailureAsDepletion: true);
+ }
+ }
+
+ private void RecordProvidedBufferUtilizationIfEnabled(
+ IoUringProvidedBufferRing providedBufferRing,
+ int bytesTransferred)
+ {
+ if (bytesTransferred <= 0 || !_engine._adaptiveBufferSizingEnabled)
+ {
+ return;
+ }
+
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "Adaptive provided-buffer utilization tracking must run on the event-loop thread.");
+ providedBufferRing.RecordCompletionUtilization(bytesTransferred);
+ }
+
+ private bool TryRecycleProvidedBufferFromCheckedOutState(
+ IoUringProvidedBufferRing providedBufferRing,
+ ushort bufferId,
+ bool reportFailureAsDepletion)
+ {
+ bool recycled = providedBufferRing.TryRecycleBufferFromCompletion(bufferId);
+ if (!recycled && reportFailureAsDepletion)
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ }
+
+ return recycled;
+ }
+
+ /// Requeues a pending operation or falls back to readiness notification.
+ private bool DispatchPendingIoUringOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ PendingIoUringReprepareResult inlineReprepareResult = TryDispatchPendingIoUringOperationInline(operation);
+ if (inlineReprepareResult == PendingIoUringReprepareResult.Prepared)
+ {
+ return false;
+ }
+
+ if (inlineReprepareResult == PendingIoUringReprepareResult.NotAttempted &&
+ operation.TryQueueIoUringPreparation())
+ {
+ _engine._ioUringPendingRetryQueuedToPrepareQueueCount++;
+ return false;
+ }
+
+ Debug.Assert(
+ inlineReprepareResult == PendingIoUringReprepareResult.Failed ||
+ !_engine._ioUringCapabilities.IsCompletionMode,
+ "Requeue should not fail in pure io_uring completion mode when inline re-prepare was not attempted.");
+
+ operation.ClearIoUringUserData();
+ Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return false;
+ }
+
+ _eventQueue.Enqueue(new SocketIOEvent(operation.AssociatedContext, fallbackEvents));
+ return true;
+ }
+
+ ///
+ /// Attempts to re-prepare and re-track a pending operation inline on the event loop thread.
+ /// This avoids an extra prepare-queue round-trip for completion-mode retries.
+ ///
+ private enum PendingIoUringReprepareResult : byte
+ {
+ NotAttempted = 0,
+ Prepared = 1,
+ Failed = 2
+ }
+
+ ///
+ /// Attempts to re-prepare a pending operation inline.
+ /// Returns whether inline re-prepare was prepared, skipped, or failed without producing an SQE.
+ ///
+ private PendingIoUringReprepareResult TryDispatchPendingIoUringOperationInline(SocketAsyncContext.AsyncOperation operation)
+ {
+ if (!_engine._ioUringCapabilities.IsCompletionMode || !_engine.IsCurrentThreadEventLoopThread())
+ {
+ return PendingIoUringReprepareResult.NotAttempted;
+ }
+
+ long prepareSequence = operation.MarkReadyForIoUringPreparation();
+ Interop.Error prepareError = _engine.TryPrepareAndTrackIoUringOperation(
+ operation,
+ prepareSequence,
+ out bool preparedSqe);
+ if (prepareError != Interop.Error.SUCCESS)
+ {
+ Debug.Fail($"io_uring inline re-prepare failed: {prepareError}");
+
+ return PendingIoUringReprepareResult.Failed;
+ }
+
+ return preparedSqe ? PendingIoUringReprepareResult.Prepared : PendingIoUringReprepareResult.Failed;
+ }
+
+ /// Routes a CQE completion result to the appropriate dispatch behavior.
+ private void DispatchIoUringCompletionResult(
+ SocketAsyncContext.AsyncOperation operation,
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionResult,
+ ref bool enqueuedFallbackEvent)
+ {
+ switch (completionResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Queued);
+ if (operation.ShouldReuseIoUringPreparationResourcesOnPending)
+ {
+ operation.MarkIoUringPreparationReusable();
+ operation.ResetIoUringUserDataForRequeue();
+ }
+ else
+ {
+ operation.ClearIoUringUserData();
+ }
+
+ enqueuedFallbackEvent |= DispatchPendingIoUringOperation(operation);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Canceled);
+ operation.ClearIoUringUserData();
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring completion result: {completionResult}");
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Detached);
+ operation.ClearIoUringUserData();
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs
new file mode 100644
index 00000000000000..e3945cf06720e8
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs
@@ -0,0 +1,384 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics.CodeAnalysis;
+using System.Diagnostics;
+using System.Collections.Generic;
+using System.IO;
+using System.Runtime.CompilerServices;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Parses an environment variable as a "0"/"1" boolean switch. Returns null if unset or unrecognized.
+ private static bool? TryParseBoolSwitch(string? value)
+ {
+ if (string.Equals(value, "1", StringComparison.Ordinal)) return true;
+ if (string.Equals(value, "0", StringComparison.Ordinal)) return false;
+ return null;
+ }
+
+ private readonly struct IoUringConfigurationInputs
+ {
+ internal readonly string? IoUringEnvironmentValue;
+ internal readonly bool IoUringFeatureSwitchEnabled;
+ internal readonly string? SqPollEnvironmentValue;
+ internal readonly bool SqPollFeatureSwitchEnabled;
+ internal readonly string? DirectSqeEnvironmentValue;
+ internal readonly string? ZeroCopySendEnvironmentValue;
+
+ internal IoUringConfigurationInputs(
+ string? ioUringEnvironmentValue,
+ bool ioUringFeatureSwitchEnabled,
+ string? sqPollEnvironmentValue,
+ bool sqPollFeatureSwitchEnabled,
+ string? directSqeEnvironmentValue,
+ string? zeroCopySendEnvironmentValue)
+ {
+ IoUringEnvironmentValue = ioUringEnvironmentValue;
+ IoUringFeatureSwitchEnabled = ioUringFeatureSwitchEnabled;
+ SqPollEnvironmentValue = sqPollEnvironmentValue;
+ SqPollFeatureSwitchEnabled = sqPollFeatureSwitchEnabled;
+ DirectSqeEnvironmentValue = directSqeEnvironmentValue;
+ ZeroCopySendEnvironmentValue = zeroCopySendEnvironmentValue;
+ }
+ }
+
+ // One-time static lookup per process, following the standard .NET pattern
+ // (e.g. GlobalizationMode). Configuration is not expected to change mid-process.
+ private static readonly IoUringConfigurationInputs s_cachedConfigInputs = ReadIoUringConfigurationInputs();
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static IoUringResolvedConfiguration ResolveIoUringResolvedConfiguration()
+ {
+ IoUringConfigurationInputs inputs = s_cachedConfigInputs;
+ return new IoUringResolvedConfiguration(
+ ioUringEnabled: ResolveIoUringEnabled(inputs),
+ sqPollRequested: ResolveSqPollRequested(inputs),
+ directSqeDisabled: ResolveIoUringDirectSqeDisabled(inputs),
+ zeroCopySendOptedIn: ResolveZeroCopySendOptedIn(inputs),
+ registerBuffersEnabled: s_ioUringRegisterBuffersEnabled,
+ adaptiveProvidedBufferSizingEnabled: s_ioUringAdaptiveBufferSizingEnabled,
+ providedBufferSize: s_ioUringProvidedBufferSize,
+ prepareQueueCapacity: s_ioUringPrepareQueueCapacity,
+ cancellationQueueCapacity: s_ioUringCancellationQueueCapacity);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static IoUringConfigurationInputs ReadIoUringConfigurationInputs()
+ {
+#if DEBUG
+ string? directSqeValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.DirectSqe);
+ string? zeroCopySendValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ZeroCopySend);
+#else
+ string? directSqeValue = null;
+ string? zeroCopySendValue = null;
+#endif
+
+ return new IoUringConfigurationInputs(
+ ioUringEnvironmentValue: Environment.GetEnvironmentVariable(IoUringEnvironmentVariable),
+ ioUringFeatureSwitchEnabled: IsIoUringFeatureEnabled,
+ sqPollEnvironmentValue: Environment.GetEnvironmentVariable(IoUringSqPollEnvironmentVariable),
+ sqPollFeatureSwitchEnabled: IsSqPollFeatureEnabled,
+ directSqeEnvironmentValue: directSqeValue,
+ zeroCopySendEnvironmentValue: zeroCopySendValue);
+ }
+
+ /// Checks whether io_uring is enabled.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringEnabled()
+ {
+ return ResolveIoUringEnabled(s_cachedConfigInputs);
+ }
+
+ [FeatureSwitchDefinition(UseIoUringAppContextSwitch)]
+ private static bool IsIoUringFeatureEnabled
+ {
+ get
+ {
+ if (AppContext.TryGetSwitch(UseIoUringAppContextSwitch, out bool enabled))
+ {
+ return enabled;
+ }
+
+ return false;
+ }
+ }
+
+ ///
+ /// Returns whether SEND_ZC should be enabled.
+ /// Defaults to enabled; test-only env var can disable for deterministic tests.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsZeroCopySendOptedIn()
+ {
+ return ResolveZeroCopySendOptedIn(s_cachedConfigInputs);
+ }
+
+ private static bool ResolveIoUringDirectSqeDisabled(in IoUringConfigurationInputs inputs)
+ {
+#if DEBUG
+ // Test-only override for deterministic coverage.
+ // Inverted: "0" disables direct SQE (returns true), "1" enables (returns false).
+ bool? parsed = TryParseBoolSwitch(inputs.DirectSqeEnvironmentValue);
+ if (parsed.HasValue) return !parsed.Value;
+#endif
+ return false;
+ }
+
+ private static bool ResolveIoUringEnabled(in IoUringConfigurationInputs inputs) =>
+ TryParseBoolSwitch(inputs.IoUringEnvironmentValue) ?? inputs.IoUringFeatureSwitchEnabled;
+
+ private static bool ResolveZeroCopySendOptedIn(in IoUringConfigurationInputs inputs)
+ {
+#if DEBUG
+ bool? parsed = TryParseBoolSwitch(inputs.ZeroCopySendEnvironmentValue);
+ if (parsed.HasValue) return parsed.Value;
+#endif
+ return true;
+ }
+
+ [FeatureSwitchDefinition(UseIoUringSqPollAppContextSwitch)]
+ private static bool IsSqPollFeatureEnabled
+ {
+ get
+ {
+ if (AppContext.TryGetSwitch(UseIoUringSqPollAppContextSwitch, out bool enabled))
+ {
+ return enabled;
+ }
+
+ return false;
+ }
+ }
+
+ ///
+ /// Returns whether SQPOLL mode has been explicitly requested.
+ /// Follows the standard .NET configuration pattern: environment variable
+ /// overrides AppContext switch; either source alone is sufficient.
+ ///
+ private static bool IsSqPollRequested()
+ {
+ return ResolveSqPollRequested(s_cachedConfigInputs);
+ }
+
+ private static bool ResolveSqPollRequested(in IoUringConfigurationInputs inputs) =>
+ TryParseBoolSwitch(inputs.SqPollEnvironmentValue) ?? inputs.SqPollFeatureSwitchEnabled;
+
+ ///
+ /// Returns whether multishot accept should be force-disabled.
+ /// This is an emergency kill-switch to isolate multishot-accept issues
+ /// while keeping other io_uring features enabled.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsMultishotAcceptDisabled() =>
+ string.Equals(
+ Environment.GetEnvironmentVariable(IoUringDisableMultishotAcceptEnvironmentVariable),
+ "1",
+ StringComparison.Ordinal);
+
+ ///
+ /// Returns whether SO_REUSEPORT accept distribution across io_uring engines is disabled.
+ /// This is an emergency kill-switch; REUSEPORT accept is on by default when multiple
+ /// engines are active. Setting the env var to "1" disables shadow listener creation.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ internal static bool IsReusePortAcceptDisabled() =>
+ string.Equals(
+ Environment.GetEnvironmentVariable(IoUringDisableReusePortAcceptEnvironmentVariable),
+ "1",
+ StringComparison.Ordinal);
+
+ private readonly struct PhysicalCoreGroup
+ {
+ internal PhysicalCoreGroup(int representativeCpu, int[] logicalCpus)
+ {
+ RepresentativeCpu = representativeCpu;
+ LogicalCpus = logicalCpus;
+ }
+
+ internal int RepresentativeCpu { get; }
+ internal int[] LogicalCpus { get; }
+ }
+
+ static partial void LinuxInitializeEngineAffinityTopology(ref int engineCount, ref int[]? pinnedCpuIndices, ref int[]? cpuToEngineIndex)
+ {
+ if (!OperatingSystem.IsLinux() || !IsIoUringEnabled())
+ {
+ return;
+ }
+
+ if (!TryDetectPhysicalCoreTopology(out PhysicalCoreGroup[]? groups) || groups is null || groups.Length == 0)
+ {
+ // Topology unavailable: keep one engine per logical CPU up to a defensive cap.
+ int fallbackCount = Math.Max(1, Math.Min(Environment.ProcessorCount, 32));
+ engineCount = Math.Min(engineCount, fallbackCount);
+ return;
+ }
+
+ engineCount = Math.Min(engineCount, groups.Length);
+ if (engineCount <= 0)
+ {
+ engineCount = 1;
+ }
+
+ pinnedCpuIndices = new int[engineCount];
+ int maxCpuIndex = -1;
+ for (int i = 0; i < engineCount; i++)
+ {
+ int representativeCpu = groups[i].RepresentativeCpu;
+ pinnedCpuIndices[i] = representativeCpu;
+ if (representativeCpu > maxCpuIndex)
+ {
+ maxCpuIndex = representativeCpu;
+ }
+
+ int[] logicalCpus = groups[i].LogicalCpus;
+ for (int j = 0; j < logicalCpus.Length; j++)
+ {
+ if (logicalCpus[j] > maxCpuIndex)
+ {
+ maxCpuIndex = logicalCpus[j];
+ }
+ }
+ }
+
+ int mapLength = Math.Max(Environment.ProcessorCount, maxCpuIndex + 1);
+ cpuToEngineIndex = new int[mapLength];
+ Array.Fill(cpuToEngineIndex, -1);
+
+ for (int engineIndex = 0; engineIndex < engineCount; engineIndex++)
+ {
+ foreach (int cpu in groups[engineIndex].LogicalCpus)
+ {
+ if ((uint)cpu < (uint)cpuToEngineIndex.Length)
+ {
+ cpuToEngineIndex[cpu] = engineIndex;
+ }
+ }
+ }
+ }
+
+ partial void LinuxPinEventLoopThreadIfConfigured()
+ {
+ if (_pinnedCpuIndex < 0 || _pinnedCpuIndex >= IntPtr.Size * 8 || !IsIoUringEnabled())
+ {
+ return;
+ }
+
+ IntPtr mask = (IntPtr)unchecked((nint)(1UL << _pinnedCpuIndex));
+ if (Interop.Sys.SchedSetAffinity(0, ref mask) != 0)
+ {
+ return;
+ }
+
+ }
+
+ private static bool TryDetectPhysicalCoreTopology([NotNullWhen(true)] out PhysicalCoreGroup[]? groups)
+ {
+ groups = null;
+ const string cpuRoot = "/sys/devices/system/cpu";
+ if (!Directory.Exists(cpuRoot))
+ {
+ return false;
+ }
+
+ IntPtr affinityMask = IntPtr.Zero;
+ if (Interop.Sys.SchedGetAffinity(0, out affinityMask) != 0)
+ {
+ affinityMask = (IntPtr)(-1);
+ }
+
+ int affinityBitCount = IntPtr.Size * 8;
+ var cpuDirectories = new List();
+ foreach (string cpuPath in Directory.EnumerateDirectories(cpuRoot, "cpu*"))
+ {
+ ReadOnlySpan fileName = Path.GetFileName(cpuPath);
+ if (!fileName.StartsWith("cpu", StringComparison.Ordinal))
+ {
+ continue;
+ }
+
+ if (int.TryParse(fileName.Slice(3), out int cpuIndex) && cpuIndex >= 0)
+ {
+ cpuDirectories.Add(cpuIndex);
+ }
+ }
+
+ if (cpuDirectories.Count == 0)
+ {
+ return false;
+ }
+
+ cpuDirectories.Sort();
+ var coreGroups = new Dictionary<(int PackageId, int CoreId), List>();
+ foreach (int cpuIndex in cpuDirectories)
+ {
+ if (cpuIndex >= affinityBitCount)
+ {
+ // IntPtr affinity mask cannot represent CPUs above native pointer width.
+ continue;
+ }
+
+ nint cpuBit = (nint)(1UL << cpuIndex);
+ if ((((nint)affinityMask) & cpuBit) == 0)
+ {
+ continue;
+ }
+
+ if (!TryReadTopologyId(cpuIndex, "physical_package_id", out int packageId) ||
+ !TryReadTopologyId(cpuIndex, "core_id", out int coreId))
+ {
+ continue;
+ }
+
+ var key = (packageId, coreId);
+ if (!coreGroups.TryGetValue(key, out List? logicalCpus))
+ {
+ logicalCpus = new List();
+ coreGroups.Add(key, logicalCpus);
+ }
+
+ logicalCpus.Add(cpuIndex);
+ }
+
+ if (coreGroups.Count == 0)
+ {
+ return false;
+ }
+
+ var orderedGroups = new List(coreGroups.Count);
+ foreach (KeyValuePair<(int PackageId, int CoreId), List> entry in coreGroups)
+ {
+ List logicalCpus = entry.Value;
+ logicalCpus.Sort();
+ orderedGroups.Add(new PhysicalCoreGroup(logicalCpus[0], logicalCpus.ToArray()));
+ }
+
+ orderedGroups.Sort(static (a, b) => a.RepresentativeCpu.CompareTo(b.RepresentativeCpu));
+ groups = orderedGroups.ToArray();
+ return true;
+ }
+
+ private static bool TryReadTopologyId(int cpuIndex, string fileName, out int value)
+ {
+ string path = $"/sys/devices/system/cpu/cpu{cpuIndex}/topology/{fileName}";
+ value = 0;
+ try
+ {
+ if (!File.Exists(path))
+ {
+ return false;
+ }
+
+ string raw = File.ReadAllText(path).Trim();
+ return int.TryParse(raw, out value);
+ }
+ catch
+ {
+ return false;
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs
new file mode 100644
index 00000000000000..fc14cd2b76fc64
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs
@@ -0,0 +1,97 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Resets the native diagnostics poll countdown.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void InitializeLinuxIoUringDiagnosticsState() =>
+ _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval;
+
+ /// Periodically polls native counters and publishes deltas to telemetry.
+ private void PollIoUringDiagnosticsIfNeeded(bool force)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort)
+ {
+ return;
+ }
+
+ if (!force)
+ {
+ int countdown = _ioUringDiagnosticsPollCountdown - 1;
+ _ioUringDiagnosticsPollCountdown = countdown;
+ if (countdown > 0)
+ {
+ return;
+ }
+ }
+
+ _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval;
+ PublishIoUringManagedDiagnosticsDelta();
+
+ if (!force)
+ {
+ EvaluateProvidedBufferRingResize();
+ }
+ }
+
+ /// Returns the non-negative delta between two counter snapshots.
+ private static long ComputeManagedCounterDelta(long previous, long current) =>
+ current >= previous ? current - previous : current;
+
+ /// Publishes a managed counter delta from source to published baseline.
+ private static bool TryPublishManagedCounterDelta(
+ ref long sourceCounter,
+ ref long publishedCounter,
+ out long delta,
+ bool monotonic = true)
+ {
+ long current = Interlocked.Read(ref sourceCounter);
+ long previous = Interlocked.Exchange(ref publishedCounter, current);
+ delta = monotonic ? ComputeManagedCounterDelta(previous, current) : current - previous;
+ return delta != 0;
+ }
+
+ /// Publishes all managed diagnostic counter deltas to telemetry.
+ private void PublishIoUringManagedDiagnosticsDelta()
+ {
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringNonPinnablePrepareFallbackCount,
+ ref _ioUringPublishedNonPinnablePrepareFallbackCount,
+ out long nonPinnableFallbackDelta))
+ {
+ SocketsTelemetry.Log.IoUringPrepareNonPinnableFallback(nonPinnableFallbackDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringPrepareQueueOverflowCount,
+ ref _ioUringPublishedPrepareQueueOverflowCount,
+ out long prepareQueueOverflowDelta))
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueOverflow(prepareQueueOverflowDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringPrepareQueueOverflowFallbackCount,
+ ref _ioUringPublishedPrepareQueueOverflowFallbackCount,
+ out long prepareQueueOverflowFallbackDelta))
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueOverflowFallback(prepareQueueOverflowFallbackDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringCompletionSlotExhaustionCount,
+ ref _ioUringPublishedCompletionSlotExhaustionCount,
+ out long completionSlotExhaustionDelta))
+ {
+ SocketsTelemetry.Log.IoUringCompletionSlotExhaustion(completionSlotExhaustionDelta);
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs
new file mode 100644
index 00000000000000..5df874a0329bf3
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs
@@ -0,0 +1,362 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ ///
+ /// Maps the SQ ring, CQ ring, and SQE array into managed address space and derives
+ /// all ring pointers from the kernel-reported offsets. On failure, unmaps any
+ /// partially-mapped regions and closes the ring fd.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryMmapRings(ref IoUringSetupResult setup)
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static bool IsOffsetInRange(ulong offset, ulong size, ulong mappedSize) =>
+ offset <= mappedSize && size <= mappedSize - offset;
+
+ ref Interop.Sys.IoUringParams p = ref setup.Params;
+ bool usesNoSqArray = (setup.NegotiatedFlags & IoUringConstants.SetupNoSqArray) != 0;
+ bool usesSqe128 = (setup.NegotiatedFlags & IoUringConstants.SetupSqe128) != 0;
+ uint negotiatedSqeSize = usesSqe128 ? 128u : (uint)sizeof(IoUringSqe);
+ if (negotiatedSqeSize != (uint)sizeof(IoUringSqe))
+ {
+ // Managed SQE writers currently mirror the 64-byte io_uring_sqe layout.
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+
+ // Compute ring sizes.
+ ulong sqRingSize = p.SqOff.Array;
+ if (!usesNoSqArray)
+ {
+ sqRingSize += p.SqEntries * (uint)sizeof(uint);
+ }
+ ulong cqRingSize = p.CqOff.Cqes + p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe);
+ ulong sqesSize = p.SqEntries * negotiatedSqeSize;
+
+ // mmap SQ ring (and possibly CQ ring if SINGLE_MMAP).
+ bool usesSingleMmap = (p.Features & IoUringConstants.FeatureSingleMmap) != 0;
+
+ byte* sqRingPtr;
+ byte* cqRingPtr;
+
+ if (usesSingleMmap)
+ {
+ ulong ringSize = sqRingSize > cqRingSize ? sqRingSize : cqRingSize;
+ void* ptr;
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, ringSize, IoUringConstants.OffSqRing, &ptr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ sqRingPtr = (byte*)ptr;
+ cqRingPtr = (byte*)ptr;
+ sqRingSize = ringSize;
+ cqRingSize = ringSize;
+ }
+ else
+ {
+ void* sqPtr;
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqRingSize, IoUringConstants.OffSqRing, &sqPtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ sqRingPtr = (byte*)sqPtr;
+
+ void* cqPtr;
+ err = Interop.Sys.IoUringShimMmap(setup.RingFd, cqRingSize, IoUringConstants.OffCqRing, &cqPtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ cqRingPtr = (byte*)cqPtr;
+ }
+
+ if (!IsOffsetInRange(p.SqOff.Head, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.Tail, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.RingMask, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.RingEntries, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.Flags, sizeof(uint), sqRingSize) ||
+ (!usesNoSqArray && !IsOffsetInRange(p.SqOff.Array, p.SqEntries * (uint)sizeof(uint), sqRingSize)) ||
+ !IsOffsetInRange(p.CqOff.Head, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.Tail, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.RingMask, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.RingEntries, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.Overflow, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.Cqes, p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe), cqRingSize))
+ {
+ if (!usesSingleMmap)
+ {
+ Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize);
+ }
+
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+
+ // mmap SQE array.
+ void* sqePtr;
+ {
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqesSize, IoUringConstants.OffSqes, &sqePtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ if (!usesSingleMmap)
+ Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize);
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ }
+
+ // Derive SQ pointers and populate existing _ioUringSqRingInfo for compatibility.
+ _ioUringSqRingInfo.SqeBase = (IntPtr)sqePtr;
+ _ioUringSqRingInfo.SqTailPtr = (IntPtr)(sqRingPtr + p.SqOff.Tail);
+ _ioUringSqRingInfo.SqHeadPtr = (IntPtr)(sqRingPtr + p.SqOff.Head);
+ _ioUringSqRingInfo.SqMask = *(uint*)(sqRingPtr + p.SqOff.RingMask);
+ _ioUringSqRingInfo.SqEntries = *(uint*)(sqRingPtr + p.SqOff.RingEntries);
+ _ioUringSqRingInfo.SqeSize = negotiatedSqeSize;
+ _ioUringSqRingInfo.UsesNoSqArray = usesNoSqArray ? (byte)1 : (byte)0;
+ _ioUringSqRingInfo.RingFd = setup.RingFd;
+ _ioUringSqRingInfo.UsesEnterExtArg = setup.UsesExtArg ? (byte)1 : (byte)0;
+ _ringState.SqFlagsPtr = (uint*)(sqRingPtr + p.SqOff.Flags);
+
+ // Initialize SQ array identity mapping if NO_SQARRAY is not active.
+ if (!usesNoSqArray)
+ {
+ uint* sqArray = (uint*)(sqRingPtr + p.SqOff.Array);
+ for (uint i = 0; i < p.SqEntries; i++)
+ {
+ sqArray[i] = i;
+ }
+ }
+
+ // Derive CQ pointers.
+ _ringState.CqeBase = (Interop.Sys.IoUringCqe*)(cqRingPtr + p.CqOff.Cqes);
+ _ringState.CqTailPtr = (uint*)(cqRingPtr + p.CqOff.Tail);
+ _ringState.CqHeadPtr = (uint*)(cqRingPtr + p.CqOff.Head);
+ _ringState.CqMask = *(uint*)(cqRingPtr + p.CqOff.RingMask);
+ _ringState.CqEntries = *(uint*)(cqRingPtr + p.CqOff.RingEntries);
+ _ringState.CqOverflowPtr = (uint*)(cqRingPtr + p.CqOff.Overflow);
+
+ Debug.Assert(
+ BitOperations.IsPow2(_ioUringSqRingInfo.SqEntries),
+ $"Kernel-reported SQ entries must be power-of-two. sq_entries={_ioUringSqRingInfo.SqEntries}");
+ Debug.Assert(
+ BitOperations.IsPow2(_ringState.CqEntries),
+ $"Kernel-reported CQ entries must be power-of-two. cq_entries={_ringState.CqEntries}");
+ Debug.Assert(
+ _ioUringSqRingInfo.SqMask == _ioUringSqRingInfo.SqEntries - 1,
+ $"Unexpected SQ mask/entries contract: sq_mask={_ioUringSqRingInfo.SqMask}, sq_entries={_ioUringSqRingInfo.SqEntries}");
+ Debug.Assert(
+ _ringState.CqMask == _ringState.CqEntries - 1,
+ $"Unexpected CQ mask/entries contract: cq_mask={_ringState.CqMask}, cq_entries={_ringState.CqEntries}");
+
+ _ringState.ObservedCqOverflow = Volatile.Read(ref *_ringState.CqOverflowPtr);
+ _cqOverflowRecoveryActive = false;
+ _cqOverflowRecoveryBranch = default;
+
+ // Store ring region info for teardown.
+ _ringState.SqRingPtr = sqRingPtr;
+ _ringState.CqRingPtr = cqRingPtr;
+ _ringState.SqRingSize = sqRingSize;
+ _ringState.CqRingSize = cqRingSize;
+ _ringState.SqesSize = sqesSize;
+ _ringState.UsesSingleMmap = usesSingleMmap;
+ _ringState.RingFd = setup.RingFd;
+ _ringState.UsesExtArg = setup.UsesExtArg;
+ _ringState.UsesNoSqArray = usesNoSqArray;
+ _ringState.NegotiatedFlags = setup.NegotiatedFlags;
+ _managedSqeInvariantsValidated = ValidateManagedSqeInitializationInvariants();
+ if (!_managedSqeInvariantsValidated)
+ {
+ CleanupManagedRings();
+ return false;
+ }
+
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void CleanupManagedRings()
+ {
+ _ringState.CqDrainEnabled = false;
+
+ byte* sqRingPtr = _ringState.SqRingPtr;
+ byte* cqRingPtr = _ringState.CqRingPtr;
+ ulong sqRingSize = _ringState.SqRingSize;
+ ulong cqRingSize = _ringState.CqRingSize;
+ ulong sqesSize = _ringState.SqesSize;
+ bool usesSingleMmap = _ringState.UsesSingleMmap;
+ void* sqeBase = _ioUringSqRingInfo.SqeBase.ToPointer();
+
+ // Clear all mmap-derived pointers before unmapping so any late reads fail safely.
+ _ringState.SqFlagsPtr = null;
+ _ringState.CqeBase = null;
+ _ringState.CqTailPtr = null;
+ _ringState.CqHeadPtr = null;
+ _ringState.CqOverflowPtr = null;
+ _ringState.SqRingPtr = null;
+ _ringState.CqRingPtr = null;
+ _ringState.SqRingSize = 0;
+ _ringState.CqRingSize = 0;
+ _ringState.SqesSize = 0;
+ _ringState.CqMask = 0;
+ _ringState.CqEntries = 0;
+ _ringState.CachedCqHead = 0;
+ _ringState.ObservedCqOverflow = 0;
+ _ioUringSqRingInfo = default;
+ _managedSqeInvariantsValidated = false;
+
+ if (sqRingPtr != null)
+ {
+ // Unmap SQEs first
+ if (sqesSize > 0 && sqeBase != null)
+ {
+ Interop.Sys.IoUringShimMunmap(sqeBase, sqesSize);
+ }
+ // Unmap CQ ring (only if separate from SQ ring)
+ if (!usesSingleMmap && cqRingPtr != null && cqRingPtr != sqRingPtr)
+ {
+ Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize);
+ }
+ // Unmap SQ ring
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ }
+ if (_ringState.RingFd >= 0)
+ {
+ Interop.Sys.IoUringShimCloseFd(_ringState.RingFd);
+ _ringState.RingFd = -1;
+ }
+ }
+
+ /// Unmaps rings and closes the ring fd.
+ partial void LinuxFreeIoUringResources()
+ {
+ // Managed io_uring teardown: release resources allocated during TryInitializeManagedIoUring.
+ // This must run BEFORE the common slot/buffer cleanup below because kernel
+ // unregister operations need the ring fd to still be open.
+ if (_ioUringInitialized)
+ {
+ // 0. Unregister/dispose provided buffer ring while the main ring fd is still open.
+ FreeIoUringProvidedBufferRing();
+
+ // 1. The registered ring fd is implicitly released when the ring fd is closed.
+ // Just mark it as inactive so no subsequent code attempts to use it.
+ _ioUringSqRingInfo.RegisteredRingFd = -1;
+
+ // 2. Close the wakeup eventfd.
+ if (_ringState.WakeupEventFd >= 0)
+ {
+ Interop.Sys.IoUringShimCloseFd(_ringState.WakeupEventFd);
+ _ringState.WakeupEventFd = -1;
+ }
+
+ // 3. Unmap SQ/CQ rings, SQEs and close the ring fd.
+ // Closing the ring fd also terminates any kernel SQPOLL thread for this ring.
+ CleanupManagedRings();
+
+ // 4. Disable managed flags to prevent any late operations.
+ _ioUringInitialized = false;
+ _ringState.CqDrainEnabled = false;
+ }
+
+ bool portClosedForTeardown = Volatile.Read(ref _ioUringPortClosedForTeardown) != 0;
+ if (!portClosedForTeardown)
+ {
+ PollIoUringDiagnosticsIfNeeded(force: true);
+ }
+
+ // Second drain intentionally catches any items enqueued after LinuxBeforeFreeNativeResources
+ // published teardown but before native port closure became globally visible.
+ DrainQueuedIoUringOperationsForTeardown();
+
+ if (_completionSlots is not null)
+ {
+ DrainTrackedIoUringOperationsForTeardown(portClosedForTeardown);
+ Debug.Assert(IsIoUringTrackingEmpty(), $"Leaked tracked io_uring operations: {Volatile.Read(ref _trackedIoUringOperationCount)}");
+
+ // Free any native memory still held by completion slots
+ for (int i = 0; i < _completionSlots.Length; i++)
+ {
+ ref IoUringCompletionSlot slot = ref _completionSlots[i];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![i];
+ if (slot.IsZeroCopySend && slot.ZeroCopyNotificationPending)
+ {
+ // Ring teardown can drop in-flight NOTIF CQEs; clear pending SEND_ZC state
+ // so teardown cannot leave slots/pin-holds logically waiting forever.
+ slot.ClearZeroCopyState();
+ }
+
+ ReleaseZeroCopyPinHold(i);
+ if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ FreeMessageStorage(i);
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Accept && slotStorage.NativeSocketAddressLengthPtr != null)
+ {
+ *slotStorage.NativeSocketAddressLengthPtr = 0;
+ }
+
+ // Clear all pointers that alias _completionSlotNativeStorage before freeing it.
+ slotStorage.NativeInlineStorage = null;
+ slotStorage.NativeSocketAddressLengthPtr = null;
+ slotStorage.NativeMsgHdrPtr = IntPtr.Zero;
+ slotStorage.MessageIsReceive = false;
+ slotStorage.NativeIOVectors = null;
+ slotStorage.NativeSocketAddress = null;
+ slotStorage.NativeControlBuffer = null;
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ }
+
+ _completionSlots = null;
+ _trackedOperations = null;
+ _completionSlotStorage = null;
+ _trackedIoUringOperationCount = 0;
+ _zeroCopyPinHolds = null;
+ _completionSlotFreeListHead = -1;
+ _completionSlotsInUse = 0;
+ _liveAcceptCompletionSlotCount = 0;
+
+ _ioUringSlotCapacity = 0;
+ _cqOverflowRecoveryActive = false;
+ _cqOverflowRecoveryBranch = default;
+ _ioUringManagedPendingSubmissions = 0;
+ _ioUringManagedSqTail = 0;
+ _ioUringManagedSqTailLoaded = false;
+ _ioUringSqRingInfo = default;
+ _ioUringDirectSqeEnabled = false;
+ _sqPollEnabled = false;
+
+ }
+
+ if (_completionSlotNativeStorage != null)
+ {
+ NativeMemory.Free(_completionSlotNativeStorage);
+ _completionSlotNativeStorage = null;
+ _completionSlotNativeStorageStride = 0;
+ }
+
+ // Final flush of managed io_uring deltas in case teardown modified counters
+ // after the forced diagnostics poll and no further event-loop iteration runs.
+ PublishIoUringManagedDiagnosticsDelta();
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs
new file mode 100644
index 00000000000000..66e3bf8a67f187
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs
@@ -0,0 +1,468 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.ExceptionServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+using Microsoft.Win32.SafeHandles;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe nuint GetCompletionSlotNativeStorageStride()
+ {
+ nuint iovSize = (nuint)IoUringConstants.MessageInlineIovCount * (nuint)sizeof(Interop.Sys.IOVector);
+ return (nuint)sizeof(NativeMsghdr) +
+ iovSize +
+ (nuint)IoUringConstants.MessageInlineSocketAddressCapacity +
+ (nuint)IoUringConstants.MessageInlineControlBufferCapacity +
+ (nuint)sizeof(int);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void InitializeCompletionSlotNativeStorage(
+ ref IoUringCompletionSlotStorage slotStorage,
+ byte* slotStorageBase)
+ {
+ slotStorage.NativeInlineStorage = slotStorageBase;
+ slotStorage.NativeMsgHdrPtr = (IntPtr)slotStorageBase;
+
+ byte* cursor = slotStorageBase + sizeof(NativeMsghdr);
+ slotStorage.NativeIOVectors = (Interop.Sys.IOVector*)cursor;
+ cursor += IoUringConstants.MessageInlineIovCount * sizeof(Interop.Sys.IOVector);
+ slotStorage.NativeSocketAddress = cursor;
+ cursor += IoUringConstants.MessageInlineSocketAddressCapacity;
+ slotStorage.NativeControlBuffer = cursor;
+ cursor += IoUringConstants.MessageInlineControlBufferCapacity;
+ slotStorage.NativeSocketAddressLengthPtr = (int*)cursor;
+
+ slotStorage.MessageIsReceive = false;
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ }
+
+ /// Allocates SoA completion slot arrays and initializes the free list.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void InitializeCompletionSlotPool(int capacity)
+ {
+ Debug.Assert(
+ (ulong)capacity <= IoUringConstants.SlotIndexMask + 1UL,
+ $"Completion slot capacity {capacity} exceeds encodable slot index range {IoUringConstants.SlotIndexMask + 1UL}.");
+ Debug.Assert(
+ Unsafe.SizeOf() == 24,
+ $"IoUringCompletionSlot size drifted: expected 24, got {Unsafe.SizeOf()}.");
+ _completionSlots = new IoUringCompletionSlot[capacity];
+ _trackedOperations = new IoUringTrackedOperationState[capacity];
+ _completionSlotStorage = new IoUringCompletionSlotStorage[capacity];
+ _zeroCopyPinHolds = new System.Buffers.MemoryHandle[capacity];
+ _completionSlotNativeStorageStride = GetCompletionSlotNativeStorageStride();
+ Debug.Assert(
+ _completionSlotNativeStorageStride <= int.MaxValue,
+ $"Completion slot native storage stride overflow: {_completionSlotNativeStorageStride}.");
+ if (_completionSlotNativeStorageStride > int.MaxValue)
+ {
+ // FailFast-adjacent site: impossible stride overflow indicates corrupted
+ // layout assumptions during engine initialization, so keep the hard failure.
+ ThrowInternalException(Interop.Error.EOVERFLOW);
+ }
+
+ _completionSlotNativeStorage = (byte*)NativeMemory.AllocZeroed((nuint)capacity * _completionSlotNativeStorageStride);
+ // Build free list linking all slots
+ for (int i = 0; i < capacity - 1; i++)
+ {
+ _completionSlots[i].Generation = 1;
+ _completionSlots[i].FreeListNext = i + 1;
+ InitializeCompletionSlotNativeStorage(
+ ref _completionSlotStorage[i],
+ _completionSlotNativeStorage + ((nuint)i * _completionSlotNativeStorageStride));
+ }
+ _completionSlots[capacity - 1].Generation = 1;
+ _completionSlots[capacity - 1].FreeListNext = -1;
+ InitializeCompletionSlotNativeStorage(
+ ref _completionSlotStorage[capacity - 1],
+ _completionSlotNativeStorage + ((nuint)(capacity - 1) * _completionSlotNativeStorageStride));
+ _completionSlotFreeListHead = 0;
+ _completionSlotsInUse = 0;
+ _completionSlotsHighWaterMark = 0;
+ _liveAcceptCompletionSlotCount = 0;
+ _trackedIoUringOperationCount = 0;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void SetCompletionSlotKind(ref IoUringCompletionSlot slot, IoUringCompletionOperationKind kind)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SetCompletionSlotKind must run on the event-loop thread.");
+ IoUringCompletionOperationKind previousKind = slot.Kind;
+ if (previousKind == kind)
+ {
+ return;
+ }
+
+ slot.Kind = kind;
+ bool previousIsAcceptLike = previousKind == IoUringCompletionOperationKind.Accept ||
+ previousKind == IoUringCompletionOperationKind.ReusePortAccept;
+ bool currentIsAcceptLike = kind == IoUringCompletionOperationKind.Accept ||
+ kind == IoUringCompletionOperationKind.ReusePortAccept;
+ if (previousIsAcceptLike || currentIsAcceptLike)
+ {
+ int liveAcceptCount = _liveAcceptCompletionSlotCount;
+ if (previousIsAcceptLike)
+ {
+ liveAcceptCount--;
+ }
+
+ if (currentIsAcceptLike)
+ {
+ liveAcceptCount++;
+ }
+
+ Debug.Assert(liveAcceptCount >= 0);
+ Volatile.Write(ref _liveAcceptCompletionSlotCount, liveAcceptCount);
+ }
+ }
+
+ ///
+ /// Allocates a completion slot from the free list. Returns the slot index,
+ /// or -1 if the pool is exhausted (backpressure signal).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int AllocateCompletionSlot()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "AllocateCompletionSlot must run on the event-loop thread.");
+ Debug.Assert(_completionSlots is not null);
+ int index = _completionSlotFreeListHead;
+ if (index < 0)
+ return -1; // Pool exhausted
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![index];
+ // Slot state is reset in FreeCompletionSlot; keep allocation to free-list bookkeeping only.
+ _completionSlotFreeListHead = slot.FreeListNext;
+ slot.FreeListNext = -1;
+ int inUse = ++_completionSlotsInUse;
+ if (inUse > _completionSlotsHighWaterMark)
+ {
+ _completionSlotsHighWaterMark = inUse;
+ SocketsTelemetry.Log.IoUringCompletionSlotHighWaterMark(inUse);
+ }
+ return index;
+ }
+
+ ///
+ /// Returns a completion slot to the free list, incrementing its generation
+ /// to invalidate any stale user_data references.
+ ///
+ private unsafe void FreeCompletionSlot(int index)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "FreeCompletionSlot must run on the event-loop thread.");
+ Debug.Assert(index >= 0 && index < _completionSlots!.Length);
+
+ ReleaseZeroCopyPinHold(index);
+ ref IoUringCompletionSlot slot = ref _completionSlots![index];
+ ref IoUringTrackedOperationState trackedState = ref _trackedOperations![index];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![index];
+ Debug.Assert(
+ Volatile.Read(ref trackedState.TrackedOperation) is null,
+ "Completion slot should not be freed while a tracked io_uring operation is still attached.");
+
+ SafeSocketHandle? dangerousRefSocketHandle = slotStorage.DangerousRefSocketHandle;
+ ExceptionDispatchInfo? dangerousReleaseException = null;
+ try
+ {
+ if (dangerousRefSocketHandle is not null)
+ {
+ slotStorage.DangerousRefSocketHandle = null;
+ dangerousRefSocketHandle.DangerousRelease();
+ }
+ }
+ catch (Exception ex)
+ {
+ dangerousReleaseException = ExceptionDispatchInfo.Capture(ex);
+ }
+ finally
+ {
+ if (slot.UsesFixedRecvBuffer)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ if (providedBufferRing is not null)
+ {
+ providedBufferRing.TryRecycleBufferFromCompletion(slot.FixedRecvBufferId);
+ }
+ }
+
+ // Free any native message storage
+ if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ FreeMessageStorage(index);
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Accept)
+ {
+ if (slotStorage.NativeSocketAddressLengthPtr != null)
+ {
+ *slotStorage.NativeSocketAddressLengthPtr = 0;
+ }
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.ReusePortAccept)
+ {
+ slotStorage.ReusePortPrimaryContext = null;
+ slotStorage.ReusePortPrimaryEngine = null;
+ }
+
+ slot.Generation = (slot.Generation + 1UL) & IoUringConstants.GenerationMask;
+ if (slot.Generation == 0)
+ {
+ slot.Generation = 1;
+ }
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.None);
+ ResetDebugTestForcedResult(ref slot);
+ slot.ClearZeroCopyState();
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ Volatile.Write(ref trackedState.TrackedOperation, null);
+ trackedState.TrackedOperationGeneration = 0;
+ slot.FreeListNext = _completionSlotFreeListHead;
+ _completionSlotFreeListHead = index;
+ _completionSlotsInUse--;
+ }
+
+ dangerousReleaseException?.Throw();
+ }
+
+ /// Disposes a retained zero-copy pin-hold for the specified completion slot.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ReleaseZeroCopyPinHold(int slotIndex)
+ {
+ System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds;
+ if (pinHolds is null || (uint)slotIndex >= (uint)pinHolds.Length)
+ {
+ return;
+ }
+
+ pinHolds[slotIndex].Dispose();
+ pinHolds[slotIndex] = default;
+ }
+
+ /// Transfers operation-owned pin state into the engine's zero-copy pin-hold table.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void TransferIoUringZeroCopyPinHold(ulong userData, System.Buffers.MemoryHandle pinHold)
+ {
+ System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds;
+ if (pinHolds is null)
+ {
+ pinHold.Dispose();
+ Debug.Fail("Zero-copy pin-hold table is unavailable while transferring pin ownership.");
+ return;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask);
+ if ((uint)slotIndex >= (uint)pinHolds.Length)
+ {
+ pinHold.Dispose();
+ Debug.Fail($"Invalid completion slot index while transferring zero-copy pin hold: {slotIndex}.");
+ return;
+ }
+
+ Debug.Assert(_completionSlots is not null);
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ if (!slot.IsZeroCopySend)
+ {
+ pinHold.Dispose();
+ Debug.Fail("Zero-copy pin hold transfer requested for a non-zero-copy completion slot.");
+ return;
+ }
+
+ pinHolds[slotIndex].Dispose();
+ pinHolds[slotIndex] = pinHold;
+ }
+
+ ///
+ /// Prepares pre-allocated per-slot native message storage for sendmsg/recvmsg.
+ /// Returns false when header shape exceeds inline capacities so callers can fall back.
+ ///
+ private unsafe bool TryPrepareInlineMessageStorage(int slotIndex, Interop.Sys.MessageHeader* messageHeader, bool isReceive)
+ {
+ Debug.Assert(sizeof(NativeMsghdr) == NativeMsghdr.ExpectedSize, $"NativeMsghdr size mismatch with kernel struct msghdr: expected {NativeMsghdr.ExpectedSize}, got {sizeof(NativeMsghdr)}");
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+
+ int iovCount = messageHeader->IOVectorCount;
+ int sockAddrLen = messageHeader->SocketAddressLen;
+ int controlBufLen = messageHeader->ControlBufferLen;
+ Debug.Assert(iovCount >= 0, $"Expected non-negative iovCount, got {iovCount}");
+ Debug.Assert(sockAddrLen >= 0, $"Expected non-negative socket address length, got {sockAddrLen}");
+ Debug.Assert(controlBufLen >= 0, $"Expected non-negative control buffer length, got {controlBufLen}");
+
+ if ((uint)iovCount > IoUringConstants.MessageInlineIovCount ||
+ (uint)sockAddrLen > IoUringConstants.MessageInlineSocketAddressCapacity ||
+ (uint)controlBufLen > IoUringConstants.MessageInlineControlBufferCapacity)
+ {
+ return false;
+ }
+
+ if (slotStorage.NativeInlineStorage == null)
+ {
+ return false;
+ }
+
+ if ((iovCount > 0 && messageHeader->IOVectors == null) ||
+ (sockAddrLen > 0 && messageHeader->SocketAddress == null) ||
+ (controlBufLen > 0 && messageHeader->ControlBuffer == null))
+ {
+ return false;
+ }
+
+ // Most of the inline slab is overwritten immediately; clear only msghdr header state.
+ new Span(slotStorage.NativeMsgHdrPtr.ToPointer(), sizeof(NativeMsghdr)).Clear();
+
+ NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr;
+ Interop.Sys.IOVector* iovDst = slotStorage.NativeIOVectors;
+ byte* sockAddrDst = slotStorage.NativeSocketAddress;
+ byte* controlBufDst = slotStorage.NativeControlBuffer;
+
+ if (iovCount > 0)
+ {
+ nuint iovBytes = (nuint)iovCount * (nuint)sizeof(Interop.Sys.IOVector);
+ Buffer.MemoryCopy(
+ messageHeader->IOVectors,
+ iovDst,
+ (nuint)IoUringConstants.MessageInlineIovCount * (nuint)sizeof(Interop.Sys.IOVector),
+ iovBytes);
+ }
+
+ if (!isReceive)
+ {
+ if (sockAddrLen > 0)
+ {
+ Buffer.MemoryCopy(
+ messageHeader->SocketAddress,
+ sockAddrDst,
+ (nuint)IoUringConstants.MessageInlineSocketAddressCapacity,
+ (nuint)sockAddrLen);
+ }
+
+ if (controlBufLen > 0)
+ {
+ Buffer.MemoryCopy(
+ messageHeader->ControlBuffer,
+ controlBufDst,
+ (nuint)IoUringConstants.MessageInlineControlBufferCapacity,
+ (nuint)controlBufLen);
+ }
+ }
+
+ hdr->MsgName = sockAddrLen > 0 ? sockAddrDst : null;
+ hdr->MsgNameLen = (uint)sockAddrLen;
+ hdr->MsgIov = iovCount > 0 ? iovDst : null;
+ hdr->MsgIovLen = (nuint)iovCount;
+ hdr->MsgControl = controlBufLen > 0 ? controlBufDst : null;
+ hdr->MsgControlLen = (nuint)controlBufLen;
+ hdr->MsgFlags = 0;
+
+ if (isReceive)
+ {
+ slotStorage.ReceiveOutputSocketAddress = messageHeader->SocketAddress;
+ slotStorage.ReceiveOutputControlBuffer = messageHeader->ControlBuffer;
+ slotStorage.ReceiveSocketAddressCapacity = sockAddrLen;
+ slotStorage.ReceiveControlBufferCapacity = controlBufLen;
+ }
+ else
+ {
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ }
+
+ slotStorage.MessageIsReceive = isReceive;
+ return true;
+ }
+
+ ///
+ /// Resets inline message metadata on the completion slot.
+ ///
+ private unsafe void FreeMessageStorage(int slotIndex)
+ {
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ // Slot inline storage is cleared on prepare before each reuse; avoid a second full memset on free.
+
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ slotStorage.MessageIsReceive = false;
+ }
+
+ ///
+ /// After a recvmsg CQE completes, copies the kernel-written socket address and
+ /// control buffer data from the native msghdr back to the managed MessageHeader's
+ /// output buffers. For sendmsg completions this is a no-op.
+ /// Returns the actual socket address length, control buffer length, and msg_flags written by the kernel.
+ ///
+ private unsafe void CopyMessageCompletionOutputs(
+ int slotIndex,
+ out int socketAddressLen,
+ out int controlBufferLen,
+ out uint messageFlags)
+ {
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ socketAddressLen = 0;
+ controlBufferLen = 0;
+ messageFlags = 0;
+
+ if (!slotStorage.MessageIsReceive)
+ return;
+
+ NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr;
+ if (hdr == null)
+ return;
+
+ socketAddressLen = (int)hdr->MsgNameLen;
+ controlBufferLen = (int)hdr->MsgControlLen;
+ messageFlags = (uint)hdr->MsgFlags;
+
+ // Copy socket address from native buffer back to managed output buffer
+ if (slotStorage.ReceiveOutputSocketAddress != null && slotStorage.NativeSocketAddress != null &&
+ slotStorage.ReceiveSocketAddressCapacity > 0 && socketAddressLen > 0)
+ {
+ int copyLen = Math.Min(slotStorage.ReceiveSocketAddressCapacity, socketAddressLen);
+ Buffer.MemoryCopy(slotStorage.NativeSocketAddress, slotStorage.ReceiveOutputSocketAddress, copyLen, copyLen);
+ }
+
+ // Copy control buffer from native buffer back to managed output buffer
+ if (slotStorage.ReceiveOutputControlBuffer != null && slotStorage.NativeControlBuffer != null &&
+ slotStorage.ReceiveControlBufferCapacity > 0 && controlBufferLen > 0)
+ {
+ int copyLen = Math.Min(slotStorage.ReceiveControlBufferCapacity, controlBufferLen);
+ Buffer.MemoryCopy(slotStorage.NativeControlBuffer, slotStorage.ReceiveOutputControlBuffer, copyLen, copyLen);
+ }
+ }
+
+ ///
+ /// Decodes a completion slot index from a user_data payload value.
+ /// The slot index is encoded in the lower bits of the payload.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int DecodeCompletionSlotIndex(ulong payload)
+ {
+ return (int)(payload & IoUringConstants.SlotIndexMask);
+ }
+
+ ///
+ /// Encodes a completion slot index and generation into a user_data value
+ /// with the ReservedCompletion tag.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong EncodeCompletionSlotUserData(int slotIndex, ulong generation)
+ {
+ ulong payload = ((ulong)(generation & IoUringConstants.GenerationMask) << IoUringConstants.SlotIndexBits) | ((ulong)slotIndex & IoUringConstants.SlotIndexMask);
+ return EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs
new file mode 100644
index 00000000000000..27649e2b22b606
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs
@@ -0,0 +1,193 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Converts SocketFlags to the kernel msg_flags representation for io_uring.
+ private static bool TryConvertIoUringPrepareSocketFlags(SocketFlags flags, out uint rwFlags)
+ {
+ const SocketFlags SupportedIoUringFlags =
+ SocketFlags.OutOfBand |
+ SocketFlags.Peek |
+ SocketFlags.DontRoute;
+
+ if ((flags & ~SupportedIoUringFlags) != 0)
+ {
+ rwFlags = 0;
+ return false;
+ }
+
+ rwFlags = (uint)(int)flags;
+ return true;
+ }
+
+ /// Writes a send/recv-like SQE (send, send_zc, recv) with a user-supplied buffer.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteSendLikeSqe(
+ IoUringSqe* sqe,
+ byte opcode,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ uint rwFlags)
+ {
+ sqe->Opcode = opcode;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0; // Not used by send/recv opcodes.
+ sqe->Fd = sqeFd;
+ sqe->Off = 0; // Not used by send/recv opcodes.
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = userData;
+ // BufIndex, Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ }
+
+ /// Writes a read-fixed SQE for registered-buffer receive.
+ private static unsafe void WriteReadFixedSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ ushort bufferIndex)
+ {
+ sqe->Opcode = IoUringOpcodes.ReadFixed;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0; // Not used by READ_FIXED.
+ sqe->Fd = sqeFd;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ sqe->RwFlags = 0; // No special read flags.
+ // For non-seekable sockets, offset is ignored; -1 matches "current position" semantics.
+ sqe->Off = ulong.MaxValue;
+ sqe->BufIndex = bufferIndex;
+ sqe->UserData = userData;
+ // Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ }
+
+ ///
+ /// Writes a recv SQE using provided-buffer selection (one-shot or multishot).
+ /// The kernel chooses a buffer from the specified buffer group.
+ /// For multishot, set to .
+ ///
+ private static void WriteProvidedBufferRecvSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ uint requestedLength,
+ uint rwFlags,
+ ushort bufferGroupId,
+ ushort ioprio = 0)
+ {
+ sqe->Opcode = IoUringOpcodes.Recv;
+ sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect);
+ sqe->Fd = sqeFd;
+ sqe->Ioprio = ioprio;
+ sqe->Off = 0; // Not used by provided-buffer recv.
+ sqe->Addr = 0; // No user buffer; kernel selects from buffer group.
+ sqe->Len = requestedLength;
+ sqe->RwFlags = rwFlags;
+ sqe->BufIndex = bufferGroupId;
+ sqe->UserData = userData;
+ // Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ }
+
+ /// Writes an accept SQE (one-shot or multishot) to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteAcceptSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ IntPtr socketAddressLengthPtr,
+ bool multishot = false)
+ {
+ sqe->Opcode = IoUringOpcodes.Accept;
+ sqe->Flags = sqeFlags;
+ sqe->Fd = sqeFd;
+ // Explicit write for defensive clarity; multishot and one-shot accept must not
+ // inherit stale ioprio bits from previous SQE occupants.
+ sqe->Ioprio = multishot ? IoUringConstants.AcceptMultishot : (ushort)0;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // Kernel accept prep aliases addr2 at sqe->off.
+ sqe->Off = (ulong)(nuint)socketAddressLengthPtr;
+ sqe->Len = 0; // Not used by accept.
+ sqe->RwFlags = IoUringConstants.AcceptFlags;
+ sqe->UserData = userData;
+ // BufIndex, Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ }
+
+ /// Writes a sendmsg/sendmsg_zc/recvmsg SQE to the submission ring entry.
+ private static void WriteSendMsgLikeSqe(
+ IoUringSqe* sqe,
+ byte opcode,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ IntPtr messageHeader,
+ uint rwFlags)
+ {
+ sqe->Opcode = opcode;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0; // Not used by sendmsg/recvmsg.
+ sqe->Fd = sqeFd;
+ sqe->Off = 0; // Not used by sendmsg/recvmsg.
+ sqe->Addr = (ulong)(nuint)messageHeader;
+ sqe->Len = 1;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = userData;
+ // BufIndex, Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ }
+
+ /// Writes a connect SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteConnectSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ int socketAddressLen)
+ {
+ sqe->Opcode = IoUringOpcodes.Connect;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0; // Not used by connect.
+ sqe->Fd = sqeFd;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // Kernel connect prep aliases addrlen at sqe->off and requires len=0.
+ sqe->Off = (uint)socketAddressLen;
+ sqe->Len = 0; // Kernel requires len=0 for connect.
+ sqe->RwFlags = 0; // No special flags for connect.
+ sqe->UserData = userData;
+ // BufIndex, Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ }
+
+ /// Writes an ASYNC_CANCEL SQE targeting the specified user_data.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteAsyncCancelSqe(IoUringSqe* sqe, ulong userData)
+ {
+ sqe->Opcode = IoUringOpcodes.AsyncCancel;
+ sqe->Flags = 0; // No SQE flags for cancel.
+ sqe->Ioprio = 0; // Not used by ASYNC_CANCEL.
+ sqe->Fd = -1;
+ sqe->Off = 0; // Not used by ASYNC_CANCEL.
+ Debug.Assert((byte)(userData >> IoUringUserDataTagShift) == IoUringConstants.TagReservedCompletion);
+ sqe->Addr = userData;
+ sqe->Len = 0; // Not used by ASYNC_CANCEL.
+ sqe->RwFlags = 0; // Not used by ASYNC_CANCEL.
+ sqe->UserData = 0;
+ // BufIndex, Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs
new file mode 100644
index 00000000000000..2b3e4e52b5cb39
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs
@@ -0,0 +1,15 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ private bool TryConsumeDebugForcedSubmitError(out Interop.Error forcedError)
+ {
+ _ = _ioUringInitialized;
+ forcedError = Interop.Error.SUCCESS;
+ return false;
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs
new file mode 100644
index 00000000000000..86a830a17efbd4
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs
@@ -0,0 +1,4545 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ /// Linux socket engine coordinating epoll and io_uring work for process sockets.
+ ///
+ /// Multiple engine instances are created at startup (one per physical core by default),
+ /// each with its own io_uring ring, completion slots, and pinned event-loop thread.
+ /// Sockets are routed to engines via s_fdEngineAffinity or round-robin fallback.
+ /// Server sockets use SO_REUSEPORT shadow listeners so the kernel distributes accepts
+ /// across all engine event loops.
+ ///
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Indicates which io_uring dispatch mode is active for this engine instance.
+ private enum IoUringMode : byte
+ {
+ Disabled = 0,
+ Completion = 1
+ }
+
+ /// Distinguishes cancellation requests issued during normal runtime from those during engine teardown.
+ private enum IoUringCancellationOrigin : byte
+ {
+ Runtime = 0,
+ Teardown = 1
+ }
+
+ /// Identifies which CQ-overflow recovery branch is active for logging/telemetry correlation.
+ private enum IoUringCqOverflowRecoveryBranch : byte
+ {
+ MultishotAcceptArming = 0,
+ Teardown = 1,
+ // Steady-state branch: normal runtime overflow recovery outside teardown/accept-arm handoff.
+ DualWave = 2
+ }
+
+ /// Tracks the lifecycle of an io_uring operation for debug assertions on valid state transitions.
+ private enum IoUringOperationLifecycleState : byte
+ {
+ Queued = 0,
+ Prepared = 1,
+ Submitted = 2,
+ Completed = 3,
+ Canceled = 4,
+ Detached = 5
+ }
+
+ /// Precomputed default receive strategy derived from immutable io_uring capabilities.
+ private enum IoUringRecvStrategy : byte
+ {
+ FixedRecv = 0,
+ MultishotProvidedBuffer = 1,
+ OneshotProvidedBuffer = 2,
+ PlainUserBuffer = 3
+ }
+
+ /// Result of attempting to remove a tracked operation by user_data.
+ private enum IoUringTrackedOperationRemoveResult : byte
+ {
+ Removed = 0,
+ NotFound = 1,
+ Mismatch = 2
+ }
+
+ private enum IoUringCancellationEnqueueResult : byte
+ {
+ Failed = 0,
+ Enqueued = 1,
+ EnqueuedAndWoke = 2
+ }
+
+ /// Immutable snapshot of negotiated io_uring capabilities for this engine instance.
+ private readonly struct LinuxIoUringCapabilities
+ {
+ private const uint FlagIsIoUringPort = 1u << 0;
+ private const uint FlagSupportsMultishotRecv = 1u << 1;
+ private const uint FlagSupportsMultishotAccept = 1u << 2;
+ private const uint FlagSupportsZeroCopySend = 1u << 3;
+ private const uint FlagSqPollEnabled = 1u << 4;
+ private const uint FlagSupportsProvidedBufferRings = 1u << 5;
+ private const uint FlagHasRegisteredBuffers = 1u << 6;
+
+ private readonly uint _flags;
+
+ /// The active io_uring dispatch mode.
+ internal IoUringMode Mode { get; }
+
+ /// Whether the engine's port was created as an io_uring instance.
+ internal bool IsIoUringPort => (_flags & FlagIsIoUringPort) != 0;
+ /// Whether multishot recv can be used by this engine instance.
+ internal bool SupportsMultishotRecv => (_flags & FlagSupportsMultishotRecv) != 0;
+ /// Whether multishot accept can be used by this engine instance.
+ internal bool SupportsMultishotAccept => (_flags & FlagSupportsMultishotAccept) != 0;
+ /// Whether zero-copy send is enabled for this engine instance.
+ internal bool SupportsZeroCopySend => (_flags & FlagSupportsZeroCopySend) != 0;
+ /// Whether SQPOLL mode is enabled for this engine instance.
+ internal bool SqPollEnabled => (_flags & FlagSqPollEnabled) != 0;
+ /// Whether provided-buffer rings are active for this engine instance.
+ internal bool SupportsProvidedBufferRings => (_flags & FlagSupportsProvidedBufferRings) != 0;
+ /// Whether provided buffers are currently registered with the kernel.
+ internal bool HasRegisteredBuffers => (_flags & FlagHasRegisteredBuffers) != 0;
+
+ /// Whether the engine is operating in full completion mode.
+ internal bool IsCompletionMode =>
+ Mode == IoUringMode.Completion;
+
+ private LinuxIoUringCapabilities(IoUringMode mode, uint flags)
+ {
+ Mode = mode;
+ _flags = flags;
+ }
+
+ internal LinuxIoUringCapabilities WithMode(IoUringMode mode) =>
+ new LinuxIoUringCapabilities(mode, _flags);
+
+ internal LinuxIoUringCapabilities WithIsIoUringPort(bool value) =>
+ WithFlag(FlagIsIoUringPort, value);
+
+ internal LinuxIoUringCapabilities WithSupportsMultishotRecv(bool value) =>
+ WithFlag(FlagSupportsMultishotRecv, value);
+
+ internal LinuxIoUringCapabilities WithSupportsMultishotAccept(bool value) =>
+ WithFlag(FlagSupportsMultishotAccept, value);
+
+ internal LinuxIoUringCapabilities WithSupportsZeroCopySend(bool value) =>
+ WithFlag(FlagSupportsZeroCopySend, value);
+
+ internal LinuxIoUringCapabilities WithSqPollEnabled(bool value) =>
+ WithFlag(FlagSqPollEnabled, value);
+
+ internal LinuxIoUringCapabilities WithSupportsProvidedBufferRings(bool value) =>
+ WithFlag(FlagSupportsProvidedBufferRings, value);
+
+ internal LinuxIoUringCapabilities WithHasRegisteredBuffers(bool value) =>
+ WithFlag(FlagHasRegisteredBuffers, value);
+
+ private LinuxIoUringCapabilities WithFlag(uint flag, bool value)
+ {
+ uint flags = value ? (_flags | flag) : (_flags & ~flag);
+ return new LinuxIoUringCapabilities(Mode, flags);
+ }
+ }
+
+ [Flags]
+ private enum IoUringConfigurationWarningFlags : byte
+ {
+ None = 0,
+ SqPollRequestedWithoutIoUring = 1 << 0,
+ DirectSqeDisabledWithoutIoUring = 1 << 1,
+ ZeroCopyOptInWithoutIoUring = 1 << 2
+ }
+
+ /// Immutable process-wide snapshot of resolved io_uring configuration inputs.
+ private readonly struct IoUringResolvedConfiguration
+ {
+ internal bool IoUringEnabled { get; }
+ internal bool SqPollRequested { get; }
+ internal bool DirectSqeDisabled { get; }
+ internal bool ZeroCopySendOptedIn { get; }
+ internal bool RegisterBuffersEnabled { get; }
+ internal bool AdaptiveProvidedBufferSizingEnabled { get; }
+ internal int ProvidedBufferSize { get; }
+ internal int PrepareQueueCapacity { get; }
+ internal int CancellationQueueCapacity { get; }
+ private readonly IoUringConfigurationWarningFlags _warningFlags;
+
+ internal IoUringResolvedConfiguration(
+ bool ioUringEnabled,
+ bool sqPollRequested,
+ bool directSqeDisabled,
+ bool zeroCopySendOptedIn,
+ bool registerBuffersEnabled,
+ bool adaptiveProvidedBufferSizingEnabled,
+ int providedBufferSize,
+ int prepareQueueCapacity,
+ int cancellationQueueCapacity)
+ {
+ IoUringEnabled = ioUringEnabled;
+ SqPollRequested = sqPollRequested;
+ DirectSqeDisabled = directSqeDisabled;
+ ZeroCopySendOptedIn = zeroCopySendOptedIn;
+ RegisterBuffersEnabled = registerBuffersEnabled;
+ AdaptiveProvidedBufferSizingEnabled = adaptiveProvidedBufferSizingEnabled;
+ ProvidedBufferSize = providedBufferSize;
+ PrepareQueueCapacity = prepareQueueCapacity;
+ CancellationQueueCapacity = cancellationQueueCapacity;
+ _warningFlags = ComputeWarningFlags(
+ ioUringEnabled,
+ sqPollRequested,
+ directSqeDisabled,
+ zeroCopySendOptedIn);
+ }
+
+ internal string ToLogString() =>
+ $"enabled={IoUringEnabled}, sqpollRequested={SqPollRequested}, directSqeDisabled={DirectSqeDisabled}, zeroCopySendOptedIn={ZeroCopySendOptedIn}, registerBuffersEnabled={RegisterBuffersEnabled}, adaptiveProvidedBufferSizingEnabled={AdaptiveProvidedBufferSizingEnabled}, providedBufferSize={ProvidedBufferSize}, prepareQueueCapacity={PrepareQueueCapacity}, cancellationQueueCapacity={CancellationQueueCapacity}";
+
+ internal bool TryGetValidationWarnings([NotNullWhen(true)] out string? warnings)
+ {
+ if (_warningFlags == IoUringConfigurationWarningFlags.None)
+ {
+ warnings = null;
+ return false;
+ }
+
+ warnings = BuildWarningMessage(_warningFlags);
+ return true;
+ }
+
+ private static IoUringConfigurationWarningFlags ComputeWarningFlags(
+ bool ioUringEnabled, bool sqPollRequested, bool directSqeDisabled, bool zeroCopySendOptedIn)
+ {
+ if (ioUringEnabled)
+ {
+ return IoUringConfigurationWarningFlags.None;
+ }
+
+ IoUringConfigurationWarningFlags warnings = IoUringConfigurationWarningFlags.None;
+ if (sqPollRequested) warnings |= IoUringConfigurationWarningFlags.SqPollRequestedWithoutIoUring;
+ if (directSqeDisabled) warnings |= IoUringConfigurationWarningFlags.DirectSqeDisabledWithoutIoUring;
+ if (zeroCopySendOptedIn) warnings |= IoUringConfigurationWarningFlags.ZeroCopyOptInWithoutIoUring;
+ return warnings;
+ }
+
+ private static string BuildWarningMessage(IoUringConfigurationWarningFlags warnings)
+ {
+ var parts = new List(3);
+ if ((warnings & IoUringConfigurationWarningFlags.SqPollRequestedWithoutIoUring) != 0)
+ {
+ parts.Add("SQPOLL requested while io_uring is disabled");
+ }
+
+ if ((warnings & IoUringConfigurationWarningFlags.DirectSqeDisabledWithoutIoUring) != 0)
+ {
+ parts.Add("direct SQE disabled while io_uring is disabled");
+ }
+
+ if ((warnings & IoUringConfigurationWarningFlags.ZeroCopyOptInWithoutIoUring) != 0)
+ {
+ parts.Add("zero-copy send opted-in while io_uring is disabled");
+ }
+
+ return string.Join("; ", parts);
+ }
+ }
+
+ /// Mirrors kernel struct io_uring_sqe (64 bytes), written to the SQ ring for submission.
+ [StructLayout(LayoutKind.Explicit, Size = 64)]
+ internal struct IoUringSqe
+ {
+ [FieldOffset(0)]
+ internal byte Opcode;
+ [FieldOffset(1)]
+ internal byte Flags;
+ [FieldOffset(2)]
+ internal ushort Ioprio;
+ [FieldOffset(4)]
+ internal int Fd;
+ [FieldOffset(8)]
+ internal ulong Off;
+ [FieldOffset(16)]
+ internal ulong Addr;
+ [FieldOffset(24)]
+ internal uint Len;
+ [FieldOffset(28)]
+ internal uint RwFlags;
+ [FieldOffset(32)]
+ internal ulong UserData;
+ [FieldOffset(40)]
+ internal ushort BufIndex;
+ [FieldOffset(42)]
+ internal ushort Personality;
+ [FieldOffset(44)]
+ internal int SpliceFdIn;
+ [FieldOffset(48)]
+ internal ulong Addr3;
+ }
+
+ /// Mirrors kernel struct io_uring_probe_op (8 bytes per entry in the probe ops array).
+ [StructLayout(LayoutKind.Explicit, Size = 8)]
+ private struct IoUringProbeOp
+ {
+ [FieldOffset(0)] internal byte Op;
+ [FieldOffset(1)] internal byte Resv;
+ [FieldOffset(2)] internal ushort Flags;
+ // 4 bytes reserved at offset 4
+ }
+
+ /// Mirrors kernel struct io_uring_probe (16-byte header preceding the variable-length ops array).
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ private struct IoUringProbeHeader
+ {
+ [FieldOffset(0)] internal byte LastOp;
+ [FieldOffset(1)] internal byte OpsLen;
+ // 14 bytes reserved at offset 2
+ }
+
+ ///
+ /// Kernel ABI opcode constants as a static class (not an enum) to avoid byte-cast noise
+ /// at every SQE write site, since the SQE Opcode field is typed as byte.
+ ///
+ private static class IoUringOpcodes
+ {
+ internal const byte ReadFixed = 4;
+ internal const byte Send = 26;
+ internal const byte Recv = 27;
+ internal const byte SendMsg = 9;
+ internal const byte RecvMsg = 10;
+ internal const byte Accept = 13;
+ internal const byte Connect = 16;
+ internal const byte SendZc = 53;
+ internal const byte SendMsgZc = 54;
+ internal const byte AsyncCancel = 14;
+ internal const byte PollAdd = 6;
+ }
+
+ ///
+ /// Centralizes io_uring ABI constants that mirror the native definitions in pal_io_uring.c.
+ /// These are used by managed code that directly interacts with the io_uring submission
+ /// and completion rings (e.g., direct SQE writes via mmap'd ring access).
+ ///
+ private static class IoUringConstants
+ {
+ // Setup flags (io_uring_setup params.flags)
+ internal const uint SetupCqSize = 1u << 3;
+ internal const uint SetupSqPoll = 1u << 5;
+ internal const uint SetupSubmitAll = 1u << 7;
+ internal const uint SetupCoopTaskrun = 1u << 8;
+ internal const uint SetupSqe128 = 1u << 10;
+ internal const uint SetupSingleIssuer = 1u << 12;
+ internal const uint SetupDeferTaskrun = 1u << 13;
+ internal const uint SetupRDisabled = 1u << 6;
+ internal const uint SetupNoSqArray = 1u << 16;
+ internal const uint SetupCloexec = 1u << 19;
+
+ // Feature flags (io_uring_params.features)
+ internal const uint FeatureSingleMmap = 1u << 0;
+ internal const uint FeatureExtArg = 1u << 8;
+
+ // Enter flags (io_uring_enter flags parameter)
+ internal const uint EnterGetevents = 1u << 0;
+ internal const uint EnterSqWakeup = 1u << 1;
+ internal const uint EnterExtArg = 1u << 3;
+ internal const uint EnterRegisteredRing = 1u << 4;
+
+ // SQ ring flags (sq_ring->flags)
+ internal const uint SqNeedWakeup = 1u << 0;
+
+ // Register opcodes
+ internal const uint RegisterEnableRings = 17;
+ internal const uint RegisterBuffers = 0;
+ internal const uint UnregisterBuffers = 1;
+ internal const uint RegisterProbe = 8;
+ internal const uint RegisterRingFds = 20;
+ internal const uint UnregisterRingFds = 21;
+ internal const uint RegisterPbufRing = 22;
+ internal const uint UnregisterPbufRing = 23;
+
+ // Register helper values
+ internal const uint RegisterOffsetAuto = 0xFFFFFFFFU;
+
+ // Probe op flags
+ internal const uint ProbeOpFlagSupported = 1u << 0;
+
+ // Poll flags
+ internal const uint PollAddFlagMulti = 1u << 0;
+ internal const uint PollIn = 0x0001;
+
+ // CQE flags
+ internal const uint CqeFBuffer = 1u << 0; // IORING_CQE_F_BUFFER (buffer id in upper bits)
+ internal const uint CqeFMore = 1u << 1; // IORING_CQE_F_MORE (multishot)
+ internal const uint CqeFSockNonEmpty = 1u << 2; // IORING_CQE_F_SOCK_NONEMPTY (more data pending after recv)
+ internal const uint CqeFNotif = 1u << 3; // IORING_CQE_F_NOTIF (zero-copy notification)
+ internal const int CqeBufferShift = 16; // IORING_CQE_BUFFER_SHIFT
+
+ // Recv ioprio flags
+ internal const ushort RecvMultishot = 1 << 1; // IORING_RECV_MULTISHOT
+ // Accept ioprio flags
+ internal const ushort AcceptMultishot = 1 << 0; // IORING_ACCEPT_MULTISHOT
+
+ // SQE flags
+ internal const byte SqeBufferSelect = 1 << 5; // IOSQE_BUFFER_SELECT
+
+ // Sizing
+ internal const uint QueueEntries = 1024;
+ // Keep CQ capacity at 4x SQ entries to absorb completion bursts during short GC pauses
+ // without immediately tripping overflow recovery on busy rings.
+ internal const uint CqEntriesFactor = 4;
+ internal const uint MaxCqeDrainBatch = 512;
+ internal const int CqePrefetchThreshold = 4;
+ // Bounded wait trades wake latency for starvation resilience:
+ // if an eventfd wake is missed or deferred, the event loop still polls at least once
+ // every 50ms (worst-case deferred wake latency).
+ internal const long BoundedWaitTimeoutNanos = 50L * 1000 * 1000; // 50ms
+ // Circuit-breaker bounded wait used after repeated eventfd wake failures.
+ internal const long WakeFailureFallbackWaitTimeoutNanos = 1L * 1000 * 1000; // 1ms
+
+ // Completion operation pool sizing
+ internal const int CompletionOperationPoolCapacityFactor = 2;
+
+ // mmap offsets (from kernel UAPI: IORING_OFF_SQ_RING, IORING_OFF_CQ_RING, IORING_OFF_SQES)
+ internal const ulong OffSqRing = 0;
+ internal const ulong OffCqRing = 0x8000000;
+ internal const ulong OffSqes = 0x10000000;
+
+ // Minimum kernel version for io_uring engine.
+ // SEND_ZC deferred-completion logic relies on NOTIF CQE sequencing behavior stabilized in Linux 6.1.0.
+ internal const int MinKernelMajor = 6;
+ internal const int MinKernelMinor = 1;
+
+ // Zero-copy send size threshold (payloads below this use regular send).
+ internal const int ZeroCopySendThreshold = 16384; // 16KB
+
+ // User data tag values (encoded in upper bits of user_data)
+ internal const byte TagNone = 0;
+ internal const byte TagReservedCompletion = 2;
+ internal const byte TagWakeupSignal = 3;
+
+ // Accept-time flags for accepted socket descriptors: SOCK_CLOEXEC | SOCK_NONBLOCK.
+ internal const uint AcceptFlags = 0x80800;
+
+ // Message inline capacities (avoid heap allocation on common small payloads)
+ internal const int MessageInlineIovCount = 4;
+ internal const int MessageInlineSocketAddressCapacity = 128; // sizeof(sockaddr_storage)
+ internal const int MessageInlineControlBufferCapacity = 128;
+
+ // Internal discriminator for io_uring vs epoll fallback detection
+ internal const int NotSocketEventPort = int.MinValue + 1;
+
+ // Completion slot encoding
+ // Slot index is encoded into 16 bits of user_data payload => max 65536 slot IDs per engine.
+ internal const int SlotIndexBits = 16;
+ internal const ulong SlotIndexMask = (1UL << SlotIndexBits) - 1UL;
+ internal const int GenerationBits = 56 - SlotIndexBits;
+ // 40-bit generation space gives each slot ~1.1 trillion incarnations before wrap.
+ // Generation zero remains reserved as "uninitialized", so wrap remaps 2^40-1 -> 1.
+ internal const ulong GenerationMask = (1UL << GenerationBits) - 1UL;
+
+ // Test hook opcode masks (mirrors IoUringTestOpcodeMask in pal_io_uring.c)
+ internal const byte TestOpcodeMaskNone = 0;
+ internal const byte TestOpcodeMaskSend = 1 << 0;
+ internal const byte TestOpcodeMaskRecv = 1 << 1;
+ internal const byte TestOpcodeMaskSendMsg = 1 << 2;
+ internal const byte TestOpcodeMaskRecvMsg = 1 << 3;
+ internal const byte TestOpcodeMaskAccept = 1 << 4;
+ internal const byte TestOpcodeMaskConnect = 1 << 5;
+ internal const byte TestOpcodeMaskSendZc = 1 << 6;
+ internal const byte TestOpcodeMaskSendMsgZc = 1 << 7;
+ }
+
+ /// Captures the results of io_uring_setup(2) including ring fd, negotiated params, and feature flags.
+ private struct IoUringSetupResult
+ {
+ internal int RingFd;
+ internal Interop.Sys.IoUringParams Params;
+ internal uint NegotiatedFlags;
+ internal bool UsesExtArg;
+ internal bool SqPollNegotiated;
+ }
+
+ /// Discriminates completion slot metadata shape for operation-specific post-completion processing.
+ private enum IoUringCompletionOperationKind : byte
+ {
+ None = 0,
+ Accept = 1,
+ Message = 2,
+ ReusePortAccept = 3,
+ }
+
+ ///
+ /// Hot per-slot metadata used on every CQE dispatch.
+ /// Keep this minimal; native pointer-heavy state is kept in .
+ /// Explicit 24-byte layout keeps generation/free-list state and hot flags in one compact block.
+ ///
+ [StructLayout(LayoutKind.Explicit, Size = 24)]
+ private struct IoUringCompletionSlot
+ {
+ // 0..7
+ [FieldOffset(0)]
+ public ulong Generation;
+ // 8..11 (-1 = end of free list)
+ [FieldOffset(8)]
+ public int FreeListNext;
+ // 12..15 (operation kind + hot state flags)
+ [FieldOffset(12)]
+ private uint _packedState;
+ // 16..17
+ [FieldOffset(16)]
+ public ushort FixedRecvBufferId;
+#if DEBUG
+ // 20..23 debug-only forced completion result payload.
+ [FieldOffset(20)]
+ public int TestForcedResult;
+#endif
+
+ private const uint KindMask = 0xFFu;
+ private const uint FlagIsZeroCopySend = 1u << 8;
+ private const uint FlagZeroCopyNotificationPending = 1u << 9;
+ private const uint FlagUsesFixedRecvBuffer = 1u << 10;
+#if DEBUG
+ private const uint FlagHasTestForcedResult = 1u << 11;
+#endif
+
+ public IoUringCompletionOperationKind Kind
+ {
+ get => (IoUringCompletionOperationKind)(_packedState & KindMask);
+ set => _packedState = (_packedState & ~KindMask) | ((uint)value & KindMask);
+ }
+
+ public bool IsZeroCopySend
+ {
+ get => (_packedState & FlagIsZeroCopySend) != 0;
+ set => SetFlag(FlagIsZeroCopySend, value);
+ }
+
+ public bool ZeroCopyNotificationPending
+ {
+ get => (_packedState & FlagZeroCopyNotificationPending) != 0;
+ set => SetFlag(FlagZeroCopyNotificationPending, value);
+ }
+
+ public bool UsesFixedRecvBuffer
+ {
+ get => (_packedState & FlagUsesFixedRecvBuffer) != 0;
+ set => SetFlag(FlagUsesFixedRecvBuffer, value);
+ }
+
+#if DEBUG
+ public bool HasTestForcedResult
+ {
+ get => (_packedState & FlagHasTestForcedResult) != 0;
+ set => SetFlag(FlagHasTestForcedResult, value);
+ }
+#endif
+
+ private void SetFlag(uint mask, bool value)
+ {
+ if (value)
+ {
+ _packedState |= mask;
+ }
+ else
+ {
+ _packedState &= ~mask;
+ }
+ }
+
+ /// Clears both zero-copy flags (single bitmask operation).
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void ClearZeroCopyState() =>
+ _packedState &= ~(FlagIsZeroCopySend | FlagZeroCopyNotificationPending);
+
+ /// Arms the slot for a SEND_ZC operation: sets IsZeroCopySend, clears NotificationPending.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void ArmZeroCopySend() =>
+ _packedState = (_packedState | FlagIsZeroCopySend) & ~FlagZeroCopyNotificationPending;
+ }
+
+ ///
+ /// Hot tracked-operation ownership state used on completion and cancellation paths.
+ /// Kept separate from native slot storage to improve cache locality in CQE dispatch.
+ ///
+ private struct IoUringTrackedOperationState
+ {
+ public SocketAsyncContext.AsyncOperation? TrackedOperation;
+ public ulong TrackedOperationGeneration;
+ }
+
+ ///
+ /// Cold per-slot native metadata: pointers and message writeback state needed only for
+ /// operation-specific completion processing.
+ ///
+ private struct IoUringCompletionSlotStorage
+ {
+ // Hold a DangerousAddRef lease for the socket fd until this slot is fully retired.
+ public SafeSocketHandle? DangerousRefSocketHandle;
+ // Per-slot pre-allocated native slab backing accept socklen_t and message inline storage.
+ public unsafe byte* NativeInlineStorage;
+ // Accept metadata
+ public unsafe int* NativeSocketAddressLengthPtr; // socklen_t* in NativeInlineStorage
+ // Message metadata (pointers to native-alloc'd msghdr/iovec)
+ public IntPtr NativeMsgHdrPtr;
+ public bool MessageIsReceive;
+ // Message metadata - deep-copied native msghdr constituents (point into NativeInlineStorage).
+ public unsafe Interop.Sys.IOVector* NativeIOVectors;
+ public unsafe byte* NativeSocketAddress;
+ public unsafe byte* NativeControlBuffer;
+ // RecvMsg output capture - pointers back to managed MessageHeader buffers for writeback
+ public unsafe byte* ReceiveOutputSocketAddress;
+ public unsafe byte* ReceiveOutputControlBuffer;
+ public int ReceiveSocketAddressCapacity;
+ public int ReceiveControlBufferCapacity;
+ // ReusePortAccept metadata - cross-engine references for shadow listener accept forwarding
+ public SocketAsyncContext? ReusePortPrimaryContext;
+ public SocketAsyncEngine? ReusePortPrimaryEngine;
+ }
+
+ ///
+ /// Mirrors the kernel's struct msghdr layout for direct SQE submission.
+ /// Used by to build a native msghdr that
+ /// io_uring sendmsg/recvmsg opcodes can consume directly.
+ /// Must only be used on 64-bit Linux where sizeof(msghdr) == 56.
+ ///
+ [StructLayout(LayoutKind.Explicit)]
+ private unsafe struct NativeMsghdr
+ {
+ /// Expected size of the kernel's struct msghdr on 64-bit Linux.
+ public const int ExpectedSize = 56;
+
+ [FieldOffset(0)]
+ public void* MsgName;
+ [FieldOffset(8)]
+ public uint MsgNameLen;
+ [FieldOffset(16)]
+ public Interop.Sys.IOVector* MsgIov;
+ [FieldOffset(24)]
+ public nuint MsgIovLen;
+ [FieldOffset(32)]
+ public void* MsgControl;
+ [FieldOffset(40)]
+ public nuint MsgControlLen;
+ [FieldOffset(48)]
+ public int MsgFlags;
+ }
+
+ ///
+ /// Managed ring mmap state. Accessed directly as _ringState.* throughout the engine.
+ ///
+ private unsafe struct ManagedRingState
+ {
+ public Interop.Sys.IoUringCqe* CqeBase;
+ public uint* CqTailPtr;
+ public uint* CqHeadPtr;
+ public uint CqMask;
+ public uint CqEntries;
+ public uint* CqOverflowPtr;
+ public uint ObservedCqOverflow;
+ public byte* SqRingPtr;
+ public byte* CqRingPtr;
+ public uint* SqFlagsPtr;
+ public ulong SqRingSize;
+ public ulong CqRingSize;
+ public ulong SqesSize;
+ public bool UsesSingleMmap;
+ public int RingFd;
+ public bool UsesExtArg;
+ public bool UsesNoSqArray;
+ public uint NegotiatedFlags;
+ public uint CachedCqHead;
+ public bool CqDrainEnabled;
+ public int WakeupEventFd;
+
+ public static ManagedRingState CreateDefault()
+ {
+ ManagedRingState state = default;
+ state.RingFd = -1;
+ state.WakeupEventFd = -1;
+ return state;
+ }
+ }
+
+ private const int IoUringDiagnosticsPollInterval = 64;
+ private const int MinIoUringPrepareQueueDrainPerSubmit = 256;
+ private const int MaxIoUringPrepareQueueDrainPerSubmit = 8192;
+ private const int MinIoUringCancelQueueDrainPerSubmit = 256;
+ private const int MaxIoUringCancelQueueDrainPerSubmit = 2048;
+ private const int MaxIoUringSqeAcquireSubmitAttempts = 16;
+ private const int CqOverflowTrackedSweepDelayMilliseconds = 250;
+ private const int CqOverflowTrackedSweepMaxRearms = 8;
+ private const int IoUringWakeFailureCircuitBreakerThreshold = 8;
+ private const string IoUringEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING";
+ private const string IoUringSqPollEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL";
+ private const string IoUringDisableMultishotAcceptEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_DISABLE_MULTISHOT_ACCEPT";
+ private const string IoUringDisableReusePortAcceptEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_DISABLE_REUSEPORT_ACCEPT";
+ private const string UseIoUringAppContextSwitch = "System.Net.Sockets.UseIoUring";
+ private const string UseIoUringSqPollAppContextSwitch = "System.Net.Sockets.UseIoUringSqPoll";
+ // Configuration matrix (7 surfaces):
+ // 1) DOTNET_SYSTEM_NET_SOCKETS_IO_URING
+ // 2) AppContext: System.Net.Sockets.UseIoUring
+ // 3) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL
+ // 4) AppContext: System.Net.Sockets.UseIoUringSqPoll
+ // 5) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE (DEBUG)
+ // 6) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND (DEBUG)
+ // 7) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS (DEBUG, in IoUringProvidedBufferRing)
+ //
+ // Precedence (same pattern for both gates):
+ // - env var overrides AppContext switch; AppContext is used only when env is unset.
+ // All inputs are read once per process and cached in s_cachedConfigInputs.
+ private static readonly object s_affinityGrowLock = new object();
+
+ internal static void SetFdEngineAffinity(int fd, int engineIndex)
+ {
+ int[]? affinity = s_fdEngineAffinity;
+ if (affinity is null) return;
+ if ((uint)fd >= (uint)affinity.Length)
+ affinity = GrowAffinityTable(fd);
+ Volatile.Write(ref affinity[fd], engineIndex + 1);
+
+ // If the table was concurrently grown after we captured 'affinity', mirror the write
+ // into the current canonical table as well.
+ int[]? current = s_fdEngineAffinity;
+ if (!ReferenceEquals(current, affinity) && current is not null && (uint)fd < (uint)current.Length)
+ {
+ Volatile.Write(ref current[fd], engineIndex + 1);
+ }
+ }
+
+ internal static void ClearFdEngineAffinity(int fd)
+ {
+ int[]? affinity = s_fdEngineAffinity;
+ if (affinity is not null && (uint)fd < (uint)affinity.Length)
+ Volatile.Write(ref affinity[fd], 0);
+ }
+
+ /// Closes an accepted fd and clears its engine affinity entry. Used on fd-leak-prevention paths.
+ private static void CloseAcceptedFd(int fd)
+ {
+ ClearFdEngineAffinity(fd);
+ Interop.Sys.Close((IntPtr)fd);
+ }
+
+ internal static void EnsureFdEngineAffinityTable()
+ {
+ if (s_fdEngineAffinity is null)
+ Interlocked.CompareExchange(ref s_fdEngineAffinity, new int[4096], null);
+ }
+
+ private static int[] GrowAffinityTable(int fd)
+ {
+ lock (s_affinityGrowLock)
+ {
+ int[]? current = s_fdEngineAffinity;
+ if (current is not null && fd < current.Length) return current;
+ int newSize = Math.Max(fd + 1, (current?.Length ?? 0) * 2);
+ int[] grown = new int[newSize];
+ if (current is not null) Array.Copy(current, grown, current.Length);
+ s_fdEngineAffinity = grown;
+ return grown;
+ }
+ }
+
+ private const ulong IoUringUserDataPayloadMask = 0x00FF_FFFF_FFFF_FFFFUL;
+ private const int IoUringUserDataTagShift = 56;
+ private static readonly int s_ioUringPrepareQueueCapacity = GetIoUringPrepareQueueCapacity();
+ private static readonly int s_ioUringCancellationQueueCapacity = s_ioUringPrepareQueueCapacity;
+
+ [StructLayout(LayoutKind.Sequential, Size = 64)]
+ private struct CacheLinePadding64
+ {
+ }
+
+ private int _ioUringResolvedConfigurationLogged;
+ // Event-loop-only counters use bare ++ instead of Interlocked.Increment because the
+ // event loop is single-threaded. Cross-thread reads use Interlocked.Read for visibility.
+ // _ioUringNonPinnablePrepareFallbackCount is the exception: it is written from producer
+ // threads and therefore uses Interlocked.Increment (see RecordIoUringNonPinnablePrepareFallback).
+ private long _ioUringPendingRetryQueuedToPrepareQueueCount;
+ private long _ioUringNonPinnablePrepareFallbackCount;
+ private long _ioUringPublishedNonPinnablePrepareFallbackCount;
+ private MpscQueue? _ioUringPrepareQueue;
+ private MpscQueue? _ioUringCancelQueue;
+ private long _ioUringPrepareQueueOverflowCount;
+ private long _ioUringCancelQueueOverflowCount;
+ private long _ioUringPrepareQueueOverflowFallbackCount;
+ private long _ioUringCompletionSlotExhaustionCount;
+ private long _ioUringUntrackMismatchCount;
+ private long _ioUringPublishedPrepareQueueOverflowCount;
+ private long _ioUringPublishedPrepareQueueOverflowFallbackCount;
+ private long _ioUringPublishedCompletionSlotExhaustionCount;
+ private int _ioUringDiagnosticsPollCountdown;
+ private int _ioUringWakeFailureConsecutiveCount;
+ private int _ioUringPortClosedForTeardown;
+ // Release-published teardown gate. Readers use Volatile.Read in enqueue/wakeup paths
+ // to prevent new io_uring work from being published after teardown begins.
+ private int _ioUringTeardownInitiated;
+ private int _ioUringSlotCapacity;
+ private bool _completionSlotDrainInProgress;
+ private bool _cqOverflowRecoveryActive;
+ private IoUringCqOverflowRecoveryBranch _cqOverflowRecoveryBranch;
+ private long _cqOverflowTrackedSweepDeadlineTicks;
+ private int _cqOverflowTrackedSweepRearmCount;
+ private uint _ioUringManagedPendingSubmissions;
+ private uint _ioUringManagedSqTail;
+ private bool _ioUringManagedSqTailLoaded;
+ private Interop.Sys.IoUringSqRingInfo _ioUringSqRingInfo;
+ private bool _managedSqeInvariantsValidated;
+ private bool _ioUringDirectSqeEnabled;
+ private ManagedRingState _ringState = ManagedRingState.CreateDefault();
+
+ // Per-opcode support flags, populated by ProbeIoUringOpcodeSupport.
+ private bool _supportsOpSend;
+ private bool _supportsOpReadFixed;
+ private bool _supportsOpRecv;
+ private bool _supportsOpSendMsg;
+ private bool _supportsOpRecvMsg;
+ private bool _supportsOpAccept;
+ private bool _supportsOpConnect;
+ private bool _supportsOpSendZc;
+ private bool _supportsOpSendMsgZc;
+ private bool _supportsOpAsyncCancel;
+ private bool _supportsMultishotRecv;
+ private bool _supportsMultishotAccept;
+ private bool _zeroCopySendEnabled;
+
+ private bool _sqPollEnabled;
+ private bool _ioUringInitialized;
+ private long _ioUringDrainBatchProvidedBufferDepletionCount;
+ private IoUringProvidedBufferRing? _ioUringProvidedBufferRing;
+ private ushort _ioUringProvidedBufferGroupId;
+ // SoA split: hot completion slot state and cold native storage/tracking metadata.
+ private IoUringCompletionSlot[]? _completionSlots;
+ private IoUringTrackedOperationState[]? _trackedOperations;
+ private IoUringCompletionSlotStorage[]? _completionSlotStorage;
+ private unsafe byte* _completionSlotNativeStorage;
+ private nuint _completionSlotNativeStorageStride;
+ private System.Buffers.MemoryHandle[]? _zeroCopyPinHolds;
+ private int _completionSlotFreeListHead = -1;
+ private int _completionSlotsInUse;
+ // Event-loop hot state above this line, then cross-thread queue/counter state.
+ private CacheLinePadding64 _ioUringEventLoopToContendedPadding;
+ private long _ioUringPrepareQueueLength;
+ private long _ioUringCancelQueueLength;
+ private int _trackedIoUringOperationCount;
+ private uint _ioUringWakeupGeneration;
+ // Cross-thread contended state above, cold diagnostics/published counters below.
+ private CacheLinePadding64 _ioUringContendedToDiagnosticsPadding;
+ private int _completionSlotsHighWaterMark;
+ private int _liveAcceptCompletionSlotCount;
+ private bool _pendingEventFdRead;
+ private IoUringRecvStrategy _ioUringRecvStrategy = IoUringRecvStrategy.PlainUserBuffer;
+
+#if DEBUG
+ // Test hook state: forced completion result injection (mirrors native pal_io_uring.c test hooks).
+ private byte _testForceEagainOnceMask;
+ private byte _testForceEcanceledOnceMask;
+ private int _testForceSubmitEpermOnce;
+ // Test-only observability for cancel-queue full retry path.
+ private long _testCancelQueueWakeRetryCount;
+#endif
+ static partial void ResetDebugTestForcedResult(ref IoUringCompletionSlot slot);
+ static partial void ResolveDebugTestForcedResult(ref IoUringCompletionSlot slot, ref int result);
+ partial void ApplyDebugTestForcedResult(ref IoUringCompletionSlot slot, byte opcode);
+ partial void RestoreDebugTestForcedResultIfNeeded(int slotIndex, byte opcode);
+ partial void InitializeDebugTestHooksFromEnvironment();
+
+ private LinuxIoUringCapabilities _ioUringCapabilities;
+
+ /// Whether this engine instance is using io_uring completion mode.
+ internal bool IsIoUringCompletionModeEnabled => _ioUringCapabilities.IsCompletionMode;
+ /// Whether managed direct SQE submission is enabled.
+ internal bool IsIoUringDirectSqeEnabled => _ioUringDirectSqeEnabled;
+ /// Whether a connected send payload is eligible for the SEND_ZC path.
+ internal bool ShouldTryIoUringDirectSendZeroCopy(int payloadLength) =>
+ IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: false);
+ /// Whether a message-based send payload is eligible for the SENDMSG_ZC path.
+ internal bool ShouldTryIoUringDirectSendMessageZeroCopy(int payloadLength) =>
+ IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: true);
+
+ ///
+ /// Centralized zero-copy policy:
+ /// 1) process-level opt-in, 2) opcode support, 3) payload threshold.
+ /// The threshold is based on total payload bytes so buffer-list workloads (e.g. 4KB segments)
+ /// are eligible once the aggregate payload crosses the cutoff.
+ ///
+ private bool IsIoUringZeroCopySendEligible(int payloadLength, bool requiresSendMessageOpcode)
+ {
+ if (!_zeroCopySendEnabled || payloadLength < IoUringConstants.ZeroCopySendThreshold)
+ {
+ return false;
+ }
+
+ return requiresSendMessageOpcode ? _supportsOpSendMsgZc : _supportsOpSendZc;
+ }
+
+ ///
+ /// Reads the total count of pending completions that had to requeue through prepare queues
+ /// after inline completion-mode re-prepare was not used.
+ ///
+ internal static long GetIoUringPendingRetryQueuedToPrepareQueueCount()
+ {
+ long total = 0;
+ foreach (SocketAsyncEngine engine in s_engines)
+ {
+ total += Interlocked.Read(ref engine._ioUringPendingRetryQueuedToPrepareQueueCount);
+ }
+
+ return total;
+ }
+
+ internal static long GetIoUringNonPinnablePrepareFallbackCount()
+ {
+ long total = 0;
+ foreach (SocketAsyncEngine engine in s_engines)
+ {
+ total += Interlocked.Read(ref engine._ioUringNonPinnablePrepareFallbackCount);
+ }
+
+ return total;
+ }
+
+ internal static void SetIoUringNonPinnablePrepareFallbackCountForTest(long value)
+ {
+#if DEBUG
+ bool assigned = false;
+ foreach (SocketAsyncEngine engine in s_engines)
+ {
+ if (!engine.IsIoUringCompletionModeEnabled)
+ {
+ continue;
+ }
+
+ long engineValue = assigned ? 0 : value;
+ Interlocked.Exchange(ref engine._ioUringNonPinnablePrepareFallbackCount, engineValue);
+ Interlocked.Exchange(ref engine._ioUringPublishedNonPinnablePrepareFallbackCount, 0);
+ assigned = true;
+ }
+#else
+ _ = value;
+#endif
+ }
+
+ private void LogIoUringResolvedConfigurationIfNeeded(in IoUringResolvedConfiguration resolvedConfiguration)
+ {
+ if (Interlocked.Exchange(ref _ioUringResolvedConfigurationLogged, 1) != 0)
+ {
+ return;
+ }
+
+ string configuration = resolvedConfiguration.ToLogString();
+ SocketsTelemetry.Log.ReportIoUringResolvedConfiguration(configuration);
+ }
+
+ private static int GetIoUringPrepareQueueCapacity()
+ {
+#if DEBUG
+ if (Environment.GetEnvironmentVariable(
+ IoUringTestEnvironmentVariables.PrepareQueueCapacity) is string configuredValue &&
+ int.TryParse(configuredValue, out int configuredCapacity) &&
+ configuredCapacity > 0)
+ {
+ return configuredCapacity;
+ }
+#endif
+
+ // Raised default to reduce fallback frequency under bursty load.
+ int scaledCapacity = s_eventBufferCount >= 32 ? checked(s_eventBufferCount * 4) : 512;
+ return Math.Max(scaledCapacity, 512);
+ }
+
+ private static uint GetIoUringQueueEntries()
+ {
+#if DEBUG
+ if (Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.QueueEntries) is string configuredValue &&
+ int.TryParse(configuredValue, out int configuredEntries) &&
+ configuredEntries >= 2 &&
+ configuredEntries <= IoUringConstants.QueueEntries &&
+ (configuredEntries & (configuredEntries - 1)) == 0)
+ {
+ return (uint)configuredEntries;
+ }
+#endif
+
+ return IoUringConstants.QueueEntries;
+ }
+
+ /// Creates a capabilities snapshot based on whether the port is io_uring.
+ private static LinuxIoUringCapabilities ResolveLinuxIoUringCapabilities(bool isIoUringPort) =>
+ default(LinuxIoUringCapabilities)
+ .WithIsIoUringPort(isIoUringPort)
+ .WithMode(isIoUringPort ? IoUringMode.Completion : IoUringMode.Disabled);
+
+ private void SetIoUringProvidedBufferCapabilityState(bool supportsProvidedBufferRings, bool hasRegisteredBuffers)
+ {
+ _ioUringCapabilities = _ioUringCapabilities
+ .WithSupportsProvidedBufferRings(supportsProvidedBufferRings)
+ .WithHasRegisteredBuffers(hasRegisteredBuffers);
+ RecomputeIoUringRecvStrategy();
+ }
+
+ private void RecomputeIoUringRecvStrategy()
+ {
+ _supportsMultishotRecv = _supportsOpRecv &&
+ _ioUringCapabilities.SupportsProvidedBufferRings;
+ _ioUringCapabilities = _ioUringCapabilities.WithSupportsMultishotRecv(_supportsMultishotRecv);
+
+ if (_supportsOpReadFixed && _ioUringCapabilities.HasRegisteredBuffers)
+ {
+ _ioUringRecvStrategy = IoUringRecvStrategy.FixedRecv;
+ }
+ else if (_supportsMultishotRecv)
+ {
+ _ioUringRecvStrategy = IoUringRecvStrategy.MultishotProvidedBuffer;
+ }
+ else if (_ioUringCapabilities.SupportsProvidedBufferRings)
+ {
+ _ioUringRecvStrategy = IoUringRecvStrategy.OneshotProvidedBuffer;
+ }
+ else
+ {
+ _ioUringRecvStrategy = IoUringRecvStrategy.PlainUserBuffer;
+ }
+ }
+
+ private IoUringRecvStrategy ResolveIoUringRecvStrategy(
+ SocketFlags flags,
+ bool allowMultishotRecv,
+ int bufferLen,
+ bool bufferAlreadyPinned)
+ {
+ if (bufferLen <= 0)
+ {
+ return IoUringRecvStrategy.PlainUserBuffer;
+ }
+
+ switch (_ioUringRecvStrategy)
+ {
+ case IoUringRecvStrategy.FixedRecv:
+ if (allowMultishotRecv)
+ {
+ return IoUringRecvStrategy.MultishotProvidedBuffer;
+ }
+
+ // For one-shot receives, a buffer that is already pinned can be targeted
+ // directly to avoid provided-buffer completion copy overhead.
+ if (bufferAlreadyPinned)
+ {
+ return IoUringRecvStrategy.PlainUserBuffer;
+ }
+
+ return flags == SocketFlags.None
+ ? IoUringRecvStrategy.FixedRecv
+ : IoUringRecvStrategy.OneshotProvidedBuffer;
+
+ case IoUringRecvStrategy.MultishotProvidedBuffer:
+ return allowMultishotRecv
+ ? IoUringRecvStrategy.MultishotProvidedBuffer
+ : bufferAlreadyPinned
+ ? IoUringRecvStrategy.PlainUserBuffer
+ : IoUringRecvStrategy.OneshotProvidedBuffer;
+
+ case IoUringRecvStrategy.OneshotProvidedBuffer:
+ return allowMultishotRecv
+ ? IoUringRecvStrategy.MultishotProvidedBuffer
+ : bufferAlreadyPinned
+ ? IoUringRecvStrategy.PlainUserBuffer
+ : IoUringRecvStrategy.OneshotProvidedBuffer;
+
+ default:
+ return IoUringRecvStrategy.PlainUserBuffer;
+ }
+ }
+
+ /// Encodes a tag byte and payload into a 64-bit user_data value.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong EncodeIoUringUserData(byte tag, ulong payload) =>
+ ((ulong)tag << IoUringUserDataTagShift) | (payload & IoUringUserDataPayloadMask);
+
+ /// Reads the next CQE from the completion ring without advancing the head.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe, int eventLoopThreadId)
+ {
+ Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId,
+ "TryPeekNextCqe must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ cqe = null;
+ uint cqTail = Volatile.Read(ref *_ringState.CqTailPtr);
+ if (_ringState.CachedCqHead == cqTail) return false;
+ uint index = _ringState.CachedCqHead & _ringState.CqMask;
+ cqe = _ringState.CqeBase + index;
+ return true;
+ }
+
+ /// Advances the CQ head pointer by the given count, making slots available to the kernel.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void AdvanceCqHead(uint count, int eventLoopThreadId)
+ {
+ Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId,
+ "AdvanceCqHead must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ _ringState.CachedCqHead += count;
+ Volatile.Write(ref *_ringState.CqHeadPtr, _ringState.CachedCqHead);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void BeginIoUringDrainTelemetryBatch()
+ {
+ _ioUringDrainBatchProvidedBufferDepletionCount = 0;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void FlushIoUringDrainTelemetryBatch()
+ {
+ long depletionCount = _ioUringDrainBatchProvidedBufferDepletionCount;
+ if (depletionCount != 0)
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion(depletionCount);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordIoUringProvidedBufferDepletionForDrainBatch(long count = 1)
+ {
+ _ioUringDrainBatchProvidedBufferDepletionCount += count;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void PrefetchNextReservedCompletionSlot()
+ {
+ if (!Sse.IsSupported || _ringState.CqTailPtr is null)
+ {
+ return;
+ }
+
+ uint nextCqHead = _ringState.CachedCqHead + 1;
+ uint cqTail = Volatile.Read(ref *_ringState.CqTailPtr);
+ if (nextCqHead == cqTail)
+ {
+ return;
+ }
+
+ uint nextIndex = nextCqHead & _ringState.CqMask;
+ Interop.Sys.IoUringCqe* nextCqe = _ringState.CqeBase + nextIndex;
+ ulong nextUserData = nextCqe->UserData;
+ if ((byte)(nextUserData >> IoUringUserDataTagShift) != IoUringConstants.TagReservedCompletion)
+ {
+ return;
+ }
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return;
+ }
+
+ int nextSlotIndex = DecodeCompletionSlotIndex(nextUserData & IoUringUserDataPayloadMask);
+ if ((uint)nextSlotIndex >= (uint)completionEntries.Length)
+ {
+ return;
+ }
+
+ fixed (IoUringCompletionSlot* completionSlots = completionEntries)
+ {
+ Sse.Prefetch0((byte*)(completionSlots + nextSlotIndex));
+ }
+
+ // Tracked operation entries contain managed references, so use a regular load
+ // to warm the cache line instead of taking an unsafe pointer.
+ IoUringTrackedOperationState[]? trackedOperations = _trackedOperations;
+ if (trackedOperations is not null &&
+ (uint)nextSlotIndex < (uint)trackedOperations.Length)
+ {
+ _ = trackedOperations[nextSlotIndex].TrackedOperationGeneration;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void FlushPendingManagedWakeupEventFdRead()
+ {
+ if (!_pendingEventFdRead || _ringState.WakeupEventFd < 0)
+ {
+ return;
+ }
+
+ _pendingEventFdRead = false;
+ ulong value;
+ _ = Interop.Sys.IoUringShimReadEventFd(_ringState.WakeupEventFd, &value);
+ }
+
+ ///
+ /// Drains up to CQEs from the mmap'd
+ /// completion ring and dispatches each based on the user_data tag.
+ /// Tag=2 (reserved completion) entries are dispatched directly through
+ /// .
+ /// Tag=3 (wakeup signal) entries are handled inline.
+ /// Returns true when at least one CQE was drained.
+ ///
+ private unsafe bool DrainCqeRingBatch(SocketEventHandler handler)
+ {
+ int eventLoopThreadId = Volatile.Read(ref _eventLoopManagedThreadId);
+ Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId,
+ "DrainCqeRingBatch must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ ObserveManagedCqOverflowCounter();
+ int drained = 0;
+ int drainLimit = _cqOverflowRecoveryActive
+ ? (int)_ringState.CqEntries
+ : (int)IoUringConstants.MaxCqeDrainBatch;
+ bool drainedAnyCqe = false;
+ bool enqueuedFallbackEvent = false;
+ uint deferredCqHeadAdvance = 0;
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ providedBufferRing?.BeginDeferredRecyclePublish();
+ BeginIoUringDrainTelemetryBatch();
+
+ try
+ {
+ while (drained < drainLimit
+ && TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe, eventLoopThreadId))
+ {
+ drainedAnyCqe = true;
+ ulong userData = cqe->UserData;
+ int result = cqe->Result;
+ uint flags = cqe->Flags;
+ if (drained >= IoUringConstants.CqePrefetchThreshold)
+ {
+ PrefetchNextReservedCompletionSlot();
+ }
+
+ if (_cqOverflowRecoveryActive)
+ {
+ // During overflow recovery, publish head movement per CQE so the kernel can
+ // reclaim CQ ring space immediately and avoid extending overflow pressure.
+ AdvanceCqHead(1, eventLoopThreadId);
+ }
+ else
+ {
+ _ringState.CachedCqHead++;
+ deferredCqHeadAdvance++;
+ }
+
+ byte tag = (byte)(userData >> IoUringUserDataTagShift);
+ ulong payload = userData & IoUringUserDataPayloadMask;
+
+ if (tag == IoUringConstants.TagReservedCompletion)
+ {
+ if ((flags & IoUringConstants.CqeFNotif) != 0)
+ {
+ if (HandleZeroCopyNotification(payload))
+ {
+ handler.DispatchZeroCopyIoUringNotification(payload);
+ // Free the slot AFTER dispatch has taken the tracked operation.
+ FreeCompletionSlot(DecodeCompletionSlotIndex(payload));
+ drained++;
+ continue;
+ }
+ }
+
+ bool isMultishotCompletion = false;
+ bool isReusePortAccept = false;
+ if ((flags & IoUringConstants.CqeFMore) != 0)
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if (completionEntries is not null &&
+ (uint)slotIndex < (uint)completionEntries.Length)
+ {
+ ref IoUringCompletionSlot classSlot = ref completionEntries[slotIndex];
+ // SEND_ZC/SENDMSG_ZC result CQEs have CQE_F_MORE because a NOTIF
+ // CQE follows -- this is NOT a multishot indicator. Exclude ZC slots.
+ if (!classSlot.IsZeroCopySend)
+ {
+ IoUringCompletionOperationKind kind = classSlot.Kind;
+ isReusePortAccept = kind == IoUringCompletionOperationKind.ReusePortAccept;
+ isMultishotCompletion = isReusePortAccept ||
+ (kind == IoUringCompletionOperationKind.Message && _ioUringCapabilities.SupportsMultishotRecv) ||
+ (kind == IoUringCompletionOperationKind.Accept && _ioUringCapabilities.SupportsMultishotAccept);
+ }
+ }
+ }
+ else
+ {
+ // Terminal CQE (no CQE_F_MORE). Check if this is a ReusePortAccept slot
+ // so we can route it for graceful cleanup without tracked-operation lookup.
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if (completionEntries is not null &&
+ (uint)slotIndex < (uint)completionEntries.Length)
+ {
+ isReusePortAccept = completionEntries[slotIndex].Kind == IoUringCompletionOperationKind.ReusePortAccept;
+ }
+ }
+ ResolveReservedCompletionSlotMetadata(
+ payload,
+ isMultishotCompletion,
+ ref result,
+ out int completionSocketAddressLen,
+ out int completionControlBufferLen,
+ out uint completionAuxiliaryData,
+ out bool hasFixedRecvBuffer,
+ out ushort fixedRecvBufferId,
+ out bool shouldFreeSlot);
+
+ if (isReusePortAccept)
+ {
+ handler.DispatchReusePortAcceptIoUringCompletion(
+ userData,
+ result,
+ flags,
+ completionSocketAddressLen,
+ completionAuxiliaryData);
+ // Terminal CQE (no MORE flag): free the slot.
+ if (!isMultishotCompletion)
+ {
+ shouldFreeSlot = true;
+ }
+ }
+ else if (isMultishotCompletion)
+ {
+ // Dispatch expects full tagged user_data so tracked-ownership decode can validate tag+generation.
+ handler.DispatchMultishotIoUringCompletion(
+ userData,
+ result,
+ flags,
+ completionSocketAddressLen,
+ completionControlBufferLen,
+ completionAuxiliaryData,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref enqueuedFallbackEvent);
+ }
+ else
+ {
+ // Dispatch expects full tagged user_data so tracked-ownership decode can validate tag+generation.
+ handler.DispatchSingleIoUringCompletion(
+ userData,
+ result,
+ flags,
+ completionSocketAddressLen,
+ completionControlBufferLen,
+ completionAuxiliaryData,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref enqueuedFallbackEvent);
+ }
+
+ // Free the completion slot AFTER dispatch has taken the tracked
+ // operation. FreeCompletionSlot nulls TrackedOperation, so it must
+ // run after TryTakeTrackedIoUringOperation in the dispatch methods.
+ if (shouldFreeSlot)
+ {
+ FreeCompletionSlot(DecodeCompletionSlotIndex(payload));
+ }
+ }
+ else if (tag == IoUringConstants.TagWakeupSignal)
+ {
+ HandleManagedWakeupSignal(result);
+ if ((flags & IoUringConstants.CqeFMore) == 0 &&
+ Volatile.Read(ref _ioUringTeardownInitiated) == 0 &&
+ !QueueManagedWakeupPollAdd())
+ {
+ }
+ }
+ else if (tag != IoUringConstants.TagNone)
+ {
+ Debug.Fail($"Unknown io_uring CQE user_data tag: {tag}.");
+ }
+
+ drained++;
+ }
+ }
+ finally
+ {
+ providedBufferRing?.EndDeferredRecyclePublish();
+ FlushIoUringDrainTelemetryBatch();
+ FlushPendingManagedWakeupEventFdRead();
+ if (deferredCqHeadAdvance != 0 && _ringState.CqHeadPtr is not null)
+ {
+ Volatile.Write(ref *_ringState.CqHeadPtr, _ringState.CachedCqHead);
+ }
+ }
+
+ if (enqueuedFallbackEvent)
+ {
+ EnsureWorkerScheduled();
+ }
+
+ TryCompleteManagedCqOverflowRecovery();
+ AssertCompletionSlotUsageBounded();
+
+ return drainedAnyCqe;
+ }
+
+ ///
+ /// Resolves metadata for a reserved completion by applying forced test results and
+ /// copying operation-specific completion outputs (accept/recvmsg) from native storage.
+ ///
+ private void ResolveReservedCompletionSlotMetadata(
+ ulong payload,
+ bool isMultishotCompletion,
+ ref int result,
+ out int completionSocketAddressLen,
+ out int completionControlBufferLen,
+ out uint completionAuxiliaryData,
+ out bool hasFixedRecvBuffer,
+ out ushort fixedRecvBufferId,
+ out bool shouldFreeSlot)
+ {
+ completionSocketAddressLen = 0;
+ completionControlBufferLen = 0;
+ completionAuxiliaryData = 0;
+ hasFixedRecvBuffer = false;
+ fixedRecvBufferId = 0;
+ shouldFreeSlot = false;
+
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)_completionSlots!.Length)
+ {
+ return;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots[slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ ulong completionGeneration = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask;
+ if (completionGeneration != slot.Generation)
+ {
+ // Stale CQE for a recycled slot; ignore without mutating current slot state.
+ return;
+ }
+
+ ResolveDebugTestForcedResult(ref slot, ref result);
+
+ if (slot.UsesFixedRecvBuffer)
+ {
+ hasFixedRecvBuffer = true;
+ fixedRecvBufferId = slot.FixedRecvBufferId;
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ Debug.Assert(!isMultishotCompletion, "Fixed-buffer receive completions are expected to be one-shot.");
+ }
+
+ if (slot.Kind == IoUringCompletionOperationKind.Accept &&
+ slotStorage.NativeSocketAddressLengthPtr is not null)
+ {
+ int nativeSocketAddressLength = *slotStorage.NativeSocketAddressLengthPtr;
+ completionAuxiliaryData = nativeSocketAddressLength >= 0 ? (uint)nativeSocketAddressLength : 0u;
+ if (isMultishotCompletion)
+ {
+ int socketAddressCapacity = slotStorage.ReceiveSocketAddressCapacity;
+ if (socketAddressCapacity > 0 && slotStorage.NativeSocketAddress is not null)
+ {
+ Unsafe.InitBlockUnaligned(slotStorage.NativeSocketAddress, 0, (uint)socketAddressCapacity);
+ }
+
+ *slotStorage.NativeSocketAddressLengthPtr = socketAddressCapacity >= 0 ? socketAddressCapacity : 0;
+ }
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ CopyMessageCompletionOutputs(
+ slotIndex,
+ out completionSocketAddressLen,
+ out completionControlBufferLen,
+ out completionAuxiliaryData);
+ }
+
+ if (!isMultishotCompletion)
+ {
+ if (!slot.IsZeroCopySend)
+ {
+ shouldFreeSlot = true;
+ }
+ else if (result < 0)
+ {
+ // Error completion path may not produce a NOTIF CQE.
+ shouldFreeSlot = true;
+ }
+ else if (!slot.ZeroCopyNotificationPending)
+ {
+ // First CQE for zero-copy send: keep slot alive until NOTIF CQE arrives.
+ slot.ZeroCopyNotificationPending = true;
+ AssertZeroCopyNotificationPendingForPayload(payload);
+ }
+ }
+ }
+
+ ///
+ /// Handles NOTIF CQEs for zero-copy sends: validates and clears ZC pending state.
+ /// The caller must free the completion slot after dispatch takes the tracked operation.
+ ///
+ private bool HandleZeroCopyNotification(ulong payload) =>
+ TryValidateAndClearZeroCopyNotificationSlot(payload, out _);
+
+ /// Returns true when the completion slot for is waiting on SEND_ZC NOTIF.
+ private bool IsZeroCopyNotificationPending(ulong userData)
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return false;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask);
+ if ((uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ return slot.IsZeroCopySend && slot.ZeroCopyNotificationPending;
+ }
+
+ ///
+ /// Releases a deferred SEND_ZC completion slot when dispatch cannot reattach ownership.
+ ///
+ private bool TryCleanupDeferredZeroCopyCompletionSlot(ulong userData)
+ {
+ if (!TryValidateAndClearZeroCopyNotificationSlot(userData & IoUringUserDataPayloadMask, out int slotIndex))
+ {
+ return false;
+ }
+
+ FreeCompletionSlot(slotIndex);
+ return true;
+ }
+
+ ///
+ /// Validates that the completion slot identified by is in the
+ /// expected SEND_ZC NOTIF-pending state (correct generation, ZC flags armed), and clears
+ /// the zero-copy flags. Returns the slot index for optional follow-up actions (e.g. free).
+ ///
+ private bool TryValidateAndClearZeroCopyNotificationSlot(ulong payload, out int slotIndex)
+ {
+ slotIndex = 0;
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return false;
+ }
+
+ slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ ulong completionGeneration = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask;
+ if (slot.Generation != completionGeneration)
+ {
+ return false;
+ }
+
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending)
+ {
+ return false;
+ }
+
+ slot.ClearZeroCopyState();
+ return true;
+ }
+
+ /// Debug assertion that a reserved completion payload remains armed for SEND_ZC NOTIF.
+ [Conditional("DEBUG")]
+ private void AssertZeroCopyNotificationPendingForPayload(ulong payload)
+ {
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ Debug.Assert(
+ IsZeroCopyNotificationPending(userData),
+ "SEND_ZC first CQE must leave the completion slot pending until NOTIF CQE arrives.");
+ }
+
+ /// Debug assertion that SEND_ZC completion dispatch is deferred until NOTIF arrives.
+ [Conditional("DEBUG")]
+ private void AssertZeroCopyDeferredCompletionState(ulong userData, SocketAsyncContext.AsyncOperation operation)
+ {
+ Debug.Assert(
+ operation.IoUringUserData == userData,
+ "Deferred SEND_ZC completion must retain the original user_data until NOTIF CQE dispatch.");
+ Debug.Assert(
+ IsZeroCopyNotificationPending(userData),
+ "Deferred SEND_ZC completion requires an armed NOTIF state.");
+ }
+
+ /// Observes kernel CQ overflow count deltas and emits telemetry/logs.
+ private unsafe void ObserveManagedCqOverflowCounter()
+ {
+ if (_ringState.CqOverflowPtr is null)
+ {
+ return;
+ }
+
+ uint observedOverflow = Volatile.Read(ref *_ringState.CqOverflowPtr);
+ uint previousOverflow = _ringState.ObservedCqOverflow;
+ // The kernel counter is uint32 and wraps; compare via wrapped delta instead of monotonic ordering.
+ uint delta = unchecked(observedOverflow - previousOverflow);
+ if (delta == 0)
+ {
+ return;
+ }
+
+ _ringState.ObservedCqOverflow = observedOverflow;
+ SocketsTelemetry.Log.IoUringCqOverflow(delta);
+ // Defer stale-tracked sweep scheduling until recovery completes.
+ Volatile.Write(ref _cqOverflowTrackedSweepDeadlineTicks, 0);
+ _cqOverflowTrackedSweepRearmCount = 0;
+
+ IoUringCqOverflowRecoveryBranch branch = _cqOverflowRecoveryActive ?
+ IoUringCqOverflowRecoveryBranch.DualWave :
+ DetermineCqOverflowRecoveryBranchAtEntry();
+ _cqOverflowRecoveryActive = true;
+ _cqOverflowRecoveryBranch = branch;
+ AssertLiveAcceptSlotsRemainTrackedDuringRecovery(branch);
+
+ }
+
+ /// Determines the initial recovery branch discriminator for a newly observed CQ overflow.
+ private IoUringCqOverflowRecoveryBranch DetermineCqOverflowRecoveryBranchAtEntry()
+ {
+ if (Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return IoUringCqOverflowRecoveryBranch.Teardown;
+ }
+
+ if (_ioUringCapabilities.SupportsMultishotAccept &&
+ HasLiveAcceptCompletionSlot())
+ {
+ return IoUringCqOverflowRecoveryBranch.MultishotAcceptArming;
+ }
+
+ return IoUringCqOverflowRecoveryBranch.DualWave;
+ }
+
+ /// Returns true when at least one active completion slot is currently tracking accept metadata.
+ private bool HasLiveAcceptCompletionSlot()
+ {
+ // Keep this O(1): CQ-overflow branch selection can run frequently on the event loop hot path.
+ int liveAcceptCount = Volatile.Read(ref _liveAcceptCompletionSlotCount);
+ Debug.Assert(liveAcceptCount >= 0);
+ return liveAcceptCount != 0;
+ }
+
+ ///
+ /// Completes CQ-overflow recovery once the ring is drained and no additional overflow increments are observed.
+ /// Recovery is best-effort: dropped CQEs cannot be reconstructed, so this only restores steady-state draining.
+ ///
+ private unsafe void TryCompleteManagedCqOverflowRecovery()
+ {
+ if (!_cqOverflowRecoveryActive ||
+ _ringState.CqOverflowPtr is null ||
+ _ringState.CqTailPtr is null)
+ {
+ return;
+ }
+
+ uint cqTail = Volatile.Read(ref *_ringState.CqTailPtr);
+ if (_ringState.CachedCqHead != cqTail)
+ {
+ return;
+ }
+
+ if (Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.Teardown;
+ }
+
+ uint observedOverflow = Volatile.Read(ref *_ringState.CqOverflowPtr);
+ // The kernel counter is uint32 and wraps; compare via wrapped subtraction.
+ uint delta = unchecked(observedOverflow - _ringState.ObservedCqOverflow);
+ if (delta > 0)
+ {
+ _ringState.ObservedCqOverflow = observedOverflow;
+ if (_cqOverflowRecoveryBranch != IoUringCqOverflowRecoveryBranch.Teardown)
+ {
+ _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.DualWave;
+ }
+ SocketsTelemetry.Log.IoUringCqOverflow(delta);
+
+ return;
+ }
+
+ _cqOverflowRecoveryActive = false;
+ _cqOverflowTrackedSweepRearmCount = 0;
+ Volatile.Write(
+ ref _cqOverflowTrackedSweepDeadlineTicks,
+ Environment.TickCount64 + CqOverflowTrackedSweepDelayMilliseconds);
+ SocketsTelemetry.Log.IoUringCqOverflowRecovery(1);
+ if (_cqOverflowRecoveryBranch == IoUringCqOverflowRecoveryBranch.MultishotAcceptArming)
+ {
+ // Phase 1 spec branch (a): if CQ overflow occurs while multishot accept is live,
+ // defer re-arm nudges until after drain completes instead of discarding active state.
+ TryQueueDeferredMultishotAcceptRearmAfterRecovery();
+ }
+ AssertCompletionSlotPoolConsistency();
+
+ }
+
+ ///
+ /// After CQ-overflow recovery completes, performs a delayed sweep to retire tracked operations
+ /// that remain attached despite already transitioning out of the waiting state.
+ ///
+ private void TrySweepStaleTrackedIoUringOperationsAfterCqOverflowRecovery()
+ {
+ if (!_ioUringCapabilities.IsCompletionMode ||
+ _cqOverflowRecoveryActive ||
+ !IsCurrentThreadEventLoopThread())
+ {
+ return;
+ }
+
+ long deadline = Volatile.Read(ref _cqOverflowTrackedSweepDeadlineTicks);
+ if (deadline == 0 ||
+ unchecked(Environment.TickCount64 - deadline) < 0)
+ {
+ return;
+ }
+
+ // Consume the deadline before the sweep; follow-up work can re-arm it.
+ Volatile.Write(ref _cqOverflowTrackedSweepDeadlineTicks, 0);
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ IoUringTrackedOperationState[]? trackedOperations = _trackedOperations;
+ if (completionEntries is null ||
+ trackedOperations is null ||
+ trackedOperations.Length != completionEntries.Length ||
+ IsIoUringTrackingEmpty())
+ {
+ return;
+ }
+
+ int detachedCount = 0;
+ int canceledWaitingCount = 0;
+
+ for (int slotIndex = 0; slotIndex < trackedOperations.Length; slotIndex++)
+ {
+ ref IoUringTrackedOperationState trackedState = ref trackedOperations[slotIndex];
+ SocketAsyncContext.AsyncOperation? operation = Volatile.Read(ref trackedState.TrackedOperation);
+ if (operation is null)
+ {
+ continue;
+ }
+
+ ulong generation = Volatile.Read(ref trackedState.TrackedOperationGeneration);
+ if (generation == 0)
+ {
+ continue;
+ }
+
+ ulong payload = EncodeCompletionSlotUserData(slotIndex, generation);
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (operation.IoUringUserData != userData)
+ {
+ continue;
+ }
+
+ IoUringCompletionOperationKind kind = completionEntries[slotIndex].Kind;
+ if (ShouldSkipCqOverflowTrackedSweep(operation, userData, kind))
+ {
+ continue;
+ }
+
+ if (operation.IsInWaitingState())
+ {
+ if (operation.TryCancel())
+ {
+ canceledWaitingCount++;
+ }
+
+ continue;
+ }
+
+ if (TryUntrackTrackedIoUringOperation(userData, operation, out SocketAsyncContext.AsyncOperation? removedOperation) != IoUringTrackedOperationRemoveResult.Removed ||
+ removedOperation is null)
+ {
+ continue;
+ }
+
+ removedOperation.ClearIoUringUserData();
+ FreeCompletionSlot(slotIndex);
+ detachedCount++;
+ }
+
+ // Sweep for orphaned SEND_ZC completion slots whose NOTIF CQE was lost to CQ overflow.
+ int zeroCopyOrphanCount = SweepOrphanedZeroCopyNotificationSlots(completionEntries, trackedOperations);
+
+ if (canceledWaitingCount != 0)
+ {
+ if (_cqOverflowTrackedSweepRearmCount < CqOverflowTrackedSweepMaxRearms)
+ {
+ _cqOverflowTrackedSweepRearmCount++;
+ Volatile.Write(
+ ref _cqOverflowTrackedSweepDeadlineTicks,
+ Environment.TickCount64 + CqOverflowTrackedSweepDelayMilliseconds);
+ }
+ }
+ else
+ {
+ _cqOverflowTrackedSweepRearmCount = 0;
+ }
+ }
+
+ ///
+ /// Scans completion slots for SEND_ZC entries stuck in ZeroCopyNotificationPending state
+ /// with no corresponding tracked operation, indicating a lost NOTIF CQE from CQ overflow.
+ ///
+ private int SweepOrphanedZeroCopyNotificationSlots(
+ IoUringCompletionSlot[] completionEntries,
+ IoUringTrackedOperationState[] trackedOperations)
+ {
+ int freedCount = 0;
+ for (int slotIndex = 0; slotIndex < completionEntries.Length; slotIndex++)
+ {
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending)
+ {
+ continue;
+ }
+
+ // The slot is waiting for a NOTIF CQE. Check whether any tracked operation
+ // still references this slot. If not, the first CQE was already processed and
+ // the operation was completed/dispatched, meaning the NOTIF CQE is the only
+ // thing keeping this slot alive -- and it was lost to CQ overflow.
+ ref IoUringTrackedOperationState trackedState = ref trackedOperations[slotIndex];
+ if (Volatile.Read(ref trackedState.TrackedOperation) is not null)
+ {
+ continue;
+ }
+
+ // Orphaned: NOTIF-pending with no tracked operation. Force-free the slot.
+ slot.ClearZeroCopyState();
+ FreeCompletionSlot(slotIndex);
+ freedCount++;
+ }
+
+ return freedCount;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool ShouldSkipCqOverflowTrackedSweep(
+ SocketAsyncContext.AsyncOperation operation,
+ ulong userData,
+ IoUringCompletionOperationKind kind)
+ {
+ SocketAsyncContext context = operation.AssociatedContext;
+
+ if (kind == IoUringCompletionOperationKind.Accept &&
+ context.IsMultishotAcceptArmed &&
+ context.MultishotAcceptUserData == userData)
+ {
+ // Active multishot accept slots are intentionally long-lived.
+ return true;
+ }
+
+ if (kind == IoUringCompletionOperationKind.Message &&
+ context.IsPersistentMultishotRecvArmed() &&
+ context.PersistentMultishotRecvUserData == userData)
+ {
+ // Persistent multishot recv slots are intentionally long-lived.
+ return true;
+ }
+
+ return false;
+ }
+
+ /// Debug assertion for Phase-1 branch (a): live multishot-accept slots must remain tracked during recovery.
+ [Conditional("DEBUG")]
+ private void AssertLiveAcceptSlotsRemainTrackedDuringRecovery(IoUringCqOverflowRecoveryBranch branch)
+ {
+ if (branch != IoUringCqOverflowRecoveryBranch.MultishotAcceptArming)
+ {
+ return;
+ }
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return;
+ }
+
+ bool foundTrackedAccept = false;
+ for (int i = 0; i < completionEntries.Length; i++)
+ {
+ if (completionEntries[i].Kind != IoUringCompletionOperationKind.Accept)
+ {
+ continue;
+ }
+
+ ulong payload = EncodeCompletionSlotUserData(i, completionEntries[i].Generation);
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (ContainsTrackedIoUringOperation(userData))
+ {
+ foundTrackedAccept = true;
+ break;
+ }
+ }
+
+ Debug.Assert(
+ foundTrackedAccept,
+ "CQ-overflow recovery branch (a) requires at least one live tracked multishot-accept slot.");
+ }
+
+ ///
+ /// After overflow recovery completes, nudges accept contexts with live multishot accept state
+ /// so the managed accept pipeline can resume dequeue/prepare flow.
+ ///
+ private void TryQueueDeferredMultishotAcceptRearmAfterRecovery()
+ {
+ if (!_ioUringCapabilities.SupportsMultishotAccept ||
+ Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return;
+ }
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return;
+ }
+
+ for (int slotIndex = 0; slotIndex < completionEntries.Length; slotIndex++)
+ {
+ if (completionEntries[slotIndex].Kind != IoUringCompletionOperationKind.Accept)
+ {
+ continue;
+ }
+
+ ulong payload = EncodeCompletionSlotUserData(slotIndex, completionEntries[slotIndex].Generation);
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (!TryGetTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) ||
+ operation is not SocketAsyncContext.AcceptOperation acceptOperation)
+ {
+ continue;
+ }
+
+ SocketAsyncContext context = acceptOperation.AssociatedContext;
+ if (!context.IsMultishotAcceptArmed ||
+ context.MultishotAcceptUserData != userData)
+ {
+ continue;
+ }
+
+ EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read);
+ }
+ }
+
+ ///
+ /// Handles a wakeup signal CQE by deferring an eventfd read until the drain batch tail.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void HandleManagedWakeupSignal(int cqeResult)
+ {
+ if (cqeResult >= 0 && _ringState.WakeupEventFd >= 0)
+ {
+ _pendingEventFdRead = true;
+ }
+ }
+
+ private const int FdCloexec = 1;
+ /// io_uring completion mode does not use socket event registration updates.
+ partial void LinuxTryChangeSocketEventRegistration(
+ IntPtr socketHandle,
+ Interop.Sys.SocketEvents currentEvents,
+ Interop.Sys.SocketEvents newEvents,
+ int data,
+ ref Interop.Error error,
+ ref bool handled)
+ {
+ if (!Volatile.Read(ref _ioUringInitialized))
+ {
+ return;
+ }
+
+ handled = true;
+ error = Interop.Error.SUCCESS;
+ }
+
+ private static bool TrySetFdCloseOnExec(int fd)
+ {
+ int currentFlags = Interop.Sys.Fcntl.GetFD((IntPtr)fd);
+ if (currentFlags < 0)
+ {
+ return false;
+ }
+
+ int updatedFlags = currentFlags | FdCloexec;
+ if (updatedFlags == currentFlags)
+ {
+ return true;
+ }
+
+ if (Interop.Sys.Fcntl.SetFD((IntPtr)fd, updatedFlags) == 0)
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Probes the kernel for supported io_uring opcodes using IORING_REGISTER_PROBE and
+ /// populates the per-opcode _supportsOp* capability flags.
+ /// When the probe syscall is unavailable (older kernels), all flags remain at their
+ /// default value ().
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void ProbeIoUringOpcodeSupport(int ringFd)
+ {
+ // Probe buffer: 16-byte header + 256 * 8-byte ops = 2064 bytes.
+ const int maxOps = 256;
+ const int probeSize = 16 + maxOps * 8;
+ byte* probeBuffer = stackalloc byte[probeSize];
+ new Span(probeBuffer, probeSize).Clear();
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ ringFd, IoUringConstants.RegisterProbe, probeBuffer, (uint)maxOps, &result);
+
+ if (err != Interop.Error.SUCCESS)
+ {
+ // Probe not supported (for example older kernels): per-opcode flags remain false.
+ // Direct SQE prep does not gate on these flags; this mainly affects optional feature light-up.
+ return;
+ }
+
+ // Parse: ops start at offset 16, each is 8 bytes.
+ IoUringProbeOp* ops = (IoUringProbeOp*)(probeBuffer + 16);
+ IoUringProbeHeader* header = (IoUringProbeHeader*)probeBuffer;
+ int opsCount = Math.Min((int)header->OpsLen, maxOps);
+
+ _supportsOpReadFixed = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.ReadFixed);
+ _supportsOpSend = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Send);
+ _supportsOpRecv = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Recv);
+ _supportsOpSendMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsg);
+ _supportsOpRecvMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.RecvMsg);
+ _supportsOpAccept = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Accept);
+ _supportsOpConnect = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Connect);
+ _supportsOpSendZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendZc);
+ _supportsOpSendMsgZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsgZc);
+ _zeroCopySendEnabled = _supportsOpSendZc && IsZeroCopySendOptedIn();
+ _supportsOpAsyncCancel = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.AsyncCancel);
+ _supportsMultishotAccept = _supportsOpAccept && !IsMultishotAcceptDisabled();
+ RefreshIoUringMultishotRecvSupport();
+ }
+
+ /// Checks whether a specific opcode is supported by the kernel's io_uring probe result.
+ private static unsafe bool IsOpcodeSupported(IoUringProbeOp* ops, int opsCount, byte opcode)
+ {
+ if (opcode >= opsCount) return false;
+ return (ops[opcode].Flags & IoUringConstants.ProbeOpFlagSupported) != 0;
+ }
+
+ /// Publishes the managed SQ tail pointer to make queued SQEs visible to the kernel.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void PublishManagedSqeTail()
+ {
+ if (!_ioUringManagedSqTailLoaded || _ioUringSqRingInfo.SqTailPtr == IntPtr.Zero)
+ {
+ return;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "PublishManagedSqeTail must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ ref uint sqTailRef = ref Unsafe.AsRef((void*)_ioUringSqRingInfo.SqTailPtr);
+ Volatile.Write(ref sqTailRef, _ioUringManagedSqTail);
+ _ioUringManagedSqTailLoaded = false;
+ }
+
+ ///
+ /// Returns true when the SQPOLL kernel thread has gone idle and needs an explicit wakeup.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool SqNeedWakeup()
+ {
+ Debug.Assert(_sqPollEnabled, "SqNeedWakeup should only be checked in SQPOLL mode.");
+ if (_ringState.SqFlagsPtr == null)
+ {
+ return true;
+ }
+
+ return (Volatile.Read(ref *_ringState.SqFlagsPtr) & IoUringConstants.SqNeedWakeup) != 0;
+ }
+
+ /// Allocates the next available SQE slot from the submission ring.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryGetNextManagedSqe(out IoUringSqe* sqe)
+ {
+ sqe = null;
+ if (!_ioUringDirectSqeEnabled)
+ {
+ return false;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryGetNextManagedSqe must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ if (!_managedSqeInvariantsValidated)
+ {
+ return false;
+ }
+
+ ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo;
+ Debug.Assert(ringInfo.SqeBase != IntPtr.Zero);
+ Debug.Assert(ringInfo.SqHeadPtr != IntPtr.Zero);
+ Debug.Assert(ringInfo.SqTailPtr != IntPtr.Zero);
+ Debug.Assert(ringInfo.SqEntries != 0);
+ Debug.Assert(ringInfo.SqeSize == (uint)sizeof(IoUringSqe));
+
+ ref uint sqHeadRef = ref Unsafe.AsRef((void*)ringInfo.SqHeadPtr);
+ uint sqHead = Volatile.Read(ref sqHeadRef);
+ if (!_ioUringManagedSqTailLoaded)
+ {
+ ref uint sqTailRef = ref Unsafe.AsRef((void*)ringInfo.SqTailPtr);
+ _ioUringManagedSqTail = Volatile.Read(ref sqTailRef);
+ _ioUringManagedSqTailLoaded = true;
+ }
+
+ uint sqTail = _ioUringManagedSqTail;
+ if (sqTail - sqHead >= ringInfo.SqEntries)
+ {
+ return false;
+ }
+
+ uint index = sqTail & ringInfo.SqMask;
+ nint sqeOffset = checked((nint)((nuint)index * ringInfo.SqeSize));
+ sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset);
+ Debug.Assert(sizeof(IoUringSqe) == 64);
+ // Tail fields (bytes 40-63: BufIndex/Personality/SpliceFdIn/Addr3 + trailing padding)
+ // are centrally zeroed here; bytes 0-39 are explicitly initialized by each SQE writer.
+ Unsafe.InitBlockUnaligned((byte*)sqe + 40, 0, 24);
+ _ioUringManagedSqTail = sqTail + 1;
+ _ioUringManagedPendingSubmissions++;
+ return true;
+ }
+
+ /// Validates immutable SQ ring invariants once at initialization.
+ private bool ValidateManagedSqeInitializationInvariants()
+ {
+ ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo;
+ if (ringInfo.SqeBase == IntPtr.Zero ||
+ ringInfo.SqHeadPtr == IntPtr.Zero ||
+ ringInfo.SqTailPtr == IntPtr.Zero ||
+ ringInfo.SqEntries == 0)
+ {
+ return false;
+ }
+
+ if (ringInfo.SqeSize != (uint)sizeof(IoUringSqe))
+ {
+ Debug.Fail($"Unexpected io_uring SQE size. Expected {sizeof(IoUringSqe)}, got {ringInfo.SqeSize}.");
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Attempts to acquire an SQE, retrying with intermediate submits on ring full.
+ private unsafe bool TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError)
+ {
+ sqe = null;
+ submitError = Interop.Error.SUCCESS;
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryAcquireManagedSqeWithRetry must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ SocketEventHandler drainHandler = default;
+ bool drainHandlerInitialized = false;
+
+ for (int attempt = 0; attempt < MaxIoUringSqeAcquireSubmitAttempts; attempt++)
+ {
+ if (TryGetNextManagedSqe(out sqe))
+ {
+ return true;
+ }
+
+ // Before retrying submission, run a CQ drain pass so completions can release
+ // slots and unblock kernel forward progress. The overflow counter is observed
+ // during drain; do not assume a single pass fully clears overflow pressure.
+ if (_ringState.CqDrainEnabled &&
+ _ringState.CqOverflowPtr is not null &&
+ _completionSlotsInUse != 0)
+ {
+ if (!drainHandlerInitialized)
+ {
+ drainHandler = new SocketEventHandler(this);
+ drainHandlerInitialized = true;
+ }
+ _ = DrainCqeRingBatch(drainHandler);
+
+ if (TryGetNextManagedSqe(out sqe))
+ {
+ return true;
+ }
+ }
+
+ submitError = SubmitIoUringOperationsNormalized();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ return false;
+ }
+ }
+
+ submitError = Interop.Error.EAGAIN;
+ return false;
+ }
+
+ ///
+ /// Common setup for direct SQE preparation: allocates a completion slot, encodes user data,
+ /// resolves the socket fd/flags, applies test hooks, and acquires an SQE. On failure,
+ /// restores test state and frees the slot.
+ ///
+ private unsafe struct IoUringDirectSqeSetupResult
+ {
+ public SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult PrepareResult;
+ public int SlotIndex;
+ public ulong UserData;
+ public int SqeFd;
+ public byte SqeFlags;
+ public IoUringSqe* Sqe;
+ public SocketError ErrorCode;
+ }
+
+ ///
+ /// Prepares a direct SQE and returns all setup data as a single struct to avoid large
+ /// out-parameter callsites in per-opcode prepare paths.
+ ///
+ ///
+ /// if the SQE was acquired
+ /// (caller must write the SQE and return Prepared),
+ /// or a terminal result (Unsupported/PrepareFailed) that the caller should return directly.
+ ///
+ private unsafe IoUringDirectSqeSetupResult TrySetupDirectSqe(
+ SafeSocketHandle socket,
+ byte opcode)
+ {
+ IoUringDirectSqeSetupResult setup = default;
+ setup.SlotIndex = -1;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ setup.ErrorCode = SocketError.Success;
+
+ if (!_ioUringDirectSqeEnabled)
+ {
+ return setup;
+ }
+
+ int slotIndex = AllocateCompletionSlot();
+ if (slotIndex < 0)
+ {
+ // Event-loop-only counter; cross-thread reads use Interlocked.Read.
+ _ioUringCompletionSlotExhaustionCount++;
+
+ if (!_completionSlotDrainInProgress)
+ {
+ _completionSlotDrainInProgress = true;
+ try
+ {
+ SocketEventHandler handler = new SocketEventHandler(this);
+ if (DrainCqeRingBatch(handler))
+ {
+ slotIndex = AllocateCompletionSlot();
+ }
+ }
+ finally
+ {
+ _completionSlotDrainInProgress = false;
+ }
+ }
+
+ if (slotIndex < 0)
+ {
+ return setup;
+ }
+ }
+
+ setup.SlotIndex = slotIndex;
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ setup.UserData = EncodeCompletionSlotUserData(slotIndex, slot.Generation);
+
+ bool addedSocketRef = false;
+ try
+ {
+ // Keep the fd alive from SQE prep through CQE retirement to avoid fd-reuse races after close.
+ socket.DangerousAddRef(ref addedSocketRef);
+ }
+ catch (ObjectDisposedException)
+ {
+ FreeCompletionSlot(slotIndex);
+ setup.SlotIndex = -1;
+ setup.ErrorCode = SocketError.OperationAborted;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed;
+ return setup;
+ }
+
+ if (!addedSocketRef)
+ {
+ FreeCompletionSlot(slotIndex);
+ setup.SlotIndex = -1;
+ setup.ErrorCode = SocketError.OperationAborted;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed;
+ return setup;
+ }
+
+ slotStorage.DangerousRefSocketHandle = socket;
+ // GC/rooting contract for fd lifetime:
+ // Engine -> _completionSlotStorage[slotIndex].DangerousRefSocketHandle -> SafeSocketHandle.
+ // Keep this chain alive across SQE submission through CQE retirement to avoid fd reuse races.
+ SafeSocketHandle? operation = slotStorage.DangerousRefSocketHandle;
+ Debug.Assert(operation != null);
+ int socketFd = (int)(nint)operation!.DangerousGetHandle();
+ setup.SqeFd = socketFd;
+ setup.SqeFlags = 0;
+ ApplyDebugTestForcedResult(ref slot, opcode);
+
+ if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError))
+ {
+ RestoreDebugTestForcedResultIfNeeded(slotIndex, opcode);
+ FreeCompletionSlot(slotIndex);
+ setup.SlotIndex = -1;
+
+ if (submitError == Interop.Error.SUCCESS ||
+ submitError == Interop.Error.EAGAIN ||
+ submitError == Interop.Error.EWOULDBLOCK)
+ {
+ return setup;
+ }
+
+ setup.ErrorCode = SocketPal.GetSocketErrorForErrorCode(submitError);
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed;
+ return setup;
+ }
+
+ setup.Sqe = sqe;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ return setup;
+ }
+
+ ///
+ /// Prepares a send SQE, preferring SEND_ZC when eligible and falling back to SEND when unavailable.
+ ///
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendWithZeroCopyFallback(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ out bool usedZeroCopy,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ usedZeroCopy = false;
+ if (ShouldTryIoUringDirectSendZeroCopy(bufferLen))
+ {
+ SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendCore(
+ socket, buffer, bufferLen, IoUringOpcodes.SendZc,
+ isZeroCopy: true, flags, out userData, out errorCode);
+ if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported)
+ {
+ usedZeroCopy = zeroCopyResult == SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ return zeroCopyResult;
+ }
+ }
+
+ return TryPrepareIoUringDirectSendCore(socket, buffer, bufferLen, IoUringOpcodes.Send,
+ isZeroCopy: false, flags, out userData, out errorCode);
+ }
+
+ /// Shared core for send/send_zc SQE preparation via the managed direct path.
+ private unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendCore(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ byte opcode,
+ bool isZeroCopy,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, opcode);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ if (isZeroCopy)
+ {
+ _completionSlots![setup.SlotIndex].ArmZeroCopySend();
+ }
+
+ WriteSendLikeSqe(setup.Sqe, opcode, setup.SqeFd, setup.SqeFlags, setup.UserData, buffer, (uint)bufferLen, rwFlags);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a recv SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectRecv(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ bool allowMultishotRecv,
+ bool bufferAlreadyPinned,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Recv);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ IoUringRecvStrategy recvStrategy = ResolveIoUringRecvStrategy(
+ flags,
+ allowMultishotRecv,
+ bufferLen,
+ bufferAlreadyPinned);
+
+ switch (recvStrategy)
+ {
+ case IoUringRecvStrategy.FixedRecv:
+ if (TryPrepareIoUringDirectRecvFixed(
+ setup.SlotIndex,
+ setup.Sqe,
+ setup.SqeFd,
+ setup.SqeFlags,
+ setup.UserData,
+ bufferLen))
+ {
+ break;
+ }
+
+ // Fixed-buffer selection can be transiently unavailable under pressure.
+ if (bufferAlreadyPinned)
+ {
+ goto default;
+ }
+
+ goto case IoUringRecvStrategy.OneshotProvidedBuffer;
+
+ case IoUringRecvStrategy.MultishotProvidedBuffer:
+ if (TryGetIoUringMultishotRecvBufferGroupId(out ushort multishotBufferGroupId))
+ {
+ SetCompletionSlotKind(ref _completionSlots![setup.SlotIndex], IoUringCompletionOperationKind.Message);
+ WriteProvidedBufferRecvSqe(
+ setup.Sqe,
+ setup.SqeFd,
+ setup.SqeFlags,
+ setup.UserData,
+ requestedLength: 0,
+ rwFlags: 0,
+ multishotBufferGroupId,
+ IoUringConstants.RecvMultishot);
+ break;
+ }
+
+ if (bufferAlreadyPinned)
+ {
+ goto default;
+ }
+
+ goto case IoUringRecvStrategy.OneshotProvidedBuffer;
+
+ case IoUringRecvStrategy.OneshotProvidedBuffer:
+ if (!bufferAlreadyPinned &&
+ TryGetIoUringProvidedBufferGroupId(out ushort providedBufferGroupId))
+ {
+ WriteProvidedBufferRecvSqe(
+ setup.Sqe,
+ setup.SqeFd,
+ setup.SqeFlags,
+ setup.UserData,
+ (uint)bufferLen,
+ rwFlags,
+ providedBufferGroupId);
+ break;
+ }
+
+ goto default;
+
+ default:
+ WriteSendLikeSqe(
+ setup.Sqe,
+ IoUringOpcodes.Recv,
+ setup.SqeFd,
+ setup.SqeFlags,
+ setup.UserData,
+ buffer,
+ (uint)bufferLen,
+ rwFlags);
+ break;
+ }
+
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ private unsafe bool TryPrepareIoUringDirectRecvFixed(
+ int slotIndex,
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ int requestedLength)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return false;
+ }
+
+ if (!providedBufferRing.TryAcquireBufferForPreparedReceive(
+ out ushort bufferId,
+ out byte* fixedBuffer,
+ out int fixedBufferLength))
+ {
+ // Under transient provided-buffer pressure, fall back to normal receive preparation.
+ return false;
+ }
+
+ Debug.Assert(_completionSlots is not null);
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ slot.UsesFixedRecvBuffer = true;
+ slot.FixedRecvBufferId = bufferId;
+
+ int receiveLength = Math.Min(requestedLength, fixedBufferLength);
+ WriteReadFixedSqe(
+ sqe,
+ sqeFd,
+ sqeFlags,
+ userData,
+ fixedBuffer,
+ (uint)receiveLength,
+ bufferId);
+ return true;
+ }
+
+ /// Prepares an accept SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectAccept(
+ SafeSocketHandle socket, byte* socketAddress, int socketAddressLen,
+ out ulong userData, out SocketError errorCode) =>
+ TryPrepareIoUringDirectAcceptCore(socket, socketAddress, socketAddressLen,
+ multishot: false, out userData, out errorCode);
+
+ /// Prepares a multishot accept SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectMultishotAccept(
+ SafeSocketHandle socket, byte* socketAddress, int socketAddressLen,
+ out ulong userData, out SocketError errorCode) =>
+ TryPrepareIoUringDirectAcceptCore(socket, socketAddress, socketAddressLen,
+ multishot: true, out userData, out errorCode);
+
+ /// Shared core for accept/multishot-accept SQE preparation.
+ private unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectAcceptCore(
+ SafeSocketHandle socket, byte* socketAddress, int socketAddressLen,
+ bool multishot, out ulong userData, out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+ if (multishot && !_supportsMultishotAccept)
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Accept);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Accept);
+ Debug.Assert(slotStorage.NativeSocketAddressLengthPtr is not null);
+
+ if (multishot)
+ {
+ // Security hardening: multishot accept reuses a single SQE across shots, so sharing one sockaddr
+ // writeback buffer can race and surface mismatched peer addresses under bursty delivery.
+ // Transitional multishot accept only needs accepted fds, so request no sockaddr writeback.
+ *slotStorage.NativeSocketAddressLengthPtr = 0;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ WriteAcceptSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData,
+ socketAddress: null, socketAddressLengthPtr: IntPtr.Zero, multishot: true);
+ }
+ else
+ {
+ *slotStorage.NativeSocketAddressLengthPtr = socketAddressLen;
+ WriteAcceptSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData,
+ socketAddress, (IntPtr)slotStorage.NativeSocketAddressLengthPtr);
+ }
+
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ ///
+ /// Prepares a multishot accept SQE for a SO_REUSEPORT shadow listener.
+ /// The slot uses so CQE dispatch
+ /// forwards accepted fds to the primary listener's pre-accept queue without tracked-operation lookup.
+ /// Must be called on this engine's event-loop thread.
+ ///
+ internal unsafe bool TryPrepareReusePortMultishotAccept(
+ SafeSocketHandle shadowSocket,
+ SocketAsyncContext primaryContext,
+ SocketAsyncEngine primaryEngine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!_supportsMultishotAccept)
+ {
+ return false;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(shadowSocket, IoUringOpcodes.Accept);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.ReusePortAccept);
+ slotStorage.ReusePortPrimaryContext = primaryContext;
+ slotStorage.ReusePortPrimaryEngine = primaryEngine;
+
+ WriteAcceptSqe(
+ setup.Sqe,
+ setup.SqeFd,
+ setup.SqeFlags,
+ setup.UserData,
+ socketAddress: null,
+ socketAddressLengthPtr: IntPtr.Zero,
+ multishot: true);
+ userData = setup.UserData;
+ return true;
+ }
+
+ /// Prepares a connect SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectConnect(
+ SafeSocketHandle socket,
+ byte* socketAddress,
+ int socketAddressLen,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Connect);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ WriteConnectSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, socketAddress, socketAddressLen);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ ///
+ /// Prepares a sendmsg SQE, preferring SENDMSG_ZC when eligible and falling back to SENDMSG otherwise.
+ ///
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ int payloadLength,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ if (ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength))
+ {
+ SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectMessageCore(
+ socket, messageHeader, IoUringOpcodes.SendMsgZc,
+ isReceive: false, isZeroCopy: true, flags, out userData, out errorCode);
+ if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported)
+ {
+ return zeroCopyResult;
+ }
+ }
+
+ return TryPrepareIoUringDirectMessageCore(socket, messageHeader, IoUringOpcodes.SendMsg,
+ isReceive: false, isZeroCopy: false, flags, out userData, out errorCode);
+ }
+
+ /// Prepares a recvmsg SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectReceiveMessage(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode) =>
+ TryPrepareIoUringDirectMessageCore(socket, messageHeader, IoUringOpcodes.RecvMsg,
+ isReceive: true, isZeroCopy: false, flags, out userData, out errorCode);
+
+ /// Shared core for sendmsg/sendmsg_zc/recvmsg SQE preparation via the managed direct path.
+ private unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectMessageCore(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ byte opcode,
+ bool isReceive,
+ bool isZeroCopy,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, opcode);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Message);
+
+ if (isZeroCopy)
+ {
+ // Mirror SEND_ZC semantics: first CQE is not final managed completion; operation
+ // completes only after NOTIF CQE confirms kernel/NIC no longer references payload.
+ slot.ArmZeroCopySend();
+ }
+
+ if (!TryPrepareInlineMessageStorage(setup.SlotIndex, messageHeader, isReceive))
+ {
+ FreeCompletionSlot(setup.SlotIndex);
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ WriteSendMsgLikeSqe(setup.Sqe, opcode, setup.SqeFd, setup.SqeFlags, setup.UserData, slotStorage.NativeMsgHdrPtr, rwFlags);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Debug-only assertion that validates a state machine transition.
+ [Conditional("DEBUG")]
+ private static void AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState from,
+ IoUringOperationLifecycleState to)
+ {
+ bool isValid =
+ from == IoUringOperationLifecycleState.Queued && to == IoUringOperationLifecycleState.Prepared ||
+ from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Submitted ||
+ from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Detached ||
+ from == IoUringOperationLifecycleState.Submitted &&
+ (to == IoUringOperationLifecycleState.Queued ||
+ to == IoUringOperationLifecycleState.Completed ||
+ to == IoUringOperationLifecycleState.Canceled ||
+ to == IoUringOperationLifecycleState.Detached);
+
+ Debug.Assert(isValid, $"Invalid io_uring lifecycle transition: {from} -> {to}");
+ }
+
+ /// Checks whether the kernel version meets the minimum for io_uring support.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringKernelVersionSupported()
+ {
+#if DEBUG
+ if (string.Equals(
+ Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceKernelVersionUnsupported),
+ "1",
+ StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ return OperatingSystem.IsOSPlatformVersionAtLeast(
+ "Linux",
+ IoUringConstants.MinKernelMajor,
+ IoUringConstants.MinKernelMinor);
+ }
+
+ ///
+ /// Recomputes whether multishot recv can be used by this engine instance.
+ /// Requires opcode support and active provided-buffer ring support.
+ ///
+ private bool RefreshIoUringMultishotRecvSupport()
+ {
+ RecomputeIoUringRecvStrategy();
+ return _supportsMultishotRecv;
+ }
+
+ ///
+ /// Returns the provided-buffer group id used for buffer-select receive submissions.
+ ///
+ private bool TryGetIoUringProvidedBufferGroupId(out ushort bufferGroupId)
+ {
+ if (_ioUringCapabilities.SupportsProvidedBufferRings && _ioUringProvidedBufferRing is not null)
+ {
+ bufferGroupId = _ioUringProvidedBufferGroupId;
+ return true;
+ }
+
+ bufferGroupId = default;
+ return false;
+ }
+
+ ///
+ /// Returns the provided-buffer group id used for multishot recv submissions.
+ /// Multishot recv remains disabled unless both the opcode probe and provided-ring
+ /// registration succeeded for this engine instance.
+ ///
+ private bool TryGetIoUringMultishotRecvBufferGroupId(out ushort bufferGroupId)
+ {
+ if (_supportsMultishotRecv && TryGetIoUringProvidedBufferGroupId(out bufferGroupId))
+ {
+ return true;
+ }
+
+ bufferGroupId = default;
+ return false;
+ }
+
+ internal bool SupportsMultishotRecv => _ioUringCapabilities.SupportsMultishotRecv;
+ internal bool SupportsMultishotAccept => _ioUringCapabilities.SupportsMultishotAccept;
+
+ /// Calls io_uring_setup and negotiates feature flags.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static unsafe bool TrySetupIoUring(bool sqPollRequested, out IoUringSetupResult setupResult)
+ {
+ setupResult = default;
+ uint queueEntries = GetIoUringQueueEntries();
+
+ uint flags = IoUringConstants.SetupCqSize | IoUringConstants.SetupSubmitAll
+ | IoUringConstants.SetupCoopTaskrun | IoUringConstants.SetupSingleIssuer
+ | IoUringConstants.SetupNoSqArray | IoUringConstants.SetupCloexec;
+
+ if (sqPollRequested)
+ {
+ // SQPOLL and DEFER_TASKRUN are mutually exclusive in practice.
+ flags |= IoUringConstants.SetupSqPoll;
+ }
+ else
+ {
+ // DEFER_TASKRUN defers task work to io_uring_enter only, reducing event-loop
+ // CPU vs COOP_TASKRUN (which runs task work at every syscall boundary).
+ // Requires SINGLE_ISSUER and submitter_task == current on every io_uring_enter.
+ // io_uring_setup runs on the event loop thread (deferred from constructor) so
+ // submitter_task is set correctly for the event loop's io_uring_enter calls.
+ flags |= IoUringConstants.SetupDeferTaskrun;
+ }
+
+ // Peel unsupported setup flags on EINVAL and retry, newest first.
+ // IORING_SETUP_NO_SQARRAY: Linux 6.6. IORING_SETUP_CLOEXEC: Linux 5.19.
+ ReadOnlySpan flagsToPeel = [IoUringConstants.SetupNoSqArray, IoUringConstants.SetupCloexec];
+
+ Interop.Sys.IoUringParams ioParams = default;
+ int ringFd;
+ Interop.Error err;
+ int peelIndex = 0;
+ while (true)
+ {
+ ioParams = default;
+ ioParams.Flags = flags;
+ ioParams.CqEntries = queueEntries * IoUringConstants.CqEntriesFactor;
+ err = Interop.Sys.IoUringShimSetup(queueEntries, &ioParams, &ringFd);
+
+ if (err != Interop.Error.EINVAL)
+ {
+ break;
+ }
+
+ // Try peeling the next optional flag.
+ while (peelIndex < flagsToPeel.Length && (flags & flagsToPeel[peelIndex]) == 0)
+ {
+ peelIndex++;
+ }
+
+ if (peelIndex >= flagsToPeel.Length)
+ {
+ break;
+ }
+
+ flags &= ~flagsToPeel[peelIndex++];
+ }
+
+ if (err != Interop.Error.SUCCESS)
+ {
+ return false;
+ }
+
+ // IORING_SETUP_CLOEXEC removes the fork/exec inheritance window on supporting kernels.
+ // Keep FD_CLOEXEC as a fallback for peeled/older setups.
+ if (!TrySetFdCloseOnExec(ringFd))
+ {
+ // Ensure ring fd is not inherited across fork/exec; inherited ring fds can corrupt ownership.
+ Interop.Sys.IoUringShimCloseFd(ringFd);
+ return false;
+ }
+
+ setupResult.RingFd = ringFd;
+ setupResult.Params = ioParams;
+ setupResult.NegotiatedFlags = flags;
+ setupResult.UsesExtArg = (ioParams.Features & IoUringConstants.FeatureExtArg) != 0;
+ setupResult.SqPollNegotiated = (flags & IoUringConstants.SetupSqPoll) != 0;
+ if (setupResult.SqPollNegotiated)
+ {
+ SocketsTelemetry.Log.ReportIoUringSqPollNegotiatedWarning();
+ }
+ return true;
+ }
+
+
+ /// Queues a POLL_ADD SQE on the wakeup eventfd for cross-thread signaling.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool QueueManagedWakeupPollAdd()
+ {
+ if (_ringState.WakeupEventFd < 0)
+ return false;
+
+ if (!TryGetNextManagedSqe(out IoUringSqe* sqe))
+ return false;
+
+ sqe->Opcode = IoUringOpcodes.PollAdd;
+ sqe->Flags = 0; // No SQE flags for wakeup poll.
+ sqe->Ioprio = 0; // Not used by POLL_ADD.
+ sqe->Fd = _ringState.WakeupEventFd;
+ sqe->Off = 0; // Not used by POLL_ADD.
+ sqe->Addr = 0; // Not used by POLL_ADD.
+ sqe->Len = IoUringConstants.PollAddFlagMulti; // IORING_POLL_ADD_MULTI
+ sqe->RwFlags = IoUringConstants.PollIn;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagWakeupSignal, 0);
+ // BufIndex, Personality, SpliceFdIn, Addr3: zeroed by TryGetNextManagedSqe.
+ return true;
+ }
+
+ /// Attempts to register the ring fd for fixed-fd submission.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryRegisterRingFd(int ringFd, out int registeredRingFd)
+ {
+ registeredRingFd = -1;
+
+ // io_uring_rsrc_update: { uint32 offset, uint32 resv, uint64 data }
+ uint* update = stackalloc uint[4]; // 16 bytes
+ update[0] = IoUringConstants.RegisterOffsetAuto; // offset = auto-assign
+ update[1] = 0; // resv
+ *(ulong*)(update + 2) = (ulong)ringFd; // data = ring fd
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ ringFd, IoUringConstants.RegisterRingFds, update, 1u, &result);
+
+ if (err != Interop.Error.SUCCESS || result <= 0)
+ return false;
+
+ registeredRingFd = (int)update[0]; // kernel wrote assigned index back
+ return true;
+ }
+
+
+
+ ///
+ /// Orchestrates complete managed io_uring initialization: kernel version check,
+ /// ring setup with flag negotiation, mmap, opcode probe, eventfd creation,
+ /// ring fd registration, and initial wakeup poll queue.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryInitializeManagedIoUring(in IoUringResolvedConfiguration resolvedConfiguration)
+ {
+ if (!IsIoUringKernelVersionSupported())
+ return false;
+
+ bool sqPollRequested = resolvedConfiguration.SqPollRequested;
+ if (!TrySetupIoUring(sqPollRequested, out IoUringSetupResult setupResult))
+ return false;
+
+ if (!TryMmapRings(ref setupResult))
+ return false;
+
+ _sqPollEnabled = setupResult.SqPollNegotiated;
+
+ // Probe opcode support.
+ ProbeIoUringOpcodeSupport(setupResult.RingFd);
+
+ // Create wakeup eventfd.
+ int eventFd;
+ Interop.Error err = Interop.Sys.IoUringShimCreateEventFd(&eventFd);
+ if (err != Interop.Error.SUCCESS)
+ {
+ // Cleanup: unmap and close
+ CleanupManagedRings();
+ return false;
+ }
+
+ if (!TrySetFdCloseOnExec(eventFd))
+ {
+ // Eventfd wake channel must remain process-local across exec to prevent stale cross-process signaling.
+ Interop.Sys.IoUringShimCloseFd(eventFd);
+ CleanupManagedRings();
+ return false;
+ }
+
+ _ringState.WakeupEventFd = eventFd;
+
+ // Try to register the ring fd for faster enter syscalls.
+ if (TryRegisterRingFd(setupResult.RingFd, out int registeredRingFd))
+ {
+ _ioUringSqRingInfo.RegisteredRingFd = registeredRingFd;
+ }
+
+ // Queue the initial wakeup POLL_ADD.
+ // Direct SQE must be enabled for QueueManagedWakeupPollAdd to work.
+ _ioUringDirectSqeEnabled = true;
+ if (!QueueManagedWakeupPollAdd())
+ {
+ _ioUringDirectSqeEnabled = false;
+ Interop.Sys.IoUringShimCloseFd(eventFd);
+ _ringState.WakeupEventFd = -1;
+ CleanupManagedRings();
+ return false;
+ }
+
+ // Respect process-level direct SQE toggle after the required wakeup POLL_ADD is armed.
+ if (resolvedConfiguration.DirectSqeDisabled)
+ {
+ _ioUringDirectSqeEnabled = false;
+ }
+
+ InitializeIoUringProvidedBufferRingIfSupported(setupResult.RingFd);
+ RefreshIoUringMultishotRecvSupport();
+ // _ioUringInitialized is set by the caller (LinuxDetectAndInitializeIoUring)
+ // after the memory barrier + capabilities publication, so cross-thread readers
+ // never observe _ioUringInitialized == true before ring state is fully visible.
+
+ InitializeDebugTestHooksFromEnvironment();
+
+ return true;
+ }
+
+ /// Validates the managed NativeMsghdr layout contract for direct io_uring message operations.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsNativeMsghdrLayoutSupportedForIoUring() =>
+ IntPtr.Size == 8 && sizeof(NativeMsghdr) == NativeMsghdr.ExpectedSize;
+
+ /// Detects io_uring support and initializes the managed submission/completion paths.
+ partial void LinuxDetectAndInitializeIoUring()
+ {
+ IoUringResolvedConfiguration resolvedConfiguration = ResolveIoUringResolvedConfiguration();
+ LogIoUringResolvedConfigurationIfNeeded(in resolvedConfiguration);
+ if (!resolvedConfiguration.IoUringEnabled || !IsNativeMsghdrLayoutSupportedForIoUring() || !TryInitializeManagedIoUring(in resolvedConfiguration))
+ {
+ _ioUringCapabilities = ResolveLinuxIoUringCapabilities(isIoUringPort: false);
+ SocketsTelemetry.Log.ReportSocketEngineBackendSelected(
+ isIoUringPort: false,
+ isCompletionMode: false,
+ sqPollEnabled: false);
+
+ return;
+ }
+
+ // Initialize managed-side state before publishing capabilities.
+ // Capabilities must be the last write: cross-thread readers gate on
+ // IsIoUringPort / IsCompletionMode and then access queues and slot pools.
+ InitializeLinuxIoUringDiagnosticsState();
+
+ _ioUringSlotCapacity = (int)Math.Max(_ringState.CqEntries, IoUringConstants.QueueEntries);
+ // Slot pool capacity is 2x slot capacity (currently 8192 with default cq sizing).
+ // Multishot operations retain slots for their full lifetime, so this bounds
+ // concurrent long-lived multishot receives before backpressure/exhaustion.
+ _ioUringPrepareQueue = new MpscQueue();
+ _ioUringCancelQueue = new MpscQueue();
+ _reusePortShadowSetupQueue = new MpscQueue();
+ int completionSlotCapacity = _ioUringSlotCapacity * IoUringConstants.CompletionOperationPoolCapacityFactor;
+ InitializeCompletionSlotPool(completionSlotCapacity);
+
+ _ringState.CqDrainEnabled = true;
+
+ // Ensure all init state (ring mappings, queues, slot pools) is globally visible
+ // before capabilities are published to cross-thread readers on ARM64.
+ Thread.MemoryBarrier();
+
+ _ioUringCapabilities = default(LinuxIoUringCapabilities)
+ .WithIsIoUringPort(true)
+ .WithMode(IoUringMode.Completion)
+ .WithSupportsMultishotAccept(_supportsMultishotAccept)
+ .WithSupportsZeroCopySend(_zeroCopySendEnabled)
+ .WithSqPollEnabled(_sqPollEnabled)
+ .WithSupportsProvidedBufferRings(_ioUringProvidedBufferRing is not null)
+ .WithHasRegisteredBuffers(_ioUringCapabilities.HasRegisteredBuffers);
+ RecomputeIoUringRecvStrategy();
+ _pendingEventFdRead = false;
+
+ _ioUringInitialized = true;
+
+ SocketsTelemetry.Log.ReportSocketEngineBackendSelected(
+ isIoUringPort: true,
+ isCompletionMode: true,
+ sqPollEnabled: _ioUringCapabilities.SqPollEnabled);
+
+ }
+
+ /// Tears down io_uring state before native resource cleanup.
+ partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort || _port == (IntPtr)(-1))
+ {
+ return;
+ }
+
+ // Publish teardown before draining queues/closing the native port so concurrent
+ // producer paths observe shutdown via acquire reads and stop queueing new work.
+ Volatile.Write(ref _ioUringTeardownInitiated, 1);
+ DrainQueuedIoUringOperationsForTeardown();
+
+ Interop.Error closeError = Interop.Sys.CloseSocketEventPort(_port);
+ if (closeError == Interop.Error.SUCCESS)
+ {
+ closeSocketEventPort = false;
+ Volatile.Write(ref _ioUringPortClosedForTeardown, 1);
+ }
+ }
+
+ /// Submits pending SQEs on the non-completion-mode event-loop path.
+ partial void LinuxEventLoopBeforeWait()
+ {
+ if (_ioUringCapabilities.IsCompletionMode)
+ {
+ return;
+ }
+
+ Interop.Error submitError = SubmitIoUringBatch();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ // FailFast site: the event-loop submit step cannot degrade safely once
+ // io_uring completion mode is active; losing submit progress would orphan tracked ops.
+ ThrowInternalException(submitError);
+ }
+ }
+
+ /// Attempts a managed completion wait using io_uring_enter with timeout.
+ partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode)
+ {
+ return;
+ }
+
+ // Managed CQE drain path: read CQEs directly from mmap'd ring.
+ // First, try a non-blocking drain of any already-available CQEs.
+ bool hadCqes = DrainCqeRingBatch(handler);
+ bool deferTaskrunEnabled =
+ (_ringState.NegotiatedFlags & IoUringConstants.SetupDeferTaskrun) != 0;
+ bool forceDeferredTaskWorkEnter = hadCqes && deferTaskrunEnabled;
+
+ // Fast-path: skip SubmitIoUringBatch when both cross-thread queues are empty.
+ // Inline re-prepare SQEs from DrainCqeRingBatch write directly to the SQ ring
+ // and are counted in _ioUringManagedPendingSubmissions without touching these queues.
+ if (Volatile.Read(ref _ioUringPrepareQueueLength) != 0 ||
+ Volatile.Read(ref _ioUringCancelQueueLength) != 0)
+ {
+ Interop.Error prepareError = SubmitIoUringBatch(
+ submitPendingSqes: false,
+ wakeEventLoopOnBacklog: false);
+ if (prepareError != Interop.Error.SUCCESS)
+ {
+ ThrowInternalException(prepareError);
+ }
+ }
+
+ uint submitCount = _sqPollEnabled ? 0u : _ioUringManagedPendingSubmissions;
+ if (hadCqes && !forceDeferredTaskWorkEnter && submitCount == 0)
+ {
+ numCompletions = 1;
+ numEvents = 0;
+ waitHandled = true;
+ err = Interop.Error.SUCCESS;
+ return;
+ }
+
+ // If CQEs were already drained and DEFER_TASKRUN is active, perform a non-blocking
+ // io_uring_enter(GETEVENTS, minComplete=0) to flush deferred task work.
+ // Otherwise perform the regular bounded wait for at least one CQE.
+ uint minComplete = (forceDeferredTaskWorkEnter || hadCqes) ? 0u : 1u;
+ uint enterFlags = IoUringConstants.EnterGetevents;
+ int ringFd = 0;
+ ResolveRingFd(ref ringFd, ref enterFlags);
+
+ if (_sqPollEnabled &&
+ _ioUringManagedPendingSubmissions != 0 &&
+ SqNeedWakeup())
+ {
+ enterFlags |= IoUringConstants.EnterSqWakeup;
+ }
+
+ if (!_ringState.UsesExtArg)
+ {
+ ThrowInternalException("io_uring completion-mode wait requires EXT_ARG support.");
+ }
+
+ // Bounded wait via EXT_ARG; timeout shortens when wake circuit-breaker is active.
+ enterFlags |= IoUringConstants.EnterExtArg;
+ Interop.Sys.IoUringKernelTimespec timeout = default;
+ timeout.TvNsec = minComplete == 0 ? 0 : GetManagedCompletionWaitTimeoutNanos();
+ Interop.Sys.IoUringGeteventsArg extArg = default;
+ extArg.Ts = (ulong)(nuint)(&timeout);
+
+ err = IoUringEnterExtWithFallback(
+ ref ringFd, submitCount, minComplete, ref enterFlags, &extArg, out int result);
+
+ if (err == Interop.Error.SUCCESS)
+ {
+ UpdateManagedPendingSubmissionCountAfterEnter(submitCount, result);
+ }
+
+ // Drain after waking. If a producer signalled during the drain, re-drain once
+ // to pick up any work enqueued between the CQE read and the generation check.
+ // Re-snapshot per iteration to avoid unbounded spin when producers are active.
+ hadCqes = false;
+ uint wakeGenCurrent;
+ do
+ {
+ wakeGenCurrent = Volatile.Read(ref _ioUringWakeupGeneration);
+ hadCqes |= DrainCqeRingBatch(handler);
+ }
+ while (Volatile.Read(ref _ioUringWakeupGeneration) != wakeGenCurrent);
+ // CQE dispatch can inline-prepare follow-up SQEs (for example partial send/recv
+ // resubmissions). Submit them immediately to avoid an extra event-loop turn before
+ // they reach the kernel, which can otherwise amplify backpressure latency.
+ if (_ioUringManagedPendingSubmissions != 0)
+ {
+ Interop.Error inlineSubmitError = SubmitIoUringOperationsNormalized();
+ if (inlineSubmitError != Interop.Error.SUCCESS)
+ {
+ ThrowInternalException(inlineSubmitError);
+ }
+ }
+
+ numCompletions = hadCqes ? 1 : 0;
+ numEvents = 0;
+ waitHandled = true;
+ err = Interop.Error.SUCCESS;
+ }
+
+ /// Polls diagnostics after each event loop iteration.
+ partial void LinuxEventLoopAfterIteration()
+ {
+ PollIoUringDiagnosticsIfNeeded(force: false);
+ TrySweepStaleTrackedIoUringOperationsAfterCqOverflowRecovery();
+ }
+
+ /// Queued request to arm a multishot accept SQE for a SO_REUSEPORT shadow listener on this engine.
+ private readonly struct ReusePortShadowSetupRequest
+ {
+ public readonly SafeSocketHandle ShadowSocket;
+ public readonly SocketAsyncContext PrimaryContext;
+ public readonly SocketAsyncEngine PrimaryEngine;
+
+ public ReusePortShadowSetupRequest(SafeSocketHandle shadowSocket, SocketAsyncContext primaryContext, SocketAsyncEngine primaryEngine)
+ {
+ ShadowSocket = shadowSocket;
+ PrimaryContext = primaryContext;
+ PrimaryEngine = primaryEngine;
+ }
+ }
+
+ private MpscQueue? _reusePortShadowSetupQueue;
+
+ ///
+ /// Enqueues a shadow listener setup request for deferred processing on this engine's event loop.
+ /// The shadow accept SQE will be armed during the next cycle.
+ ///
+ internal bool TryEnqueueReusePortShadowSetup(SafeSocketHandle shadowSocket, SocketAsyncContext primaryContext, SocketAsyncEngine primaryEngine)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return false;
+ }
+
+ MpscQueue? queue = _reusePortShadowSetupQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ if (!queue.TryEnqueue(new ReusePortShadowSetupRequest(shadowSocket, primaryContext, primaryEngine)))
+ {
+ return false;
+ }
+
+ WakeEventLoop();
+ return true;
+ }
+
+ /// Queued work item pairing an operation with its prepare sequence number for deferred SQE preparation.
+ private readonly struct IoUringPrepareWorkItem
+ {
+ /// The operation to prepare.
+ public readonly SocketAsyncContext.AsyncOperation Operation;
+ /// The sequence number that must match for the preparation to proceed.
+ public readonly long PrepareSequence;
+
+ /// Creates a work item pairing an operation with its prepare sequence number.
+ public IoUringPrepareWorkItem(SocketAsyncContext.AsyncOperation operation, long prepareSequence)
+ {
+ Operation = operation;
+ PrepareSequence = prepareSequence;
+ }
+ }
+
+ /// Enqueues an operation for deferred SQE preparation on the event loop thread.
+ internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation, long prepareSequence)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return false;
+ }
+
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is null)
+ {
+ return false;
+ }
+
+ long queueLength = Interlocked.Increment(ref _ioUringPrepareQueueLength);
+ if (queueLength > s_ioUringPrepareQueueCapacity)
+ {
+ Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount);
+
+ return false;
+ }
+
+ if (!prepareQueue.TryEnqueue(new IoUringPrepareWorkItem(operation, prepareSequence)))
+ {
+ Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount);
+
+ return false;
+ }
+
+ WakeEventLoop();
+ return true;
+ }
+
+ /// Extracts completion-slot index and generation from tracked reserved-completion user_data.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryDecodeTrackedIoUringUserData(ulong userData, out int slotIndex, out ulong generation)
+ {
+ generation = 0;
+ slotIndex = 0;
+ if (userData == 0)
+ {
+ return false;
+ }
+
+ if ((byte)(userData >> IoUringUserDataTagShift) != IoUringConstants.TagReservedCompletion)
+ {
+ return false;
+ }
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return false;
+ }
+
+ ulong payload = userData & IoUringUserDataPayloadMask;
+ slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ generation = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask;
+ return true;
+ }
+
+ /// Atomically removes and returns the tracked operation matching the user_data and generation.
+ private bool TryTakeTrackedIoUringOperation(ulong userData, out SocketAsyncContext.AsyncOperation? operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryTakeTrackedIoUringOperation must run on the event-loop thread.");
+ operation = null;
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ // Writers publish generation before operation; if operation is visible here,
+ // generation must match unless this CQE belongs to an older slot incarnation.
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return false;
+ }
+
+ // Single-owner handoff: exactly one completion-side CAS can null out TrackedOperation
+ // for this slot incarnation. A racing replace may swap references, but cannot create
+ // two winners for the same user_data token.
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, null, currentOperation) != currentOperation)
+ {
+ continue;
+ }
+
+ // Reset generation to zero so TryReattachTrackedIoUringOperation (used by
+ // SEND_ZC to re-register while awaiting the NOTIF CQE) can CAS from 0 to
+ // the new generation. Volatile.Write ensures visibility on ARM64 before the
+ // count decrement below, preventing a concurrent TryTrack from observing
+ // TrackedOperation == null with a stale non-zero generation.
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0UL);
+ DecrementTrackedIoUringOperationCountOnEventLoop();
+ operation = currentOperation;
+ return true;
+ }
+ }
+
+ /// Returns the tracked operation for the given user_data without untracking it.
+ private bool TryGetTrackedIoUringOperation(ulong userData, out SocketAsyncContext.AsyncOperation? operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryGetTrackedIoUringOperation must run on the event-loop thread.");
+ operation = null;
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return false;
+ }
+
+ operation = currentOperation;
+ return true;
+ }
+
+ /// Returns whether an operation with the given user_data and generation is currently tracked.
+ private bool ContainsTrackedIoUringOperation(ulong userData)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "ContainsTrackedIoUringOperation must run on the event-loop thread.");
+ return TryGetTrackedIoUringOperation(userData, out _);
+ }
+
+ /// Re-attaches a completion owner after dispatch-side deferral (for example SEND_ZC waiting on NOTIF CQE).
+ private bool TryReattachTrackedIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryReattachTrackedIoUringOperation must run on the event-loop thread.");
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ // Verify the completion slot is still in the expected SEND_ZC NOTIF-pending state
+ // before attempting to reattach. If the slot was freed and reallocated between the
+ // first CQE dispatch and this reattach call, the slot's state will not match.
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null || (uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending || slot.Generation != generation)
+ {
+ // Slot was freed and possibly reallocated. The NOTIF CQE was either already
+ // processed or will be discarded by HandleZeroCopyNotification's generation check.
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ if (Interlocked.CompareExchange(ref entry.TrackedOperationGeneration, generation, 0) != 0)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, operation, null) is not null)
+ {
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0);
+ return false;
+ }
+
+ IncrementTrackedIoUringOperationCountOnEventLoop();
+ return true;
+ }
+
+ /// Atomically replaces the tracked operation for the given user_data.
+ private bool TryReplaceTrackedIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation)
+ {
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, newOperation, currentOperation) == currentOperation)
+ {
+ return true;
+ }
+ }
+ }
+
+ /// Removes a tracked operation, optionally verifying it matches an expected reference.
+ private IoUringTrackedOperationRemoveResult TryUntrackTrackedIoUringOperation(
+ ulong userData,
+ SocketAsyncContext.AsyncOperation? expectedOperation,
+ out SocketAsyncContext.AsyncOperation? removedOperation)
+ {
+ removedOperation = null;
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return IoUringTrackedOperationRemoveResult.NotFound;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return IoUringTrackedOperationRemoveResult.NotFound;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return IoUringTrackedOperationRemoveResult.NotFound;
+ }
+
+ if (expectedOperation is not null && !ReferenceEquals(currentOperation, expectedOperation))
+ {
+ return IoUringTrackedOperationRemoveResult.Mismatch;
+ }
+
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, null, currentOperation) != currentOperation)
+ {
+ continue;
+ }
+
+ // Volatile.Write ensures the generation reset is visible on ARM64 before
+ // the count decrement. This method runs from worker threads (cancellation),
+ // and a plain store could reorder past Interlocked.Decrement, leaving a
+ // window where the event loop sees TrackedOperation == null but generation != 0.
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0UL);
+ Interlocked.Decrement(ref _trackedIoUringOperationCount);
+ removedOperation = currentOperation;
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Canceled);
+ return IoUringTrackedOperationRemoveResult.Removed;
+ }
+ }
+
+ /// Returns true when no io_uring operations are currently tracked.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsIoUringTrackingEmpty() =>
+ Volatile.Read(ref _trackedIoUringOperationCount) == 0;
+
+ private void IncrementTrackedIoUringOperationCountOnEventLoop()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Tracked-operation increments must run on the event-loop thread.");
+ int nextCount = _trackedIoUringOperationCount + 1;
+ Volatile.Write(ref _trackedIoUringOperationCount, nextCount);
+ }
+
+ private void DecrementTrackedIoUringOperationCountOnEventLoop()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Tracked-operation decrements must run on the event-loop thread.");
+ int nextCount = _trackedIoUringOperationCount - 1;
+ Debug.Assert(nextCount >= 0, "Tracked-operation count underflow.");
+ Volatile.Write(ref _trackedIoUringOperationCount, nextCount);
+ }
+
+ /// Removes an operation from completion-slot tracking, logging on mismatch.
+ internal bool TryUntrackIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation? expectedOperation = null)
+ {
+ IoUringTrackedOperationRemoveResult removeResult = TryUntrackTrackedIoUringOperation(userData, expectedOperation, out _);
+ if (removeResult == IoUringTrackedOperationRemoveResult.Mismatch)
+ {
+ Debug.Fail("io_uring tracked operation mismatch while untracking user_data.");
+ Interlocked.Increment(ref _ioUringUntrackMismatchCount);
+
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Attempts to replace the currently tracked operation for an existing user_data slot.
+ internal bool TryReplaceIoUringTrackedOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation)
+ {
+ // Replacement keeps the same slot+generation token; completion ownership is still
+ // resolved by the CompareExchange gate in TryTakeTrackedIoUringOperation.
+ return TryReplaceTrackedIoUringOperation(userData, newOperation);
+ }
+
+ /// Enqueues a user_data for ASYNC_CANCEL on the event loop thread.
+ private IoUringCancellationEnqueueResult TryEnqueueIoUringCancellation(ulong userData)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || userData == 0 || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return IoUringCancellationEnqueueResult.Failed;
+ }
+
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is null)
+ {
+ return IoUringCancellationEnqueueResult.Failed;
+ }
+
+ // First attempt: enqueue directly.
+ long queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength);
+ if (queueLength <= s_ioUringCancellationQueueCapacity)
+ {
+ if (cancelQueue.TryEnqueue(userData))
+ {
+ return IoUringCancellationEnqueueResult.Enqueued;
+ }
+
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ }
+ else
+ {
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ }
+
+ // Queue-full can be transient under cancellation bursts. Wake the event loop and retry once.
+#if DEBUG
+ // Keep a dedicated test counter so functional tests can verify the wake-and-retry path.
+ Interlocked.Increment(ref _testCancelQueueWakeRetryCount);
+#endif
+ WakeEventLoop();
+ // Retry while SpinWait remains in active-spin mode; once it would yield, take slow-path accounting.
+ SpinWait retryBackoff = default;
+ do
+ {
+ retryBackoff.SpinOnce();
+
+ queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength);
+ if (queueLength <= s_ioUringCancellationQueueCapacity)
+ {
+ if (cancelQueue.TryEnqueue(userData))
+ {
+ return IoUringCancellationEnqueueResult.EnqueuedAndWoke;
+ }
+
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ continue;
+ }
+
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ } while (!retryBackoff.NextSpinWillYield);
+
+ Interlocked.Increment(ref _ioUringCancelQueueOverflowCount);
+ SocketsTelemetry.Log.IoUringCancellationQueueOverflow();
+
+ return IoUringCancellationEnqueueResult.Failed;
+ }
+
+ /// Writes an ASYNC_CANCEL SQE directly if the engine is on the event loop thread.
+ private bool TryQueueIoUringAsyncCancel(ulong userData)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort || userData == 0)
+ {
+ return false;
+ }
+
+ if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out _))
+ {
+ return false;
+ }
+
+ WriteAsyncCancelSqe(sqe, userData);
+ return true;
+ }
+
+ /// Writes to the eventfd to wake the event loop from a blocking wait.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private Interop.Error ManagedWakeEventLoop()
+ {
+ return Interop.Sys.IoUringShimWriteEventFd(_ringState.WakeupEventFd);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private long GetManagedCompletionWaitTimeoutNanos()
+ {
+ return Volatile.Read(ref _ioUringWakeFailureConsecutiveCount) >= IoUringWakeFailureCircuitBreakerThreshold
+ ? IoUringConstants.WakeFailureFallbackWaitTimeoutNanos
+ : IoUringConstants.BoundedWaitTimeoutNanos;
+ }
+
+ /// Sends a wake signal to the event loop thread.
+ ///
+ /// Write coalescing is handled by the kernel's eventfd mechanism: multiple write(1)
+ /// calls accumulate in the counter, read() drains it, and the multishot POLL_ADD on
+ /// the eventfd fires once per 0-to-nonzero transition. No application-level
+ /// coalescing is needed.
+ ///
+ private void WakeEventLoop()
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return;
+ }
+
+ // Advance the wakeup generation so the event loop's post-wake drain loop
+ // detects work enqueued during the blocking syscall.
+ Interlocked.Increment(ref _ioUringWakeupGeneration);
+
+ Interop.Error error = ManagedWakeEventLoop();
+ if (error == Interop.Error.SUCCESS)
+ {
+ Interlocked.Exchange(ref _ioUringWakeFailureConsecutiveCount, 0);
+
+ return;
+ }
+
+ Interlocked.Increment(ref _ioUringWakeFailureConsecutiveCount);
+ }
+
+ ///
+ /// Wakes the io_uring event loop to process deferred cancel CQEs produced by
+ /// shutdown/disconnect during SafeSocketHandle.CloseAsIs. With DEFER_TASKRUN,
+ /// these CQEs are queued as task work and only processed during io_uring_enter.
+ ///
+ partial void LinuxWakeIoUringEventLoopForSocketClose()
+ {
+ WakeEventLoop();
+ }
+
+ /// Enqueues a cancellation request and wakes the event loop when needed.
+ internal void TryRequestIoUringCancellation(ulong userData)
+ {
+ IoUringCancellationEnqueueResult enqueueResult = TryEnqueueIoUringCancellation(userData);
+ if (enqueueResult == IoUringCancellationEnqueueResult.Failed)
+ {
+ return;
+ }
+
+ if (enqueueResult == IoUringCancellationEnqueueResult.Enqueued)
+ {
+ WakeEventLoop();
+ }
+ }
+
+ ///
+ /// Enqueues a readiness fallback event when io_uring submission is congested.
+ ///
+ internal void EnqueueReadinessFallbackEvent(
+ SocketAsyncContext context,
+ Interop.Sys.SocketEvents events,
+ bool countAsPrepareQueueOverflowFallback = false)
+ {
+ if (events == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ _eventQueue.Enqueue(new SocketIOEvent(context, events));
+ if (countAsPrepareQueueOverflowFallback)
+ {
+ Interlocked.Increment(ref _ioUringPrepareQueueOverflowFallbackCount);
+ }
+ EnsureWorkerScheduled();
+ }
+
+ /// Drains queued cancellation requests into ASYNC_CANCEL SQEs.
+ private bool DrainIoUringCancellationQueue()
+ {
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is null)
+ {
+ return false;
+ }
+
+ int cancelDrainBudget = GetAdaptiveIoUringCancelQueueDrainBudget();
+ bool preparedSqe = false;
+ for (int drained = 0; drained < cancelDrainBudget &&
+ cancelQueue.TryDequeue(out ulong userData); drained++)
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ Debug.Assert(remainingLength >= 0);
+
+ // Cancellation requests can race with terminal completion/untracking.
+ // Skip stale requests to avoid issuing known -ENOENT async-cancel SQEs.
+ if (!IsTrackedIoUringOperation(userData))
+ {
+ continue;
+ }
+
+ if (TryQueueIoUringAsyncCancel(userData))
+ {
+ preparedSqe = true;
+ }
+ }
+ return preparedSqe;
+ }
+
+ /// Drains queued SO_REUSEPORT shadow listener setup requests, arming multishot accept SQEs.
+ private bool DrainReusePortShadowSetupQueue()
+ {
+ MpscQueue? queue = _reusePortShadowSetupQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ bool preparedSqe = false;
+ while (queue.TryDequeue(out ReusePortShadowSetupRequest request))
+ {
+ if (TryPrepareReusePortMultishotAccept(
+ request.ShadowSocket,
+ request.PrimaryContext,
+ request.PrimaryEngine,
+ out ulong userData))
+ {
+ preparedSqe = true;
+ request.PrimaryContext.RecordReusePortShadowArmed(userData, _engineIndex);
+ }
+ }
+ return preparedSqe;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int GetAdaptiveIoUringPrepareQueueDrainBudget()
+ {
+ long observedLength = Interlocked.Read(ref _ioUringPrepareQueueLength);
+ return ComputeAdaptiveIoUringDrainBudget(
+ observedLength,
+ MinIoUringPrepareQueueDrainPerSubmit,
+ MaxIoUringPrepareQueueDrainPerSubmit);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int GetAdaptiveIoUringCancelQueueDrainBudget()
+ {
+ long observedLength = Interlocked.Read(ref _ioUringCancelQueueLength);
+ return ComputeAdaptiveIoUringDrainBudget(
+ observedLength,
+ MinIoUringCancelQueueDrainPerSubmit,
+ MaxIoUringCancelQueueDrainPerSubmit);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int ComputeAdaptiveIoUringDrainBudget(long observedLength, int minBudget, int maxBudget)
+ {
+ if (observedLength <= minBudget)
+ {
+ return minBudget;
+ }
+
+ if (observedLength >= maxBudget)
+ {
+ return maxBudget;
+ }
+
+ return (int)observedLength;
+ }
+
+ /// Drains prepare/cancel queues and optionally submits pending SQEs.
+ private Interop.Error SubmitIoUringBatch(
+ bool submitPendingSqes = true,
+ bool wakeEventLoopOnBacklog = true)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SubmitIoUringBatch must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ bool preparedSqe = false;
+ if (_ioUringCapabilities.IsCompletionMode)
+ {
+ preparedSqe |= DrainIoUringCancellationQueue();
+ preparedSqe |= DrainReusePortShadowSetupQueue();
+
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is null)
+ {
+ ThrowInternalException("io_uring invariant violation: prepare queue is null while engine is in completion mode");
+ }
+
+ int prepareDrainBudget = GetAdaptiveIoUringPrepareQueueDrainBudget();
+ for (int drained = 0; drained < prepareDrainBudget &&
+ prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem); drained++)
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Debug.Assert(remainingLength >= 0);
+ Interop.Error prepareError = TryPrepareAndTrackIoUringOperation(
+ workItem.Operation,
+ workItem.PrepareSequence,
+ out bool preparedOperation);
+ if (prepareError != Interop.Error.SUCCESS)
+ {
+ return prepareError;
+ }
+
+ preparedSqe |= preparedOperation;
+
+ if (!preparedOperation && workItem.Operation.IsInCompletedState())
+ {
+ // Operation completed from early-buffer data during prepare (no SQE needed).
+ // Dispatch the completion callback now.
+ workItem.Operation.AssociatedContext.TryCompleteIoUringOperation(workItem.Operation);
+ continue;
+ }
+
+ if (!preparedOperation && workItem.Operation.IsInWaitingState())
+ {
+ // In completion mode, keep retries in the io_uring prepare queue.
+ // Synthetic readiness fallback can self-amplify into hot loops that
+ // do not produce kernel send/recv progress.
+ bool requeued = workItem.Operation.TryQueueIoUringPreparation();
+ if (requeued)
+ {
+ continue;
+ }
+
+ workItem.Operation.ResetIoUringSlotExhaustionRetryCount();
+ WakeEventLoop();
+ continue;
+ }
+ }
+
+ }
+
+ if (!preparedSqe)
+ {
+ // Inline re-prepare paths can write SQEs outside queue drains; ensure they are submitted.
+ if (_ioUringManagedPendingSubmissions != 0)
+ {
+ if (!submitPendingSqes)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ return SubmitIoUringOperationsNormalized();
+ }
+
+ if (wakeEventLoopOnBacklog &&
+ ((_ioUringCancelQueue?.IsEmpty == false) || (_ioUringPrepareQueue?.IsEmpty == false)))
+ {
+ WakeEventLoop();
+ }
+
+ return Interop.Error.SUCCESS;
+ }
+
+ return submitPendingSqes
+ ? SubmitIoUringOperationsNormalized()
+ : Interop.Error.SUCCESS;
+ }
+
+ ///
+ /// Prepares an operation for io_uring submission and tracks it in completion-slot metadata.
+ /// On non-prepared paths, clears operation user_data and releases preparation resources.
+ ///
+ private Interop.Error TryPrepareAndTrackIoUringOperation(
+ SocketAsyncContext.AsyncOperation operation,
+ long prepareSequence,
+ out bool preparedSqe)
+ {
+ preparedSqe = false;
+
+ bool prepared = operation.TryPrepareIoUring(operation.AssociatedContext, prepareSequence);
+ if (prepared)
+ {
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Queued,
+ IoUringOperationLifecycleState.Prepared);
+ }
+
+ if (prepared && operation.ErrorCode == SocketError.Success)
+ {
+ preparedSqe = true;
+ if (!TryTrackPreparedIoUringOperation(operation))
+ {
+ // Invariant violation: tracking collision after prepare.
+ // A prepared SQE may now complete without a managed owner; do not attempt best-effort recovery.
+ operation.ClearIoUringUserData();
+ ThrowInternalException("io_uring tracking collision: prepared SQE could not be tracked by user_data");
+ }
+
+ return Interop.Error.SUCCESS;
+ }
+
+ if (prepared)
+ {
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Prepared,
+ IoUringOperationLifecycleState.Detached);
+ }
+
+ if (!TryUntrackIoUringOperation(operation.IoUringUserData, operation))
+ {
+ // Mismatch indicates token ownership confusion; avoid releasing
+ // resources that may still be associated with another tracked op.
+ ThrowInternalException("io_uring untrack mismatch: token ownership confusion during prepare cleanup");
+ }
+
+ operation.ClearIoUringUserData();
+ return Interop.Error.SUCCESS;
+ }
+
+ ///
+ /// Falls back to readiness notification for an operation that remained waiting after a failed prepare attempt.
+ ///
+ private void EmitReadinessFallbackForUnpreparedOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ operation.ClearIoUringUserData();
+ Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ operation.RequestIoUringFallbackReprepare();
+ EnqueueReadinessFallbackEvent(operation.AssociatedContext, fallbackEvents);
+ }
+
+ /// Registers a prepared operation in completion-slot metadata.
+ private bool TryTrackPreparedIoUringOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryTrackPreparedIoUringOperation must run on the event-loop thread.");
+ if (!TryDecodeTrackedIoUringUserData(operation.IoUringUserData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) == 0 &&
+ Volatile.Read(ref entry.TrackedOperation) is null)
+ {
+ // Publish generation before operation so readers never observe a new
+ // operation paired with a stale generation on weakly-ordered CPUs.
+ Volatile.Write(ref entry.TrackedOperationGeneration, generation);
+ Volatile.Write(ref entry.TrackedOperation, operation);
+ IncrementTrackedIoUringOperationCountOnEventLoop();
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Prepared,
+ IoUringOperationLifecycleState.Submitted);
+ return true;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperation) is null &&
+ Volatile.Read(ref entry.TrackedOperationGeneration) == generation)
+ {
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0);
+ }
+
+ // Persistent multishot receive can rebind an existing tracked user_data to a new
+ // managed operation before this call. In that case, tracking is already satisfied.
+ return operation.IoUringUserData != 0 &&
+ TryGetTrackedIoUringOperation(operation.IoUringUserData, out SocketAsyncContext.AsyncOperation? trackedOperation) &&
+ ReferenceEquals(trackedOperation, operation);
+ }
+
+ /// Returns whether the given user_data is currently tracked.
+ private bool IsTrackedIoUringOperation(ulong userData)
+ {
+ return ContainsTrackedIoUringOperation(userData);
+ }
+
+ /// Returns whether current completion-slot usage indicates likely slot exhaustion pressure.
+ private bool IsPotentialCompletionSlotExhaustion()
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null || completionEntries.Length == 0)
+ {
+ return false;
+ }
+
+ int threshold = Math.Max(0, completionEntries.Length - 16);
+ return _completionSlotsInUse >= threshold;
+ }
+
+ /// Debug assertion that tracked completion-slot usage never exceeds pool bounds.
+ [Conditional("DEBUG")]
+ private void AssertCompletionSlotUsageBounded()
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ Debug.Assert(
+ _completionSlotsInUse == 0,
+ "Completion slot usage must be zero when the slot pool is not allocated.");
+ return;
+ }
+
+ Debug.Assert(
+ _completionSlotsInUse >= 0 && _completionSlotsInUse <= completionEntries.Length,
+ $"Completion slot usage out of bounds: inUse={_completionSlotsInUse}, capacity={completionEntries.Length}.");
+ }
+
+ /// Debug assertion that completion-slot free-list topology matches .
+ [Conditional("DEBUG")]
+ private void AssertCompletionSlotPoolConsistency()
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ Debug.Assert(_completionSlotsInUse == 0, "Completion slot usage must be zero when slots are not allocated.");
+ Debug.Assert(_completionSlotFreeListHead == -1, "Free-list head must be reset when slots are not allocated.");
+ return;
+ }
+
+ bool[] visited = new bool[completionEntries.Length];
+ int freeCount = 0;
+ int current = _completionSlotFreeListHead;
+ while (current >= 0)
+ {
+ Debug.Assert(
+ (uint)current < (uint)completionEntries.Length,
+ $"Completion-slot free-list index out of range: {current}.");
+ if ((uint)current >= (uint)completionEntries.Length || visited[current])
+ {
+ break;
+ }
+
+ visited[current] = true;
+ freeCount++;
+ current = completionEntries[current].FreeListNext;
+ }
+
+ int expectedInUse = completionEntries.Length - freeCount;
+ Debug.Assert(
+ expectedInUse == _completionSlotsInUse,
+ $"Completion-slot accounting mismatch: expected in-use={expectedInUse}, actual in-use={_completionSlotsInUse}, free={freeCount}, capacity={completionEntries.Length}.");
+ }
+
+ /// Returns whether the calling thread is the event loop thread.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsCurrentThreadEventLoopThread() =>
+ Volatile.Read(ref _eventLoopManagedThreadId) == Environment.CurrentManagedThreadId;
+
+ /// Disables the registered ring fd after an EINVAL and falls back to the raw ring fd.
+ private void DisableRegisteredRingFd()
+ {
+ _ioUringSqRingInfo.RegisteredRingFd = -1;
+ }
+
+ /// Resolves the effective ring fd and enter flags, preferring the registered ring fd when available.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ResolveRingFd(ref int ringFd, ref uint enterFlags)
+ {
+ ringFd = _ringState.RingFd;
+ if (_ioUringSqRingInfo.RegisteredRingFd >= 0)
+ {
+ enterFlags |= IoUringConstants.EnterRegisteredRing;
+ ringFd = _ioUringSqRingInfo.RegisteredRingFd;
+ }
+ }
+
+ ///
+ /// Calls io_uring_enter, automatically falling back from the registered ring fd to the raw
+ /// ring fd on EINVAL. This consolidates the retry pattern used at every enter call site.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe Interop.Error IoUringEnterWithFallback(
+ ref int ringFd, uint toSubmit, uint minComplete, ref uint enterFlags, out int result)
+ {
+ Interop.Error err = Interop.Sys.IoUringShimEnter(ringFd, toSubmit, minComplete, enterFlags, &result);
+ if (err == Interop.Error.EINVAL && (enterFlags & IoUringConstants.EnterRegisteredRing) != 0)
+ {
+ err = IoUringEnterWithFallbackSlow(ref ringFd, toSubmit, minComplete, ref enterFlags, &result);
+ }
+ return err;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe Interop.Error IoUringEnterWithFallbackSlow(
+ ref int ringFd, uint toSubmit, uint minComplete, ref uint enterFlags, int* result)
+ {
+ DisableRegisteredRingFd();
+ enterFlags &= ~IoUringConstants.EnterRegisteredRing;
+ ringFd = _ringState.RingFd;
+ return Interop.Sys.IoUringShimEnter(ringFd, toSubmit, minComplete, enterFlags, result);
+ }
+
+ ///
+ /// Calls io_uring_enter with EXT_ARG, automatically falling back from the registered ring fd
+ /// to the raw ring fd on EINVAL.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe Interop.Error IoUringEnterExtWithFallback(
+ ref int ringFd, uint toSubmit, uint minComplete, ref uint enterFlags,
+ Interop.Sys.IoUringGeteventsArg* extArg, out int result)
+ {
+ Interop.Error err = Interop.Sys.IoUringShimEnterExt(ringFd, toSubmit, minComplete, enterFlags, extArg, &result);
+ if (err == Interop.Error.EINVAL && (enterFlags & IoUringConstants.EnterRegisteredRing) != 0)
+ {
+ err = IoUringEnterExtWithFallbackSlow(ref ringFd, toSubmit, minComplete, ref enterFlags, extArg, &result);
+ }
+ return err;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe Interop.Error IoUringEnterExtWithFallbackSlow(
+ ref int ringFd, uint toSubmit, uint minComplete, ref uint enterFlags,
+ Interop.Sys.IoUringGeteventsArg* extArg, int* result)
+ {
+ DisableRegisteredRingFd();
+ enterFlags &= ~IoUringConstants.EnterRegisteredRing;
+ ringFd = _ringState.RingFd;
+ return Interop.Sys.IoUringShimEnterExt(ringFd, toSubmit, minComplete, enterFlags, extArg, result);
+ }
+
+ ///
+ /// Completes rejected-but-published SQEs as failed completions so ignored submit
+ /// errors do not re-queue the same work indefinitely.
+ ///
+ private unsafe void DrainRejectedManagedSqesAsFailedCompletions(uint rejectedSubmitCount, Interop.Error submitError)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "DrainRejectedManagedSqesAsFailedCompletions must run on the event-loop thread.");
+ if (rejectedSubmitCount == 0)
+ {
+ return;
+ }
+
+ ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo;
+ if (ringInfo.SqeBase == IntPtr.Zero || ringInfo.SqEntries == 0 || ringInfo.SqeSize < (uint)sizeof(IoUringSqe))
+ {
+ return;
+ }
+
+ int completionResult = -Interop.Sys.ConvertErrorPalToPlatform(submitError);
+ uint firstRejectedSqTail = _ioUringManagedSqTail - rejectedSubmitCount;
+ SocketEventHandler handler = new SocketEventHandler(this);
+ bool enqueuedFallbackEvent = false;
+
+ for (uint i = 0; i < rejectedSubmitCount; i++)
+ {
+ uint sqTail = firstRejectedSqTail + i;
+ uint ringIndex = sqTail & ringInfo.SqMask;
+ nint sqeOffset = checked((nint)((nuint)ringIndex * ringInfo.SqeSize));
+ IoUringSqe* sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset);
+ ulong sqeUserData = sqe->UserData;
+ byte tag = (byte)(sqeUserData >> IoUringUserDataTagShift);
+
+ if (tag == IoUringConstants.TagReservedCompletion)
+ {
+ int sqeCompletionResult = completionResult;
+ ulong payload = sqeUserData & IoUringUserDataPayloadMask;
+ ResolveReservedCompletionSlotMetadata(
+ payload,
+ isMultishotCompletion: false,
+ ref sqeCompletionResult,
+ out int completionSocketAddressLen,
+ out int completionControlBufferLen,
+ out uint completionAuxiliaryData,
+ out bool hasFixedRecvBuffer,
+ out ushort fixedRecvBufferId,
+ out bool shouldFreeSlot);
+
+ handler.DispatchSingleIoUringCompletion(
+ sqeUserData,
+ sqeCompletionResult,
+ flags: 0,
+ socketAddressLen: completionSocketAddressLen,
+ controlBufferLen: completionControlBufferLen,
+ auxiliaryData: completionAuxiliaryData,
+ hasFixedRecvBuffer: hasFixedRecvBuffer,
+ fixedRecvBufferId: fixedRecvBufferId,
+ ref enqueuedFallbackEvent);
+
+ // Mirror the normal CQE dispatch ownership model: free slot only
+ // after dispatch has taken/reconciled tracked operation ownership.
+ if (shouldFreeSlot)
+ {
+ FreeCompletionSlot(DecodeCompletionSlotIndex(payload));
+ }
+ }
+ else if (tag != IoUringConstants.TagNone && tag != IoUringConstants.TagWakeupSignal)
+ {
+ Debug.Fail($"Unexpected io_uring SQE user_data tag on rejected submit drain: {tag}.");
+ }
+ }
+
+ if (enqueuedFallbackEvent)
+ {
+ EnsureWorkerScheduled();
+ }
+ }
+
+ /// Returns the accepted SQE count from an io_uring_enter result, clamped to the requested submit count.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ComputeAcceptedSubmissionCount(uint requestedSubmitCount, int enterResult)
+ {
+ if (requestedSubmitCount == 0 || enterResult <= 0)
+ {
+ return 0;
+ }
+
+ uint acceptedSubmitCount = (uint)enterResult;
+ return acceptedSubmitCount <= requestedSubmitCount ? acceptedSubmitCount : requestedSubmitCount;
+ }
+
+ /// Updates pending-submission accounting after an io_uring_enter wait call.
+ private void UpdateManagedPendingSubmissionCountAfterEnter(uint requestedSubmitCount, int enterResult)
+ {
+ if (_sqPollEnabled)
+ {
+ // SQPOLL consumes published SQEs asynchronously after wakeup.
+ _ioUringManagedPendingSubmissions = 0;
+ return;
+ }
+
+ uint acceptedSubmitCount = ComputeAcceptedSubmissionCount(requestedSubmitCount, enterResult);
+ uint rejectedSubmitCount = requestedSubmitCount - acceptedSubmitCount;
+ Debug.Assert(
+ acceptedSubmitCount + rejectedSubmitCount == requestedSubmitCount,
+ "Partial-submit accounting mismatch in io_uring wait path.");
+ _ioUringManagedPendingSubmissions = rejectedSubmitCount;
+ }
+
+ /// Submits the specified number of pending SQEs via io_uring_enter.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe Interop.Error ManagedSubmitPendingEntries(
+ uint toSubmit,
+ out uint acceptedSubmitCount)
+ {
+ acceptedSubmitCount = 0;
+ if (toSubmit == 0)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "ManagedSubmitPendingEntries must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ if (TryConsumeDebugForcedSubmitError(out Interop.Error forcedSubmitError))
+ {
+ return forcedSubmitError;
+ }
+
+ if (_sqPollEnabled)
+ {
+ if (!SqNeedWakeup())
+ {
+ SocketsTelemetry.Log.IoUringSqPollSubmissionSkipped(toSubmit);
+ acceptedSubmitCount = toSubmit;
+ return Interop.Error.SUCCESS;
+ }
+
+ uint wakeupFlags = IoUringConstants.EnterSqWakeup;
+ int wakeupRingFd = 0;
+ ResolveRingFd(ref wakeupRingFd, ref wakeupFlags);
+
+ // Wakeup accounting is intentionally optimistic: this counter tracks wake requests
+ // issued by managed code, not guaranteed kernel-side SQ consumption.
+ SocketsTelemetry.Log.IoUringSqPollWakeup();
+ Interop.Error wakeupError = IoUringEnterWithFallback(
+ ref wakeupRingFd, 0, 0, ref wakeupFlags, out _);
+
+ if (wakeupError == Interop.Error.SUCCESS)
+ {
+ acceptedSubmitCount = toSubmit;
+ }
+
+ return wakeupError;
+ }
+
+ uint enterFlags = 0;
+ int ringFd = 0;
+ ResolveRingFd(ref ringFd, ref enterFlags);
+
+ while (toSubmit > 0)
+ {
+ Interop.Error err = IoUringEnterWithFallback(
+ ref ringFd, toSubmit, 0, ref enterFlags, out int result);
+
+ if (err != Interop.Error.SUCCESS)
+ return err;
+
+ uint acceptedThisCall = ComputeAcceptedSubmissionCount(toSubmit, result);
+ if (acceptedThisCall == 0)
+ {
+ return Interop.Error.EAGAIN;
+ }
+
+ acceptedSubmitCount += acceptedThisCall;
+ toSubmit -= acceptedThisCall;
+ }
+ return Interop.Error.SUCCESS;
+ }
+
+ /// Computes pending submissions and calls ManagedSubmitPendingEntries.
+ private Interop.Error SubmitIoUringOperationsNormalized()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SubmitIoUringOperationsNormalized must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ PublishManagedSqeTail();
+ uint managedPending = _ioUringManagedPendingSubmissions;
+ _ioUringManagedPendingSubmissions = 0;
+
+ Interop.Error error = ManagedSubmitPendingEntries(managedPending, out uint acceptedSubmitCount);
+ uint rejectedSubmitCount = managedPending - acceptedSubmitCount;
+ Debug.Assert(
+ acceptedSubmitCount + rejectedSubmitCount == managedPending,
+ "Partial-submit accounting mismatch in io_uring submit path.");
+
+ // EFAULT indicates corrupted SQ ring memory; propagate to FailFast.
+ // All other errors drain rejected SQEs as failed completions so individual
+ // operations receive error callbacks and the engine survives.
+ bool fatalSubmitError = error == Interop.Error.EFAULT;
+ if (error != Interop.Error.SUCCESS && rejectedSubmitCount != 0 && !fatalSubmitError)
+ {
+ DrainRejectedManagedSqesAsFailedCompletions(rejectedSubmitCount, error);
+ }
+
+ return fatalSubmitError ? error : Interop.Error.SUCCESS;
+ }
+
+ /// Cancels all queued-but-not-submitted operations during teardown.
+ private void DrainQueuedIoUringOperationsForTeardown()
+ {
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is not null)
+ {
+ while (prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem))
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Debug.Assert(remainingLength >= 0);
+
+ SocketAsyncContext.AsyncOperation operation = workItem.Operation;
+ operation.CancelPendingIoUringPreparation(workItem.PrepareSequence);
+ operation.TryCancelForTeardown();
+ operation.ClearIoUringUserData();
+ }
+ }
+
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is not null)
+ {
+ while (cancelQueue.TryDequeue(out _))
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ Debug.Assert(remainingLength >= 0);
+ }
+ }
+
+ // No reset needed for generation counter; teardown does not re-enter the wait loop.
+ }
+
+ ///
+ /// Cancels all tracked in-flight operations during teardown.
+ /// This includes any future long-lived operations (for example multishot recv).
+ ///
+ private void DrainTrackedIoUringOperationsForTeardown(bool portClosedForTeardown)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "DrainTrackedIoUringOperationsForTeardown must run on the event-loop thread.");
+ if (_completionSlots is null || IsIoUringTrackingEmpty())
+ {
+ return;
+ }
+
+ if (_cqOverflowRecoveryActive)
+ {
+ // Phase 1 spec branch (b): teardown preempts overflow-recovery ownership;
+ // tracked-operation drain/cancel paths become the single shutdown owner.
+ _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.Teardown;
+ _cqOverflowRecoveryActive = false;
+ }
+
+ bool queuedAsyncCancel = false;
+ bool canPrepareTeardownCancels = !portClosedForTeardown && IsCurrentThreadEventLoopThread();
+ IoUringTrackedOperationState[]? trackedOperations = _trackedOperations;
+ if (trackedOperations is null)
+ {
+ return;
+ }
+
+ // Teardown uses an explicit array walk to avoid iterator state-machine allocations.
+ for (int i = 0; i < trackedOperations.Length; i++)
+ {
+ SocketAsyncContext.AsyncOperation? operation = Interlocked.Exchange(ref trackedOperations[i].TrackedOperation, null);
+ if (operation is null)
+ {
+ continue;
+ }
+
+ Volatile.Write(ref trackedOperations[i].TrackedOperationGeneration, 0UL);
+ DecrementTrackedIoUringOperationCountOnEventLoop();
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Detached);
+
+ ulong userData = operation.IoUringUserData;
+ if (canPrepareTeardownCancels &&
+ TryQueueIoUringAsyncCancel(userData))
+ {
+ queuedAsyncCancel = true;
+ }
+
+ // Teardown policy: if the port was already closed, native ownership has been
+ // detached and it is now safe to release operation-owned resources eagerly.
+ // Otherwise, queue best-effort async cancel before releasing resources.
+ operation.TryCancelForTeardown();
+ operation.ClearIoUringUserData();
+ }
+
+ if (canPrepareTeardownCancels && queuedAsyncCancel)
+ {
+ _ = SubmitIoUringOperationsNormalized();
+ }
+ }
+
+ internal void RecordIoUringNonPinnablePrepareFallback() =>
+ Interlocked.Increment(ref _ioUringNonPinnablePrepareFallbackCount);
+
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
index ae9b6c9095e43f..965482e6bc17e3 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
@@ -4,20 +4,22 @@
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace System.Net.Sockets
{
- internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem
+ internal sealed unsafe partial class SocketAsyncEngine : IThreadPoolWorkItem
{
- private const int EventBufferCount =
+ private const int DefaultEventBufferCount =
#if DEBUG
32;
#else
1024;
#endif
+ private static readonly int s_eventBufferCount = GetEventBufferCount();
// Socket continuations are dispatched to the ThreadPool from the event thread.
// This avoids continuations blocking the event handling.
@@ -25,9 +27,55 @@ internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem
// PreferInlineCompletions defaults to false and can be set to true using the DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS envvar.
internal static readonly bool InlineSocketCompletionsEnabled = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS") == "1";
+#if DEBUG
+ ///
+ /// Central registry of DEBUG-only io_uring test environment variables.
+ /// These switches are intentionally unsupported for production tuning.
+ ///
+ private static class IoUringTestEnvironmentVariables
+ {
+ internal const string EventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT";
+ internal const string QueueEntries = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_QUEUE_ENTRIES";
+ internal const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY";
+ internal const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE";
+ internal const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND";
+ internal const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK";
+ internal const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK";
+ internal const string ForceSubmitEpermOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_SUBMIT_EPERM_ONCE";
+ internal const string ForceEnterEintrRetryLimitOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE";
+ internal const string ForceKernelVersionUnsupported = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_KERNEL_VERSION_UNSUPPORTED";
+ internal const string ForceProvidedBufferRingOomOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_PROVIDED_BUFFER_RING_OOM_ONCE";
+ internal const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE";
+ internal const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING";
+ internal const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS";
+ }
+#endif
+
+ private static int GetEventBufferCount()
+ {
+#if DEBUG
+ // Test-only knob to make wait-buffer saturation deterministic for io_uring diagnostics coverage.
+ // Only available in DEBUG builds so production code never reads test env vars.
+ if (OperatingSystem.IsLinux())
+ {
+ string? configuredValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.EventBufferCount);
+ if (configuredValue is not null &&
+ int.TryParse(configuredValue, out int parsedValue) &&
+ parsedValue >= 1 &&
+ parsedValue <= DefaultEventBufferCount)
+ {
+ return parsedValue;
+ }
+ }
+#endif
+
+ return DefaultEventBufferCount;
+ }
+
private static int GetEngineCount()
{
// The responsibility of SocketAsyncEngine is to get notifications from epoll|kqueue
+ // (or io_uring on Linux when enabled in the native shim)
// and schedule corresponding work items to ThreadPool (socket reads and writes).
//
// Using TechEmpower benchmarks that generate a LOT of SMALL socket reads and writes under a VERY HIGH load
@@ -60,16 +108,46 @@ private static int GetEngineCount()
private static readonly SocketAsyncEngine[] s_engines = CreateEngines();
private static int s_allocateFromEngine = -1;
+ private static volatile int[]? s_fdEngineAffinity;
+ private static int[]? s_cpuToEngineIndex;
+
+ internal static int EngineCount => s_engines.Length;
+
+ internal static SocketAsyncEngine GetEngineByIndex(int index) => s_engines[index];
+
+ internal static int GetEngineIndexForCpu(int cpuIndex)
+ {
+ int[]? cpuToEngineIndex = s_cpuToEngineIndex;
+ if (cpuToEngineIndex is null || (uint)cpuIndex >= (uint)cpuToEngineIndex.Length)
+ {
+ return -1;
+ }
+
+ return Volatile.Read(ref cpuToEngineIndex[cpuIndex]);
+ }
private static SocketAsyncEngine[] CreateEngines()
{
int engineCount = GetEngineCount();
+ int[]? pinnedCpuIndices = null;
+ int[]? cpuToEngineIndex = null;
+ LinuxInitializeEngineAffinityTopology(ref engineCount, ref pinnedCpuIndices, ref cpuToEngineIndex);
+ if (cpuToEngineIndex is not null)
+ {
+ Interlocked.Exchange(ref s_cpuToEngineIndex, cpuToEngineIndex);
+ }
var engines = new SocketAsyncEngine[engineCount];
for (int i = 0; i < engineCount; i++)
{
- engines[i] = new SocketAsyncEngine();
+ int pinnedCpuIndex = -1;
+ if (pinnedCpuIndices is not null && (uint)i < (uint)pinnedCpuIndices.Length)
+ {
+ pinnedCpuIndex = pinnedCpuIndices[i];
+ }
+
+ engines[i] = new SocketAsyncEngine(i, pinnedCpuIndex);
}
return engines;
@@ -85,11 +163,17 @@ private static SocketAsyncEngine[] CreateEngines()
private readonly IntPtr _port;
private readonly Interop.Sys.SocketEvent* _buffer;
+ private readonly int _engineIndex;
+ private readonly int _pinnedCpuIndex;
+ private int _eventLoopManagedThreadId;
+ private readonly ManualResetEventSlim _ioUringInitSignal = new ManualResetEventSlim(false);
+ internal int EngineIndex => _engineIndex;
+ internal int PinnedCpuIndex => _pinnedCpuIndex;
//
// Queue of events generated by EventLoop() that would be processed by the thread pool
//
- private readonly ConcurrentQueue _eventQueue = new ConcurrentQueue();
+ private readonly SocketIOEventQueue _eventQueue = new SocketIOEventQueue();
// This flag is used for communication between item enqueuing and workers that process the items.
// There are two states of this flag:
@@ -109,13 +193,39 @@ private static SocketAsyncEngine[] CreateEngines()
//
public static bool TryRegisterSocket(IntPtr socketHandle, SocketAsyncContext context, out SocketAsyncEngine? engine, out Interop.Error error)
{
- int engineIndex = Math.Abs(Interlocked.Increment(ref s_allocateFromEngine) % s_engines.Length);
+ int engineIndex;
+ int[]? affinity = s_fdEngineAffinity;
+ int fd = checked((int)socketHandle);
+ if (affinity is not null && (uint)fd < (uint)affinity.Length)
+ {
+ int value = Interlocked.Exchange(ref affinity[fd], 0);
+ if (value > 0)
+ engineIndex = value - 1;
+ else
+ engineIndex = Math.Abs(Interlocked.Increment(ref s_allocateFromEngine) % s_engines.Length);
+ }
+ else
+ {
+ engineIndex = Math.Abs(Interlocked.Increment(ref s_allocateFromEngine) % s_engines.Length);
+ }
+
SocketAsyncEngine nextEngine = s_engines[engineIndex];
+ nextEngine._ioUringInitSignal.Wait();
bool registered = nextEngine.TryRegisterCore(socketHandle, context, out error);
engine = registered ? nextEngine : null;
return registered;
}
+ internal static bool TryRegisterSocketWithEngine(
+ IntPtr socketHandle,
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out Interop.Error error)
+ {
+ engine._ioUringInitSignal.Wait();
+ return engine.TryRegisterCore(socketHandle, context, out error);
+ }
+
private bool TryRegisterCore(IntPtr socketHandle, SocketAsyncContext context, out Interop.Error error)
{
Debug.Assert(context.GlobalContextIndex == -1);
@@ -143,8 +253,20 @@ private bool TryRegisterCore(IntPtr socketHandle, SocketAsyncContext context, ou
context.GlobalContextIndex = index;
}
- error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None,
- Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex);
+ Interop.Error managedError = default;
+ bool managedHandled = false;
+ LinuxTryChangeSocketEventRegistration(socketHandle, Interop.Sys.SocketEvents.None,
+ Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write,
+ context.GlobalContextIndex, ref managedError, ref managedHandled);
+ if (managedHandled)
+ {
+ error = managedError;
+ }
+ else
+ {
+ error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None,
+ Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex);
+ }
if (error == Interop.Error.SUCCESS)
{
return true;
@@ -168,8 +290,10 @@ public static void UnregisterSocket(SocketAsyncContext context)
context.GlobalContextIndex = -1;
}
- private SocketAsyncEngine()
+ private SocketAsyncEngine(int engineIndex, int pinnedCpuIndex)
{
+ _engineIndex = engineIndex;
+ _pinnedCpuIndex = pinnedCpuIndex;
_port = (IntPtr)(-1);
try
{
@@ -182,19 +306,27 @@ private SocketAsyncEngine()
err = Interop.Sys.CreateSocketEventPort(portPtr);
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
}
fixed (Interop.Sys.SocketEvent** bufferPtr = &_buffer)
{
- err = Interop.Sys.CreateSocketEventBuffer(EventBufferCount, bufferPtr);
+ err = Interop.Sys.CreateSocketEventBuffer(s_eventBufferCount, bufferPtr);
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
}
+ // io_uring initialization is deferred to the event loop thread so that
+ // io_uring_setup sets submitter_task to the event loop thread, which is
+ // required by DEFER_TASKRUN. TryRegisterSocket waits on _ioUringInitSignal
+ // before handing sockets to an engine, so no socket can register before
+ // init completes. This wait cannot deadlock because it runs after the
+ // static initializer finishes (s_engines must be assigned for
+ // TryRegisterSocket to access it).
+
var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop())
{
IsBackground = true,
@@ -204,37 +336,106 @@ private SocketAsyncEngine()
}
catch
{
+ // Constructor failure path only: if construction throws, clean up immediately.
+ // This path is the sole caller of FreeNativeResources().
FreeNativeResources();
throw;
}
}
+ partial void LinuxDetectAndInitializeIoUring();
+ static partial void LinuxInitializeEngineAffinityTopology(ref int engineCount, ref int[]? pinnedCpuIndices, ref int[]? cpuToEngineIndex);
+ partial void LinuxPinEventLoopThreadIfConfigured();
+ partial void LinuxEventLoopBeforeWait();
+ partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled);
+ partial void LinuxEventLoopAfterIteration();
+ partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort);
+ partial void LinuxFreeIoUringResources();
+ partial void LinuxTryChangeSocketEventRegistration(IntPtr socketHandle, Interop.Sys.SocketEvents currentEvents, Interop.Sys.SocketEvents newEvents, int data, ref Interop.Error error, ref bool handled);
+ partial void LinuxWakeIoUringEventLoopForSocketClose();
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void WakeIoUringEventLoopForSocketClose() => LinuxWakeIoUringEventLoopForSocketClose();
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(Interop.Error error) =>
+ throw new InternalException(error);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(string message) =>
+ throw new InternalException(message);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void FailFastEventLoop(Exception exception) =>
+ Environment.FailFast($"Exception thrown from SocketAsyncEngine event loop: {exception}", exception);
+
+ private void RecordAndAssertEventLoopThreadIdentity()
+ {
+ int currentThreadId = Environment.CurrentManagedThreadId;
+#if DEBUG
+ int previousThreadId = Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0);
+ Debug.Assert(
+ previousThreadId == 0 || previousThreadId == currentThreadId,
+ $"SocketAsyncEngine event loop thread changed: previous={previousThreadId}, current={currentThreadId}");
+#else
+ Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0);
+#endif
+ }
+
private void EventLoop()
{
try
{
+ RecordAndAssertEventLoopThreadIdentity();
+ LinuxPinEventLoopThreadIfConfigured();
+ try
+ {
+ LinuxDetectAndInitializeIoUring();
+ }
+ finally
+ {
+ _ioUringInitSignal.Set();
+ }
+
SocketEventHandler handler = new SocketEventHandler(this);
while (true)
{
- int numEvents = EventBufferCount;
- Interop.Error err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents);
+ LinuxEventLoopBeforeWait();
+
+ int numEvents = s_eventBufferCount;
+ int numCompletions = 0;
+ Interop.Error err = default;
+ bool waitHandled = false;
+ LinuxEventLoopTryCompletionWait(handler, ref numEvents, ref numCompletions, ref err, ref waitHandled);
+ if (!waitHandled)
+ {
+ err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents);
+ }
+
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
- // The native shim is responsible for ensuring this condition.
- Debug.Assert(numEvents > 0, $"Unexpected numEvents: {numEvents}");
+ // io_uring completion-mode wait can return with zero surfaced events/completions
+ // when woken only to flush managed prepare/cancel queues.
+ Debug.Assert(waitHandled || numEvents > 0 || numCompletions > 0, $"Unexpected wait result: events={numEvents}, completions={numCompletions}");
- if (handler.HandleSocketEvents(numEvents))
+ if (numEvents > 0 && handler.HandleSocketEvents(numEvents))
{
EnsureWorkerScheduled();
}
+
+ LinuxEventLoopAfterIteration();
}
}
catch (Exception e)
{
- Environment.FailFast("Exception thrown from SocketAsyncEngine event loop: " + e.ToString(), e);
+ FailFastEventLoop(e);
}
}
@@ -257,7 +458,7 @@ void IThreadPoolWorkItem.Execute()
// Checking for items must happen after resetting the processing state.
Interlocked.MemoryBarrier();
- ConcurrentQueue eventQueue = _eventQueue;
+ SocketIOEventQueue eventQueue = _eventQueue;
if (!eventQueue.TryDequeue(out SocketIOEvent ev))
{
return;
@@ -295,11 +496,22 @@ void IThreadPoolWorkItem.Execute()
private void FreeNativeResources()
{
+ Debug.Assert(
+ Volatile.Read(ref _eventLoopManagedThreadId) == 0,
+ "FreeNativeResources is only used by constructor-failure cleanup; event loop thread must not have started.");
+ bool closeSocketEventPort = true;
+ // Linux io_uring teardown may need to close the port first to ensure native
+ // ownership is detached before managed operation resources are released.
+ LinuxBeforeFreeNativeResources(ref closeSocketEventPort);
+
+ LinuxFreeIoUringResources();
+
if (_buffer != null)
{
Interop.Sys.FreeSocketEventBuffer(_buffer);
}
- if (_port != (IntPtr)(-1))
+
+ if (closeSocketEventPort && _port != (IntPtr)(-1))
{
Interop.Sys.CloseSocketEventPort(_port);
}
@@ -310,14 +522,16 @@ private void FreeNativeResources()
// To avoid this, the event handling logic is delegated to a non-inlined processing method.
// See discussion: https://github.com/dotnet/runtime/issues/37064
// SocketEventHandler holds an on-stack cache of SocketAsyncEngine members needed by the handler method.
- private readonly struct SocketEventHandler
+ private readonly partial struct SocketEventHandler
{
public Interop.Sys.SocketEvent* Buffer { get; }
- private readonly ConcurrentQueue _eventQueue;
+ private readonly SocketIOEventQueue _eventQueue;
+ private readonly SocketAsyncEngine _engine;
public SocketEventHandler(SocketAsyncEngine engine)
{
+ _engine = engine;
Buffer = engine._buffer;
_eventQueue = engine._eventQueue;
}
@@ -358,6 +572,25 @@ public bool HandleSocketEvents(int numEvents)
}
}
+ private sealed class SocketIOEventQueue
+ {
+#if TARGET_LINUX
+ private readonly MpscQueue _queue = new MpscQueue();
+#else
+ private readonly ConcurrentQueue _queue = new ConcurrentQueue();
+#endif
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ // Event delivery cannot drop entries. Use Enqueue's retrying contract here;
+ // io_uring prepare/cancel queues use TryEnqueue where fallback paths exist.
+ public void Enqueue(SocketIOEvent socketEvent) => _queue.Enqueue(socketEvent);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public bool TryDequeue(out SocketIOEvent socketEvent) => _queue.TryDequeue(out socketEvent);
+
+ public bool IsEmpty => _queue.IsEmpty;
+ }
+
private readonly struct SocketIOEvent
{
public SocketAsyncContext Context { get; }
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Wasi.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Wasi.cs
index 0a39feb2699364..8244f9e09f730d 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Wasi.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Wasi.cs
@@ -35,6 +35,39 @@ public static bool TryRegisterSocket(IntPtr socketHandle, SocketAsyncContext con
return true;
}
+ internal static int EngineCount => 1;
+
+ internal int EngineIndex
+ {
+ get
+ {
+ Debug.Assert(this == s_engine);
+ return 0;
+ }
+ }
+
+ internal static SocketAsyncEngine GetEngineByIndex(int index)
+ {
+ Debug.Assert(index == 0);
+ return s_engine;
+ }
+
+ internal static bool TryRegisterSocketWithEngine(
+ IntPtr socketHandle,
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out Interop.Error error)
+ {
+ Debug.Assert(engine == s_engine);
+ return TryRegisterSocket(socketHandle, context, out _, out error);
+ }
+
+ internal void WakeIoUringEventLoopForSocketClose()
+ {
+ Debug.Assert(this == s_engine);
+ // No-op: WASI does not use io_uring.
+ }
+
public static void UnregisterSocket(SocketAsyncContext context)
{
context.unregisterPollHook.Cancel();
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEventArgs.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEventArgs.cs
index 57397ea0ace268..1ed99f87797894 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEventArgs.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEventArgs.cs
@@ -1009,7 +1009,10 @@ internal void FinishOperationSyncSuccess(int bytesTransferred, SocketFlags flags
if (socketError == SocketError.Success)
{
- _acceptSocket = _currentSocket.UpdateAcceptSocket(_acceptSocket!, _currentSocket._rightEndPoint!.Create(remoteSocketAddress));
+ EndPoint? remoteEndPoint = remoteSocketAddress.Size > 0 ?
+ _currentSocket._rightEndPoint!.Create(remoteSocketAddress) :
+ null;
+ _acceptSocket = _currentSocket.UpdateAcceptSocket(_acceptSocket!, remoteEndPoint);
if (NetEventSource.Log.IsEnabled())
{
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs
new file mode 100644
index 00000000000000..38d7ef78334b34
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs
@@ -0,0 +1,12 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Net.Sockets
+{
+ internal static partial class SocketPal
+ {
+ /// Extracts from a completed io_uring recvmsg message header.
+ internal static unsafe IPPacketInformation GetIoUringIPPacketInformation(Interop.Sys.MessageHeader* messageHeader, bool isIPv4, bool isIPv6) =>
+ GetIPPacketInformation(messageHeader, isIPv4, isIPv6);
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
index 1171961a204351..81f181d5940733 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
@@ -14,6 +14,22 @@ internal sealed partial class SocketsTelemetry : EventSource
private const string ConnectActivityName = ActivitySourceName + ".Connect";
private static readonly ActivitySource s_connectActivitySource = new ActivitySource(ActivitySourceName);
+ internal static class IoUringCounterNames
+ {
+ internal const string PrepareNonPinnableFallbacks = "io-uring-prepare-nonpinnable-fallbacks";
+ internal const string SocketEventBufferFull = "io-uring-socket-event-buffer-full";
+ internal const string CqOverflows = "io-uring-cq-overflows";
+ internal const string CqOverflowRecoveries = "io-uring-cq-overflow-recoveries";
+ internal const string PrepareQueueOverflows = "io-uring-prepare-queue-overflows";
+ internal const string PrepareQueueOverflowFallbacks = "io-uring-prepare-queue-overflow-fallbacks";
+ internal const string CompletionSlotExhaustions = "io-uring-completion-slot-exhaustions";
+ internal const string CompletionSlotHighWaterMark = "io-uring-completion-slot-high-water-mark";
+ internal const string CancellationQueueOverflows = "io-uring-cancellation-queue-overflows";
+ internal const string ProvidedBufferDepletions = "io-uring-provided-buffer-depletions";
+ internal const string SqPollWakeups = "io-uring-sqpoll-wakeups";
+ internal const string SqPollSubmissionsSkipped = "io-uring-sqpoll-submissions-skipped";
+ }
+
public static readonly SocketsTelemetry Log = new SocketsTelemetry();
private PollingCounter? _currentOutgoingConnectAttemptsCounter;
@@ -23,6 +39,20 @@ internal sealed partial class SocketsTelemetry : EventSource
private PollingCounter? _bytesSentCounter;
private PollingCounter? _datagramsReceivedCounter;
private PollingCounter? _datagramsSentCounter;
+ // Keep io_uring counter backing fields always present so EventCounter name contracts remain stable
+ // across platforms; OnEventCommand only registers these counters on Linux.
+ private PollingCounter? _ioUringPrepareNonPinnableFallbacksCounter;
+ private PollingCounter? _ioUringSocketEventBufferFullCounter;
+ private PollingCounter? _ioUringCqOverflowCounter;
+ private PollingCounter? _ioUringCqOverflowRecoveriesCounter;
+ private PollingCounter? _ioUringPrepareQueueOverflowsCounter;
+ private PollingCounter? _ioUringPrepareQueueOverflowFallbacksCounter;
+ private PollingCounter? _ioUringCompletionSlotExhaustionsCounter;
+ private PollingCounter? _ioUringCompletionSlotHighWaterMarkCounter;
+ private PollingCounter? _ioUringCancellationQueueOverflowsCounter;
+ private PollingCounter? _ioUringProvidedBufferDepletionsCounter;
+ private PollingCounter? _ioUringSqPollWakeupsCounter;
+ private PollingCounter? _ioUringSqPollSubmissionsSkippedCounter;
private long _currentOutgoingConnectAttempts;
private long _outgoingConnectionsEstablished;
@@ -31,6 +61,19 @@ internal sealed partial class SocketsTelemetry : EventSource
private long _bytesSent;
private long _datagramsReceived;
private long _datagramsSent;
+ // Backing fields stay cross-platform for contract stability; they are only surfaced as counters on Linux.
+ private long _ioUringPrepareNonPinnableFallbacks;
+ private long _ioUringSocketEventBufferFull;
+ private long _ioUringCqOverflow;
+ private long _ioUringCqOverflowRecoveries;
+ private long _ioUringPrepareQueueOverflows;
+ private long _ioUringPrepareQueueOverflowFallbacks;
+ private long _ioUringCompletionSlotExhaustions;
+ private long _ioUringCompletionSlotHighWaterMark;
+ private long _ioUringCancellationQueueOverflows;
+ private long _ioUringProvidedBufferDepletions;
+ private long _ioUringSqPollWakeups;
+ private long _ioUringSqPollSubmissionsSkipped;
[Event(1, Level = EventLevel.Informational)]
private void ConnectStart(string? address)
@@ -80,6 +123,33 @@ private void AcceptFailed(SocketError error, string? exceptionMessage)
}
}
+ [Event(7, Level = EventLevel.Informational)]
+ private void SocketEngineBackendSelected(string backend, int isIoUringPort, int sqPollEnabled)
+ {
+ if (IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ WriteEvent(eventId: 7, backend, isIoUringPort, sqPollEnabled);
+ }
+ }
+
+ [Event(8, Level = EventLevel.Warning)]
+ private void IoUringSqPollNegotiatedWarning(string message)
+ {
+ if (IsEnabled(EventLevel.Warning, EventKeywords.All))
+ {
+ WriteEvent(eventId: 8, message);
+ }
+ }
+
+ [Event(9, Level = EventLevel.Informational)]
+ private void IoUringResolvedConfiguration(string configuration)
+ {
+ if (IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ WriteEvent(eventId: 9, configuration);
+ }
+ }
+
[NonEvent]
public Activity? ConnectStart(SocketAddress address, ProtocolType protocolType, EndPoint endPoint, bool keepActivityCurrent)
{
@@ -189,6 +259,43 @@ public void AcceptStart(EndPoint address)
}
}
+ [NonEvent]
+ internal void ReportSocketEngineBackendSelected(bool isIoUringPort, bool isCompletionMode, bool sqPollEnabled)
+ {
+ if (!IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ return;
+ }
+
+ SocketEngineBackendSelected(
+ isCompletionMode ? "io_uring_completion" : "epoll",
+ isIoUringPort ? 1 : 0,
+ sqPollEnabled ? 1 : 0);
+ }
+
+ [NonEvent]
+ internal void ReportIoUringSqPollNegotiatedWarning()
+ {
+ if (!IsEnabled(EventLevel.Warning, EventKeywords.All))
+ {
+ return;
+ }
+
+ IoUringSqPollNegotiatedWarning(
+ "io_uring SQPOLL negotiated: kernel polling thread is enabled and may increase privileges in containerized environments.");
+ }
+
+ [NonEvent]
+ internal void ReportIoUringResolvedConfiguration(string configuration)
+ {
+ if (!IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ return;
+ }
+
+ IoUringResolvedConfiguration(configuration);
+ }
+
[NonEvent]
public void AfterAccept(SocketError error, string? exceptionMessage = null)
{
@@ -231,6 +338,113 @@ public void DatagramSent()
Interlocked.Increment(ref _datagramsSent);
}
+ [NonEvent]
+ public void IoUringPrepareNonPinnableFallback(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringPrepareNonPinnableFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringSocketEventBufferFull(long count)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringSocketEventBufferFull, count);
+ }
+
+ [NonEvent]
+ public void IoUringCqOverflow(long count)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringCqOverflow, count);
+ }
+
+ [NonEvent]
+ public void IoUringCqOverflowRecovery(long count)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringCqOverflowRecoveries, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueOverflow(long count)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringPrepareQueueOverflows, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueOverflowFallback(long count)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringPrepareQueueOverflowFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringCompletionSlotExhaustion(long count)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringCompletionSlotExhaustions, count);
+ }
+
+ [NonEvent]
+ public void IoUringCompletionSlotHighWaterMark(long count)
+ {
+ Debug.Assert(count >= 0);
+ while (true)
+ {
+ long observed = Volatile.Read(ref _ioUringCompletionSlotHighWaterMark);
+ if (count <= observed)
+ {
+ return;
+ }
+
+ if (Interlocked.CompareExchange(ref _ioUringCompletionSlotHighWaterMark, count, observed) == observed)
+ {
+ return;
+ }
+ }
+ }
+
+ [NonEvent]
+ public void IoUringCancellationQueueOverflow(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringCancellationQueueOverflows, count);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferDepletion(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringProvidedBufferDepletions, count);
+ }
+
+ [NonEvent]
+ public void IoUringSqPollWakeup(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringSqPollWakeups, count);
+ }
+
+ [NonEvent]
+ public void IoUringSqPollSubmissionSkipped(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ if (IsEnabled())
+ Interlocked.Add(ref _ioUringSqPollSubmissionsSkipped, count);
+ }
+
private static string GetErrorType(SocketError socketError) => socketError switch
{
// Common connect() errors expected to be seen:
@@ -291,6 +505,60 @@ protected override void OnEventCommand(EventCommandEventArgs command)
{
DisplayName = "Datagrams Sent",
};
+
+ if (!OperatingSystem.IsLinux())
+ {
+ return;
+ }
+
+ _ioUringPrepareNonPinnableFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareNonPinnableFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareNonPinnableFallbacks))
+ {
+ DisplayName = "io_uring Prepare Non-Pinnable Fallbacks",
+ };
+ _ioUringSocketEventBufferFullCounter ??= new PollingCounter(IoUringCounterNames.SocketEventBufferFull, this, () => Interlocked.Read(ref _ioUringSocketEventBufferFull))
+ {
+ DisplayName = "io_uring Socket Event Buffer Full",
+ };
+ _ioUringCqOverflowCounter ??= new PollingCounter(IoUringCounterNames.CqOverflows, this, () => Interlocked.Read(ref _ioUringCqOverflow))
+ {
+ DisplayName = "io_uring Completion Queue Overflow",
+ };
+ _ioUringCqOverflowRecoveriesCounter ??= new PollingCounter(IoUringCounterNames.CqOverflowRecoveries, this, () => Interlocked.Read(ref _ioUringCqOverflowRecoveries))
+ {
+ DisplayName = "io_uring Completion Queue Overflow Recoveries",
+ };
+ _ioUringPrepareQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflows, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflows))
+ {
+ DisplayName = "io_uring Prepare Queue Overflows",
+ };
+ _ioUringPrepareQueueOverflowFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflowFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflowFallbacks))
+ {
+ DisplayName = "io_uring Prepare Queue Overflow Fallbacks",
+ };
+ _ioUringCompletionSlotExhaustionsCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotExhaustions, this, () => Interlocked.Read(ref _ioUringCompletionSlotExhaustions))
+ {
+ DisplayName = "io_uring Completion Slot Exhaustions",
+ };
+ _ioUringCompletionSlotHighWaterMarkCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotHighWaterMark, this, () => Interlocked.Read(ref _ioUringCompletionSlotHighWaterMark))
+ {
+ DisplayName = "io_uring Completion Slot High-Water Mark",
+ };
+ _ioUringCancellationQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.CancellationQueueOverflows, this, () => Interlocked.Read(ref _ioUringCancellationQueueOverflows))
+ {
+ DisplayName = "io_uring Cancellation Queue Overflows",
+ };
+ _ioUringProvidedBufferDepletionsCounter ??= new PollingCounter(IoUringCounterNames.ProvidedBufferDepletions, this, () => Interlocked.Read(ref _ioUringProvidedBufferDepletions))
+ {
+ DisplayName = "io_uring Provided Buffer Depletions",
+ };
+ _ioUringSqPollWakeupsCounter ??= new PollingCounter(IoUringCounterNames.SqPollWakeups, this, () => Interlocked.Read(ref _ioUringSqPollWakeups))
+ {
+ DisplayName = "io_uring SQPOLL Wakeups",
+ };
+ _ioUringSqPollSubmissionsSkippedCounter ??= new PollingCounter(IoUringCounterNames.SqPollSubmissionsSkipped, this, () => Interlocked.Read(ref _ioUringSqPollSubmissionsSkipped))
+ {
+ DisplayName = "io_uring SQPOLL Submissions Skipped",
+ };
}
}
}
diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs
new file mode 100644
index 00000000000000..2f3b64787525df
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs
@@ -0,0 +1,609 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Reflection;
+using System.Runtime.ExceptionServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ ///
+ /// Linux test-only shim that mirrors internal SocketAsyncEngine test hooks through reflection.
+ ///
+ internal sealed class SocketAsyncEngine
+ {
+ private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic;
+ private const BindingFlags InstanceFlags = BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic;
+
+ // Keep shim type initialization inert: all reflection is resolved lazily per call.
+ [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketAsyncEngine", "System.Net.Sockets")]
+ static SocketAsyncEngine()
+ {
+ }
+
+ private readonly object _inner;
+
+ private SocketAsyncEngine(object inner)
+ {
+ _inner = inner;
+ }
+
+ internal readonly struct IoUringNonPinnableFallbackPublicationState
+ {
+ internal IoUringNonPinnableFallbackPublicationState(long publishedCount, int publishingGate, long fallbackCount)
+ {
+ PublishedCount = publishedCount;
+ PublishingGate = publishingGate;
+ FallbackCount = fallbackCount;
+ }
+
+ internal long PublishedCount { get; }
+ internal int PublishingGate { get; }
+ internal long FallbackCount { get; }
+ }
+
+
+ internal readonly struct IoUringProvidedBufferSnapshotForTest
+ {
+ internal IoUringProvidedBufferSnapshotForTest(
+ bool hasIoUringPort,
+ bool supportsProvidedBufferRings,
+ bool hasProvidedBufferRing,
+ bool hasRegisteredBuffers,
+ bool adaptiveBufferSizingEnabled,
+ int availableCount,
+ int inUseCount,
+ int totalBufferCount,
+ int bufferSize,
+ int recommendedBufferSize,
+ long recycledCount,
+ long allocationFailureCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsProvidedBufferRings = supportsProvidedBufferRings;
+ HasProvidedBufferRing = hasProvidedBufferRing;
+ HasRegisteredBuffers = hasRegisteredBuffers;
+ AdaptiveBufferSizingEnabled = adaptiveBufferSizingEnabled;
+ AvailableCount = availableCount;
+ InUseCount = inUseCount;
+ TotalBufferCount = totalBufferCount;
+ BufferSize = bufferSize;
+ RecommendedBufferSize = recommendedBufferSize;
+ RecycledCount = recycledCount;
+ AllocationFailureCount = allocationFailureCount;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SupportsProvidedBufferRings { get; }
+ internal bool HasProvidedBufferRing { get; }
+ internal bool HasRegisteredBuffers { get; }
+ internal bool AdaptiveBufferSizingEnabled { get; }
+ internal int AvailableCount { get; }
+ internal int InUseCount { get; }
+ internal int TotalBufferCount { get; }
+ internal int BufferSize { get; }
+ internal int RecommendedBufferSize { get; }
+ internal long RecycledCount { get; }
+ internal long AllocationFailureCount { get; }
+ }
+
+ internal readonly struct IoUringZeroCopySendSnapshotForTest
+ {
+ internal IoUringZeroCopySendSnapshotForTest(
+ bool hasIoUringPort,
+ bool supportsSendZc,
+ bool supportsSendMsgZc,
+ bool zeroCopySendEnabled)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsSendZc = supportsSendZc;
+ SupportsSendMsgZc = supportsSendMsgZc;
+ ZeroCopySendEnabled = zeroCopySendEnabled;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SupportsSendZc { get; }
+ internal bool SupportsSendMsgZc { get; }
+ internal bool ZeroCopySendEnabled { get; }
+ }
+
+ internal readonly struct IoUringFixedRecvSnapshotForTest
+ {
+ internal IoUringFixedRecvSnapshotForTest(
+ bool hasIoUringPort,
+ bool supportsReadFixed,
+ bool hasRegisteredBuffers)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsReadFixed = supportsReadFixed;
+ HasRegisteredBuffers = hasRegisteredBuffers;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SupportsReadFixed { get; }
+ internal bool HasRegisteredBuffers { get; }
+ }
+
+ internal readonly struct IoUringSqPollSnapshotForTest
+ {
+ internal IoUringSqPollSnapshotForTest(bool hasIoUringPort, bool sqPollEnabled, bool deferTaskrunEnabled)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SqPollEnabled = sqPollEnabled;
+ DeferTaskrunEnabled = deferTaskrunEnabled;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SqPollEnabled { get; }
+ internal bool DeferTaskrunEnabled { get; }
+ }
+
+ internal readonly struct IoUringZeroCopyPinHoldSnapshotForTest
+ {
+ internal IoUringZeroCopyPinHoldSnapshotForTest(
+ bool hasIoUringPort,
+ int activePinHolds,
+ int pendingNotificationCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ ActivePinHolds = activePinHolds;
+ PendingNotificationCount = pendingNotificationCount;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal int ActivePinHolds { get; }
+ internal int PendingNotificationCount { get; }
+ }
+
+ internal readonly struct IoUringNativeMsghdrLayoutSnapshotForTest
+ {
+ internal IoUringNativeMsghdrLayoutSnapshotForTest(
+ int size,
+ int msgNameOffset,
+ int msgNameLengthOffset,
+ int msgIovOffset,
+ int msgIovLengthOffset,
+ int msgControlOffset,
+ int msgControlLengthOffset,
+ int msgFlagsOffset)
+ {
+ Size = size;
+ MsgNameOffset = msgNameOffset;
+ MsgNameLengthOffset = msgNameLengthOffset;
+ MsgIovOffset = msgIovOffset;
+ MsgIovLengthOffset = msgIovLengthOffset;
+ MsgControlOffset = msgControlOffset;
+ MsgControlLengthOffset = msgControlLengthOffset;
+ MsgFlagsOffset = msgFlagsOffset;
+ }
+
+ internal int Size { get; }
+ internal int MsgNameOffset { get; }
+ internal int MsgNameLengthOffset { get; }
+ internal int MsgIovOffset { get; }
+ internal int MsgIovLengthOffset { get; }
+ internal int MsgControlOffset { get; }
+ internal int MsgControlLengthOffset { get; }
+ internal int MsgFlagsOffset { get; }
+ }
+
+ internal readonly struct IoUringCompletionSlotLayoutSnapshotForTest
+ {
+ internal IoUringCompletionSlotLayoutSnapshotForTest(
+ int size,
+ int generationOffset,
+ int freeListNextOffset,
+ int packedStateOffset,
+ int fixedRecvBufferIdOffset,
+ int testForcedResultOffset)
+ {
+ Size = size;
+ GenerationOffset = generationOffset;
+ FreeListNextOffset = freeListNextOffset;
+ PackedStateOffset = packedStateOffset;
+ FixedRecvBufferIdOffset = fixedRecvBufferIdOffset;
+ TestForcedResultOffset = testForcedResultOffset;
+ }
+
+ internal int Size { get; }
+ internal int GenerationOffset { get; }
+ internal int FreeListNextOffset { get; }
+ internal int PackedStateOffset { get; }
+ internal int FixedRecvBufferIdOffset { get; }
+ internal int TestForcedResultOffset { get; }
+ }
+
+ internal static IoUringNonPinnableFallbackPublicationState GetIoUringNonPinnableFallbackPublicationStateForTest()
+ {
+ object state = InvokeStatic("GetIoUringNonPinnableFallbackPublicationStateForTest")!;
+ return new IoUringNonPinnableFallbackPublicationState(
+ ReadProperty(state, "PublishedCount"),
+ ReadProperty(state, "PublishingGate"),
+ ReadProperty(state, "FallbackCount"));
+ }
+
+ internal static void SetIoUringNonPinnableFallbackPublicationStateForTest(IoUringNonPinnableFallbackPublicationState state)
+ {
+ MethodInfo setter = GetRequiredMethod(GetEngineType(), "SetIoUringNonPinnableFallbackPublicationStateForTest", StaticFlags);
+ Type stateType = setter.GetParameters()[0].ParameterType;
+ ConstructorInfo constructor = stateType.GetConstructor(
+ BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic,
+ binder: null,
+ types: new[] { typeof(long), typeof(int), typeof(long) },
+ modifiers: null) ?? throw new MissingMethodException(stateType.FullName, ".ctor(long,int,long)");
+
+ object rawState = constructor.Invoke(new object[] { state.PublishedCount, state.PublishingGate, state.FallbackCount });
+ _ = setter.Invoke(null, new object[] { rawState });
+ }
+
+ internal static long GetIoUringNonPinnablePrepareFallbackDeltaForTest() => (long)InvokeStatic("GetIoUringNonPinnablePrepareFallbackDeltaForTest")!;
+ internal static bool IsIoUringEnabledForTest() => (bool)InvokeStatic("IsIoUringEnabledForTest")!;
+ internal static bool IsSqPollRequestedForTest() => (bool)InvokeStatic("IsSqPollRequestedForTest")!;
+ internal static bool IsIoUringDirectSqeDisabledForTest() => (bool)InvokeStatic("IsIoUringDirectSqeDisabledForTest")!;
+ internal static bool IsZeroCopySendOptedInForTest() => (bool)InvokeStatic("IsZeroCopySendOptedInForTest")!;
+ internal static bool IsIoUringRegisterBuffersEnabledForTest() => (bool)InvokeStatic("IsIoUringRegisterBuffersEnabledForTest")!;
+ internal static bool IsNativeMsghdrLayoutSupportedForIoUringForTest(int pointerSize, int nativeMsghdrSize) =>
+ (bool)InvokeStatic("IsNativeMsghdrLayoutSupportedForIoUringForTest", new object?[] { pointerSize, nativeMsghdrSize })!;
+ internal static long GetIoUringPendingRetryQueuedToPrepareQueueCountForTest() => (long)InvokeStatic("GetIoUringPendingRetryQueuedToPrepareQueueCountForTest")!;
+ internal static int GetIoUringCancellationQueueCapacityForTest() => (int)InvokeStatic("GetIoUringCancellationQueueCapacityForTest")!;
+ internal static bool IsIoUringMultishotRecvSupportedForTest() => (bool)InvokeStatic("IsIoUringMultishotRecvSupportedForTest")!;
+ internal static bool IsIoUringMultishotAcceptSupportedForTest() => (bool)InvokeStatic("IsIoUringMultishotAcceptSupportedForTest")!;
+ internal static bool HasActiveIoUringEngineWithInitializedCqStateForTest() => (bool)InvokeStatic("HasActiveIoUringEngineWithInitializedCqStateForTest")!;
+ internal static int GetIoUringCompletionSlotsInUseForTest() => (int)InvokeStatic("GetIoUringCompletionSlotsInUseForTest")!;
+ internal static int GetIoUringTrackedOperationCountForTest() => (int)InvokeStatic("GetIoUringTrackedOperationCountForTest")!;
+ internal static bool IsAnyIoUringSqPollEngineNeedingWakeupForTest() => (bool)InvokeStatic("IsAnyIoUringSqPollEngineNeedingWakeupForTest")!;
+ internal static bool ValidateIoUringProvidedBufferTeardownOrderingForTest() => (bool)InvokeStatic("ValidateIoUringProvidedBufferTeardownOrderingForTest")!;
+ internal static ulong EncodeCompletionSlotUserDataForTest(int slotIndex, ulong generation) =>
+ (ulong)InvokeStatic("EncodeCompletionSlotUserDataForTest", new object?[] { slotIndex, generation })!;
+ internal static ulong IncrementCompletionSlotGenerationForTest(ulong generation) =>
+ (ulong)InvokeStatic("IncrementCompletionSlotGenerationForTest", new object?[] { generation })!;
+
+ internal static bool IsTrackedIoUringUserDataForTest(ulong userData) =>
+ (bool)InvokeStatic("IsTrackedIoUringUserDataForTest", new object?[] { userData })!;
+
+ internal static bool TryDecodeCompletionSlotUserDataForTest(ulong userData, out int slotIndex, out ulong generation)
+ {
+ object?[] args = new object?[] { userData, 0, 0UL };
+ bool result = (bool)InvokeStatic("TryDecodeCompletionSlotUserDataForTest", args)!;
+ slotIndex = (int)args[1]!;
+ generation = (ulong)args[2]!;
+ return result;
+ }
+
+ internal static IoUringNativeMsghdrLayoutSnapshotForTest GetIoUringNativeMsghdrLayoutForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringNativeMsghdrLayoutForTest")!;
+ return new IoUringNativeMsghdrLayoutSnapshotForTest(
+ ReadProperty(snapshot, "Size"),
+ ReadProperty(snapshot, "MsgNameOffset"),
+ ReadProperty(snapshot, "MsgNameLengthOffset"),
+ ReadProperty(snapshot, "MsgIovOffset"),
+ ReadProperty(snapshot, "MsgIovLengthOffset"),
+ ReadProperty(snapshot, "MsgControlOffset"),
+ ReadProperty(snapshot, "MsgControlLengthOffset"),
+ ReadProperty(snapshot, "MsgFlagsOffset"));
+ }
+
+ internal static IoUringCompletionSlotLayoutSnapshotForTest GetIoUringCompletionSlotLayoutForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringCompletionSlotLayoutForTest")!;
+ return new IoUringCompletionSlotLayoutSnapshotForTest(
+ ReadProperty(snapshot, "Size"),
+ ReadProperty(snapshot, "GenerationOffset"),
+ ReadProperty(snapshot, "FreeListNextOffset"),
+ ReadProperty(snapshot, "PackedStateOffset"),
+ ReadProperty(snapshot, "FixedRecvBufferIdOffset"),
+ ReadProperty(snapshot, "TestForcedResultOffset"));
+ }
+
+ internal static IoUringProvidedBufferSnapshotForTest GetIoUringProvidedBufferSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringProvidedBufferSnapshotForTest")!;
+ return new IoUringProvidedBufferSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SupportsProvidedBufferRings"),
+ ReadProperty(snapshot, "HasProvidedBufferRing"),
+ ReadProperty(snapshot, "HasRegisteredBuffers"),
+ ReadProperty(snapshot, "AdaptiveBufferSizingEnabled"),
+ ReadProperty(snapshot, "AvailableCount"),
+ ReadProperty(snapshot, "InUseCount"),
+ ReadProperty(snapshot, "TotalBufferCount"),
+ ReadProperty(snapshot, "BufferSize"),
+ ReadProperty(snapshot, "RecommendedBufferSize"),
+ ReadProperty(snapshot, "RecycledCount"),
+ ReadProperty(snapshot, "AllocationFailureCount"));
+ }
+
+ internal static IoUringZeroCopySendSnapshotForTest GetIoUringZeroCopySendSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringZeroCopySendSnapshotForTest")!;
+ return new IoUringZeroCopySendSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SupportsSendZc"),
+ ReadProperty(snapshot, "SupportsSendMsgZc"),
+ ReadProperty(snapshot, "ZeroCopySendEnabled"));
+ }
+
+ internal static IoUringFixedRecvSnapshotForTest GetIoUringFixedRecvSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringFixedRecvSnapshotForTest")!;
+ return new IoUringFixedRecvSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SupportsReadFixed"),
+ ReadProperty(snapshot, "HasRegisteredBuffers"));
+ }
+
+ internal static IoUringSqPollSnapshotForTest GetIoUringSqPollSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringSqPollSnapshotForTest")!;
+ return new IoUringSqPollSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SqPollEnabled"),
+ ReadProperty(snapshot, "DeferTaskrunEnabled"));
+ }
+
+ internal static IoUringZeroCopyPinHoldSnapshotForTest GetIoUringZeroCopyPinHoldSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringZeroCopyPinHoldSnapshotForTest")!;
+ return new IoUringZeroCopyPinHoldSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "ActivePinHolds"),
+ ReadProperty(snapshot, "PendingNotificationCount"));
+ }
+
+ internal static bool TryInjectIoUringCqOverflowForTest(uint delta, out int injectedEngineCount)
+ {
+ object?[] args = new object?[] { delta, 0 };
+ bool result = (bool)InvokeStatic("TryInjectIoUringCqOverflowForTest", args)!;
+ injectedEngineCount = (int)args[1]!;
+ return result;
+ }
+
+ internal static bool TryGetIoUringRingFdForTest(out int ringFd)
+ {
+ object?[] args = new object?[] { -1 };
+ bool result = (bool)InvokeStatic("TryGetIoUringRingFdForTest", args)!;
+ ringFd = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryGetIoUringWakeupEventFdForTest(out int eventFd)
+ {
+ object?[] args = new object?[] { -1 };
+ bool result = (bool)InvokeStatic("TryGetIoUringWakeupEventFdForTest", args)!;
+ eventFd = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest(out bool matches)
+ {
+ object?[] args = new object?[] { false };
+ bool result = (bool)InvokeStatic("TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest", args)!;
+ matches = (bool)args[0]!;
+ return result;
+ }
+
+ internal static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)
+ {
+ object?[] args = new object?[] { 0 };
+ bool result = (bool)InvokeStatic("TryForceIoUringProvidedBufferRingExhaustionForTest", args)!;
+ forcedBufferCount = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount)
+ {
+ object?[] args = new object?[] { 0 };
+ bool result = (bool)InvokeStatic("TryRecycleForcedIoUringProvidedBufferRingForTest", args)!;
+ recycledBufferCount = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine)
+ {
+ object?[] args = new object?[] { null };
+ bool result = (bool)InvokeStatic("TryGetFirstIoUringEngineForTest", args)!;
+ if (!result || args[0] is null)
+ {
+ ioUringEngine = null;
+ return false;
+ }
+
+ ioUringEngine = new SocketAsyncEngine(args[0]);
+ return true;
+ }
+
+ internal static SocketAsyncEngine[] GetActiveIoUringEnginesForTest()
+ {
+ Array engines = (Array)InvokeStatic("GetActiveIoUringEnginesForTest")!;
+ var wrappers = new SocketAsyncEngine[engines.Length];
+ for (int i = 0; i < engines.Length; i++)
+ {
+ wrappers[i] = new SocketAsyncEngine(engines.GetValue(i)!);
+ }
+
+ return wrappers;
+ }
+
+ internal static int[] GetEnginePinnedCpuIndicesForTest() =>
+ (int[])InvokeStatic("GetEnginePinnedCpuIndicesForTest")!;
+
+ internal static int GetEngineIndexForCpuForTest(int cpuIndex) =>
+ (int)InvokeStatic("GetEngineIndexForCpuForTest", cpuIndex)!;
+
+ internal static bool TrySetCurrentThreadAffinityForTest(int cpuIndex) =>
+ (bool)InvokeStatic("TrySetCurrentThreadAffinityForTest", cpuIndex)!;
+
+ internal bool SupportsMultishotAcceptForTest
+ {
+ get => GetInstanceProperty("SupportsMultishotAcceptForTest");
+ set => SetInstanceProperty("SupportsMultishotAcceptForTest", value);
+ }
+
+ internal bool SupportsOpSendZcForTest
+ {
+ get => GetInstanceProperty("SupportsOpSendZcForTest");
+ set => SetInstanceProperty("SupportsOpSendZcForTest", value);
+ }
+
+ internal bool ZeroCopySendEnabledForTest
+ {
+ get => GetInstanceProperty("ZeroCopySendEnabledForTest");
+ set => SetInstanceProperty("ZeroCopySendEnabledForTest", value);
+ }
+
+ internal long IoUringCancelQueueLengthForTest
+ {
+ get => GetInstanceProperty("IoUringCancelQueueLengthForTest");
+ set => SetInstanceProperty("IoUringCancelQueueLengthForTest", value);
+ }
+
+ internal long IoUringCancelQueueOverflowCountForTest => GetInstanceProperty("IoUringCancelQueueOverflowCountForTest");
+ internal long IoUringCancelQueueWakeRetryCountForTest => GetInstanceProperty("IoUringCancelQueueWakeRetryCountForTest");
+
+ internal int IoUringWakeupRequestedForTest
+ {
+ get => GetInstanceProperty("IoUringWakeupRequestedForTest");
+ set => SetInstanceProperty("IoUringWakeupRequestedForTest", value);
+ }
+
+ internal bool TryEnqueueIoUringCancellationForTest(ulong userData)
+ => (bool)InvokeInstance("TryEnqueueIoUringCancellationForTest", userData)!;
+
+ internal int SubmitIoUringOperationsNormalizedForTest()
+ => Convert.ToInt32(InvokeInstance("SubmitIoUringOperationsNormalizedForTest"));
+
+ internal static int EngineCount
+ {
+ get
+ {
+ PropertyInfo property = GetRequiredProperty(GetEngineType(), "EngineCount", StaticFlags);
+ return (int)property.GetValue(null)!;
+ }
+ }
+
+ private static object? InvokeStatic(string methodName, params object?[]? args)
+ {
+ MethodInfo method = GetRequiredMethod(GetEngineType(), methodName, StaticFlags);
+ return method.Invoke(null, args);
+ }
+
+ private object? InvokeInstance(string methodName, params object?[]? args)
+ {
+ MethodInfo method = GetRequiredMethod(GetEngineType(), methodName, InstanceFlags);
+ return method.Invoke(_inner, args);
+ }
+
+ private T GetInstanceProperty(string propertyName)
+ {
+ PropertyInfo property = GetRequiredProperty(GetEngineType(), propertyName, InstanceFlags);
+ return (T)property.GetValue(_inner)!;
+ }
+
+ private void SetInstanceProperty(string propertyName, object? value)
+ {
+ PropertyInfo property = GetRequiredProperty(GetEngineType(), propertyName, InstanceFlags);
+ property.SetValue(_inner, value);
+ }
+
+ private static T ReadProperty(object instance, string propertyName)
+ {
+ PropertyInfo property = GetRequiredProperty(instance.GetType(), propertyName, InstanceFlags);
+ return (T)property.GetValue(instance)!;
+ }
+
+ [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)]
+ private static Type GetEngineType()
+ {
+ return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true, ignoreCase: false)!;
+ }
+
+ private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags)
+ {
+ return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName);
+ }
+
+ private static PropertyInfo GetRequiredProperty([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string propertyName, BindingFlags flags)
+ {
+ return type.GetProperty(propertyName, flags) ?? throw new MissingMemberException(type.FullName, propertyName);
+ }
+ }
+
+ ///
+ /// Linux test-only shim that forwards SocketAsyncContext test hooks through reflection.
+ ///
+ internal sealed class SocketAsyncContext
+ {
+ private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic;
+ private const BindingFlags InstanceFlags = BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic;
+
+ private readonly object _inner;
+
+ private SocketAsyncContext(object inner)
+ {
+ _inner = inner;
+ }
+
+ [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketAsyncContext", "System.Net.Sockets")]
+ internal static bool IsMultishotAcceptArmedForTest(Socket socket)
+ => (bool)GetRequiredMethod(GetContextType(), "IsMultishotAcceptArmedForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static int GetMultishotAcceptQueueCountForTest(Socket socket)
+ => (int)GetRequiredMethod(GetContextType(), "GetMultishotAcceptQueueCountForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static bool TryGetIncomingCpuForTest(Socket socket, out int cpu)
+ {
+ object?[] args = new object?[] { socket, 0 };
+ bool result = (bool)GetRequiredMethod(GetContextType(), "TryGetIncomingCpuForTest", StaticFlags).Invoke(null, args)!;
+ cpu = (int)args[1]!;
+ return result;
+ }
+
+ internal static bool IsPersistentMultishotRecvArmedForTest(Socket socket)
+ => (bool)GetRequiredMethod(GetContextType(), "IsPersistentMultishotRecvArmedForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static ulong GetPersistentMultishotRecvUserDataForTest(Socket socket)
+ => (ulong)GetRequiredMethod(GetContextType(), "GetPersistentMultishotRecvUserDataForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static int GetPersistentMultishotRecvBufferedCountForTest(Socket socket)
+ => (int)GetRequiredMethod(GetContextType(), "GetPersistentMultishotRecvBufferedCountForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static int GetReusePortShadowListenerCountForTest(Socket socket)
+ => (int)GetRequiredMethod(GetContextType(), "GetReusePortShadowListenerCountForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static bool TryGetSocketAsyncContextForTest(Socket socket, out SocketAsyncContext? context)
+ {
+ object?[] args = new object?[] { socket, null };
+ bool result = (bool)GetRequiredMethod(GetContextType(), "TryGetSocketAsyncContextForTest", StaticFlags).Invoke(null, args)!;
+ if (!result || args[1] is null)
+ {
+ context = null;
+ return false;
+ }
+
+ context = new SocketAsyncContext(args[1]);
+ return true;
+ }
+
+ internal bool TryBufferEarlyPersistentMultishotRecvData(byte[] payload)
+ {
+ MethodInfo method = GetRequiredMethod(GetContextType(), "TryBufferEarlyPersistentMultishotRecvDataForTest", InstanceFlags);
+ return (bool)method.Invoke(_inner, new object[] { payload })!;
+ }
+
+ [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)]
+ private static Type GetContextType()
+ {
+ return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncContext", throwOnError: true, ignoreCase: false)!;
+ }
+
+ private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags)
+ {
+ return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName);
+ }
+ }
+
+
+}
diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs
new file mode 100644
index 00000000000000..b3b222613606b1
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs
@@ -0,0 +1,7231 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Buffers;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Net;
+using System.Runtime.InteropServices;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.DotNet.RemoteExecutor;
+using Xunit;
+using IoUringFixedRecvSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringFixedRecvSnapshotForTest;
+using IoUringProvidedBufferSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringProvidedBufferSnapshotForTest;
+using IoUringSqPollSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringSqPollSnapshotForTest;
+using IoUringZeroCopyPinHoldSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringZeroCopyPinHoldSnapshotForTest;
+using IoUringZeroCopySendSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringZeroCopySendSnapshotForTest;
+
+namespace System.Net.Sockets.Tests
+{
+ // io_uring internals and reflection-based test hooks are currently validated on CoreCLR.
+ [ConditionalClass(typeof(PlatformDetection), nameof(PlatformDetection.IsNotMonoRuntime))]
+ public partial class IoUring
+ {
+ private const int F_GETFD = 1;
+ private const int F_GETFL = 3;
+ private const int FD_CLOEXEC = 1;
+ private const int O_NONBLOCK = 0x800;
+ private const int RLIMIT_NOFILE = 7;
+
+ [StructLayout(LayoutKind.Sequential)]
+ private struct RLimit
+ {
+ public nuint Current;
+ public nuint Maximum;
+ }
+
+ private static class IoUringEnvironmentVariables
+ {
+ public const string Enabled = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING";
+ public const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE";
+ public const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING";
+ public const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS";
+ public const string SqPoll = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL";
+ public const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND";
+ public const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE";
+ public const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK";
+ public const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK";
+ public const string ForceSubmitEpermOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_SUBMIT_EPERM_ONCE";
+ public const string ForceEnterEintrRetryLimitOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE";
+ public const string ForceKernelVersionUnsupported = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_KERNEL_VERSION_UNSUPPORTED";
+ public const string ForceProvidedBufferRingOomOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_PROVIDED_BUFFER_RING_OOM_ONCE";
+ public const string TestEventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT";
+ public const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY";
+ public const string QueueEntries = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_QUEUE_ENTRIES";
+ public const string ThreadCount = "DOTNET_SYSTEM_NET_SOCKETS_THREAD_COUNT";
+ public const string DisableReusePortAccept = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_DISABLE_REUSEPORT_ACCEPT";
+ }
+
+ // fcntl uses C int for fd/cmd/return on Linux ABIs.
+ [LibraryImport("libc", EntryPoint = "fcntl", SetLastError = true)]
+ private static partial int Fcntl(int fd, int cmd);
+
+ [LibraryImport("libc", EntryPoint = "getrlimit", SetLastError = true)]
+ private static partial int GetRLimit(int resource, out RLimit limit);
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // Uses Linux-only io_uring publication internals.
+ public static async Task IoUringNonPinnableFallbackPublication_ConcurrentPublishers_EmitSingleDelta()
+ {
+ await RemoteExecutor.Invoke(static () =>
+ {
+ SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState originalState =
+ SocketAsyncEngine.GetIoUringNonPinnableFallbackPublicationStateForTest();
+
+ try
+ {
+ const long firstFallbackCount = 17;
+ const int publisherCount = 16;
+ long[] deltas = new long[publisherCount];
+ using var start = new ManualResetEventSlim(initialState: false);
+ var tasks = new Task[publisherCount];
+
+ SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest(
+ new SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState(
+ publishedCount: 0L,
+ publishingGate: 0,
+ fallbackCount: firstFallbackCount));
+
+ for (int i = 0; i < publisherCount; i++)
+ {
+ int capturedIndex = i;
+ tasks[i] = Task.Run(() =>
+ {
+ start.Wait();
+ deltas[capturedIndex] = SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest();
+ });
+ }
+
+ start.Set();
+ Task.WaitAll(tasks);
+
+ long deltaTotal = 0;
+ int nonZeroCount = 0;
+ long nonZeroValue = 0;
+ foreach (long delta in deltas)
+ {
+ deltaTotal += delta;
+ if (delta != 0)
+ {
+ nonZeroCount++;
+ nonZeroValue = delta;
+ }
+ }
+
+ Assert.Equal(firstFallbackCount, deltaTotal);
+ Assert.Equal(1, nonZeroCount);
+ Assert.Equal(firstFallbackCount, nonZeroValue);
+
+ const long secondFallbackCount = 23;
+ SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest(
+ new SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState(
+ publishedCount: firstFallbackCount,
+ publishingGate: 0,
+ fallbackCount: secondFallbackCount));
+ Assert.Equal(secondFallbackCount - firstFallbackCount, SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest());
+ Assert.Equal(0, SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest());
+ }
+ finally
+ {
+ SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest(originalState);
+ }
+ }).DisposeAsync();
+ }
+
+ private static RemoteInvokeOptions CreateSocketEngineOptions(
+ string? ioUringValue = "1",
+ string? forceEagainOnceMask = null,
+ string? forceEcanceledOnceMask = null,
+ bool? forceSubmitEpermOnce = null,
+ bool? forceEnterEintrRetryLimitOnce = null,
+ bool? forceKernelVersionUnsupported = null,
+ bool? forceProvidedBufferRingOomOnce = null,
+ int? testEventBufferCount = null,
+ string? testEventBufferCountRaw = null,
+ int? prepareQueueCapacity = null,
+ int? queueEntries = null,
+ int? threadCount = null,
+ int? providedBufferSize = null,
+ bool? adaptiveBufferSizingEnabled = null,
+ bool? registerBuffersEnabled = null,
+ bool? sqPollEnabled = null,
+ bool? directSqeEnabled = null,
+ bool? zeroCopySendEnabled = null,
+ bool? reusePortAcceptDisabled = null)
+ {
+ static void SetOrRemoveEnvironmentVariable(RemoteInvokeOptions options, string name, string? value)
+ {
+ if (value is null)
+ {
+ options.StartInfo.EnvironmentVariables.Remove(name);
+ }
+ else
+ {
+ options.StartInfo.EnvironmentVariables[name] = value;
+ }
+ }
+
+ static void ValidateSocketEngineOptionCombination(int? configuredEventBufferCount, string? configuredEventBufferCountRaw)
+ {
+ if (configuredEventBufferCount.HasValue && configuredEventBufferCountRaw is not null)
+ {
+ throw new ArgumentException(
+ "Specify either testEventBufferCount or testEventBufferCountRaw, not both.",
+ nameof(configuredEventBufferCountRaw));
+ }
+ }
+
+ ValidateSocketEngineOptionCombination(testEventBufferCount, testEventBufferCountRaw);
+
+ RemoteInvokeOptions options = new RemoteInvokeOptions();
+ string? configuredEventBufferCount =
+ testEventBufferCountRaw ?? (testEventBufferCount.HasValue ? testEventBufferCount.Value.ToString() : null);
+ (string Name, string? Value)[] ioUringEnvironmentAssignments =
+ {
+ (IoUringEnvironmentVariables.Enabled, ioUringValue),
+ (IoUringEnvironmentVariables.ProvidedBufferSize, providedBufferSize?.ToString()),
+ (IoUringEnvironmentVariables.AdaptiveBufferSizing, adaptiveBufferSizingEnabled.HasValue ? (adaptiveBufferSizingEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.RegisterBuffers, registerBuffersEnabled.HasValue ? (registerBuffersEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.SqPoll, sqPollEnabled.HasValue ? (sqPollEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.DirectSqe, directSqeEnabled.HasValue ? (directSqeEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ZeroCopySend, zeroCopySendEnabled.HasValue ? (zeroCopySendEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceEagainOnceMask, string.IsNullOrEmpty(forceEagainOnceMask) ? null : forceEagainOnceMask),
+ (IoUringEnvironmentVariables.ForceEcanceledOnceMask, string.IsNullOrEmpty(forceEcanceledOnceMask) ? null : forceEcanceledOnceMask),
+ (IoUringEnvironmentVariables.ForceSubmitEpermOnce, forceSubmitEpermOnce.HasValue ? (forceSubmitEpermOnce.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceEnterEintrRetryLimitOnce, forceEnterEintrRetryLimitOnce.HasValue ? (forceEnterEintrRetryLimitOnce.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceKernelVersionUnsupported, forceKernelVersionUnsupported.HasValue ? (forceKernelVersionUnsupported.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceProvidedBufferRingOomOnce, forceProvidedBufferRingOomOnce.HasValue ? (forceProvidedBufferRingOomOnce.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.TestEventBufferCount, configuredEventBufferCount),
+ (IoUringEnvironmentVariables.PrepareQueueCapacity, prepareQueueCapacity?.ToString()),
+ (IoUringEnvironmentVariables.QueueEntries, queueEntries?.ToString()),
+ (IoUringEnvironmentVariables.ThreadCount, threadCount?.ToString()),
+ (IoUringEnvironmentVariables.DisableReusePortAccept, reusePortAcceptDisabled.HasValue ? (reusePortAcceptDisabled.Value ? "1" : "0") : null),
+ };
+
+ foreach ((string Name, string? Value) assignment in ioUringEnvironmentAssignments)
+ {
+ SetOrRemoveEnvironmentVariable(options, assignment.Name, assignment.Value);
+ }
+
+ options.TimeOut = (int)TimeSpan.FromMinutes(10).TotalMilliseconds;
+ return options;
+ }
+
+ private static Task ToTask(Task task) => task;
+ private static Task ToTask(ValueTask task) => task.AsTask();
+
+ private static Task AwaitWithTimeoutAsync(Task task, string operationName) =>
+ AwaitWithTimeoutAsync(task, operationName, TimeSpan.FromSeconds(15));
+
+ private static async Task AwaitWithTimeoutAsync(Task task, string operationName, TimeSpan timeout)
+ {
+ Task completed = await Task.WhenAny(task, Task.Delay(timeout));
+ if (!ReferenceEquals(task, completed))
+ {
+ throw new TimeoutException($"Timed out waiting for {operationName}");
+ }
+
+ await task;
+ }
+
+ private static Task AwaitWithTimeoutAsync(Task task, string operationName) =>
+ AwaitWithTimeoutAsync(task, operationName, TimeSpan.FromSeconds(15));
+
+ private static async Task AwaitWithTimeoutAsync(Task task, string operationName, TimeSpan timeout)
+ {
+ Task completed = await Task.WhenAny(task, Task.Delay(timeout));
+ if (!ReferenceEquals(task, completed))
+ {
+ throw new TimeoutException($"Timed out waiting for {operationName}");
+ }
+
+ return await task;
+ }
+
+ private static void AssertCanceledOrInterrupted(Exception? ex)
+ {
+ Assert.NotNull(ex);
+ Assert.True(
+ ex is OperationCanceledException ||
+ ex is SocketException socketException &&
+ (socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted),
+ $"Unexpected exception: {ex}");
+ }
+
+ private static void AssertCanceledDisposedOrInterrupted(Exception? ex)
+ {
+ if (ex is null)
+ {
+ return;
+ }
+
+ Assert.True(
+ ex is ObjectDisposedException ||
+ ex is OperationCanceledException ||
+ ex is SocketException socketException &&
+ (socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted),
+ $"Unexpected exception: {ex}");
+ }
+
+ private static bool IsProvidedBufferSnapshotUsable(IoUringProvidedBufferSnapshot snapshot) =>
+ snapshot.HasIoUringPort &&
+ snapshot.SupportsProvidedBufferRings &&
+ snapshot.HasProvidedBufferRing &&
+ snapshot.TotalBufferCount > 0;
+
+ private static bool IsAdaptiveSizingUsable(IoUringProvidedBufferSnapshot snapshot) =>
+ IsProvidedBufferSnapshotUsable(snapshot) && snapshot.AdaptiveBufferSizingEnabled;
+
+ private static bool IsFixedRecvEnabled(IoUringFixedRecvSnapshot snapshot) =>
+ snapshot.SupportsReadFixed && snapshot.HasRegisteredBuffers;
+
+ private static bool IsSqPollActive(IoUringSqPollSnapshot snapshot) =>
+ snapshot.HasIoUringPort && snapshot.SqPollEnabled;
+
+ private sealed class NonPinnableMemoryManager : MemoryManager
+ {
+ private readonly byte[] _buffer;
+
+ public NonPinnableMemoryManager(byte[] buffer)
+ {
+ _buffer = buffer;
+ }
+
+ public override Span GetSpan() => _buffer;
+
+ public override MemoryHandle Pin(int elementIndex = 0)
+ {
+ _ = elementIndex;
+ throw new NotSupportedException("Non-pinnable test memory.");
+ }
+
+ public override void Unpin()
+ {
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ }
+ }
+
+ private sealed unsafe class TrackingPinnableMemoryManager : MemoryManager
+ {
+ private readonly byte[] _buffer;
+ private int _pinCount;
+ private int _unpinCount;
+
+ public TrackingPinnableMemoryManager(byte[] buffer)
+ {
+ _buffer = buffer;
+ }
+
+ public int PinCount => Volatile.Read(ref _pinCount);
+ public int UnpinCount => Volatile.Read(ref _unpinCount);
+
+ public override Span GetSpan() => _buffer;
+
+ public override MemoryHandle Pin(int elementIndex = 0)
+ {
+ if ((uint)elementIndex > (uint)_buffer.Length)
+ {
+ throw new ArgumentOutOfRangeException(nameof(elementIndex));
+ }
+
+ Interlocked.Increment(ref _pinCount);
+ GCHandle handle = GCHandle.Alloc(_buffer, GCHandleType.Pinned);
+ byte* pointer = (byte*)handle.AddrOfPinnedObject() + elementIndex;
+ return new MemoryHandle(pointer, handle, this);
+ }
+
+ public override void Unpin()
+ {
+ Interlocked.Increment(ref _unpinCount);
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ }
+ }
+
+#if DEBUG
+ private sealed class ThrowingTraceListener : TraceListener
+ {
+ public override void Write(string? message)
+ {
+ }
+
+ public override void WriteLine(string? message)
+ {
+ }
+
+ public override void Fail(string? message, string? detailMessage)
+ {
+ throw new InvalidOperationException($"{message} {detailMessage}");
+ }
+ }
+#endif
+
+ private static bool InvokeSocketAsyncEngineBoolMethod(string methodName)
+ {
+ return methodName switch
+ {
+ "IsIoUringEnabled" => SocketAsyncEngine.IsIoUringEnabledForTest(),
+ "IsSqPollRequested" => SocketAsyncEngine.IsSqPollRequestedForTest(),
+ "IsIoUringDirectSqeDisabled" => SocketAsyncEngine.IsIoUringDirectSqeDisabledForTest(),
+ "IsZeroCopySendOptedIn" => SocketAsyncEngine.IsZeroCopySendOptedInForTest(),
+ "IsIoUringRegisterBuffersEnabled" => SocketAsyncEngine.IsIoUringRegisterBuffersEnabledForTest(),
+ _ => throw new ArgumentOutOfRangeException(nameof(methodName), methodName, "Unknown SocketAsyncEngine bool selector."),
+ };
+ }
+
+ private static void AssertBooleanAppContextSwitch(
+ string switchName,
+ string methodName,
+ bool expectedWhenSwitchTrue,
+ bool expectedWhenSwitchFalse)
+ {
+ AppContext.SetSwitch(switchName, true);
+ Assert.Equal(expectedWhenSwitchTrue, InvokeSocketAsyncEngineBoolMethod(methodName));
+
+ AppContext.SetSwitch(switchName, false);
+ Assert.Equal(expectedWhenSwitchFalse, InvokeSocketAsyncEngineBoolMethod(methodName));
+ }
+
+ private static long GetIoUringPendingRetryQueuedToPrepareQueueCount()
+ => SocketAsyncEngine.GetIoUringPendingRetryQueuedToPrepareQueueCountForTest();
+
+ private static void AssertNativeMsghdrLayoutContractForIoUring()
+ {
+ SocketAsyncEngine.IoUringNativeMsghdrLayoutSnapshotForTest layout =
+ SocketAsyncEngine.GetIoUringNativeMsghdrLayoutForTest();
+
+ Assert.Equal(56, layout.Size);
+ Assert.Equal(0, layout.MsgNameOffset);
+ Assert.Equal(8, layout.MsgNameLengthOffset);
+ Assert.Equal(16, layout.MsgIovOffset);
+ Assert.Equal(24, layout.MsgIovLengthOffset);
+ Assert.Equal(32, layout.MsgControlOffset);
+ Assert.Equal(40, layout.MsgControlLengthOffset);
+ Assert.Equal(48, layout.MsgFlagsOffset);
+ }
+
+ private static void AssertNativeMsghdr32BitRejectionPathForIoUring()
+ {
+ Assert.True(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 8, nativeMsghdrSize: 56));
+ Assert.False(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 4, nativeMsghdrSize: 56));
+ Assert.False(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 8, nativeMsghdrSize: 48));
+ }
+
+ private static void AssertIoUringCompletionSlotLayoutContractForIoUring()
+ {
+ SocketAsyncEngine.IoUringCompletionSlotLayoutSnapshotForTest layout =
+ SocketAsyncEngine.GetIoUringCompletionSlotLayoutForTest();
+
+ Assert.Equal(24, layout.Size);
+ Assert.Equal(0, layout.GenerationOffset);
+ Assert.Equal(8, layout.FreeListNextOffset);
+ Assert.Equal(12, layout.PackedStateOffset);
+ Assert.Equal(16, layout.FixedRecvBufferIdOffset);
+ if (layout.TestForcedResultOffset >= 0)
+ {
+ Assert.Equal(20, layout.TestForcedResultOffset);
+ }
+ }
+
+ private static bool TryInjectIoUringCqOverflowForTest(uint delta, out int injectedEngineCount)
+ => SocketAsyncEngine.TryInjectIoUringCqOverflowForTest(delta, out injectedEngineCount);
+
+ private static bool AssertIoUringCqReflectionTargetsStableForTest()
+ => SocketAsyncEngine.HasActiveIoUringEngineWithInitializedCqStateForTest();
+
+ private static int GetIoUringCompletionSlotsInUseForTest()
+ => SocketAsyncEngine.GetIoUringCompletionSlotsInUseForTest();
+
+ private static int GetIoUringTrackedOperationCountForTest()
+ => SocketAsyncEngine.GetIoUringTrackedOperationCountForTest();
+
+ private static ulong EncodeCompletionSlotUserDataForTest(int slotIndex, ulong generation)
+ => SocketAsyncEngine.EncodeCompletionSlotUserDataForTest(slotIndex, generation);
+
+ private static bool TryDecodeCompletionSlotUserDataForTest(ulong userData, out int slotIndex, out ulong generation)
+ => SocketAsyncEngine.TryDecodeCompletionSlotUserDataForTest(userData, out slotIndex, out generation);
+
+ private static ulong IncrementCompletionSlotGenerationForTest(ulong generation)
+ => SocketAsyncEngine.IncrementCompletionSlotGenerationForTest(generation);
+
+ private static bool IsTrackedIoUringUserDataForTest(ulong userData)
+ => SocketAsyncEngine.IsTrackedIoUringUserDataForTest(userData);
+
+ private static bool TryGetIoUringRingFdForTest(out int ringFd)
+ => SocketAsyncEngine.TryGetIoUringRingFdForTest(out ringFd);
+
+ private static bool TryGetIoUringWakeupEventFdForTest(out int eventFd)
+ => SocketAsyncEngine.TryGetIoUringWakeupEventFdForTest(out eventFd);
+
+ private static bool TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine)
+ {
+ return SocketAsyncEngine.TryGetFirstIoUringEngineForTest(out ioUringEngine);
+ }
+
+ private static void AssertCompletionSlotUserDataEncodingBoundaryContractForIoUring()
+ {
+ const int MaxSlotIndex = 8191;
+ const ulong MaxGeneration = (1UL << 43) - 1;
+
+ ulong encoded = EncodeCompletionSlotUserDataForTest(MaxSlotIndex, MaxGeneration);
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(encoded, out int decodedSlotIndex, out ulong decodedGeneration));
+ Assert.Equal(MaxSlotIndex, decodedSlotIndex);
+ Assert.Equal(MaxGeneration, decodedGeneration);
+
+ ulong wrappedGeneration = IncrementCompletionSlotGenerationForTest(MaxGeneration);
+ Assert.Equal(1UL, wrappedGeneration);
+
+ ulong wrappedEncoded = EncodeCompletionSlotUserDataForTest(MaxSlotIndex, wrappedGeneration);
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(wrappedEncoded, out int wrappedSlotIndex, out ulong wrappedDecodedGeneration));
+ Assert.Equal(MaxSlotIndex, wrappedSlotIndex);
+ Assert.Equal(1UL, wrappedDecodedGeneration);
+ }
+
+ private static async Task WaitForIoUringCompletionSlotsInUseAtMostAsync(int maxValue, int timeoutMilliseconds = 10000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (GetIoUringCompletionSlotsInUseForTest() <= maxValue)
+ {
+ return true;
+ }
+
+ await Task.Delay(25);
+ }
+
+ return GetIoUringCompletionSlotsInUseForTest() <= maxValue;
+ }
+
+ private static async Task WaitForIoUringCompletionSlotsInUseAboveAsync(int baselineValue, int minimumDelta, int timeoutMilliseconds = 10000)
+ {
+ int threshold = baselineValue + minimumDelta;
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (GetIoUringCompletionSlotsInUseForTest() > threshold)
+ {
+ return true;
+ }
+
+ await Task.Delay(25);
+ }
+
+ return GetIoUringCompletionSlotsInUseForTest() > threshold;
+ }
+
+ private static async Task WaitForIoUringTrackedOperationsAtMostAsync(int maxValue, int timeoutMilliseconds = 10000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (GetIoUringTrackedOperationCountForTest() <= maxValue)
+ {
+ return true;
+ }
+
+ await Task.Delay(25);
+ }
+
+ return GetIoUringTrackedOperationCountForTest() <= maxValue;
+ }
+
+ private static bool IsIoUringMultishotRecvSupported()
+ => SocketAsyncEngine.IsIoUringMultishotRecvSupportedForTest();
+
+ private static bool IsIoUringMultishotAcceptSupported()
+ => SocketAsyncEngine.IsIoUringMultishotAcceptSupportedForTest();
+
+ private static bool IsListenerMultishotAcceptArmed(Socket listener)
+ => SocketAsyncContext.IsMultishotAcceptArmedForTest(listener);
+
+ private static int GetListenerMultishotAcceptQueueCount(Socket listener)
+ => SocketAsyncContext.GetMultishotAcceptQueueCountForTest(listener);
+
+ private static async Task WaitForMultishotAcceptArmedStateAsync(Socket listener, bool expectedArmed, int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (IsListenerMultishotAcceptArmed(listener) == expectedArmed)
+ {
+ return true;
+ }
+
+ await Task.Delay(20);
+ }
+
+ return IsListenerMultishotAcceptArmed(listener) == expectedArmed;
+ }
+
+ private static bool IsPersistentMultishotRecvArmed(Socket socket)
+ => SocketAsyncContext.IsPersistentMultishotRecvArmedForTest(socket);
+
+ private static ulong GetPersistentMultishotRecvUserData(Socket socket)
+ => SocketAsyncContext.GetPersistentMultishotRecvUserDataForTest(socket);
+
+ private static int GetPersistentMultishotRecvBufferedCount(Socket socket)
+ => SocketAsyncContext.GetPersistentMultishotRecvBufferedCountForTest(socket);
+
+ private static async Task WaitForPersistentMultishotRecvArmedStateAsync(Socket socket, bool expectedArmed, int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (IsPersistentMultishotRecvArmed(socket) == expectedArmed)
+ {
+ return true;
+ }
+
+ await Task.Delay(20);
+ }
+
+ return IsPersistentMultishotRecvArmed(socket) == expectedArmed;
+ }
+
+ private static bool HasSufficientFileDescriptorLimit(int requiredDescriptorCount)
+ {
+ if (requiredDescriptorCount <= 0)
+ {
+ return true;
+ }
+
+ if (GetRLimit(RLIMIT_NOFILE, out RLimit limit) != 0)
+ {
+ return true;
+ }
+
+ return limit.Current >= (nuint)requiredDescriptorCount;
+ }
+
+ private static bool DoesExecChildObserveFileDescriptor(int fd)
+ {
+ if (fd < 0)
+ {
+ return false;
+ }
+
+ using Process process = Process.Start(
+ new ProcessStartInfo
+ {
+ FileName = "/bin/sh",
+ Arguments = $"-c \"[ -e /proc/self/fd/{fd} ]\"",
+ UseShellExecute = false,
+ })!;
+
+ process.WaitForExit();
+ return process.ExitCode == 0;
+ }
+
+ private static async Task WaitForZeroCopyPinHoldSnapshotAsync(
+ Func predicate,
+ int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ IoUringZeroCopyPinHoldSnapshot snapshot = GetIoUringZeroCopyPinHoldSnapshot();
+ while (DateTime.UtcNow < deadline)
+ {
+ if (predicate(snapshot))
+ {
+ return snapshot;
+ }
+
+ await Task.Delay(20);
+ snapshot = GetIoUringZeroCopyPinHoldSnapshot();
+ }
+
+ return snapshot;
+ }
+
+ private static async Task AssertConnectedPairRoundTripAsync(Socket client, Socket server, byte marker)
+ {
+ byte[] payload = new byte[] { marker };
+ byte[] receiveBuffer = new byte[1];
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ Assert.Equal(marker, receiveBuffer[0]);
+ }
+
+ private static async Task AssertPinsReleasedAsync(TrackingPinnableMemoryManager manager)
+ {
+ DateTime start = DateTime.UtcNow;
+ while (manager.PinCount != manager.UnpinCount)
+ {
+ if (DateTime.UtcNow - start > TimeSpan.FromSeconds(10))
+ {
+ break;
+ }
+
+ await Task.Delay(20);
+ }
+
+ Assert.True(manager.PinCount > 0, "Expected at least one pin.");
+ Assert.Equal(manager.PinCount, manager.UnpinCount);
+ }
+
+ private static IoUringProvidedBufferSnapshot GetIoUringProvidedBufferSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringProvidedBufferSnapshotForTest();
+ }
+
+ private static IoUringZeroCopySendSnapshot GetIoUringZeroCopySendSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringZeroCopySendSnapshotForTest();
+ }
+
+ private static IoUringFixedRecvSnapshot GetIoUringFixedRecvSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringFixedRecvSnapshotForTest();
+ }
+
+ private static IoUringSqPollSnapshot GetIoUringSqPollSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringSqPollSnapshotForTest();
+ }
+
+ private static bool IsAnyIoUringSqPollEngineNeedingWakeup()
+ => SocketAsyncEngine.IsAnyIoUringSqPollEngineNeedingWakeupForTest();
+
+ private static bool ValidateSqNeedWakeupMatchesRawSqFlagBit()
+ {
+ if (!SocketAsyncEngine.TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest(out bool matches))
+ {
+ return false;
+ }
+
+ Assert.True(matches, "SqNeedWakeup should match the SQ_NEED_WAKEUP bit contract.");
+ return true;
+ }
+
+ private static void EnableSqPollAppContextOptIn() =>
+ AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", true);
+
+ private static IoUringZeroCopyPinHoldSnapshot GetIoUringZeroCopyPinHoldSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringZeroCopyPinHoldSnapshotForTest();
+ }
+
+ private static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)
+ => SocketAsyncEngine.TryForceIoUringProvidedBufferRingExhaustionForTest(out forcedBufferCount);
+
+ private static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount)
+ => SocketAsyncEngine.TryRecycleForcedIoUringProvidedBufferRingForTest(out recycledBufferCount);
+
+
+ private static Task StartReceiveMessageFromAsync(Socket socket, SocketAsyncEventArgs eventArgs)
+ => StartSocketAsyncEventArgsOperation(socket, eventArgs, static (s, args) => s.ReceiveMessageFromAsync(args));
+
+ private static Task StartSocketAsyncEventArgsOperation(
+ Socket socket,
+ SocketAsyncEventArgs eventArgs,
+ Func startOperation)
+ {
+ var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ EventHandler handler = null!;
+ handler = (_, completedArgs) =>
+ {
+ eventArgs.Completed -= handler;
+ tcs.TrySetResult(completedArgs);
+ };
+
+ eventArgs.Completed += handler;
+ if (!startOperation(socket, eventArgs))
+ {
+ eventArgs.Completed -= handler;
+ tcs.TrySetResult(eventArgs);
+ }
+
+ return tcs.Task;
+ }
+
+ private static async Task<(Socket Listener, Socket Client, Socket Server)> CreateConnectedTcpSocketTrioAsync(int listenBacklog = 1)
+ {
+ Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(listenBacklog);
+
+ Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ Task acceptTask = listener.AcceptAsync();
+ await AwaitWithTimeoutAsync(client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!), "CreateConnectedTcpSocketTrioAsync_connect");
+ Socket server = await AwaitWithTimeoutAsync(acceptTask, "CreateConnectedTcpSocketTrioAsync_accept");
+ return (listener, client, server);
+ }
+ catch
+ {
+ client.Dispose();
+ throw;
+ }
+ }
+ catch
+ {
+ listener.Dispose();
+ throw;
+ }
+ }
+
+ private static async Task<(Socket Client, Socket Server)> AcceptConnectedTcpPairAsync(Socket listener, IPEndPoint endpoint)
+ {
+ Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ Task acceptTask = listener.AcceptAsync();
+ await AwaitWithTimeoutAsync(client.ConnectAsync(endpoint), "AcceptConnectedTcpPairAsync_connect");
+ Socket server = await AwaitWithTimeoutAsync(acceptTask, "AcceptConnectedTcpPairAsync_accept");
+ return (client, server);
+ }
+ catch
+ {
+ client.Dispose();
+ throw;
+ }
+ }
+
+ private static async Task RunTcpRoundTripAsync(int iterations)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[] { 1 };
+ byte[] receiveBuffer = new byte[1];
+
+ for (int i = 0; i < iterations; i++)
+ {
+ var serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ await Task.Yield();
+
+ int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, clientSent);
+
+ int serverReceived = await serverReceiveTask;
+ Assert.Equal(1, serverReceived);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ var clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ await Task.Yield();
+
+ int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, serverSent);
+
+ int clientReceived = await clientReceiveTask;
+ Assert.Equal(1, clientReceived);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ unchecked
+ {
+ sendBuffer[0]++;
+ }
+ }
+ }
+
+ private static async Task RunUnixDomainSocketRoundTripAsync()
+ {
+ if (!Socket.OSSupportsUnixDomainSockets)
+ {
+ return;
+ }
+
+ string path = UnixDomainSocketTest.GetRandomNonExistingFilePath();
+ var endpoint = new UnixDomainSocketEndPoint(path);
+ try
+ {
+ using Socket listener = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified);
+ listener.Bind(endpoint);
+ listener.Listen(1);
+
+ using Socket client = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified);
+ Task acceptTask = listener.AcceptAsync();
+ await client.ConnectAsync(endpoint);
+
+ using Socket server = await acceptTask;
+ await AssertConnectedPairRoundTripAsync(client, server, 0x31);
+ await AssertConnectedPairRoundTripAsync(server, client, 0x32);
+ }
+ finally
+ {
+ try
+ {
+ System.IO.File.Delete(path);
+ }
+ catch
+ {
+ }
+ }
+ }
+
+ private static async Task RunHybridIoUringAndEpollEngineScenarioAsync()
+ {
+ await RunTcpRoundTripAsync(4);
+
+ // With DOTNET_SYSTEM_NET_SOCKETS_THREAD_COUNT=2, one io_uring engine indicates a hybrid mix.
+ if (SocketAsyncEngine.GetActiveIoUringEnginesForTest().Length != 1)
+ {
+ return;
+ }
+
+ const int ConnectionCount = 32;
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(ConnectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ var acceptTasks = new Task[ConnectionCount];
+ var clients = new Socket[ConnectionCount];
+ var connectTasks = new Task[ConnectionCount];
+
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ acceptTasks[i] = listener.AcceptAsync();
+ }
+
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ connectTasks[i] = clients[i].ConnectAsync(endpoint);
+ }
+
+ await Task.WhenAll(connectTasks);
+ Socket[] servers = await Task.WhenAll(acceptTasks);
+
+ try
+ {
+ var work = new Task[ConnectionCount];
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ Socket client = clients[i];
+ Socket server = servers[i];
+ byte value = (byte)(i + 1);
+
+ work[i] = Task.Run(async () =>
+ {
+ byte[] tx = new byte[] { value };
+ byte[] rx = new byte[1];
+
+ int sent = await client.SendAsync(tx, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ int received = await server.ReceiveAsync(rx, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(value, rx[0]);
+
+ sent = await server.SendAsync(tx, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ received = await client.ReceiveAsync(rx, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(value, rx[0]);
+ });
+ }
+
+ await Task.WhenAll(work);
+ }
+ finally
+ {
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ servers[i].Dispose();
+ clients[i].Dispose();
+ }
+ }
+ }
+
+ private static async Task RunThreadCountTwoCancellationRoutingScenarioAsync()
+ {
+ await RunHybridIoUringAndEpollEngineScenarioAsync();
+
+ SocketAsyncEngine[] ioUringEngines = SocketAsyncEngine.GetActiveIoUringEnginesForTest();
+ if (ioUringEngines.Length != 1)
+ {
+ return;
+ }
+
+ SocketAsyncEngine ioUringEngine = ioUringEngines[0];
+ long queueLengthBefore = ioUringEngine.IoUringCancelQueueLengthForTest;
+ long wakeRetryBefore = ioUringEngine.IoUringCancelQueueWakeRetryCountForTest;
+
+ await RunCancellationSubmitContentionScenarioAsync(connectionCount: 8, cancellationsPerConnection: 64);
+
+ Assert.True(queueLengthBefore >= 0);
+ Assert.True(ioUringEngine.IoUringCancelQueueLengthForTest >= 0);
+ Assert.True(
+ ioUringEngine.IoUringCancelQueueLengthForTest <= SocketAsyncEngine.GetIoUringCancellationQueueCapacityForTest());
+ Assert.True(ioUringEngine.IoUringCancelQueueWakeRetryCountForTest >= wakeRetryBefore);
+ }
+
+ private static async Task RunKernelVersionUnsupportedFallbackScenarioAsync()
+ {
+ await RunTcpRoundTripAsync(4);
+ Assert.Equal(0, SocketAsyncEngine.GetActiveIoUringEnginesForTest().Length);
+ }
+
+ private static async Task RunTrackedOperationGenerationTransitionStressScenarioAsync(int connectionCount, int iterationsPerConnection)
+ {
+ if (!PlatformDetection.IsArm64Process)
+ {
+ return;
+ }
+
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest();
+ int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest();
+
+ var clients = new List(connectionCount);
+ var servers = new List(connectionCount);
+ try
+ {
+ for (int i = 0; i < connectionCount; i++)
+ {
+ (Socket client, Socket server) = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ clients.Add(client);
+ servers.Add(server);
+ }
+
+ var workers = new Task[connectionCount];
+ for (int i = 0; i < connectionCount; i++)
+ {
+ Socket client = clients[i];
+ Socket server = servers[i];
+ workers[i] = Task.Run(async () =>
+ {
+ byte[] sendBuffer = new byte[1];
+ byte[] receiveBuffer = new byte[1];
+ for (int iteration = 0; iteration < iterationsPerConnection; iteration++)
+ {
+ // Stress rapid slot reuse so generation mismatches surface as stuck operations
+ // rather than silently passing under low churn.
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ int sent = await client.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ int received = await receiveTask;
+ Assert.Equal(1, received);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ unchecked
+ {
+ sendBuffer[0]++;
+ }
+ }
+ });
+ }
+
+ Task workerTask = Task.WhenAll(workers);
+ Task completed = await Task.WhenAny(workerTask, Task.Delay(TimeSpan.FromSeconds(60)));
+ Assert.Same(workerTask, completed);
+ await workerTask;
+ }
+ finally
+ {
+ foreach (Socket server in servers)
+ {
+ server.Dispose();
+ }
+
+ foreach (Socket client in clients)
+ {
+ client.Dispose();
+ }
+ }
+
+ Assert.True(
+ await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 2, timeoutMilliseconds: 15000),
+ "Completion-slot usage remained elevated after ARM64 generation-transition stress.");
+ Assert.True(
+ await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 2, timeoutMilliseconds: 15000),
+ "Tracked-operation count remained elevated after ARM64 generation-transition stress.");
+ }
+
+ private static async Task RunGenerationWrapAroundDispatchScenarioAsync()
+ {
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ Task armReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x5C }, SocketFlags.None));
+ Assert.Equal(1, await armReceive);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true),
+ "Expected persistent multishot recv to arm before generation-wrap dispatch validation.");
+
+ ulong activeUserData = GetPersistentMultishotRecvUserData(server);
+ Assert.NotEqual(0UL, activeUserData);
+ Assert.True(IsTrackedIoUringUserDataForTest(activeUserData), "Active multishot user_data should be tracked.");
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(activeUserData, out int slotIndex, out ulong generation));
+
+ // Derive max generation from encoding mask and verify helper wrap contract.
+ ulong maxEncodedUserData = EncodeCompletionSlotUserDataForTest(slotIndex, ulong.MaxValue);
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(maxEncodedUserData, out _, out ulong maxGeneration));
+ Assert.Equal(1UL, IncrementCompletionSlotGenerationForTest(maxGeneration));
+
+ ulong staleGeneration = IncrementCompletionSlotGenerationForTest(generation);
+ ulong staleUserData = EncodeCompletionSlotUserDataForTest(slotIndex, staleGeneration);
+ if (staleUserData == activeUserData)
+ {
+ staleUserData = EncodeCompletionSlotUserDataForTest(slotIndex, generation == 1UL ? 2UL : 1UL);
+ }
+
+ Assert.NotEqual(activeUserData, staleUserData);
+ Assert.False(
+ IsTrackedIoUringUserDataForTest(staleUserData),
+ "Stale wrapped-generation user_data should be rejected during dispatch lookup.");
+ Assert.True(IsTrackedIoUringUserDataForTest(activeUserData));
+ }
+
+ private static async Task RunBufferListSendRoundTripAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[] { 0x11, 0x22, 0x33, 0x44, 0x55 };
+ var sendBuffers = new List>
+ {
+ new ArraySegment(payload, 0, 2),
+ new ArraySegment(payload, 2, 1),
+ new ArraySegment(payload, 3, 2)
+ };
+
+ byte[] receiveBuffer = new byte[payload.Length];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ int sent = await client.SendAsync(sendBuffers, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static async Task RunReceiveMessageFromRoundTripAsync()
+ {
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+
+ receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] payload = new byte[] { 0x91, 0x92, 0x93 };
+ byte[] receiveBuffer = new byte[payload.Length];
+ EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0);
+
+ var receiveTask = receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint);
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveMessageFromResult result = await receiveTask;
+ Assert.Equal(payload.Length, result.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint);
+ }
+
+ private static async Task RunReceiveMessageFromPacketInformationRoundTripAsync(bool useIpv6)
+ {
+ if (useIpv6 && !Socket.OSSupportsIPv6)
+ {
+ return;
+ }
+
+ AddressFamily addressFamily = useIpv6 ? AddressFamily.InterNetworkV6 : AddressFamily.InterNetwork;
+ SocketOptionLevel optionLevel = useIpv6 ? SocketOptionLevel.IPv6 : SocketOptionLevel.IP;
+ IPAddress loopbackAddress = useIpv6 ? IPAddress.IPv6Loopback : IPAddress.Loopback;
+ IPAddress anyAddress = useIpv6 ? IPAddress.IPv6Any : IPAddress.Any;
+
+ using Socket receiver = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp);
+ using Socket sender = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp);
+
+ receiver.SetSocketOption(optionLevel, SocketOptionName.PacketInformation, true);
+ receiver.Bind(new IPEndPoint(loopbackAddress, 0));
+ sender.Bind(new IPEndPoint(loopbackAddress, 0));
+
+ byte[] payload = useIpv6 ?
+ new byte[] { 0xA1, 0xA2, 0xA3 } :
+ new byte[] { 0x90, 0x91, 0x92, 0x93 };
+ byte[] receiveBuffer = new byte[payload.Length];
+ EndPoint remoteEndPoint = new IPEndPoint(anyAddress, 0);
+
+ Task receiveTask =
+ ToTask(receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint));
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveMessageFromResult result = await receiveTask;
+ Assert.Equal(payload.Length, result.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint);
+ Assert.Equal(((IPEndPoint)sender.LocalEndPoint!).Address, result.PacketInformation.Address);
+ }
+
+ private static async Task RunNonPinnableMemorySendFallbackScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[] { 0x71, 0x72, 0x73, 0x74 };
+ using var nonPinnableMemory = new NonPinnableMemoryManager(payload);
+ byte[] receiveBuffer = new byte[payload.Length];
+
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ int sent = await client.SendAsync(nonPinnableMemory.Memory, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static async Task RunNonPinnableMemoryReceiveFallbackScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[4];
+ using var nonPinnableMemory = new NonPinnableMemoryManager(receiveBuffer);
+ byte[] payload = new byte[] { 0x81, 0x82, 0x83, 0x84 };
+
+ Task receiveTask = ToTask(server.ReceiveAsync(nonPinnableMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static Task RunNonPinnableMemoryFallbackScenarioAsync(bool receivePath) =>
+ receivePath ? RunNonPinnableMemoryReceiveFallbackScenarioAsync() : RunNonPinnableMemorySendFallbackScenarioAsync();
+
+ private static async Task RunPinnableMemoryPinReleaseLifecycleScenarioAsync()
+ {
+ if (!GetIoUringProvidedBufferSnapshot().HasIoUringPort)
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Completion path: receive completes with data and must release pin.
+ byte[] completionPayload = new byte[] { 0x91 };
+ using var completionMemory = new TrackingPinnableMemoryManager(new byte[completionPayload.Length]);
+ Task completionReceive = ToTask(server.ReceiveAsync(completionMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(completionPayload, SocketFlags.None));
+ Assert.Equal(1, await completionReceive);
+ Assert.Equal(completionPayload, completionMemory.GetSpan().ToArray());
+ await AssertPinsReleasedAsync(completionMemory);
+
+ // Cancellation path: pending receive canceled by token must release pin.
+ using var cancellationMemory = new TrackingPinnableMemoryManager(new byte[16]);
+ using (var cts = new CancellationTokenSource())
+ {
+ Task canceledReceive = ToTask(server.ReceiveAsync(cancellationMemory.Memory, SocketFlags.None, cts.Token));
+ await Task.Delay(20);
+ cts.Cancel();
+
+ Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(canceledException);
+ }
+
+ await AssertPinsReleasedAsync(cancellationMemory);
+
+ // Teardown/abort path: pending receive interrupted by close must release pin.
+ using var teardownMemory = new TrackingPinnableMemoryManager(new byte[16]);
+ Task teardownReceive = ToTask(server.ReceiveAsync(teardownMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ client.Dispose();
+ server.Dispose();
+
+ Exception? teardownException = await Record.ExceptionAsync(async () => await teardownReceive);
+ AssertCanceledDisposedOrInterrupted(teardownException);
+ await AssertPinsReleasedAsync(teardownMemory);
+ }
+
+ private static async Task RunProvidedBufferRegistrationLifecycleScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ Task initialReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xA1 }, SocketFlags.None));
+ Assert.Equal(1, await initialReceive);
+
+ IoUringProvidedBufferSnapshot initialSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(initialSnapshot))
+ {
+ return;
+ }
+
+ Assert.Equal(initialSnapshot.TotalBufferCount, initialSnapshot.AvailableCount + initialSnapshot.InUseCount);
+ Assert.Equal(0, initialSnapshot.InUseCount);
+
+ using (var cts = new CancellationTokenSource())
+ {
+ Task canceledReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token));
+ await Task.Yield();
+ cts.Cancel();
+
+ Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(canceledException);
+ }
+
+ await Task.Delay(50);
+ IoUringProvidedBufferSnapshot postCancellationSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.Equal(initialSnapshot.TotalBufferCount, postCancellationSnapshot.TotalBufferCount);
+ Assert.Equal(postCancellationSnapshot.TotalBufferCount, postCancellationSnapshot.AvailableCount + postCancellationSnapshot.InUseCount);
+ Assert.Equal(0, postCancellationSnapshot.InUseCount);
+ }
+
+ private static async Task RunProvidedBufferSelectReceiveScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xB2 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(0xB2, receiveBuffer[0]);
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ }
+
+ private static async Task RunProvidedBufferRecycleReuseScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ long allocationFailuresBefore = beforeSnapshot.AllocationFailureCount;
+
+ int iterations = Math.Max(beforeSnapshot.TotalBufferCount + 64, 512);
+ byte[] receiveBuffer = new byte[1];
+ byte[] payload = new byte[1];
+
+ for (int i = 0; i < iterations; i++)
+ {
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ payload[0] = unchecked((byte)i);
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(payload[0], receiveBuffer[0]);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.Equal(allocationFailuresBefore, afterSnapshot.AllocationFailureCount);
+ Assert.Equal(beforeSnapshot.TotalBufferCount, afterSnapshot.TotalBufferCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount);
+ }
+
+ private static async Task RunProvidedBufferExhaustionScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] warmupBuffer = new byte[1];
+ Task warmupReceive = ToTask(server.ReceiveAsync(warmupBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC1 }, SocketFlags.None));
+ Assert.Equal(1, await warmupReceive);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot))
+ {
+ return;
+ }
+
+ Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount));
+ Assert.True(forcedBufferCount > 0);
+
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC2 }, SocketFlags.None));
+ Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(receiveTask, completed);
+
+ Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask);
+ SocketException socketException = Assert.IsType(receiveException);
+ Assert.Equal(SocketError.NoBufferSpaceAvailable, socketException.SocketErrorCode);
+ }
+
+ private static async Task RunProvidedBufferMixedWorkloadScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ using Socket udpReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ udpReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] tcpReceiveBuffer = new byte[1];
+ byte[] udpReceiveBuffer = new byte[2];
+
+ Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None));
+ Task udpReceive = ToTask(
+ udpReceiver.ReceiveFromAsync(
+ udpReceiveBuffer,
+ SocketFlags.None,
+ new IPEndPoint(IPAddress.Any, 0)));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xD1 }, SocketFlags.None));
+ Assert.Equal(2, await udpSender.SendToAsync(new byte[] { 0xE1, 0xE2 }, SocketFlags.None, udpReceiver.LocalEndPoint!));
+
+ Assert.Equal(1, await tcpReceive);
+ Assert.Equal(0xD1, tcpReceiveBuffer[0]);
+
+ SocketReceiveFromResult udpResult = await udpReceive;
+ Assert.Equal(2, udpResult.ReceivedBytes);
+ Assert.Equal(0xE1, udpReceiveBuffer[0]);
+ Assert.Equal(0xE2, udpReceiveBuffer[1]);
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ }
+
+ private static async Task SendExactlyAsync(Socket socket, ReadOnlyMemory buffer)
+ {
+ int totalSent = 0;
+ while (totalSent < buffer.Length)
+ {
+ int sent = await socket.SendAsync(buffer.Slice(totalSent), SocketFlags.None);
+ Assert.True(sent > 0, "Socket.SendAsync returned 0 before sending all bytes.");
+ totalSent += sent;
+ }
+ }
+
+ private static async Task ReceiveExactlyAsync(Socket socket, Memory buffer)
+ {
+ int totalReceived = 0;
+ while (totalReceived < buffer.Length)
+ {
+ int received = await socket.ReceiveAsync(buffer.Slice(totalReceived), SocketFlags.None);
+ Assert.True(received > 0, "Socket.ReceiveAsync returned 0 before receiving all expected bytes.");
+ totalReceived += received;
+ }
+ }
+
+ private static async Task WaitForProvidedBufferSnapshotAsync(
+ Func predicate,
+ int timeoutMilliseconds = 10000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ while (DateTime.UtcNow < deadline)
+ {
+ if (predicate(snapshot))
+ {
+ return snapshot;
+ }
+
+ await Task.Delay(50);
+ snapshot = GetIoUringProvidedBufferSnapshot();
+ }
+
+ return snapshot;
+ }
+
+ private static async Task RunAdaptiveProvidedBufferSmallMessageShrinkScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+
+ for (int i = 0; i < 320; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)i));
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) &&
+ (snapshot.RecommendedBufferSize < initialBufferSize || snapshot.BufferSize < initialBufferSize));
+
+ Assert.True(
+ afterSnapshot.RecommendedBufferSize < initialBufferSize || afterSnapshot.BufferSize < initialBufferSize,
+ $"Expected adaptive recommendation to shrink from {initialBufferSize}. " +
+ $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferLargeMessageGrowScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ int payloadSize = initialBufferSize;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ sendBuffer.AsSpan().Fill(0x5A);
+
+ for (int i = 0; i < 320; i++)
+ {
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) &&
+ (snapshot.RecommendedBufferSize > initialBufferSize || snapshot.BufferSize > initialBufferSize));
+
+ Assert.True(
+ afterSnapshot.RecommendedBufferSize > initialBufferSize || afterSnapshot.BufferSize > initialBufferSize,
+ $"Expected adaptive recommendation to grow from {initialBufferSize}. " +
+ $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferMixedWorkloadStableScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ byte[] smallSend = new byte[64];
+ byte[] smallReceive = new byte[64];
+ byte[] largeSend = new byte[initialBufferSize];
+ byte[] largeReceive = new byte[initialBufferSize];
+ smallSend.AsSpan().Fill(0x11);
+ largeSend.AsSpan().Fill(0x77);
+
+ for (int i = 0; i < 320; i++)
+ {
+ bool useLarge = (i & 1) == 1;
+ byte[] send = useLarge ? largeSend : smallSend;
+ byte[] receive = useLarge ? largeReceive : smallReceive;
+
+ Task receiveTask = ReceiveExactlyAsync(server, receive);
+ await SendExactlyAsync(client, send);
+ await receiveTask;
+ Assert.Equal(send, receive);
+ }
+
+ await Task.Delay(250);
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.True(IsAdaptiveSizingUsable(afterSnapshot));
+ Assert.Equal(initialBufferSize, afterSnapshot.RecommendedBufferSize);
+ }
+
+ private static async Task RunAdaptiveProvidedBufferResizeSwapNoDataLossScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ for (int i = 0; i < 384; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)i));
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) && snapshot.BufferSize < initialBufferSize,
+ timeoutMilliseconds: 15000);
+
+ Assert.True(
+ afterSnapshot.BufferSize < initialBufferSize,
+ $"Expected adaptive resize swap to shrink active ring. initial={initialBufferSize}, current={afterSnapshot.BufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferResizeSwapConcurrentInFlightNoDataLossScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int batchSize = 64;
+ const int rounds = 24;
+
+ // Keep many receives in flight while driving enough completions to trigger adaptive
+ // resize; this exercises ring-swap safety under concurrent tracked receive activity.
+ for (int round = 0; round < rounds; round++)
+ {
+ Task