From db61cfe38e81213f21ff018f70d1e98ce2341f93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20Wei=C3=9F?= Date: Tue, 24 Apr 2018 18:51:24 +0100 Subject: [PATCH] make enums containing ByteBuffer fit in three words (#349) Motivation: NIO's dynamic pipeline comes with one performance caveat: If your type isn't specialised in NIOAny (only types in the NIO module can be) and it's over 3 words (24 bytes on 64-bit platforms), you'll suffer an allocation for every time you box it in NIOAny. Very unfortunately, both ByteBuffer and FileRegion were exactly 3 words wide which means that any enum containing those would be wider than 3 words. Worse, both HTTP{Request,Response}Head contained types that were wider than 3 words. The best solution to this problem is to shrink ByteBuffer and FileRegion to just under 3 words and that's exactly what this PR is doing. That is slightly tricky as ByteBuffer was already bit packed fairly well (reader/writer indices & slice begin/end were stored as a UInt32). The trick we employ now is to store the slice beginning in a UInt24 and the file region reader index in a UInt56. That saves one byte for both ByteBuffer and FileRegion with very moderate tradeoffs: The reader index in a file now needs to be within 64 PiB (peta bytes) and a byte buffer slice beginning must start within 16 MiB (mega bytes). Note: The reader/writer indices as well as slice ends are _not_ affected and can still be within 4 GiB. Clearly no one would care about the restrictions for FileRegions in the real world but we might hit the ByteBuffer slice beginning limit in which case the slice would be copied out. But given that slices are mostly used to slice off headers in network protocols, 16 MiB should be _plenty_. Norman was kind enough to measure the perf differences: master (before this PR): ``` $ wrk -c 256 -d 10s -t 4 -s ~/Downloads/pipeline-many.lua -H "X-Host: SomeValue" -H "Host: swiftnio.io" -H "ThereAreEvenMoreHeaders: AndMoreValues" http://localhost:8888 Running 10s test @ http://localhost:8888 4 threads and 256 connections Thread Stats Avg Stdev Max +/- Stdev Latency 93.02ms 158.13ms 1.97s 93.07% Req/Sec 112.79k 26.05k 258.75k 84.50% 4501098 requests in 10.04s, 214.63MB read Socket errors: connect 0, read 111, write 0, timeout 89 Requests/sec: 448485.61 Transfer/sec: 21.39MB ``` after this PR: ``` $ wrk -c 256 -d 10s -t 4 -s ~/Downloads/pipeline-many.lua -H "X-Host: SomeValue" -H "Host: swiftnio.io" -H "ThereAreEvenMoreHeaders: AndMoreValues" http://localhost:8888 Running 10s test @ http://localhost:8888 4 threads and 256 connections Thread Stats Avg Stdev Max +/- Stdev Latency 107.53ms 206.56ms 1.99s 90.55% Req/Sec 124.15k 26.56k 290.41k 89.25% 4952904 requests in 10.03s, 236.17MB read Socket errors: connect 0, read 161, write 0, timeout 22 Requests/sec: 493852.65 Transfer/sec: 23.55MB ``` so we see a nice 10% improvement Modifications: - shrank ByteBuffer by 1 byte, making it 23 bytes in total - shrank FileRegion by 1 byte, making it 23 bytes in total - added @_inlineable to NIOAny where appropriate to not suffer boxed existentials in different places - added tests for the new edge cases Result: - more speed - fewer allocations --- Sources/NIO/ByteBuffer-core.swift | 62 ++++++++- Sources/NIO/FileRegion.swift | 20 ++- Sources/NIO/IntegerTypes.swift | 137 +++++++++++++++++++ Sources/NIO/NIOAny.swift | 3 + Sources/NIO/TypeAssistedChannelHandler.swift | 8 ++ Tests/NIOTests/ByteBufferTest+XCTest.swift | 3 + Tests/NIOTests/ByteBufferTest.swift | 102 ++++++++++++++ Tests/NIOTests/FileRegionTest+XCTest.swift | 3 + Tests/NIOTests/FileRegionTest.swift | 59 ++++++++ 9 files changed, 386 insertions(+), 11 deletions(-) create mode 100644 Sources/NIO/IntegerTypes.swift diff --git a/Sources/NIO/ByteBuffer-core.swift b/Sources/NIO/ByteBuffer-core.swift index dcee7a569e..6d74396ba5 100644 --- a/Sources/NIO/ByteBuffer-core.swift +++ b/Sources/NIO/ByteBuffer-core.swift @@ -29,6 +29,43 @@ let sysFree: @convention(c) (UnsafeMutableRawPointer?) -> Void = free } #endif +extension _ByteBufferSlice: Equatable { + static func ==(_ lhs: _ByteBufferSlice, _ rhs: _ByteBufferSlice) -> Bool { + return lhs._begin == rhs._begin && lhs.upperBound == rhs.upperBound + } +} + +/// The slice of a `ByteBuffer`, it's different from `Range` because the lower bound is actually only +/// 24 bits (the upper bound is still 32). Before constructing, you need to make sure the lower bound actually +/// fits within 24 bits, otherwise the behaviour is undefined. +@_versioned +struct _ByteBufferSlice { + @_versioned var upperBound: ByteBuffer.Index + @_versioned var _begin: _UInt24 + @_versioned var lowerBound: ByteBuffer.Index { + return UInt32(self._begin) + } + @_inlineable @_versioned var count: Int { + return Int(self.upperBound - self.lowerBound) + } + init() { + self._begin = 0 + self.upperBound = 0 + } + static var maxSupportedLowerBound: ByteBuffer.Index { + return ByteBuffer.Index(_UInt24.max) + } +} + +extension _ByteBufferSlice { + init(_ range: Range) { + self = _ByteBufferSlice() + + self._begin = _UInt24(range.lowerBound) + self.upperBound = range.upperBound + } +} + /// The preferred allocator for `ByteBuffer` values. The allocation strategy is opaque but is currently libc's /// `malloc`, `realloc` and `free`. /// @@ -163,15 +200,15 @@ public struct ByteBufferAllocator { /// for doing so. In any case, if you use the `get` prefixed methods you are responsible for ensuring that you do not reach into uninitialized memory by taking the `readableBytes` and `readerIndex` into /// account, and ensuring that you have previously written into the area covered by the `index itself. public struct ByteBuffer { - typealias Slice = Range + typealias Slice = _ByteBufferSlice typealias Allocator = ByteBufferAllocator typealias Index = UInt32 typealias Capacity = UInt32 + @_versioned private(set) var _storage: _Storage @_versioned private(set) var _readerIndex: Index = 0 @_versioned private(set) var _writerIndex: Index = 0 @_versioned private(set) var _slice: Slice - @_versioned private(set) var _storage: _Storage // MARK: Internal _Storage for CoW @_versioned final class _Storage { @@ -184,7 +221,7 @@ public struct ByteBuffer { self.bytes = bytesNoCopy self.capacity = capacity self.allocator = allocator - self.fullSlice = 0.. _Storage { + public func reallocSlice(_ slice: Range, capacity: Capacity) -> _Storage { assert(slice.count <= capacity) let new = self.allocateStorage(capacity: capacity) self.allocator.memcpy(new.bytes, self.bytes.advanced(by: Int(slice.lowerBound)), slice.count) @@ -223,7 +260,7 @@ public struct ByteBuffer { ptr.bindMemory(to: UInt8.self, capacity: Int(capacity)) self.bytes = ptr self.capacity = capacity - self.fullSlice = 0..> 16) + } +} + +extension _UInt24: Equatable { + static func ==(_ lhs: _UInt24, _ rhs: _UInt24) -> Bool { + return lhs.b12 == rhs.b12 && lhs.b3 == rhs.b3 + } +} + +// MARK: _UInt56 + +/// A 56-bit unsigned integer value type. +struct _UInt56: ExpressibleByIntegerLiteral { + typealias IntegerLiteralType = UInt32 + + @_versioned var b1234: UInt32 + @_versioned var b56: UInt16 + @_versioned var b7: UInt8 + + private init(b1234: UInt32, b56: UInt16, b7: UInt8) { + self.b1234 = b1234 + self.b56 = b56 + self.b7 = b7 + } + + init(integerLiteral value: UInt32) { + self.init(b1234: value, b56: 0, b7: 0) + } + + static let bitWidth: Int = 56 + + static var max: _UInt56 { + return .init(b1234: .max, b56: .max, b7: .max) + } + + static let min: _UInt56 = 0 +} + +extension _UInt56 { + init(_ value: UInt64) { + assert(value & 0xff_00_00_00_00_00_00_00 == 0, "value \(value) too large for _UInt56") + self.init(b1234: UInt32(truncatingIfNeeded: (value & 0xff_ff_ff_ff) >> 0 ), + b56: UInt16(truncatingIfNeeded: (value & 0xff_ff_00_00_00_00) >> 32), + b7: UInt8( value >> 48)) + } + + init(_ value: Int) { + self.init(UInt64(value)) + } +} + +extension UInt64 { + init(_ value: _UInt56) { + var newValue: UInt64 = 0 + newValue = UInt64(value.b1234) + newValue |= UInt64(value.b56 ) << 32 + newValue |= UInt64(value.b7 ) << 48 + self = newValue + } +} + +extension Int { + init(_ value: _UInt56) { + self = Int(UInt64(value)) + } +} + +extension _UInt56: Equatable { + static func ==(_ lhs: _UInt56, _ rhs: _UInt56) -> Bool { + return lhs.b1234 == rhs.b1234 && lhs.b56 == rhs.b56 && lhs.b7 == rhs.b7 + } +} diff --git a/Sources/NIO/NIOAny.swift b/Sources/NIO/NIOAny.swift index 75c9771139..d042c79e4d 100644 --- a/Sources/NIO/NIOAny.swift +++ b/Sources/NIO/NIOAny.swift @@ -49,15 +49,18 @@ public struct NIOAny { /// Wrap a value in a `NIOAny`. In most cases you should not create a `NIOAny` directly using this constructor. /// The abstraction that accepts values of type `NIOAny` must also provide a mechanism to do the wrapping. An /// example is a `ChannelInboundHandler` which provides `self.wrapInboundOut(aValueOfTypeInboundOut)`. + @_inlineable public init(_ value: T) { self._storage = _NIOAny(value) } + @_versioned enum _NIOAny { case ioData(IOData) case bufferEnvelope(AddressedEnvelope) case other(Any) + @_inlineable @_versioned init(_ value: T) { switch value { case let value as ByteBuffer: diff --git a/Sources/NIO/TypeAssistedChannelHandler.swift b/Sources/NIO/TypeAssistedChannelHandler.swift index e667e02018..2c970c3c1e 100644 --- a/Sources/NIO/TypeAssistedChannelHandler.swift +++ b/Sources/NIO/TypeAssistedChannelHandler.swift @@ -20,11 +20,13 @@ public protocol _EmittingChannelHandler { associatedtype OutboundOut = Never /// Wrap the provided `OutboundOut` that will be passed to the next `ChannelOutboundHandler` by calling `ChannelHandlerContext.write`. + @_inlineable func wrapOutboundOut(_ value: OutboundOut) -> NIOAny } /// Default implementations for `_EmittingChannelHandler`. extension _EmittingChannelHandler { + @_inlineable public func wrapOutboundOut(_ value: OutboundOut) -> NIOAny { return NIOAny(value) } @@ -41,18 +43,22 @@ public protocol ChannelInboundHandler: _ChannelInboundHandler, _EmittingChannelH associatedtype InboundOut = Never /// Unwrap the provided `NIOAny` that was passed to `channelRead`. + @_inlineable func unwrapInboundIn(_ value: NIOAny) -> InboundIn /// Wrap the provided `InboundOut` that will be passed to the next `ChannelInboundHandler` by calling `ChannelHandlerContext.fireChannelRead`. + @_inlineable func wrapInboundOut(_ value: InboundOut) -> NIOAny } /// Default implementations for `ChannelInboundHandler`. extension ChannelInboundHandler { + @_inlineable public func unwrapInboundIn(_ value: NIOAny) -> InboundIn { return value.forceAs() } + @_inlineable public func wrapInboundOut(_ value: InboundOut) -> NIOAny { return NIOAny(value) } @@ -66,11 +72,13 @@ public protocol ChannelOutboundHandler: _ChannelOutboundHandler, _EmittingChanne associatedtype OutboundIn /// Unwrap the provided `NIOAny` that was passed to `write`. + @_inlineable func unwrapOutboundIn(_ value: NIOAny) -> OutboundIn } /// Default implementations for `ChannelOutboundHandler`. extension ChannelOutboundHandler { + @_inlineable public func unwrapOutboundIn(_ value: NIOAny) -> OutboundIn { return value.forceAs() } diff --git a/Tests/NIOTests/ByteBufferTest+XCTest.swift b/Tests/NIOTests/ByteBufferTest+XCTest.swift index d4bb52e611..f694c71acc 100644 --- a/Tests/NIOTests/ByteBufferTest+XCTest.swift +++ b/Tests/NIOTests/ByteBufferTest+XCTest.swift @@ -110,6 +110,9 @@ extension ByteBufferTest { ("testUnderestimatingSequenceWorks", testUnderestimatingSequenceWorks), ("testZeroSizeByteBufferResizes", testZeroSizeByteBufferResizes), ("testSpecifyTypesAndEndiannessForIntegerMethods", testSpecifyTypesAndEndiannessForIntegerMethods), + ("testByteBufferFitsInACoupleOfEnums", testByteBufferFitsInACoupleOfEnums), + ("testLargeSliceBegin16MBIsOkayAndDoesNotCopy", testLargeSliceBegin16MBIsOkayAndDoesNotCopy), + ("testLargeSliceBeginMoreThan16MBIsOkay", testLargeSliceBeginMoreThan16MBIsOkay), ] } } diff --git a/Tests/NIOTests/ByteBufferTest.swift b/Tests/NIOTests/ByteBufferTest.swift index 26815974b3..4453fa8050 100644 --- a/Tests/NIOTests/ByteBufferTest.swift +++ b/Tests/NIOTests/ByteBufferTest.swift @@ -1214,6 +1214,108 @@ class ByteBufferTest: XCTestCase { self.buf.set(integer: 0xdeadbeef, at: 0, endianness: .little, as: UInt64.self) XCTAssertEqual(0xdeadbeef, self.buf.getInteger(at: 0, endianness: .little, as: UInt64.self)) } + + func testByteBufferFitsInACoupleOfEnums() throws { + enum Level4 { + case case1(ByteBuffer) + case case2(ByteBuffer) + case case3(ByteBuffer) + case case4(ByteBuffer) + } + enum Level3 { + case case1(Level4) + case case2(Level4) + case case3(Level4) + case case4(Level4) + } + enum Level2 { + case case1(Level3) + case case2(Level3) + case case3(Level3) + case case4(Level3) + } + enum Level1 { + case case1(Level2) + case case2(Level2) + case case3(Level2) + case case4(Level2) + } + + XCTAssertLessThanOrEqual(MemoryLayout.size, 23) + XCTAssertLessThanOrEqual(MemoryLayout.size, 24) + + XCTAssertLessThanOrEqual(MemoryLayout.size(ofValue: Level1.case1(.case2(.case3(.case4(self.buf))))), 24) + XCTAssertLessThanOrEqual(MemoryLayout.size(ofValue: Level1.case1(.case3(.case4(.case1(self.buf))))), 24) + } + + func testLargeSliceBegin16MBIsOkayAndDoesNotCopy() throws { + var fourMBBuf = self.allocator.buffer(capacity: 4 * 1024 * 1024) + fourMBBuf.write(bytes: repeatElement(0xff, count: fourMBBuf.capacity)) + let totalBufferSize = 5 * fourMBBuf.readableBytes + XCTAssertEqual(4 * 1024 * 1024, fourMBBuf.readableBytes) + var buf = self.allocator.buffer(capacity: totalBufferSize) + for _ in 0..<5 { + var fresh = fourMBBuf + buf.write(buffer: &fresh) + } + + let offset = Int(_UInt24.max) + + // mark some special bytes + buf.set(integer: 0xaa, at: 0, as: UInt8.self) + buf.set(integer: 0xbb, at: offset - 1, as: UInt8.self) + buf.set(integer: 0xcc, at: offset, as: UInt8.self) + buf.set(integer: 0xdd, at: buf.writerIndex - 1, as: UInt8.self) + + XCTAssertEqual(totalBufferSize, buf.readableBytes) + + let oldPtrVal = buf.withUnsafeReadableBytes { + UInt(bitPattern: $0.baseAddress!.advanced(by: offset)) + } + + let expectedReadableBytes = totalBufferSize - offset + let slice = buf.getSlice(at: offset, length: expectedReadableBytes)! + XCTAssertEqual(expectedReadableBytes, slice.readableBytes) + let newPtrVal = slice.withUnsafeReadableBytes { + UInt(bitPattern: $0.baseAddress!) + } + XCTAssertEqual(oldPtrVal, newPtrVal) + + XCTAssertEqual(0xcc, slice.getInteger(at: 0, as: UInt8.self)) + XCTAssertEqual(0xdd, slice.getInteger(at: slice.writerIndex - 1, as: UInt8.self)) + } + + func testLargeSliceBeginMoreThan16MBIsOkay() throws { + var fourMBBuf = self.allocator.buffer(capacity: 4 * 1024 * 1024) + fourMBBuf.write(bytes: repeatElement(0xff, count: fourMBBuf.capacity)) + let totalBufferSize = 5 * fourMBBuf.readableBytes + 1 + XCTAssertEqual(4 * 1024 * 1024, fourMBBuf.readableBytes) + var buf = self.allocator.buffer(capacity: totalBufferSize) + for _ in 0..<5 { + var fresh = fourMBBuf + buf.write(buffer: &fresh) + } + + let offset = Int(_UInt24.max) + 1 + + // mark some special bytes + buf.set(integer: 0xaa, at: 0, as: UInt8.self) + buf.set(integer: 0xbb, at: offset - 1, as: UInt8.self) + buf.set(integer: 0xcc, at: offset, as: UInt8.self) + buf.write(integer: 0xdd, as: UInt8.self) // write extra byte so the slice is the same length as above + XCTAssertEqual(totalBufferSize, buf.readableBytes) + + let expectedReadableBytes = totalBufferSize - offset + let slice = buf.getSlice(at: offset, length: expectedReadableBytes)! + XCTAssertEqual(expectedReadableBytes, slice.readableBytes) + XCTAssertEqual(0, slice.readerIndex) + XCTAssertEqual(expectedReadableBytes, slice.writerIndex) + XCTAssertEqual(Int(UInt32(expectedReadableBytes).nextPowerOf2()), slice.capacity) + + XCTAssertEqual(0xcc, slice.getInteger(at: 0, as: UInt8.self)) + XCTAssertEqual(0xdd, slice.getInteger(at: slice.writerIndex - 1, as: UInt8.self)) + } + } private enum AllocationExpectationState: Int { diff --git a/Tests/NIOTests/FileRegionTest+XCTest.swift b/Tests/NIOTests/FileRegionTest+XCTest.swift index fe0e16e96d..fba1c9c6fd 100644 --- a/Tests/NIOTests/FileRegionTest+XCTest.swift +++ b/Tests/NIOTests/FileRegionTest+XCTest.swift @@ -32,6 +32,9 @@ extension FileRegionTest { ("testWholeFileFileRegion", testWholeFileFileRegion), ("testWholeEmptyFileFileRegion", testWholeEmptyFileFileRegion), ("testFileRegionDuplicatesShareSeekPointer", testFileRegionDuplicatesShareSeekPointer), + ("testMassiveFileRegionThatJustAboutWorks", testMassiveFileRegionThatJustAboutWorks), + ("testMassiveFileRegionReaderIndexWorks", testMassiveFileRegionReaderIndexWorks), + ("testFileRegionAndIODataFitsInACoupleOfEnums", testFileRegionAndIODataFitsInACoupleOfEnums), ] } } diff --git a/Tests/NIOTests/FileRegionTest.swift b/Tests/NIOTests/FileRegionTest.swift index b5fcbbbfc2..45718e950d 100644 --- a/Tests/NIOTests/FileRegionTest.swift +++ b/Tests/NIOTests/FileRegionTest.swift @@ -213,4 +213,63 @@ class FileRegionTest : XCTestCase { } } } + + func testMassiveFileRegionThatJustAboutWorks() { + withTemporaryFile(content: "0123456789") { fh, path in + // just in case someone uses 32bit platforms + let readerIndex = UInt64(_UInt56.max) < UInt64(Int.max) ? Int(_UInt56.max) : Int.max + let fr = FileRegion(fileHandle: fh, readerIndex: readerIndex, endIndex: Int.max) + XCTAssertEqual(readerIndex, fr.readerIndex) + XCTAssertEqual(Int.max, fr.endIndex) + } + } + + func testMassiveFileRegionReaderIndexWorks() { + withTemporaryFile(content: "0123456789") { fh, path in + // just in case someone uses 32bit platforms + let readerIndex = (UInt64(_UInt56.max) < UInt64(Int.max) ? Int(_UInt56.max) : Int.max) - 1000 + var fr = FileRegion(fileHandle: fh, readerIndex: readerIndex, endIndex: Int.max) + for i in 0..<1000 { + XCTAssertEqual(readerIndex + i, fr.readerIndex) + XCTAssertEqual(Int.max, fr.endIndex) + fr.moveReaderIndex(forwardBy: 1) + } + } + } + + func testFileRegionAndIODataFitsInACoupleOfEnums() throws { + enum Level4 { + case case1(FileRegion) + case case2(FileRegion) + case case3(IOData) + case case4(IOData) + } + enum Level3 { + case case1(Level4) + case case2(Level4) + case case3(Level4) + case case4(Level4) + } + enum Level2 { + case case1(Level3) + case case2(Level3) + case case3(Level3) + case case4(Level3) + } + enum Level1 { + case case1(Level2) + case case2(Level2) + case case3(Level2) + case case4(Level2) + } + + XCTAssertLessThanOrEqual(MemoryLayout.size, 23) + XCTAssertLessThanOrEqual(MemoryLayout.size, 24) + + XCTAssertNoThrow(try withTemporaryFile(content: "0123456789") { fh, path in + let fr = try FileRegion(fileHandle: fh) + XCTAssertLessThanOrEqual(MemoryLayout.size(ofValue: Level1.case1(.case2(.case3(.case4(.fileRegion(fr)))))), 24) + XCTAssertLessThanOrEqual(MemoryLayout.size(ofValue: Level1.case1(.case3(.case4(.case1(fr))))), 24) + }) + } }