From 2053c8fea0c8c6decfd95128b2126b2c73a9bdf6 Mon Sep 17 00:00:00 2001 From: Eric Patey Date: Sat, 10 Mar 2018 22:11:14 -0500 Subject: [PATCH 1/2] Dramatically optimize algorithm in the common case by excluding matching heads and tails before using LCS. For example, in the case of single insert, the algorithm changes from O(m*n) to O(m+n). When the arrays contain 1,000 entries, for example, this change reduces the number of comparisons ~1,000,000 to ~2,000 and the size of the table used by the algorithm from ~1,000,000 to 2. --- Dwifft/Dwifft.swift | 35 ++++++++++++++++++++++++--------- DwifftTests/DwifftTests.swift | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/Dwifft/Dwifft.swift b/Dwifft/Dwifft.swift index 188d037..e5d54d2 100644 --- a/Dwifft/Dwifft.swift +++ b/Dwifft/Dwifft.swift @@ -78,6 +78,20 @@ public enum SectionedDiffStep: CustomDebugStringConvertible { /// Namespace for the `diff` and `apply` functions. public enum Dwifft { + internal static func matchingEndsInfo(_ lhs: [Value], _ rhs: [Value]) -> (Int, ArraySlice, ArraySlice) { + let minTotalCount = min(lhs.count, rhs.count) + let matchingHeadCount = zip(lhs, rhs).prefix() { $0.0 == $0.1 }.map() { $0.0 }.count + let matchingTailCount = matchingHeadCount == minTotalCount + ? 0 // if the matching head consumed all of either of the arrays, there's no tail + : zip(lhs.reversed(), rhs.reversed()).prefix(minTotalCount - matchingHeadCount).prefix() { $0.0 == $0.1 }.reversed().map() { $0.0 }.count + + let matchingEndsCount = matchingHeadCount + matchingTailCount + let lhsMiddle = matchingEndsCount < lhs.count ? lhs[matchingHeadCount..(_ lhs: [Value], _ rhs: [Value]) -> [DiffStep] { + let (matchingHeadCount, lhs, rhs) = matchingEndsInfo(lhs, rhs) + if lhs.isEmpty { - return rhs.enumerated().map(DiffStep.insert) + return rhs.enumerated().map { DiffStep.insert($0 + matchingHeadCount, $1) } } else if rhs.isEmpty { - return lhs.enumerated().map(DiffStep.delete).reversed() + return lhs.enumerated().map { DiffStep.delete($0 + matchingHeadCount, $1) }.reversed() } let table = MemoizedSequenceComparison.buildTable(lhs, rhs) @@ -241,8 +257,8 @@ public enum Dwifft { private static func diffInternal( _ table: [[Int]], - _ x: [Value], - _ y: [Value], + _ x: ArraySlice, + _ y: ArraySlice, _ i: Int, _ j: Int, _ currentResults: ([DiffStep], [DiffStep]) @@ -252,18 +268,19 @@ public enum Dwifft { } else { return .call { + let prefixCount = x.startIndex var nextResults = currentResults if i == 0 { - nextResults.0 = [DiffStep.insert(j-1, y[j-1])] + nextResults.0 + nextResults.0 = [DiffStep.insert(j-1+prefixCount, y[j-1+prefixCount])] + nextResults.0 return diffInternal(table, x, y, i, j-1, nextResults) } else if j == 0 { - nextResults.1 = nextResults.1 + [DiffStep.delete(i-1, x[i-1])] + nextResults.1 = nextResults.1 + [DiffStep.delete(i-1+prefixCount, x[i-1+prefixCount])] return diffInternal(table, x, y, i - 1, j, nextResults) } else if table[i][j] == table[i][j-1] { - nextResults.0 = [DiffStep.insert(j-1, y[j-1])] + nextResults.0 + nextResults.0 = [DiffStep.insert(j-1+prefixCount, y[j-1+prefixCount])] + nextResults.0 return diffInternal(table, x, y, i, j-1, nextResults) } else if table[i][j] == table[i-1][j] { - nextResults.1 = nextResults.1 + [DiffStep.delete(i-1, x[i-1])] + nextResults.1 = nextResults.1 + [DiffStep.delete(i-1+prefixCount, x[i-1+prefixCount])] return diffInternal(table, x, y, i - 1, j, nextResults) } else { return diffInternal(table, x, y, i-1, j-1, nextResults) @@ -279,7 +296,7 @@ fileprivate enum Result{ } fileprivate struct MemoizedSequenceComparison { - static func buildTable(_ x: [T], _ y: [T]) -> [[Int]] { + static func buildTable(_ x: ArraySlice, _ y: ArraySlice) -> [[Int]] { let n = x.count, m = y.count var table = Array(repeating: Array(repeating: 0, count: m + 1), count: n + 1) // using unsafe pointers lets us avoid swift array bounds-checking, which results in a considerable speed boost. diff --git a/DwifftTests/DwifftTests.swift b/DwifftTests/DwifftTests.swift index c758332..ea8e7fa 100644 --- a/DwifftTests/DwifftTests.swift +++ b/DwifftTests/DwifftTests.swift @@ -25,8 +25,44 @@ struct SectionedValuesWrapper: Arbitrary { } } + +// Generator that makes arrays from 0-20 elements long where each element has the value 0...9 +let smallishArrayGen = Gen.fromElements(in: 0...20).flatMap(Gen.fromElements(in: 0...9).proliferate(withSize:)) + +struct smallishArrayWrapper: Arbitrary { + let getArray: [Int] + + static var arbitrary: Gen { + return smallishArrayGen.map {smallishArrayWrapper(getArray:$0)} + } + + public static func shrink(_ x: smallishArrayWrapper) -> [smallishArrayWrapper] { + return Array.shrink(x.getArray).map {smallishArrayWrapper(getArray:$0)} + } +} + class DwifftSwiftCheckTests: XCTestCase { + func testMatchingEndsInfo() { + property("Confirming matching ends info is self consistent", arguments: CheckerArguments(maxAllowableSuccessfulTests: 5000)) <- forAll { (lhs : smallishArrayWrapper, rhs : smallishArrayWrapper) in + let lhs = lhs.getArray, rhs = rhs.getArray + + let (matchingHeadCount, lhsInner, rhsInner) = Dwifft.matchingEndsInfo(lhs, rhs) + + // Generate slices for the head and tail based on the results from matchingEndsInfo + let matchingHead = matchingHeadCount > 0 ? lhs[0..() + let matchingTailCount = lhs.count - matchingHeadCount + lhsInner.count + let matchingTail = matchingTailCount > 0 ? lhs[(matchingHeadCount + lhsInner.count)...] : ArraySlice() + + // Now reconstruct the input arrays using the matching head, innards, and tail + let reconstructedLhs = Array(matchingHead + lhsInner + matchingTail) + let reconstructedRhs = Array(matchingHead + rhsInner + matchingTail) + + return (reconstructedLhs == lhs) "Left identity" + ^&&^ + (reconstructedRhs == rhs) "Right identity" + } + } func testDiff() { property("Diffing two arrays, then applying the diff to the first, yields the second") <- forAll { (a1 : ArrayOf, a2 : ArrayOf) in let diff = Dwifft.diff(a1.getArray, a2.getArray) @@ -115,6 +151,7 @@ class DwifftTests: XCTestCase { TestCase("1234", "1224533324", "+2@2+4@3+5@4+3@6+3@7+2@8"), TestCase("thisisatest", "testing123testing", "-a@6-s@5-i@2-h@1+e@1+t@3+n@5+g@6+1@7+2@8+3@9+i@14+n@15+g@16"), TestCase("HUMAN", "CHIMPANZEE", "-U@1+C@0+I@2+P@4+Z@7+E@8+E@9"), + TestCase("1211", "11", "-1@2-2@1"), // Needed to verify matchingEndsInfo bug where tail match size was overstated ] for test in tests { From 82ee099abdb2e9f166e50bd94227373d1c712a5d Mon Sep 17 00:00:00 2001 From: Eric Patey Date: Tue, 20 Mar 2018 10:46:06 -0400 Subject: [PATCH 2/2] Keep sequences lazy for performance of matchingEndsInfo. --- Dwifft/Dwifft.swift | 17 ++++++++++++++--- DwifftTests/DwifftTests.swift | 11 +++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/Dwifft/Dwifft.swift b/Dwifft/Dwifft.swift index e5d54d2..aa0ff46 100644 --- a/Dwifft/Dwifft.swift +++ b/Dwifft/Dwifft.swift @@ -75,16 +75,27 @@ public enum SectionedDiffStep: CustomDebugStringConvertible { } } +// Need to be able to count a sequence without materializing as an array in order to keep matchingEndsInfo below as fase as possible +private extension Sequence { + func count() -> Int { + var i = 0 + for _ in self { + i += 1 + } + return i + } +} + /// Namespace for the `diff` and `apply` functions. public enum Dwifft { internal static func matchingEndsInfo(_ lhs: [Value], _ rhs: [Value]) -> (Int, ArraySlice, ArraySlice) { let minTotalCount = min(lhs.count, rhs.count) - let matchingHeadCount = zip(lhs, rhs).prefix() { $0.0 == $0.1 }.map() { $0.0 }.count + let matchingHeadCount = zip(lhs, rhs).lazy.prefix() { $0.0 == $0.1 }.count() let matchingTailCount = matchingHeadCount == minTotalCount ? 0 // if the matching head consumed all of either of the arrays, there's no tail - : zip(lhs.reversed(), rhs.reversed()).prefix(minTotalCount - matchingHeadCount).prefix() { $0.0 == $0.1 }.reversed().map() { $0.0 }.count - + : zip(lhs.reversed(), rhs.reversed()).prefix(minTotalCount - matchingHeadCount).lazy.prefix() { $0.0 == $0.1 }.count() + let matchingEndsCount = matchingHeadCount + matchingTailCount let lhsMiddle = matchingEndsCount < lhs.count ? lhs[matchingHeadCount...fromElements(in: 0...9).proliferate(withSize:1000).generate + let rhs = lhs + [666] + lhs + + measure { + for _ in 0...10000 { + let _ = Dwifft.matchingEndsInfo(lhs, rhs) + } + } + } + func testDiffBenchmark() { let a: [Int] = (0...1000).map({ _ in Int(arc4random_uniform(100)) }).filter({ _ in arc4random_uniform(2) == 0}) let b: [Int] = (0...1000).map({ _ in Int(arc4random_uniform(100)) }).filter({ _ in arc4random_uniform(2) == 0})