From 2053c8fea0c8c6decfd95128b2126b2c73a9bdf6 Mon Sep 17 00:00:00 2001
From: Eric Patey <eric@patey.com>
Date: Sat, 10 Mar 2018 22:11:14 -0500
Subject: [PATCH 1/2] Dramatically optimize algorithm in the common case by
 excluding matching heads and tails before using LCS. For example, in the case
 of single insert, the algorithm changes from O(m*n) to O(m+n). When the
 arrays contain 1,000 entries, for example, this change reduces the number of
 comparisons ~1,000,000 to ~2,000 and the size of the table used by the
 algorithm from ~1,000,000 to 2.

---
 Dwifft/Dwifft.swift           | 35 ++++++++++++++++++++++++---------
 DwifftTests/DwifftTests.swift | 37 +++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 9 deletions(-)
diff --git a/Dwifft/Dwifft.swift b/Dwifft/Dwifft.swift
index 188d037..e5d54d2 100644
--- a/Dwifft/Dwifft.swift
+++ b/Dwifft/Dwifft.swift
@@ -78,6 +78,20 @@ public enum SectionedDiffStep<Section, Value>: CustomDebugStringConvertible {
 /// Namespace for the `diff` and `apply` functions.
 public enum Dwifft {
 
+    internal static func matchingEndsInfo<Value: Equatable>(_ lhs: [Value], _ rhs: [Value]) -> (Int, ArraySlice<Value>, ArraySlice<Value>) {
+        let minTotalCount = min(lhs.count, rhs.count)
+        let matchingHeadCount = zip(lhs, rhs).prefix() { $0.0 == $0.1 }.map() { $0.0 }.count
+        let matchingTailCount = matchingHeadCount == minTotalCount
+            ? 0 // if the matching head consumed all of either of the arrays, there's no tail
+            : zip(lhs.reversed(), rhs.reversed()).prefix(minTotalCount - matchingHeadCount).prefix() { $0.0 == $0.1 }.reversed().map() { $0.0 }.count
+        
+        let matchingEndsCount = matchingHeadCount + matchingTailCount
+        let lhsMiddle = matchingEndsCount < lhs.count ? lhs[matchingHeadCount..<lhs.count - matchingTailCount] : []
+        let rhsMiddle = matchingEndsCount < rhs.count ? rhs[matchingHeadCount..<rhs.count - matchingTailCount] : []
+
+        return (matchingHeadCount, lhsMiddle, rhsMiddle)
+    }
+    
     /// Returns the sequence of `DiffStep`s required to transform one array into another.
     ///
     /// - Parameters:
@@ -85,10 +99,12 @@ public enum Dwifft {
     ///   - rhs: another, uh, array
     /// - Returns: the series of transformations that, when applied to `lhs`, will yield `rhs`.
     public static func diff<Value: Equatable>(_ lhs: [Value], _ rhs: [Value]) -> [DiffStep<Value>] {
+        let (matchingHeadCount, lhs, rhs) = matchingEndsInfo(lhs, rhs)
+
         if lhs.isEmpty {
-            return rhs.enumerated().map(DiffStep.insert)
+            return rhs.enumerated().map { DiffStep.insert($0 + matchingHeadCount, $1) }
         } else if rhs.isEmpty {
-            return lhs.enumerated().map(DiffStep.delete).reversed()
+            return lhs.enumerated().map { DiffStep.delete($0 + matchingHeadCount, $1) }.reversed()
         }
 
         let table = MemoizedSequenceComparison.buildTable(lhs, rhs)
@@ -241,8 +257,8 @@ public enum Dwifft {
 
     private static func diffInternal<Value: Equatable>(
         _ table: [[Int]],
-        _ x: [Value],
-        _ y: [Value],
+        _ x: ArraySlice<Value>,
+        _ y: ArraySlice<Value>,
         _ i: Int,
         _ j: Int,
         _ currentResults: ([DiffStep<Value>], [DiffStep<Value>])
@@ -252,18 +268,19 @@ public enum Dwifft {
         }
         else {
             return .call {
+                let prefixCount = x.startIndex
                 var nextResults = currentResults
                 if i == 0 {
-                    nextResults.0 = [DiffStep.insert(j-1, y[j-1])] + nextResults.0
+                    nextResults.0 = [DiffStep.insert(j-1+prefixCount, y[j-1+prefixCount])] + nextResults.0
                     return diffInternal(table, x, y, i, j-1, nextResults)
                 } else if j == 0 {
-                    nextResults.1 = nextResults.1 + [DiffStep.delete(i-1, x[i-1])]
+                    nextResults.1 = nextResults.1 + [DiffStep.delete(i-1+prefixCount, x[i-1+prefixCount])]
                     return diffInternal(table, x, y, i - 1, j, nextResults)
                 } else if table[i][j] == table[i][j-1] {
-                    nextResults.0 = [DiffStep.insert(j-1, y[j-1])] + nextResults.0
+                    nextResults.0 = [DiffStep.insert(j-1+prefixCount, y[j-1+prefixCount])] + nextResults.0
                     return diffInternal(table, x, y, i, j-1, nextResults)
                 } else if table[i][j] == table[i-1][j] {
-                    nextResults.1 = nextResults.1 + [DiffStep.delete(i-1, x[i-1])]
+                    nextResults.1 = nextResults.1 + [DiffStep.delete(i-1+prefixCount, x[i-1+prefixCount])]
                     return diffInternal(table, x, y, i - 1, j, nextResults)
                 } else {
                     return diffInternal(table, x, y, i-1, j-1, nextResults)
@@ -279,7 +296,7 @@ fileprivate enum Result<T>{
 }
 
 fileprivate struct MemoizedSequenceComparison<T: Equatable> {
-    static func buildTable(_ x: [T], _ y: [T]) -> [[Int]] {
+    static func buildTable(_ x: ArraySlice<T>, _ y: ArraySlice<T>) -> [[Int]] {
         let n = x.count, m = y.count
         var table = Array(repeating: Array(repeating: 0, count: m + 1), count: n + 1)
         // using unsafe pointers lets us avoid swift array bounds-checking, which results in a considerable speed boost.
diff --git a/DwifftTests/DwifftTests.swift b/DwifftTests/DwifftTests.swift
index c758332..ea8e7fa 100644
--- a/DwifftTests/DwifftTests.swift
+++ b/DwifftTests/DwifftTests.swift
@@ -25,8 +25,44 @@ struct SectionedValuesWrapper: Arbitrary {
     }
 }
 
+
+// Generator that makes arrays from 0-20 elements long where each element has the value 0...9
+let smallishArrayGen = Gen<Int>.fromElements(in: 0...20).flatMap(Gen<Int>.fromElements(in: 0...9).proliferate(withSize:))
+
+struct smallishArrayWrapper: Arbitrary {
+    let getArray: [Int]
+    
+    static var arbitrary: Gen<smallishArrayWrapper> {
+        return smallishArrayGen.map {smallishArrayWrapper(getArray:$0)}
+    }
+    
+    public static func shrink(_ x: smallishArrayWrapper) -> [smallishArrayWrapper] {
+        return Array<Int>.shrink(x.getArray).map {smallishArrayWrapper(getArray:$0)}
+    }
+}
+
 class DwifftSwiftCheckTests: XCTestCase {
 
+    func testMatchingEndsInfo() {
+        property("Confirming matching ends info is self consistent", arguments: CheckerArguments(maxAllowableSuccessfulTests: 5000)) <- forAll { (lhs : smallishArrayWrapper, rhs : smallishArrayWrapper) in
+            let lhs = lhs.getArray, rhs = rhs.getArray
+            
+            let (matchingHeadCount, lhsInner, rhsInner) = Dwifft.matchingEndsInfo(lhs, rhs)
+            
+            // Generate slices for the head and tail based on the results from matchingEndsInfo
+            let matchingHead = matchingHeadCount > 0 ? lhs[0..<matchingHeadCount] : ArraySlice<Int>()
+            let matchingTailCount = lhs.count - matchingHeadCount + lhsInner.count
+            let matchingTail = matchingTailCount > 0 ? lhs[(matchingHeadCount + lhsInner.count)...] : ArraySlice<Int>()
+            
+            // Now reconstruct the input arrays using the matching head, innards, and tail
+            let reconstructedLhs = Array(matchingHead + lhsInner + matchingTail)
+            let reconstructedRhs = Array(matchingHead + rhsInner + matchingTail)
+
+            return (reconstructedLhs == lhs) <?> "Left identity"
+                ^&&^
+                (reconstructedRhs == rhs) <?> "Right identity"
+        }
+    }
     func testDiff() {
         property("Diffing two arrays, then applying the diff to the first, yields the second") <- forAll { (a1 : ArrayOf<Int>, a2 : ArrayOf<Int>) in
             let diff = Dwifft.diff(a1.getArray, a2.getArray)
@@ -115,6 +151,7 @@ class DwifftTests: XCTestCase {
             TestCase("1234", "1224533324", "+2@2+4@3+5@4+3@6+3@7+2@8"),
             TestCase("thisisatest", "testing123testing", "-a@6-s@5-i@2-h@1+e@1+t@3+n@5+g@6+1@7+2@8+3@9+i@14+n@15+g@16"),
             TestCase("HUMAN", "CHIMPANZEE", "-U@1+C@0+I@2+P@4+Z@7+E@8+E@9"),
+            TestCase("1211", "11", "-1@2-2@1"), // Needed to verify matchingEndsInfo bug where tail match size was overstated
             ]
 
         for test in tests {

From 82ee099abdb2e9f166e50bd94227373d1c712a5d Mon Sep 17 00:00:00 2001
From: Eric Patey <eric@patey.com>
Date: Tue, 20 Mar 2018 10:46:06 -0400
Subject: [PATCH 2/2] Keep sequences lazy for performance of matchingEndsInfo.

---
 Dwifft/Dwifft.swift           | 17 ++++++++++++++---
 DwifftTests/DwifftTests.swift | 11 +++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/Dwifft/Dwifft.swift b/Dwifft/Dwifft.swift
index e5d54d2..aa0ff46 100644
--- a/Dwifft/Dwifft.swift
+++ b/Dwifft/Dwifft.swift
@@ -75,16 +75,27 @@ public enum SectionedDiffStep<Section, Value>: CustomDebugStringConvertible {
     }
 }
 
+// Need to be able to count a sequence without materializing as an array in order to keep matchingEndsInfo below as fase as possible
+private extension Sequence {
+    func count() -> Int {
+        var i = 0
+        for _ in self {
+            i += 1
+        }
+        return i
+    }
+}
+
 /// Namespace for the `diff` and `apply` functions.
 public enum Dwifft {
 
     internal static func matchingEndsInfo<Value: Equatable>(_ lhs: [Value], _ rhs: [Value]) -> (Int, ArraySlice<Value>, ArraySlice<Value>) {
         let minTotalCount = min(lhs.count, rhs.count)
-        let matchingHeadCount = zip(lhs, rhs).prefix() { $0.0 == $0.1 }.map() { $0.0 }.count
+        let matchingHeadCount = zip(lhs, rhs).lazy.prefix() { $0.0 == $0.1 }.count()
         let matchingTailCount = matchingHeadCount == minTotalCount
             ? 0 // if the matching head consumed all of either of the arrays, there's no tail
-            : zip(lhs.reversed(), rhs.reversed()).prefix(minTotalCount - matchingHeadCount).prefix() { $0.0 == $0.1 }.reversed().map() { $0.0 }.count
-        
+            : zip(lhs.reversed(), rhs.reversed()).prefix(minTotalCount - matchingHeadCount).lazy.prefix() { $0.0 == $0.1 }.count()
+
         let matchingEndsCount = matchingHeadCount + matchingTailCount
         let lhsMiddle = matchingEndsCount < lhs.count ? lhs[matchingHeadCount..<lhs.count - matchingTailCount] : []
         let rhsMiddle = matchingEndsCount < rhs.count ? rhs[matchingHeadCount..<rhs.count - matchingTailCount] : []
diff --git a/DwifftTests/DwifftTests.swift b/DwifftTests/DwifftTests.swift
index ea8e7fa..bf4e6d1 100644
--- a/DwifftTests/DwifftTests.swift
+++ b/DwifftTests/DwifftTests.swift
@@ -163,6 +163,17 @@ class DwifftTests: XCTestCase {
 
     }
 
+    func testMatchingEndsInfoBenchmark() {
+        let lhs = Gen<Int>.fromElements(in: 0...9).proliferate(withSize:1000).generate
+        let rhs = lhs + [666] + lhs
+        
+        measure {
+            for _ in 0...10000 {
+                let _ = Dwifft.matchingEndsInfo(lhs, rhs)
+            }
+        }
+    }
+    
     func testDiffBenchmark() {
         let a: [Int] = (0...1000).map({ _ in Int(arc4random_uniform(100)) }).filter({ _ in arc4random_uniform(2) == 0})
         let b: [Int] = (0...1000).map({ _ in Int(arc4random_uniform(100)) }).filter({ _ in arc4random_uniform(2) == 0})