feat: initial work & failing test

qri-io · Feb 14, 2019 · 70735a9 · 70735a9
commit 70735a9
Show file tree

Hide file tree

Showing 2 changed files with 447 additions and 0 deletions.
diff --git a/difff.go b/difff.go
@@ -0,0 +1,274 @@
+// Package difff calculates the differences of document trees consisting of the
+// standard go types created by unmarshaling from JSON, consisting of two
+// complex types:
+//   * map[string]interface{}
+//   * []interface{}
+// and five scalar types:
+//   * string, int, float64, bool, nil
+//
+// difff is based off an algorithm designed for diffing XML documents outlined in:
+//    Detecting Changes in XML Documents by Grégory Cobéna & Amélie Marian
+//
+// The paper describes an algorithm for generating an edit script that transitions
+// between two states of tree-type data structures (XML). The general
+// approach is as follows: For two given tree states, generate a diff script
+// as a set of Deltas in 6 steps:
+//
+// 1. register in a map a unique signature (hash value) for every
+//    subtree of the d1 (old) document
+// 2. consider every subtree in d2 document, starting from the
+//    largest. check if it is identitical to some the subtrees in
+//    d1, if so match both subtrees.
+// 3. attempt to match the parents of two matched subtrees
+//    by checking labels (in our case, types of parent object or array)
+//    controlling for bad matches based on length of path to the
+//    ancestor and the weight of the matching subtrees. eg: a large
+//    subtree may force the matching of its ancestors up to the root
+//    a small subtree may not even force matching of its parent
+// 4. Consider the largest subtrees of d2 in order. If one candidate
+//    has it's parent already matched to the parent of the considered
+//    node, it is certianly the best candidate.
+// 5. At this point we might have matched all of d2. A node may not
+//    match b/c its been inserted, or we missed matching it. We can now
+//    do peephole optimization pass to retry some of the rejected nodes
+//    once no more matchings can be obtained, unmatched nodes in d2
+//    correspond to inserted nodes.
+// 6. consider each matching node and decide if the node is at its right
+//    place, or whether it has been moved.
+package difff
+
+import (
+	"bytes"
+	"encoding/hex"
+	"fmt"
+	"hash"
+	"hash/fnv"
+	"sort"
+	"strconv"
+	"sync"
+)
+
+// Diff computes a slice of deltas that define an edit script for turning the
+// value at d1 into d2
+func Diff(d1, d2 interface{}) []Delta {
+	var (
+		wg     sync.WaitGroup
+		t1, t2 Node
+	)
+	wg.Add(2)
+
+	go func() {
+		t1 = tree(d1, "", nil)
+		wg.Done()
+	}()
+	go func() {
+		t2 = tree(d2, "", nil)
+		wg.Done()
+	}()
+
+	wg.Wait()
+
+	fmt.Println(hex.EncodeToString(t1.Hash()), t1.Weight())
+	fmt.Println(hex.EncodeToString(t2.Hash()), t2.Weight())
+	return nil
+}
+
+// DeltaType defines the types of changes xydiff can create
+// to describe the difference between two documents
+type DeltaType uint8
+
+const (
+	// DTUnknown defaults DeltaType to undefined behaviour
+	DTUnknown DeltaType = iota
+	// DTRemove means making the children of a node
+	// become the children of a node's parent
+	DTRemove
+	// DTInsert is the compliment of deleting, adding
+	// children of a parent node to a new node, and making
+	// that node a child of the original parent
+	DTInsert
+	// DTMove is the succession of a deletion & insertion
+	// of the same node
+	DTMove
+	// DTChange is an alteration of a scalar data type (string, bool, float, etc)
+	DTChange
+)
+
+// Delta represents a change between two documents
+type Delta struct {
+	Type DeltaType
+
+	SrcPath []string
+	DstPath []string
+
+	SrcVal interface{}
+	DstVal interface{}
+}
+
+// NewHash returns a new hash interface, wrapped in a function for easy
+// hash algorithm switching, package consumers can override NewHash
+// with their own desired hash.Hash implementation if the value space is
+// particularly large. default is 32-bit FNV 1 for fast, cheap hashing
+var NewHash = func() hash.Hash {
+	return fnv.New32()
+}
+
+// NodeType defines all of the atoms in our universe
+type NodeType uint8
+
+const (
+	// NTUnknown defines a type outside our universe, should never be encountered
+	NTUnknown NodeType = iota
+	// NTObject is a dictionary of key / value pairs
+	NTObject
+	NTArray
+	NTString
+	NTFloat
+	NTInt
+	NTBool
+	NTNull
+)
+
+type Node interface {
+	Type() NodeType
+	Hash() []byte
+	Weight() int
+	Name() string
+}
+
+type compound struct {
+	t        NodeType
+	name     string
+	hash     []byte
+	parent   Node
+	children []Node
+	weight   int
+}
+
+func (c compound) Type() NodeType { return c.t }
+func (c compound) Name() string   { return c.name }
+func (c compound) Hash() []byte   { return c.hash }
+func (c compound) Weight() int    { return c.weight }
+
+type scalar struct {
+	t      NodeType
+	name   string
+	hash   []byte
+	parent Node
+}
+
+func (s scalar) Type() NodeType { return s.t }
+func (s scalar) Name() string   { return s.name }
+func (s scalar) Hash() []byte   { return s.hash }
+func (s scalar) Weight() int    { return 1 }
+
+func tree(v interface{}, name string, parent Node) Node {
+	switch x := v.(type) {
+	case nil:
+		return scalar{
+			t:      NTNull,
+			name:   name,
+			hash:   NewHash().Sum([]byte("null")),
+			parent: parent,
+		}
+	case float64:
+		fstr := strconv.FormatFloat(x, 'f', -1, 64)
+		return scalar{
+			t:      NTFloat,
+			name:   name,
+			hash:   NewHash().Sum([]byte(fstr)),
+			parent: parent,
+		}
+	case string:
+		return scalar{
+			t:      NTString,
+			name:   name,
+			hash:   NewHash().Sum([]byte(x)),
+			parent: parent,
+		}
+	case bool:
+		bstr := "false"
+		if x {
+			bstr = "true"
+		}
+		return scalar{
+			t:    NTBool,
+			name: name,
+			hash: NewHash().Sum([]byte(bstr)),
+		}
+	case []interface{}:
+		hasher := NewHash()
+		n := compound{
+			t:      NTArray,
+			name:   name,
+			parent: parent,
+		}
+
+		for i, v := range x {
+			name := strconv.Itoa(i)
+			node := tree(v, name, n)
+			hasher.Write(node.Hash())
+			n.children = append(n.children, node)
+		}
+		n.hash = hasher.Sum(nil)
+
+		n.weight = 1
+		for _, ch := range n.children {
+			n.weight += ch.Weight()
+		}
+
+		return n
+	case map[string]interface{}:
+		hasher := NewHash()
+		n := compound{
+			t:      NTObject,
+			name:   name,
+			parent: parent,
+		}
+
+		// gotta sort keys for consistent hashing
+		names := make([]string, 0, len(x))
+		for name := range x {
+			names = append(names, name)
+		}
+		sort.Strings(names)
+
+		for _, name := range names {
+			node := tree(x[name], name, n)
+			hasher.Write(node.Hash())
+			n.children = append(n.children, node)
+		}
+		n.hash = hasher.Sum(nil)
+
+		n.weight = 1
+		for _, ch := range n.children {
+			n.weight += ch.Weight()
+		}
+		return n
+	default:
+		panic(fmt.Sprintf("unexpected type: %T", v))
+	}
+}
+
+// sortAdd inserts n into nodes, keeping the slice sorted by node weight,
+// heaviest to the left
+func sortAdd(n Node, nodes []Node) {
+	i := sort.Search(len(nodes), func(i int) bool { return nodes[i].Weight() <= n.Weight() })
+	if i < len(nodes) && nodes[i] == n {
+		fmt.Println(i)
+	} else {
+		nodes = append(nodes, nil)
+		copy(nodes[i+1:], nodes[i:])
+		nodes[i] = n
+	}
+}
+
+// Match connects nodes from different trees
+type Match struct {
+	left, right Node
+}
+
+// ExactMatch checks if two nodes are the same
+func ExactMatch(a, b Node) bool {
+	return bytes.Equal(a.Hash(), b.Hash())
+}