-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 70735a9
Showing
2 changed files
with
447 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
// Package difff calculates the differences of document trees consisting of the | ||
// standard go types created by unmarshaling from JSON, consisting of two | ||
// complex types: | ||
// * map[string]interface{} | ||
// * []interface{} | ||
// and five scalar types: | ||
// * string, int, float64, bool, nil | ||
// | ||
// difff is based off an algorithm designed for diffing XML documents outlined in: | ||
// Detecting Changes in XML Documents by Grégory Cobéna & Amélie Marian | ||
// | ||
// The paper describes an algorithm for generating an edit script that transitions | ||
// between two states of tree-type data structures (XML). The general | ||
// approach is as follows: For two given tree states, generate a diff script | ||
// as a set of Deltas in 6 steps: | ||
// | ||
// 1. register in a map a unique signature (hash value) for every | ||
// subtree of the d1 (old) document | ||
// 2. consider every subtree in d2 document, starting from the | ||
// largest. check if it is identitical to some the subtrees in | ||
// d1, if so match both subtrees. | ||
// 3. attempt to match the parents of two matched subtrees | ||
// by checking labels (in our case, types of parent object or array) | ||
// controlling for bad matches based on length of path to the | ||
// ancestor and the weight of the matching subtrees. eg: a large | ||
// subtree may force the matching of its ancestors up to the root | ||
// a small subtree may not even force matching of its parent | ||
// 4. Consider the largest subtrees of d2 in order. If one candidate | ||
// has it's parent already matched to the parent of the considered | ||
// node, it is certianly the best candidate. | ||
// 5. At this point we might have matched all of d2. A node may not | ||
// match b/c its been inserted, or we missed matching it. We can now | ||
// do peephole optimization pass to retry some of the rejected nodes | ||
// once no more matchings can be obtained, unmatched nodes in d2 | ||
// correspond to inserted nodes. | ||
// 6. consider each matching node and decide if the node is at its right | ||
// place, or whether it has been moved. | ||
package difff | ||
|
||
import ( | ||
"bytes" | ||
"encoding/hex" | ||
"fmt" | ||
"hash" | ||
"hash/fnv" | ||
"sort" | ||
"strconv" | ||
"sync" | ||
) | ||
|
||
// Diff computes a slice of deltas that define an edit script for turning the | ||
// value at d1 into d2 | ||
func Diff(d1, d2 interface{}) []Delta { | ||
var ( | ||
wg sync.WaitGroup | ||
t1, t2 Node | ||
) | ||
wg.Add(2) | ||
|
||
go func() { | ||
t1 = tree(d1, "", nil) | ||
wg.Done() | ||
}() | ||
go func() { | ||
t2 = tree(d2, "", nil) | ||
wg.Done() | ||
}() | ||
|
||
wg.Wait() | ||
|
||
fmt.Println(hex.EncodeToString(t1.Hash()), t1.Weight()) | ||
fmt.Println(hex.EncodeToString(t2.Hash()), t2.Weight()) | ||
return nil | ||
} | ||
|
||
// DeltaType defines the types of changes xydiff can create | ||
// to describe the difference between two documents | ||
type DeltaType uint8 | ||
|
||
const ( | ||
// DTUnknown defaults DeltaType to undefined behaviour | ||
DTUnknown DeltaType = iota | ||
// DTRemove means making the children of a node | ||
// become the children of a node's parent | ||
DTRemove | ||
// DTInsert is the compliment of deleting, adding | ||
// children of a parent node to a new node, and making | ||
// that node a child of the original parent | ||
DTInsert | ||
// DTMove is the succession of a deletion & insertion | ||
// of the same node | ||
DTMove | ||
// DTChange is an alteration of a scalar data type (string, bool, float, etc) | ||
DTChange | ||
) | ||
|
||
// Delta represents a change between two documents | ||
type Delta struct { | ||
Type DeltaType | ||
|
||
SrcPath []string | ||
DstPath []string | ||
|
||
SrcVal interface{} | ||
DstVal interface{} | ||
} | ||
|
||
// NewHash returns a new hash interface, wrapped in a function for easy | ||
// hash algorithm switching, package consumers can override NewHash | ||
// with their own desired hash.Hash implementation if the value space is | ||
// particularly large. default is 32-bit FNV 1 for fast, cheap hashing | ||
var NewHash = func() hash.Hash { | ||
return fnv.New32() | ||
} | ||
|
||
// NodeType defines all of the atoms in our universe | ||
type NodeType uint8 | ||
|
||
const ( | ||
// NTUnknown defines a type outside our universe, should never be encountered | ||
NTUnknown NodeType = iota | ||
// NTObject is a dictionary of key / value pairs | ||
NTObject | ||
NTArray | ||
NTString | ||
NTFloat | ||
NTInt | ||
NTBool | ||
NTNull | ||
) | ||
|
||
type Node interface { | ||
Type() NodeType | ||
Hash() []byte | ||
Weight() int | ||
Name() string | ||
} | ||
|
||
type compound struct { | ||
t NodeType | ||
name string | ||
hash []byte | ||
parent Node | ||
children []Node | ||
weight int | ||
} | ||
|
||
func (c compound) Type() NodeType { return c.t } | ||
func (c compound) Name() string { return c.name } | ||
func (c compound) Hash() []byte { return c.hash } | ||
func (c compound) Weight() int { return c.weight } | ||
|
||
type scalar struct { | ||
t NodeType | ||
name string | ||
hash []byte | ||
parent Node | ||
} | ||
|
||
func (s scalar) Type() NodeType { return s.t } | ||
func (s scalar) Name() string { return s.name } | ||
func (s scalar) Hash() []byte { return s.hash } | ||
func (s scalar) Weight() int { return 1 } | ||
|
||
func tree(v interface{}, name string, parent Node) Node { | ||
switch x := v.(type) { | ||
case nil: | ||
return scalar{ | ||
t: NTNull, | ||
name: name, | ||
hash: NewHash().Sum([]byte("null")), | ||
parent: parent, | ||
} | ||
case float64: | ||
fstr := strconv.FormatFloat(x, 'f', -1, 64) | ||
return scalar{ | ||
t: NTFloat, | ||
name: name, | ||
hash: NewHash().Sum([]byte(fstr)), | ||
parent: parent, | ||
} | ||
case string: | ||
return scalar{ | ||
t: NTString, | ||
name: name, | ||
hash: NewHash().Sum([]byte(x)), | ||
parent: parent, | ||
} | ||
case bool: | ||
bstr := "false" | ||
if x { | ||
bstr = "true" | ||
} | ||
return scalar{ | ||
t: NTBool, | ||
name: name, | ||
hash: NewHash().Sum([]byte(bstr)), | ||
} | ||
case []interface{}: | ||
hasher := NewHash() | ||
n := compound{ | ||
t: NTArray, | ||
name: name, | ||
parent: parent, | ||
} | ||
|
||
for i, v := range x { | ||
name := strconv.Itoa(i) | ||
node := tree(v, name, n) | ||
hasher.Write(node.Hash()) | ||
n.children = append(n.children, node) | ||
} | ||
n.hash = hasher.Sum(nil) | ||
|
||
n.weight = 1 | ||
for _, ch := range n.children { | ||
n.weight += ch.Weight() | ||
} | ||
|
||
return n | ||
case map[string]interface{}: | ||
hasher := NewHash() | ||
n := compound{ | ||
t: NTObject, | ||
name: name, | ||
parent: parent, | ||
} | ||
|
||
// gotta sort keys for consistent hashing | ||
names := make([]string, 0, len(x)) | ||
for name := range x { | ||
names = append(names, name) | ||
} | ||
sort.Strings(names) | ||
|
||
for _, name := range names { | ||
node := tree(x[name], name, n) | ||
hasher.Write(node.Hash()) | ||
n.children = append(n.children, node) | ||
} | ||
n.hash = hasher.Sum(nil) | ||
|
||
n.weight = 1 | ||
for _, ch := range n.children { | ||
n.weight += ch.Weight() | ||
} | ||
return n | ||
default: | ||
panic(fmt.Sprintf("unexpected type: %T", v)) | ||
} | ||
} | ||
|
||
// sortAdd inserts n into nodes, keeping the slice sorted by node weight, | ||
// heaviest to the left | ||
func sortAdd(n Node, nodes []Node) { | ||
i := sort.Search(len(nodes), func(i int) bool { return nodes[i].Weight() <= n.Weight() }) | ||
if i < len(nodes) && nodes[i] == n { | ||
fmt.Println(i) | ||
} else { | ||
nodes = append(nodes, nil) | ||
copy(nodes[i+1:], nodes[i:]) | ||
nodes[i] = n | ||
} | ||
} | ||
|
||
// Match connects nodes from different trees | ||
type Match struct { | ||
left, right Node | ||
} | ||
|
||
// ExactMatch checks if two nodes are the same | ||
func ExactMatch(a, b Node) bool { | ||
return bytes.Equal(a.Hash(), b.Hash()) | ||
} |
Oops, something went wrong.