timbray · timbray · Jun 6, 2024 · May 31, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -2,6 +2,12 @@
 
 ## Basics
 
+Most of this document is concerned with the mechanics of raising issues
+and posting Pull Requests to offer improvements to Quamina. Following
+this, there is a section entitled **Developing** that describes
+technology issues that potential contributors will face
+and tools that might be helpful.
+
 Quamina is hosted in this GitHub repository 
 at `github.com/timbray/quamina` and welcomes 
 contributions.
@@ -12,7 +18,7 @@ This is important because possibly Quamina already
 does what you want, in which case perhaps what’s 
 needed is a documentation fix. Possibly the idea 
 has been raised before but failed to convince Quamina’s
-maintainers. (Doesn’t mean it won’t find favor now;
+maintainers. (Doesn't mean it won’t find favor now;
 times change.)
 
 Assuming there is agreement that a change in Quamina
@@ -27,7 +33,7 @@ The coding style suggested by the Go community is
 used in Quamina. See the
 [style doc](https://github.com/golang/go/wiki/CodeReviewComments) for details.
 
-Try to limit column width to 120 characters for both code and markdown documents
+Try to limit column width to 120 characters for both code and Markdown documents
 such as this one.
 
 ### Format of the Commit Message
@@ -64,7 +70,7 @@ is recommended to break up your commits using distinct prefixes.
 
 ### Signing commits
 
-Commits should be signed (not just the `-s` “signd off on”) with
+Commits should be signed (not just the `-s` “signed off on”) with
 any of the [styles GitHub supports](https://docs.github.com/en/authentication/managing-commit-signature-verification/signing-commits).
 Note that you can use `git config` to arrange that your commits are
 automatically signed with the right key.
@@ -99,3 +105,52 @@ instructions for installing it.
 
 When opening a new issue, try to roughly follow the commit message format
 conventions above.
+
+## Developing
+
+Quamina works by compiling the Patterns together into a Nondeterministic
+Finite Automaton (NFA) which proceeds byte-at-a-time through the UTF-encoded
+fields and values. NFAs are nondeterministic in the sense that a byte value
+may cause multiple transitions to different states.
+
+The general workflow, for some specific pattern type, is to write code to build 
+an automaton that matches that type. Examples are the functions `makeStringFA` in
+`value_matcher.go` and `makeShellStyleAutomaton` in `shell_style.go`. Then, 
+insert calls to the automaton builder in `value_matcher.go`, which is reasonably
+straightforward code.  It takes care of merging new automata with existing ones
+as required.
+
+NFAs can be difficult to build and to debug.  For this reason, code 
+is provided in `prettyprinter.go` which produces human-readable NFA
+representations.
+
+For example, the `makeShellStyleAutomaton` code has `prettyprinter` call-outs to
+label the states and transitions it creates, and the `TestPP` test in 
+`prettyprinter_test.go` uses this.  The pattern being matched is `"x*9"` and 
+the prettyprinter output is:
+
+```
+ 758 [START HERE] '"' → [910 on " at 0]
+ 910 [on " at 0] 'x' → [821 gS at 2]
+ 821 [gS at 2] '9' → [551 gX on 9 at 3] / ★ → [821 gS at 2]
+ 551 [gX on 9 at 3] '"' → [937 on " at 4] / '9' → [551 gX on 9 at 3] / ★ → [821 gS at 2]
+ 937 [on " at 4] '9' → [551 gX on 9 at 3] / 'ℵ' → [820 last step at 5] / ★ → [821 gS at 2]
+ 820 [last step at 5]  [1 transition(s)]
+```
+
+Each line represents one state.
+
+Each step gets a 3-digit number and a text description. The construct `★ →` represents
+a default transition, which occurs in the case that none of the other transitions match. The
+symbol `ℵ` represents the end of the input value.
+
+In this particular NFA, the `makeShellStyleAutomaton` code labels states corresponding to
+the `*` "glob" character with text including `gS` for "glob spin" and states that escape the
+"glob spin" state with `gX` for "glob escape".
+
+Most of the NFA-building code does not exercise the prettyprinter. Normally, you would insert
+such code while debugging a particular builder and remove it after completion. Since the 
+shell-style builder is unusually complex, the prettyprinting code is un-removed in anticipation
+of future issues and progress to full regular-expression NFAs.
+
+
diff --git a/core_matcher.go b/core_matcher.go
@@ -52,6 +52,12 @@ func (m *coreMatcher) fields() *coreFields {
 // addPattern - the patternBytes is a JSON text which must be an object. The X is what the matcher returns to indicate
 // that the provided pattern has been matched. In many applications it might be a string which is the pattern's name.
 func (m *coreMatcher) addPattern(x X, patternJSON string) error {
+	return m.addPatternWithPrinter(x, patternJSON, sharedNullPrinter)
+}
+
+// addPatternWithPrinter can be called from debugging and under-development code to allow viewing pretty-printed
+// NFAs
+func (m *coreMatcher) addPatternWithPrinter(x X, patternJSON string, printer printer) error {
 	patternFields, err := patternFromJSON([]byte(patternJSON))
 	if err != nil {
 		return err
@@ -97,7 +103,7 @@ func (m *coreMatcher) addPattern(x X, patternJSON string) error {
 			case existsFalseType:
 				ns = state.addExists(false, field)
 			default:
-				ns = state.addTransition(field)
+				ns = state.addTransition(field, printer)
 			}
 
 			nextStates = append(nextStates, ns...)

diff --git a/field_matcher.go b/field_matcher.go
@@ -93,7 +93,7 @@ func (m *fieldMatcher) addExists(exists bool, field *patternField) []*fieldMatch
 	return []*fieldMatcher{trans}
 }
 
-func (m *fieldMatcher) addTransition(field *patternField) []*fieldMatcher {
+func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fieldMatcher {
 	// we build the new updateable state in freshStart so that we can blast it in atomically once computed
 	current := m.fields()
 	freshStart := &fmFields{
@@ -119,7 +119,7 @@ func (m *fieldMatcher) addTransition(field *patternField) []*fieldMatcher {
 	//  cases where this doesn't happen and reduce the number of fieldMatchStates
 	var nextFieldMatchers []*fieldMatcher
 	for _, val := range field.vals {
-		nextFieldMatchers = append(nextFieldMatchers, vm.addTransition(val))
+		nextFieldMatchers = append(nextFieldMatchers, vm.addTransition(val, printer))
 
 		// if the val is a number, let's add a transition on the canonicalized number
 		// TODO: Only do this if asked

diff --git a/nfa.go b/nfa.go
@@ -21,6 +21,10 @@ func traverseOneFAStep(table *smallTable, index int, val []byte, transitions []*
 		return transitions
 	}
 	index++
+	// 1. Note no effort to traverse multiple next-steps in parallel. The traversal compute is tiny and the
+	//    necessary concurrency apparatus would almost certainly outweigh it
+	// 2. TODO: It would probably be better to implement this iteratively rather than recursively.
+	//    The recursion will potentially go as deep as the val argument is long.
 	for _, nextStep := range nextSteps.steps {
 		transitions = append(transitions, nextStep.fieldTransitions...)
 		transitions = traverseOneFAStep(nextStep.table, index, val, transitions)
@@ -101,66 +105,3 @@ func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState) *faS
 
 	return combined
 }
-
-/**************************************/
-/* debugging apparatus from here down */
-/**************************************/
-/*
-func (t *smallTable) dump() string {
-	return dump1(&faState{table: t}, 0, make(map[*smallTable]bool))
-}
-func dump1(fas *faState, indent int, already map[*smallTable]bool) string {
-	t := fas.table
-	s := " " + st2(t) + "\n"
-	for _, step := range t.steps {
-		if step != nil {
-			for _, state := range step.steps {
-				_, ok := already[state.table]
-				if !ok {
-					already[state.table] = true
-					s += dump1(state, indent+1, already)
-				}
-			}
-		}
-	}
-	return s
-}
-func (t *smallTable) shortDump() string {
-	return fmt.Sprintf("%d-%s", t.serial, t.label)
-}
-
-func (n *faNext) String() string {
-	var snames []string
-	for _, step := range n.steps {
-		snames = append(snames, fmt.Sprintf("%d %s", step.table.serial, step.table.label))
-	}
-	return "[" + strings.Join(snames, " · ") + "]"
-}
-
-func stString(t *smallTable) string {
-	var rows []string
-
-	for i := range t.ceilings {
-		c := t.ceilings[i]
-		if i == 0 {
-			c = 0
-		} else {
-			if c != valueTerminator && c != byte(byteCeiling) {
-				c = t.ceilings[i-1]
-			}
-		}
-		var trailer string
-		if i == len(t.ceilings)-1 && c != valueTerminator && c != byte(byteCeiling) {
-			trailer = "…"
-		} else {
-			trailer = ""
-		}
-		if t.steps[i] != nil {
-			rows = append(rows, fmt.Sprintf("%s%s:%s ", branchChar(c), trailer, t.steps[i].String()))
-		} else {
-			rows = append(rows, fmt.Sprintf("%s%s:nil ", branchChar(c), trailer))
-		}
-	}
-	return fmt.Sprintf("s%d [%s] ", t.serial, t.label) + strings.Join(rows, "/ ")
-}
-*/
diff --git a/nfa_test.go b/nfa_test.go
@@ -58,7 +58,7 @@ func TestFocusedMerge(t *testing.T) {
 
 	for _, shellStyle := range shellStyles {
 		str := `"` + shellStyle + `"`
-		automaton, matcher := makeShellStyleAutomaton([]byte(str))
+		automaton, matcher := makeShellStyleAutomaton([]byte(str), &nullPrinter{})
 		automata = append(automata, automaton)
 		matchers = append(matchers, matcher)
 	}
@@ -76,7 +76,7 @@ func TestFocusedMerge(t *testing.T) {
 		s := statsAccum{
 			fmVisited: make(map[*fieldMatcher]bool),
 			vmVisited: make(map[*valueMatcher]bool),
-			stVisited: make(map[any]bool),
+			stVisited: make(map[*smallTable]bool),
 		}
 		faStats(merged, &s)
 		fmt.Println(s.stStats())

diff --git a/prettyprinter.go b/prettyprinter.go
@@ -0,0 +1,160 @@
+package quamina
+
+import (
+	"fmt"
+	"math/rand"
+	"strings"
+)
+
+// printer is an interface used to generate representations of Quamina data structures to facilitate
+// debugging and optimization. It's an interface rather than a type so that a null implementation can
+// be provided for production that should incur very little performance cost.
+type printer interface {
+	labelTable(table *smallTable, label string)
+	printNFA(table *smallTable) string
+	shortPrintNFA(table *smallTable) string
+}
+
+// nullPrinter is what the name says, a do-nothing implementation of the printer interface which ideally
+// should consume close to zero CPU cycles.
+type nullPrinter struct{}
+
+const noPP = "prettyprinting not enabled"
+
+func (*nullPrinter) labelTable(_ *smallTable, _ string) {
+}
+func (*nullPrinter) printNFA(_ *smallTable) string {
+	return noPP
+}
+func (*nullPrinter) shortPrintNFA(_ *smallTable) string {
+	return noPP
+}
+
+var sharedNullPrinter = &nullPrinter{}
+
+// prettyPrinter makes a human-readable representation of a NFA; each smallTable may be
+// given a label and as a side effect will get a random 3-digit serial number. For an example
+// of the output, see the functions TestPP and TestNullPP in prettyprinter_test.go
+type prettyPrinter struct {
+	randInts     rand.Source
+	tableLabels  map[*smallTable]string
+	tableSerials map[*smallTable]uint
+}
+
+func newPrettyPrinter(seed int) *prettyPrinter {
+	return &prettyPrinter{
+		randInts:     rand.NewSource(int64(seed)),
+		tableLabels:  make(map[*smallTable]string),
+		tableSerials: make(map[*smallTable]uint),
+	}
+}
+
+func (pp *prettyPrinter) tableSerial(t *smallTable) uint {
+	return pp.tableSerials[t]
+}
+func (pp *prettyPrinter) tableLabel(t *smallTable) string {
+	return pp.tableLabels[t]
+}
+
+func (pp *prettyPrinter) labelTable(table *smallTable, label string) {
+	pp.tableLabels[table] = label
+	pp.tableSerials[table] = uint(pp.randInts.Int63()%500 + 500)
+}
+
+func (pp *prettyPrinter) printNFA(t *smallTable) string {
+	return pp.printNFAStep(&faState{table: t}, 0, make(map[*smallTable]bool))
+}
+
+func (pp *prettyPrinter) printNFAStep(fas *faState, indent int, already map[*smallTable]bool) string {
+	t := fas.table
+	trailer := "\n"
+	if len(fas.fieldTransitions) != 0 {
+		trailer = fmt.Sprintf(" [%d transition(s)]\n", len(fas.fieldTransitions))
+	}
+	s := " " + pp.printTable(t) + trailer
+	for _, step := range t.steps {
+		if step != nil {
+			for _, state := range step.steps {
+				_, ok := already[state.table]
+				if !ok {
+					already[state.table] = true
+					s += pp.printNFAStep(state, indent+1, already)
+				}
+			}
+		}
+	}
+	return s
+}
+
+func (pp *prettyPrinter) printTable(t *smallTable) string {
+	// going to build a string rep of a smallTable based on the unpacked form
+	// each line is going to be a range like
+	// 'c' .. 'e' => %X
+	// lines where the *faNext is nil are omitted
+	var rows []string
+	unpacked := unpackTable(t)
+
+	var rangeStart int
+	var b int
+
+	defTrans := unpacked[0]
+
+	for {
+		for b < len(unpacked) && unpacked[b] == nil {
+			b++
+		}
+		if b == len(unpacked) {
+			break
+		}
+		rangeStart = b
+		lastN := unpacked[b]
+		for b < len(unpacked) && unpacked[b] == lastN {
+			b++
+		}
+		if lastN != defTrans {
+			row := ""
+			if b == rangeStart+1 {
+				row += fmt.Sprintf("'%s'", branchChar(byte(rangeStart)))
+			} else {
+				row += fmt.Sprintf("'%s'…'%s'", branchChar(byte(rangeStart)), branchChar(byte(b-1)))
+			}
+			row += " → " + pp.nextString(lastN)
+			rows = append(rows, row)
+		}
+	}
+	serial := pp.tableSerial(t)
+	label := pp.tableLabel(t)
+	if defTrans != nil {
+		dtString := "★ → " + pp.nextString(defTrans)
+		return fmt.Sprintf("%d [%s] ", serial, label) + strings.Join(rows, " / ") + " / " + dtString
+	} else {
+		return fmt.Sprintf("%d [%s] ", serial, label) + strings.Join(rows, " / ")
+	}
+}
+
+func (pp *prettyPrinter) nextString(n *faNext) string {
+	var snames []string
+	for _, step := range n.steps {
+		snames = append(snames, fmt.Sprintf("%d %s",
+			pp.tableSerial(step.table), pp.tableLabel(step.table)))
+	}
+	return "[" + strings.Join(snames, " · ") + "]"
+}
+
+func branchChar(b byte) string {
+	switch b {
+	// TODO: Figure out how to test commented-out cases
+	// case 0:
+	// 	return "∅"
+	case valueTerminator:
+		return "ℵ"
+	// case byte(byteCeiling):
+	// 	return "♾️"
+	default:
+		return fmt.Sprintf("%c", b)
+	}
+}
+
+func (pp *prettyPrinter) shortPrintNFA(table *smallTable) string {
+	return fmt.Sprintf("%d-%s", pp.tableSerials[table], pp.tableLabels[table])
+}