Enhance: Reuse slack client

Alfex4936 · Oct 17, 2024 · 090bdd0 · 090bdd0
1 parent f499111
commit 090bdd0
Show file tree

Hide file tree

Showing 4 changed files with 484 additions and 34 deletions.
diff --git a/backend/util/ahocorasick.go b/backend/util/ahocorasick.go
@@ -0,0 +1,281 @@
+// original code: https://github.com/RRethy/ahocorasick
+// edited version
+package util
+
+import (
+	"fmt"
+	"sort"
+	"unicode/utf8"
+)
+
+// Matcher is the pattern matching state machine.
+type Matcher struct {
+	base        []int        // base array in the double array trie
+	check       []int        // check array in the double array trie
+	fail        []int        // fail function
+	output      [][]int      // output function
+	runeIndices map[rune]int // mapping from runes to indices
+	runes       []rune       // list of unique runes
+}
+
+// Match represents a matched pattern in the text.
+type Match struct {
+	Word  string // the matched pattern
+	Index int    // the start index of the match
+}
+
+// CompileByteSlices compiles a Matcher from a slice of byte slices. This Matcher can be
+// used to find occurrences of each pattern in a text.
+func CompileByteSlices(words [][]byte) *Matcher {
+	wordRuneSlices := make([][]rune, len(words))
+	for i, word := range words {
+		runes, err := bytesToRunes(word)
+		if err != nil {
+			// Handle invalid UTF-8 by skipping the pattern or logging.
+			// For simplicity, we'll skip invalid patterns.
+			continue
+		}
+		wordRuneSlices[i] = runes
+	}
+	return compile(wordRuneSlices)
+}
+
+// CompileStrings compiles a Matcher from a slice of strings. This Matcher can
+// be used to find occurrences of each pattern in a text.
+func CompileStrings(words []string) *Matcher {
+	wordRuneSlices := make([][]rune, len(words))
+	for i, word := range words {
+		wordRuneSlices[i] = []rune(word)
+	}
+	return compile(wordRuneSlices)
+}
+
+func compile(words [][]rune) *Matcher {
+	m := &Matcher{
+		base:        []int{0},
+		check:       []int{0},
+		fail:        []int{0},
+		output:      [][]int{nil},
+		runeIndices: make(map[rune]int),
+	}
+
+	// Build rune to index mapping
+	for _, word := range words {
+		for _, r := range word {
+			if _, exists := m.runeIndices[r]; !exists {
+				m.runeIndices[r] = len(m.runeIndices)
+				m.runes = append(m.runes, r)
+			}
+		}
+	}
+
+	// Sort the words to ensure deterministic automaton construction.
+	sort.Slice(words, func(i, j int) bool {
+		return lessRuneSlice(words[i], words[j])
+	})
+
+	type trieNode struct {
+		state int
+		depth int
+		start int
+		end   int
+	}
+
+	queue := []trieNode{{state: 0, depth: 0, start: 0, end: len(words)}}
+
+	for len(queue) > 0 {
+		node := queue[0]
+		queue = queue[1:]
+
+		if node.end <= node.start {
+			continue
+		}
+
+		edges := collectEdges(words, node.depth, node.start, node.end)
+
+		base := m.findBase(edges)
+		m.base[node.state] = base
+
+		i := node.start
+		for _, edge := range edges {
+			offset, exists := m.runeIndices[edge]
+			if !exists {
+				continue // Skip if rune not in mapping
+			}
+
+			newState := base + offset
+
+			m.ensureStateCapacity(newState)
+
+			m.check[newState] = node.state
+
+			// Add fail links
+			var failState int
+			if node.depth == 0 {
+				failState = 0
+			} else {
+				failState = m.getFailState(m.fail[node.state], offset)
+			}
+			m.fail[newState] = failState
+
+			// Merge output functions
+			if len(m.output[failState]) > 0 {
+				m.output[newState] = append(m.output[newState], m.output[failState]...)
+			}
+
+			// Add output for complete words
+			newNodeStart := i
+			newNodeEnd := i
+			for i < node.end && words[i][node.depth] == edge {
+				if node.depth+1 == len(words[i]) {
+					m.output[newState] = append(m.output[newState], len(words[i]))
+				}
+				i++
+				newNodeEnd++
+			}
+
+			// Enqueue the next trie node if necessary
+			if newNodeStart < newNodeEnd {
+				queue = append(queue, trieNode{
+					state: newState,
+					depth: node.depth + 1,
+					start: newNodeStart,
+					end:   newNodeEnd,
+				})
+			}
+		}
+	}
+
+	return m
+}
+
+// lessRuneSlice compares two rune slices lexicographically.
+func lessRuneSlice(a, b []rune) bool {
+	minLen := len(a)
+	if len(b) < minLen {
+		minLen = len(b)
+	}
+	for i := 0; i < minLen; i++ {
+		if a[i] != b[i] {
+			return a[i] < b[i]
+		}
+	}
+	return len(a) < len(b)
+}
+
+// collectEdges collects the unique edges (runes) at the given depth.
+func collectEdges(words [][]rune, depth, start, end int) []rune {
+	edgeSet := make(map[rune]struct{})
+	for i := start; i < end; i++ {
+		if depth < len(words[i]) {
+			edgeSet[words[i][depth]] = struct{}{}
+		}
+	}
+	edges := make([]rune, 0, len(edgeSet))
+	for edge := range edgeSet {
+		edges = append(edges, edge)
+	}
+	sort.Slice(edges, func(i, j int) bool { return edges[i] < edges[j] })
+	return edges
+}
+
+// ensureStateCapacity ensures that the state arrays have enough capacity.
+func (m *Matcher) ensureStateCapacity(state int) {
+	if state >= len(m.base) {
+		newSize := state + 1
+		m.base = append(m.base, make([]int, newSize-len(m.base))...)
+		m.check = append(m.check, make([]int, newSize-len(m.check))...)
+		m.fail = append(m.fail, make([]int, newSize-len(m.fail))...)
+		m.output = append(m.output, make([][]int, newSize-len(m.output))...)
+	}
+}
+
+// getFailState computes the fail state for a given state and offset.
+func (m *Matcher) getFailState(failState, offset int) int {
+	for failState != 0 && !m.hasEdge(failState, offset) {
+		failState = m.fail[failState]
+	}
+	if m.hasEdge(failState, offset) {
+		return m.base[failState] + offset
+	}
+	return 0
+}
+
+// findBase finds a suitable base value for the given edges.
+func (m *Matcher) findBase(edges []rune) int {
+	var base int
+search:
+	for {
+		base++
+		for _, edge := range edges {
+			offset, exists := m.runeIndices[edge]
+			if !exists {
+				continue search
+			}
+			state := base + offset
+			if state < len(m.check) && m.check[state] != 0 {
+				continue search
+			}
+		}
+		break
+	}
+	return base
+}
+
+// hasEdge checks if there is an edge from the given state with the given offset.
+func (m *Matcher) hasEdge(state, offset int) bool {
+	nextState := m.base[state] + offset
+	return nextState < len(m.check) && m.check[nextState] == state
+}
+
+// FindAllString finds all instances of the patterns in the text.
+func (m *Matcher) FindAllString(text string) []*Match {
+	return m.FindAllRuneSlice([]rune(text))
+}
+
+// FindAllRuneSlice finds all instances of the patterns in the rune slice.
+func (m *Matcher) FindAllRuneSlice(text []rune) []*Match {
+	var matches []*Match
+	state := 0
+	for i, r := range text {
+		offset, exists := m.runeIndices[r]
+		if !exists {
+			state = 0
+			continue
+		}
+		for state != 0 && !m.hasEdge(state, offset) {
+			state = m.fail[state]
+		}
+		if m.hasEdge(state, offset) {
+			state = m.base[state] + offset
+		} else {
+			state = 0
+		}
+		if len(m.output[state]) > 0 {
+			for _, length := range m.output[state] {
+				start := i - length + 1
+				if start >= 0 {
+					matches = append(matches, &Match{
+						Word:  string(text[start : i+1]),
+						Index: start,
+					})
+				}
+			}
+		}
+	}
+	return matches
+}
+
+// bytesToRunes converts a byte slice to a rune slice, ensuring valid UTF-8 encoding.
+func bytesToRunes(text []byte) ([]rune, error) {
+	var runes []rune
+	for len(text) > 0 {
+		r, size := utf8.DecodeRune(text)
+		if r == utf8.RuneError && size == 1 {
+			return nil, fmt.Errorf("invalid UTF-8 encoding")
+		}
+		runes = append(runes, r)
+		text = text[size:]
+	}
+	return runes, nil
+}
diff --git a/backend/util/ahocorasick_test.go b/backend/util/ahocorasick_test.go
@@ -0,0 +1,104 @@
+package util
+
+import (
+	"testing"
+)
+
+func TestNonASCIICharacters(t *testing.T) {
+	patterns := []string{"안녕하세요", "안녕", "하세요"}
+	matcher := CompileStrings(patterns)
+	text := "안녕하세요 여러분"
+	expectedMatches := []*Match{
+		{Word: "안녕하세요", Index: 0},
+		{Word: "안녕", Index: 0},
+		{Word: "하세요", Index: 2},
+	}
+	matches := matcher.FindAllString(text)
+	if !compareMatches(matches, expectedMatches) {
+		t.Errorf("Expected matches %v, got %v", expectedMatches, matches)
+	}
+}
+
+func TestLongPatterns(t *testing.T) {
+	pattern := ""
+	for i := 0; i < 1000; i++ {
+		pattern += "a"
+	}
+	patterns := []string{pattern}
+	matcher := CompileStrings(patterns)
+	text := ""
+	for i := 0; i < 1000; i++ {
+		text += "a"
+	}
+	matches := matcher.FindAllString(text)
+	if len(matches) != 1 || matches[0].Index != 0 {
+		t.Errorf("Expected one match at index 0, got %v", matches)
+	}
+}
+
+func TestMultipleMatchesAtSamePosition(t *testing.T) {
+	patterns := []string{"he", "he", "he"}
+	matcher := CompileStrings(patterns)
+	text := "he"
+	matches := matcher.FindAllString(text)
+	if len(matches) != 3 {
+		t.Errorf("Expected 3 matches, got %d", len(matches))
+	}
+}
+
+func TestLongText(t *testing.T) {
+	patterns := []string{"test", "long", "text"}
+	matcher := CompileStrings(patterns)
+	text := ""
+	for i := 0; i < 10000; i++ {
+		text += "This is a long text for testing."
+	}
+	matches := matcher.FindAllString(text)
+	if len(matches) == 0 {
+		t.Error("Expected matches, got none")
+	}
+}
+
+func TestSpecialCharacters(t *testing.T) {
+	patterns := []string{"$", "^", "*", "+", "."}
+	matcher := CompileStrings(patterns)
+	text := "This $ is a ^ test * with + special . characters."
+	expectedMatches := []*Match{
+		{Word: "$", Index: 5},
+		{Word: "^", Index: 12},
+		{Word: "*", Index: 19},
+		{Word: "+", Index: 26},
+		{Word: ".", Index: 36},
+	}
+	matches := matcher.FindAllString(text)
+	if !compareMatches(matches, expectedMatches) {
+		t.Errorf("Expected matches %v, got %v", expectedMatches, matches)
+	}
+}
+
+func TestUnicodeCharacters(t *testing.T) {
+	patterns := []string{"😊", "🚀", "🌟"}
+	matcher := CompileStrings(patterns)
+	text := "Hello 😊! Let's go to the moon 🚀 and shine like a star 🌟."
+	expectedMatches := []*Match{
+		{Word: "😊", Index: 6},
+		{Word: "🚀", Index: 31},
+		{Word: "🌟", Index: 53},
+	}
+	matches := matcher.FindAllString(text)
+	if !compareMatches(matches, expectedMatches) {
+		t.Errorf("Expected matches %v, got %v", expectedMatches, matches)
+	}
+}
+
+func compareMatches(a, b []*Match) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i].Index != b[i].Index || a[i].Word != b[i].Word {
+			return false
+		}
+	}
+	return true
+}