Skip to content

Commit

Permalink
perf: add PureNumber regexp for dedu-smart
Browse files Browse the repository at this point in the history
  • Loading branch information
mstxq17 committed Nov 4, 2023
1 parent 871bf2a commit 5701714
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 7 deletions.
12 changes: 9 additions & 3 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,23 @@ var diffCmd = &cobra.Command{
}
if cmpMode == 1 {
for _, line := range onlyInA {
fmt.Println(line)
if line != "" {
fmt.Println(line)
}
}
}
if cmpMode == 2 {
for _, line := range onlyInB {
fmt.Println(line)
if line != "" {
fmt.Println(line)
}
}
}
if cmpMode == 3 {
for _, line := range inBoth {
fmt.Println(line)
if line != "" {
fmt.Println(line)
}
}
}
} else {
Expand Down
32 changes: 28 additions & 4 deletions core/duplicate.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,28 @@ package core
import "regexp"

const (
ALPHANUMERIC = "{ALPHANUMERIC}"
AlphanumericOtherMixed = "{ALPHANUMERIC_OTHER_MIXED}"
PureNumber = "{PURE_NUMBER}"
)

var Filters = map[string]string{
AlphanumericOtherMixed: `[0-9A-Za-z_-]{8,}`,
PureNumber: `[0-9]{2,7}`,
}

// OrderFilters distribute filter order is required because of unordered map
// OrderFilters 组织好过滤顺序是必须的,解决map的无序问题
var OrderFilters = []string{
AlphanumericOtherMixed,
PureNumber,
}

type DuplicateRemover struct {
linesMap map[string]struct{}
linesCount map[string]int
threshold int
smart bool
ANRegexp *regexp.Regexp
ANRegexp map[string]*regexp.Regexp
}

func NewDuplicateRemover(threshold int, smart bool) *DuplicateRemover {
Expand All @@ -21,7 +34,15 @@ func NewDuplicateRemover(threshold int, smart bool) *DuplicateRemover {
threshold: threshold,
smart: smart,
}
dr.ANRegexp, _ = regexp.Compile(`[0-9A-Za-z]{10,}`)
// some design problems
// 设计存在问题
dr.ANRegexp, _ = func() (map[string]*regexp.Regexp, error) {
ANRegexp := make(map[string]*regexp.Regexp)
for key, value := range Filters {
ANRegexp[key], _ = regexp.Compile(value)
}
return ANRegexp, nil
}()
return dr
}

Expand All @@ -46,5 +67,8 @@ func (dr *DuplicateRemover) RemoveDuplicator(line string) string {

// 将正则 [0-9A-Za-z]{10,} 一般化,超过阈值则进行智能过滤
func (dr *DuplicateRemover) generalize(line string) string {
return dr.ANRegexp.ReplaceAllString(line, ALPHANUMERIC)
for _, key := range OrderFilters {
line = dr.ANRegexp[key].ReplaceAllString(line, key)
}
return line
}

0 comments on commit 5701714

Please sign in to comment.