Skip to content

Commit

Permalink
add rules for geo coordinates
Browse files Browse the repository at this point in the history
  • Loading branch information
ryzheboka committed Feb 1, 2023
1 parent b4f3cf5 commit 493adce
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 24 deletions.
8 changes: 8 additions & 0 deletions english/golden_rules_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ func TestGoldenRules(t *testing.T) {
}
compareSentences(t, actualText, expected, test)

test = "43. Geo Coordinates"
actualText = "You can find it at N°. 1026.253.553. That is where the treasure is."
expected = []string{
"You can find it at N°. 1026.253.553.",
" That is where the treasure is.",
}
compareSentences(t, actualText, expected, test)

test = "46. Ellipsis at end of quotation"
actualText = "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
expected = []string{
Expand Down
6 changes: 3 additions & 3 deletions english/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ func (a *MultiPunctWordAnnotation) Annotate(tokens []*sentences.Token) []*senten
}

func (a *MultiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Token) {
if a.IsListNumber(tokOne) {
if a.IsListNumber(tokOne) || a.IsCoordinatePartOne(tokOne) {
tokOne.SentBreak = false
return
}
Expand All @@ -146,7 +146,7 @@ func (a *MultiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Tok
return
}

if len(reAbbr.FindAllString(tokOne.Tok, 1)) == 0 && tokOne.Tok!="." && !a.HasUnreliableEndChars(tokOne) {
if len(reAbbr.FindAllString(tokOne.Tok, 1)) == 0 && tokOne.Tok!="." && !(a.HasUnreliableEndChars(tokOne)) && !(a.IsCoordinatePartTwo(tokOne)) {
return
}

Expand Down Expand Up @@ -175,7 +175,7 @@ func (a *MultiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Tok
frequent-sentence-starters list, then label tok as a
sentence break.
*/
if a.TokenParser.FirstUpper(tokTwo) && (a.SentStarters[nextTyp] != 0 || a.HasUnreliableEndChars(tokOne) || tokOne.Tok==".") {
if a.TokenParser.FirstUpper(tokTwo) && (a.SentStarters[nextTyp] != 0 || a.HasUnreliableEndChars(tokOne) || tokOne.Tok=="." || a.IsCoordinatePartTwo(tokOne)) {
tokOne.SentBreak = true
return
}
Expand Down
43 changes: 23 additions & 20 deletions token.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,35 +36,38 @@ func (p *DefaultTokenGrouper) Group(tokens []*Token) [][2]*Token {

// Token stores a token of text with annotations produced during sentence boundary detection.
type Token struct {
Tok string
Position int
SentBreak bool
ParaStart bool
LineStart bool
Abbr bool
periodFinal bool
reEllipsis *regexp.Regexp
reNumeric *regexp.Regexp
reInitial *regexp.Regexp
reListNumber *regexp.Regexp
reAlpha *regexp.Regexp
Tok string
Position int
SentBreak bool
ParaStart bool
LineStart bool
Abbr bool
periodFinal bool
reEllipsis *regexp.Regexp
reNumeric *regexp.Regexp
reInitial *regexp.Regexp
reListNumber *regexp.Regexp
reAlpha *regexp.Regexp
reCoordinateSecondPart *regexp.Regexp
}

var reEllipsis = regexp.MustCompile(`\.\.+$`)
var reEllipsis = regexp.MustCompile(`^\.\.+$`)
var reNumeric = regexp.MustCompile(`-?[\.,]?\d[\d,\.-]*\.?$`)
var reInitial = regexp.MustCompile(`^[A-Za-z]\.$`)
var reListNumber = regexp.MustCompile(`\d+.?\)?$`)
var reListNumber = regexp.MustCompile(`^\d+.?\)?$`)
var reAlpha = regexp.MustCompile(`^[A-Za-z]+$`)
var reCoordinateSecondPart = regexp.MustCompile(`^[0-9]*\.[0-9]*\.[0-9]*\.$`)

// NewToken is the default implementation of the Token struct
func NewToken(token string) *Token {
tok := Token{
Tok: token,
reEllipsis: reEllipsis,
reNumeric: reNumeric,
reInitial: reInitial,
reListNumber: reListNumber,
reAlpha: reAlpha,
Tok: token,
reEllipsis: reEllipsis,
reNumeric: reNumeric,
reInitial: reInitial,
reListNumber: reListNumber,
reAlpha: reAlpha,
reCoordinateSecondPart: reCoordinateSecondPart,
}

return &tok
Expand Down
16 changes: 15 additions & 1 deletion word_tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,16 @@ type TokenExistential interface {
IsNumber(*Token) bool
// True if the token is either a number or is alphabetic.
IsNonPunct(*Token) bool
// True if the token is first part of a coordinate.
IsCoordinatePartOne(*Token) bool
// True if the token is second part of a coordinate.
IsCoordinatePartTwo(*Token) bool
// Does this token end with a period?
HasPeriodFinal(*Token) bool
// Does this token end with a punctuation and a quote?
HasSentEndChars(*Token) bool
// Does this token end with ambigiuous punctuation?
HasUnreliableEndChars(*Token) bool

}

// TokenParser is the primary token interface that determines the context and type of a tokenized word.
Expand Down Expand Up @@ -223,6 +226,16 @@ func (p *DefaultWordTokenizer) IsAlpha(t *Token) bool {
return t.reAlpha.MatchString(t.Tok)
}

// IsCoordinatePartTwo is true if the token text might be the second part of a coordiate.
func (p *DefaultWordTokenizer) IsCoordinatePartOne(t *Token) bool {
return strings.Compare(t.Tok, "N°.") == 0
}

// IsCoordinatePartTwo is true if the token text might be the second part of a coordiate.
func (p *DefaultWordTokenizer) IsCoordinatePartTwo(t *Token) bool {
return t.reCoordinateSecondPart.MatchString(t.Tok)
}

// IsNonPunct is true if the token is either a number or is alphabetic.
func (p *DefaultWordTokenizer) IsNonPunct(t *Token) bool {
nonPunct := regexp.MustCompile(p.PunctStrings.NonPunct())
Expand Down Expand Up @@ -268,6 +281,7 @@ func (p *DefaultWordTokenizer) HasSentEndChars(t *Token) bool {

return false
}

// Find any punctuation that might mean the end of a sentence but doesn't have to
func (p *DefaultWordTokenizer) HasUnreliableEndChars(t *Token) bool {
enders := []string{
Expand Down

0 comments on commit 493adce

Please sign in to comment.