add rules for geo coordinates

neurosnap · Feb 1, 2023 · 493adce · 493adce
1 parent b4f3cf5
commit 493adce
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 24 deletions.
diff --git a/english/golden_rules_test.go b/english/golden_rules_test.go
@@ -85,6 +85,14 @@ func TestGoldenRules(t *testing.T) {
 	}
 	compareSentences(t, actualText, expected, test)
 
+	test = "43. Geo Coordinates"
+	actualText = "You can find it at N°. 1026.253.553. That is where the treasure is."
+	expected = []string{
+		"You can find it at N°. 1026.253.553.",
+		" That is where the treasure is.",
+	}
+	compareSentences(t, actualText, expected, test)
+
 	test = "46. Ellipsis at end of quotation"
 	actualText = "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
 	expected = []string{

diff --git a/english/main.go b/english/main.go
@@ -136,7 +136,7 @@ func (a *MultiPunctWordAnnotation) Annotate(tokens []*sentences.Token) []*senten
 }
 
 func (a *MultiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Token) {
-	if a.IsListNumber(tokOne) {
+	if a.IsListNumber(tokOne) || a.IsCoordinatePartOne(tokOne) {
 		tokOne.SentBreak = false
 		return
 	}
@@ -146,7 +146,7 @@ func (a *MultiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Tok
 		return
 	}
 
-	if len(reAbbr.FindAllString(tokOne.Tok, 1)) == 0 && tokOne.Tok!="." && !a.HasUnreliableEndChars(tokOne) {
+	if len(reAbbr.FindAllString(tokOne.Tok, 1)) == 0 && tokOne.Tok!="." && !(a.HasUnreliableEndChars(tokOne)) && !(a.IsCoordinatePartTwo(tokOne)) {
 		return
 	}
 
@@ -175,7 +175,7 @@ func (a *MultiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Tok
 		frequent-sentence-starters list, then label tok as a
 		sentence break.
 	*/
-	if a.TokenParser.FirstUpper(tokTwo) && (a.SentStarters[nextTyp] != 0 || a.HasUnreliableEndChars(tokOne) || tokOne.Tok==".") {
+	if a.TokenParser.FirstUpper(tokTwo) && (a.SentStarters[nextTyp] != 0 || a.HasUnreliableEndChars(tokOne) || tokOne.Tok=="." || a.IsCoordinatePartTwo(tokOne)) {
 		tokOne.SentBreak = true
 		return
 	}

diff --git a/token.go b/token.go
@@ -36,35 +36,38 @@ func (p *DefaultTokenGrouper) Group(tokens []*Token) [][2]*Token {
 
 // Token stores a token of text with annotations produced during sentence boundary detection.
 type Token struct {
-	Tok         string
-	Position    int
-	SentBreak   bool
-	ParaStart   bool
-	LineStart   bool
-	Abbr        bool
-	periodFinal bool
-	reEllipsis  *regexp.Regexp
-	reNumeric   *regexp.Regexp
-	reInitial   *regexp.Regexp
-	reListNumber *regexp.Regexp
-	reAlpha     *regexp.Regexp
+	Tok                    string
+	Position               int
+	SentBreak              bool
+	ParaStart              bool
+	LineStart              bool
+	Abbr                   bool
+	periodFinal            bool
+	reEllipsis             *regexp.Regexp
+	reNumeric              *regexp.Regexp
+	reInitial              *regexp.Regexp
+	reListNumber           *regexp.Regexp
+	reAlpha                *regexp.Regexp
+	reCoordinateSecondPart *regexp.Regexp
 }
 
-var reEllipsis = regexp.MustCompile(`\.\.+$`)
+var reEllipsis = regexp.MustCompile(`^\.\.+$`)
 var reNumeric = regexp.MustCompile(`-?[\.,]?\d[\d,\.-]*\.?$`)
 var reInitial = regexp.MustCompile(`^[A-Za-z]\.$`)
-var reListNumber = regexp.MustCompile(`\d+.?\)?$`)
+var reListNumber = regexp.MustCompile(`^\d+.?\)?$`)
 var reAlpha = regexp.MustCompile(`^[A-Za-z]+$`)
+var reCoordinateSecondPart = regexp.MustCompile(`^[0-9]*\.[0-9]*\.[0-9]*\.$`)
 
 // NewToken is the default implementation of the Token struct
 func NewToken(token string) *Token {
 	tok := Token{
-		Tok:        token,
-		reEllipsis: reEllipsis,
-		reNumeric:  reNumeric,
-		reInitial:  reInitial,
-		reListNumber: reListNumber,
-		reAlpha:    reAlpha,
+		Tok:                    token,
+		reEllipsis:             reEllipsis,
+		reNumeric:              reNumeric,
+		reInitial:              reInitial,
+		reListNumber:           reListNumber,
+		reAlpha:                reAlpha,
+		reCoordinateSecondPart: reCoordinateSecondPart,
 	}
 
 	return &tok

diff --git a/word_tokenizer.go b/word_tokenizer.go
@@ -43,13 +43,16 @@ type TokenExistential interface {
 	IsNumber(*Token) bool
 	// True if the token is either a number or is alphabetic.
 	IsNonPunct(*Token) bool
+	// True if the token is first part of a coordinate.
+	IsCoordinatePartOne(*Token) bool
+	// True if the token is second part of a coordinate.
+	IsCoordinatePartTwo(*Token) bool
 	// Does this token end with a period?
 	HasPeriodFinal(*Token) bool
 	// Does this token end with a punctuation and a quote?
 	HasSentEndChars(*Token) bool
 	// Does this token end with ambigiuous punctuation?
 	HasUnreliableEndChars(*Token) bool
-
 }
 
 // TokenParser is the primary token interface that determines the context and type of a tokenized word.
@@ -223,6 +226,16 @@ func (p *DefaultWordTokenizer) IsAlpha(t *Token) bool {
 	return t.reAlpha.MatchString(t.Tok)
 }
 
+// IsCoordinatePartTwo is true if the token text might be the second part of a coordiate.
+func (p *DefaultWordTokenizer) IsCoordinatePartOne(t *Token) bool {
+	return strings.Compare(t.Tok, "N°.") == 0
+}
+
+// IsCoordinatePartTwo is true if the token text might be the second part of a coordiate.
+func (p *DefaultWordTokenizer) IsCoordinatePartTwo(t *Token) bool {
+	return t.reCoordinateSecondPart.MatchString(t.Tok)
+}
+
 // IsNonPunct is true if the token is either a number or is alphabetic.
 func (p *DefaultWordTokenizer) IsNonPunct(t *Token) bool {
 	nonPunct := regexp.MustCompile(p.PunctStrings.NonPunct())
@@ -268,6 +281,7 @@ func (p *DefaultWordTokenizer) HasSentEndChars(t *Token) bool {
 
 	return false
 }
+
 // Find any punctuation that might mean the end of a sentence but doesn't have to
 func (p *DefaultWordTokenizer) HasUnreliableEndChars(t *Token) bool {
 	enders := []string{