JohnSnowLabs · saif-ellafi · Jan 4, 2018 · Dec 27, 2017
diff --git a/...main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticSentenceExtractor.scala b/...main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticSentenceExtractor.scala
@@ -17,12 +17,15 @@ class PragmaticSentenceExtractor(text: String) {
     var lastCharPosition = 0
     var i = 0
     while (i < sentences.length) {
-      val sentenceContent = rawSentences(i)
-      val sentenceLastCharPos = lastCharPosition + sentenceContent.length - 1
+      val rawSentence = rawSentences(i)
+      val sentence = rawSentence.trim()
+      val startPad = rawSentence.indexOf(sentence)
+
+      val sentenceLastCharPos = lastCharPosition + rawSentence.length - 1
       sentences(i) = Sentence(
-        sentenceContent,
-        lastCharPosition,
-        sentenceLastCharPos
+        sentence,
+        lastCharPosition + startPad,
+        lastCharPosition + startPad + sentence.length() - 1
       )
       lastCharPosition = sentenceLastCharPos + 1
       i = i + 1
@@ -39,15 +42,15 @@ class PragmaticSentenceExtractor(text: String) {
     * @return final sentence structure
     */
   def pull: Array[Sentence] = {
-    val splitSentences: Array[String] = text
+    val splitSentences = text
       .split(PragmaticSymbols.UNPROTECTED_BREAK_INDICATOR)
       .map(_.replaceAll(PragmaticSymbols.BREAK_INDICATOR, ""))
-      .map(_.trim).filter(_.nonEmpty)
       .map(s => recoverySymbols.replaceAllIn(
         s, m => PragmaticSymbols.symbolRecovery
           .getOrElse(m.matched, throw new IllegalArgumentException("Invalid symbol in sentence recovery"))
       ))
+
     buildSentenceProperties(splitSentences)
+      .filter(_.content.trim.nonEmpty)
   }
-
 }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexTokenizerBehaviors.scala
@@ -22,8 +22,14 @@ trait RegexTokenizerBehaviors { this: FlatSpec =>
       .collect
       .flatMap { r => r.getSeq[Row](0)}
       .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) }
-    val corpus = sentencesAnnotations
-      .map { a => a.result }
+
+    val docAnnotations = documents
+      .collect
+      .flatMap { r => r.getSeq[Row](0)}
+      .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) }
+
+    val corpus = docAnnotations
+      .map(d => d.result)
       .mkString("")
   }
 

diff --git a/...scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala b/...scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorModelBoundsSpec.scala
@@ -0,0 +1,27 @@
+package com.johnsnowlabs.nlp.annotators.sbd.pragmatic
+
+import com.johnsnowlabs.nlp.annotators.common.Sentence
+import org.scalatest.FlatSpec
+
+
+class SentenceDetectorModelBoundsSpec extends FlatSpec {
+
+  val model = new PragmaticMethod(false)
+
+  "SentenceDetectorModel" should "return correct sentence bounds" in {
+    val bounds = model.extractBounds("Hello World!! New Sentence", Array.empty[String])
+
+    assert(bounds.length == 2)
+    assert(bounds(0) == Sentence("Hello World!!", 0, 12))
+    assert(bounds(1) == Sentence("New Sentence", 14, 25))
+  }
+
+  "SentenceDetectorModel" should "correct return sentence bounds with whitespaces" in {
+    val bounds = model.extractBounds(" Hello World!! .  New Sentence  ", Array.empty[String])
+
+    assert(bounds.length == 3)
+    assert(bounds(0) == Sentence("Hello World!!", 1, 13))
+    assert(bounds(1) == Sentence(".", 15, 15))
+    assert(bounds(2) == Sentence("New Sentence", 18, 29))
+  }
+}