Skip to content

Commit

Permalink
Merge pull request #255 from cozydev-pink/fragments
Browse files Browse the repository at this point in the history
  • Loading branch information
valencik authored Nov 29, 2024
2 parents 82ae0c3 + 56a98bc commit 5b3d185
Show file tree
Hide file tree
Showing 9 changed files with 254 additions and 7 deletions.
1 change: 1 addition & 0 deletions core/src/main/scala/pink/cozydev/protosearch/Hit.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ case class Hit(
val id: Int,
val score: Double,
val fields: Map[String, String],
val highlight: String,
)
13 changes: 9 additions & 4 deletions core/src/main/scala/pink/cozydev/protosearch/MultiIndex.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package pink.cozydev.protosearch

import pink.cozydev.lucille.Query
import pink.cozydev.protosearch.highlight.{FirstMatchHighlighter, FragmentFormatter}

case class MultiIndex(
indexes: Map[String, Index],
Expand All @@ -26,18 +27,22 @@ case class MultiIndex(

private val indexSearcher = IndexSearcher(this, schema.defaultOR)
private val scorer = Scorer(this, schema.defaultOR)
private val highlighter = FirstMatchHighlighter(FragmentFormatter(100, "<b>", "</b>"))
val queryAnalyzer = schema.queryAnalyzer(schema.defaultField)

/** Search the index with a `Query`. Results are sorted by descending score.
*
* @param q The `Query` to search
* @return A list of `Hit`s or error
*/
def search(q: Query): Either[String, List[Hit]] = {
def search(rawQStr: String, q: Query): Either[String, List[Hit]] = {
val docs = indexSearcher.search(q).flatMap(ds => scorer.score(q, ds))
val lstb = List.newBuilder[Hit]
docs.map(_.foreach { case (docId, score) =>
lstb += Hit(docId, score, fields.map { case (k, v) => (k, v(docId)) })
val docFields = fields.map { case (k, v) => (k, v(docId)) }
val highlight =
docFields.get("body").map(b => highlighter.highlight(b, rawQStr)).getOrElse("")
lstb += Hit(docId, score, docFields, highlight)
})
docs.map(_ => lstb.result())
}
Expand All @@ -48,7 +53,7 @@ case class MultiIndex(
* @return A list of `Hit`s or error
*/
def search(q: String): Either[String, List[Hit]] =
queryAnalyzer.parse(q).flatMap(search)
queryAnalyzer.parse(q).flatMap(pq => search(q, pq))

/** Search the index with a possibly incomplete query. Meant for use in a "search as your type"
* scenario. The last term, which is possibly incomplete, is rewritten to be a prefix.
Expand All @@ -59,7 +64,7 @@ case class MultiIndex(
def searchInteractive(partialQuery: String): Either[String, List[Hit]] = {
val rewriteQ =
queryAnalyzer.parse(partialQuery).map(mq => mq.mapLastTerm(LastTermRewrite.termToPrefix))
rewriteQ.flatMap(search)
rewriteQ.flatMap(newq => search(partialQuery, newq))
}
}
object MultiIndex {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright 2022 CozyDev
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package pink.cozydev.protosearch.highlight

case class FirstMatchHighlighter(
formatter: FragmentFormatter
) {

val lookBackWindowSize: Int = formatter.maxSize / 2

def trim(str: String): String = {
val trimmed = str.trim()
if (trimmed.size > formatter.maxSize)
trimmed.take(formatter.maxSize) + "..."
else trimmed
}

def highlight(str: String, queryStr: String): String = {
val offset = str.indexOf(queryStr)
if (offset == -1)
trim(str)
else {
val start = Math.max(0, offset - lookBackWindowSize)
val nearby = str.indexWhere(c => " \n\t.".contains(c), start)
val slice = if (str.size < formatter.maxSize) str else str.drop(nearby)
val newOffset = if (str.size < formatter.maxSize) offset else offset - nearby
val fStr = formatter.format(slice, List(newOffset, queryStr.size))
trim(fStr)
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright 2022 CozyDev
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package pink.cozydev.protosearch.highlight

case class FragmentFormatter(
maxSize: Int,
startTag: String,
endTag: String,
) {
private val tagSize = startTag.size + endTag.size

def format(fragment: String, offsets: Iterable[Int]): String =
if (offsets.size == 0) fragment
else {
val offsetArr = offsets.toArray
assert(offsetArr.size % 2 == 0, "even number of offsets required")

val sb = new StringBuilder()
sb.sizeHint(fragment.size + tagSize * (offsets.size / 2))
val chars = fragment.toCharArray()

try {
// Add initial characters
sb.appendAll(chars, 0, offsetArr(0))
var charsOffset = offsetArr(0)

// Loop through offsets, two at a time
var i = 0
while (i < offsetArr.size) {
val offsetStart = offsetArr(i)
// add chars between offsets
val inbetweenChars = offsetStart - charsOffset
sb.appendAll(chars, charsOffset, inbetweenChars)
charsOffset += inbetweenChars

// start new offset
sb.append(startTag)
val offsetLength = offsetArr(i + 1)
sb.appendAll(chars, charsOffset, offsetLength)
sb.append(endTag)
charsOffset += offsetLength
i += 2
}
// Add remaining characters
sb.appendAll(chars, charsOffset, chars.size - charsOffset)
sb.result()
} catch {
case _: IndexOutOfBoundsException =>
throw new IllegalArgumentException("Offset exceeded string length")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright 2022 CozyDev
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package pink.cozydev.protosearch.highlight

class FirstMatchHighlighterSuite extends munit.FunSuite {
val formatter = FragmentFormatter(60, "<b>", "</b>")
val highlighter = FirstMatchHighlighter(formatter)

test("no highlight on no match") {
val s = "hello world"
val actual = highlighter.highlight(s, "cat")
val expected = "hello world"
assertEquals(actual, expected)
}

test("highlights simple substring") {
val s = "hello world"
val actual = highlighter.highlight(s, "world")
val expected = "hello <b>world</b>"
assertEquals(actual, expected)
}

test("highlights only first match") {
val s = "hello world, you, nice world, you"
val actual = highlighter.highlight(s, "world")
val expected = "hello <b>world</b>, you, nice world, you"
assertEquals(actual, expected)
}

test("highlights matches near end of long doc") {
val s = List.fill(100)("hello cat,").mkString("", " ", " world")
val actual = highlighter.highlight(s, "world")
val expected = "cat, hello cat, hello cat, <b>world</b>"
assertEquals(actual, expected)
}

test("long docs get trimmed with ellipses") {
val s = List.fill(100)("hello cat,").mkString("", " ", " world")
val actual = highlighter.highlight(s, "fake")
val expected = s.take(formatter.maxSize) + "..."
assertEquals(actual, expected)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright 2022 CozyDev
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package pink.cozydev.protosearch.highlight

import munit.FunSuite

class FragmentFormatterSuite extends FunSuite {
val formatter = FragmentFormatter(100, "<b>", "</b>")

test("formats string with empty offsets") {
val s = "hello world"
val actual = formatter.format(s, List.empty)
val expected = s
assertEquals(actual, expected)
}

test("formats string with one pair of offsets") {
val s = "hello world"
val actual = formatter.format(s, List(6, 5))
val expected = "hello <b>world</b>"
assertEquals(actual, expected)
}

test("formats string with offset in the middle") {
val s = "hello world, how are you?"
val actual = formatter.format(s, List(6, 5))
val expected = "hello <b>world</b>, how are you?"
assertEquals(actual, expected)
}

test("formats string with two pair of offsets") {
val s = "hello world"
val actual = formatter.format(s, List(0, 5, 6, 5))
val expected = "<b>hello</b> <b>world</b>"
assertEquals(actual, expected)
}

test("formats whole string if one offset") {
val s = "hello world"
val actual = formatter.format(s, List(0, 11))
val expected = "<b>hello world</b>"
assertEquals(actual, expected)
}

test("throws if offset length exceeds string boundary") {
val s = "hello world"
intercept[java.lang.IllegalArgumentException] {
formatter.format(s, List(6, 5 + 1))
}
}

test("throws if odd number of offset integers") {
intercept[java.lang.AssertionError] {
formatter.format("", List(1))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class JsHit(
val id: Int,
val score: Double,
val fields: js.Dictionary[String],
val highlight: String,
) extends js.Object

@JSExportTopLevel("Querier")
Expand All @@ -40,7 +41,7 @@ class Querier(val mIndex: MultiIndex) {
err => { println(err); Nil },
identity,
)
.map(h => new JsHit(h.id, h.score, h.fields.toJSDictionary))
.map(h => new JsHit(h.id, h.score, h.fields.toJSDictionary, h.highlight))
hits.toJSArray
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ function renderDoc(hit) {
const path = hit.fields.path
const link = "../" + hit.fields.path.replace(".txt", ".html")
const title = hit.fields.title
const preview = hit.fields.body.slice(0, 150) + "..."
const preview = hit.highlight
return (
`
<ol>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ function render(hit) {
const htmlPath = hit.fields.path.replace(".txt", ".html")
const link = new URL("../" + htmlPath, baseUrl)
const title = hit.fields.title
const preview = hit.fields.body.slice(0, 150) + "..."
const preview = hit.highlight
return (
`
<ol>
Expand Down

0 comments on commit 5b3d185

Please sign in to comment.