Skip to content

Commit

Permalink
Add support for Fuse.js search format (#2507)
Browse files Browse the repository at this point in the history
* inital "just barely works" Fuse.js support

* implement FuseJavascript; refactor index_for_lang

* support search config

* move fuse index building to it's own file

* update doc of Search.index_format

* update config docs

* update search documentation

* use &str where possible

* use libs::serde_json

remmeber to commit Cargo.lock

* move extension logic to IndexFormat

* move the entire filename logic inside IndexFormat

* move elasticlunr to it's own module

* only create elasticlunr.min.js if we're actually using elasticlunr

* move ELASTICLUNR_JS to elasticlunr.js

* hide the details of search's submodules

* optionally include path

* explain include_path better

* remove references to stork

* replace if with match

* support include_description

* specify "permalink"

* move body cleaning and truncation to a function

* update truncate_content_length docs to specify *code points*
  • Loading branch information
SIGSTACKFAULT authored May 31, 2024
1 parent d75b00c commit 8c56a0e
Show file tree
Hide file tree
Showing 10 changed files with 496 additions and 348 deletions.
161 changes: 83 additions & 78 deletions Cargo.lock

Large diffs are not rendered by default.

21 changes: 19 additions & 2 deletions components/config/src/config/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ pub enum IndexFormat {
ElasticlunrJson,
#[default]
ElasticlunrJavascript,
FuseJson,
FuseJavascript,
}

impl IndexFormat {
/// file extension which ought to be used for this index format.
fn extension(&self) -> &'static str {
match *self {
IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js",
IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json",
}
}

/// the filename which ought to be used for this format and language `lang`
pub fn filename(&self, lang: &str) -> String {
format!("search_index.{}.{}", lang, self.extension())
}
}

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
Expand All @@ -17,7 +34,7 @@ pub struct Search {
/// Includes the whole content in the search index. Ok for small sites but becomes
/// too big on large sites. `true` by default.
pub include_content: bool,
/// Optionally truncate the content down to `n` chars. This might cut content in a word
/// Optionally truncate the content down to `n` code points. This might cut content in a word
pub truncate_content_length: Option<usize>,
/// Includes the description in the search index. When the site becomes too large, you can switch
/// to that instead. `false` by default
Expand All @@ -26,7 +43,7 @@ pub struct Search {
pub include_date: bool,
/// Include the path of the page in the search index. `false` by default.
pub include_path: bool,
/// Foramt of the search index to be produced. Javascript by default
/// Foramt of the search index to be produced. 'elasticlunr_javascript' by default.
pub index_format: IndexFormat,
}

Expand Down
1 change: 1 addition & 0 deletions components/search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ errors = { path = "../errors" }
content = { path = "../content" }
config = { path = "../config" }
libs = { path = "../libs" }
serde = { version = "1.0", features = ["derive"] }
236 changes: 236 additions & 0 deletions components/search/src/elasticlunr.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
use config::{Config, Search};
use content::{Library, Section};
use errors::{bail, Result};
use libs::elasticlunr::{lang, Index, IndexBuilder};
use libs::time::format_description::well_known::Rfc3339;
use libs::time::OffsetDateTime;

use crate::clean_and_truncate_body;

pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");

fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder {
if search_config.include_title {
index = index.add_field("title");
}

if search_config.include_description {
index = index.add_field("description");
}

if search_config.include_date {
index = index.add_field("date")
}

if search_config.include_path {
index = index.add_field_with_tokenizer("path", Box::new(path_tokenizer));
}

if search_config.include_content {
index = index.add_field("body")
}

index
}

fn path_tokenizer(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.collect()
}

fn fill_index(
search_config: &Search,
title: &Option<String>,
description: &Option<String>,
datetime: &Option<OffsetDateTime>,
path: &str,
content: &str,
) -> Vec<String> {
let mut row = vec![];

if search_config.include_title {
row.push(title.clone().unwrap_or_default());
}

if search_config.include_description {
row.push(description.clone().unwrap_or_default());
}

if search_config.include_date {
if let Some(date) = datetime {
if let Ok(d) = date.format(&Rfc3339) {
row.push(d);
}
}
}

if search_config.include_path {
row.push(path.to_string());
}

if search_config.include_content {
row.push(clean_and_truncate_body(search_config.truncate_content_length, content));
}
row
}

/// Returns the generated JSON index with all the documents of the site added using
/// the language given
/// Errors if the language given is not available in Elasticlunr
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {
let language = match lang::from_code(lang) {
Some(l) => l,
None => {
bail!("Tried to build search index for language {} which is not supported", lang);
}
};
let language_options = &config.languages[lang];
let mut index = IndexBuilder::with_language(language);
index = build_fields(&language_options.search, index);
let mut index = index.build();

for (_, section) in &library.sections {
if section.lang == lang {
add_section_to_index(&mut index, section, library, &language_options.search);
}
}

Ok(index.to_json())
}

fn add_section_to_index(
index: &mut Index,
section: &Section,
library: &Library,
search_config: &Search,
) {
if !section.meta.in_search_index {
return;
}

// Don't index redirecting sections
if section.meta.redirect_to.is_none() {
index.add_doc(
&section.permalink,
&fill_index(
search_config,
&section.meta.title,
&section.meta.description,
&None,
&section.path,
&section.content,
),
);
}

for key in &section.pages {
let page = &library.pages[key];
if !page.meta.in_search_index {
continue;
}

index.add_doc(
&page.permalink,
&fill_index(
search_config,
&page.meta.title,
&page.meta.description,
&page.meta.datetime,
&page.path,
&page.content,
),
);
}
}

#[cfg(test)]
mod tests {
use super::*;
use config::Config;
use libs::elasticlunr::IndexBuilder;

#[test]
fn can_build_fields() {
let mut config = Config::default();
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "body"]);

config.search.include_content = false;
config.search.include_description = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description"]);

config.search.include_content = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description", "body"]);

config.search.include_title = false;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["description", "body"]);
}

#[test]
fn can_fill_index_default() {
let config = Config::default();
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();

let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content);
}

#[test]
fn can_fill_index_description() {
let mut config = Config::default();
config.search.include_description = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();

let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], description.unwrap());
assert_eq!(res[2], content);
}

#[test]
fn can_fill_index_truncated_content() {
let mut config = Config::default();
config.search.truncate_content_length = Some(5);
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();

let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content[..5]);
}

#[test]
fn can_fill_index_date() {
let mut config = Config::default();
config.search.include_date = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let datetime = Some(OffsetDateTime::parse("2023-01-31T00:00:00Z", &Rfc3339).unwrap());

let res = fill_index(&config.search, &title, &description, &datetime, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], "2023-01-31T00:00:00Z");
assert_eq!(res[2], content);
}
}
76 changes: 76 additions & 0 deletions components/search/src/fuse.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use config::Search;
use content::Library;
use errors::Result;
use libs::serde_json;

use crate::clean_and_truncate_body;

/// build index in Fuse.js format.
pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result<String> {
#[derive(serde::Serialize)]
struct Item<'a> {
url: &'a str,
title: Option<&'a str>,
description: Option<&'a str>,
body: Option<String>, // AMMONIA.clean has to allocate anyway
path: Option<&'a str>,
}
let mut items: Vec<Item> = Vec::new();
for (_, section) in &library.sections {
if section.lang == lang
&& section.meta.redirect_to.is_none()
&& section.meta.in_search_index
{
items.push(Item {
url: &section.permalink,
title: match config.include_title {
true => Some(&section.meta.title.as_deref().unwrap_or_default()),
false => None,
},
description: match config.include_description {
true => Some(&section.meta.description.as_deref().unwrap_or_default()),
false => None,
},
body: match config.include_content {
true => Some(clean_and_truncate_body(
config.truncate_content_length,
&section.content,
)),
false => None,
},
path: match config.include_path {
true => Some(&section.path),
false => None,
},
});
for page in &section.pages {
let page = &library.pages[page];
if page.meta.in_search_index {
items.push(Item {
url: &page.permalink,
title: match config.include_title {
true => Some(&page.meta.title.as_deref().unwrap_or_default()),
false => None,
},
description: match config.include_description {
true => Some(&page.meta.description.as_deref().unwrap_or_default()),
false => None,
},
body: match config.include_content {
true => Some(super::clean_and_truncate_body(
config.truncate_content_length,
&page.content,
)),
false => None,
},
path: match config.include_path {
true => Some(&page.path),
false => None,
},
})
}
}
}
}
Ok(serde_json::to_string(&items)?)
}
Loading

0 comments on commit 8c56a0e

Please sign in to comment.