forked from opensearch-project/documentation-website
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add search-indexer and fix link checker plugins (opensearch-project#2)
- Loading branch information
Showing
2 changed files
with
112 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
# frozen_string_literal: true | ||
|
||
require "jekyll/hooks" | ||
require "jekyll/document" | ||
require "json" | ||
|
||
## | ||
# This singleton facilitates production of an indexable JSON representation of the content to populate a data source | ||
# to provide search functionality. | ||
|
||
module Jekyll::ContentIndexer | ||
|
||
## | ||
# The collection that will get stores as the output | ||
|
||
@data = [] | ||
|
||
## | ||
# Pattern to identify documents that should be excluded based on their URL | ||
|
||
@excluded_paths = /\.(css|js|json|map|xml|txt|yml)$/i.freeze | ||
|
||
## | ||
# Pattern to identify block HTML tags (not comprehensive) | ||
|
||
@html_block_tags = /\s*<[?\/]?(article|blockquote|d[dlt]|div|fieldset|form|h|li|main|nav|[ou]l|p|section|table).*?>\s*/im.freeze | ||
|
||
## | ||
# Pattern to identify certain HTML tags whose content should be excluded from indexing | ||
|
||
@html_excluded_tags = /\s*<(head|style|script|h1).*?>.*?<\/\1>/im.freeze | ||
|
||
## | ||
# Initializes the singleton by recording the site | ||
|
||
def self.init(site) | ||
@site = site | ||
end | ||
|
||
## | ||
# Processes a Document or Page and adds it to the collection | ||
|
||
def self.add(page) | ||
return if @excluded_paths.match(page.url) | ||
|
||
content = page.content | ||
.gsub(@html_excluded_tags, ' ') # Strip certain HTML blocks | ||
.gsub(@html_block_tags, "\n") # Strip some block HTML tags, replacing with newline | ||
.gsub(/\s*<[?\/!]?[a-z]+.*?>\s*/im, ' ') # Strip all remaining HTML tags | ||
.gsub(/\s*[\r\n]+\s*/, "\n") # Clean line-breaks | ||
.gsub(/\s{2,}/, ' ') # Trim long spaces | ||
.gsub(/\s+([.:;,)!\]?])/, '\1') # Remove spaces before some punctuations | ||
.strip # Trim leading and tailing whitespaces | ||
|
||
return if content.empty? | ||
|
||
url = @site.config["baseurl"] + page.url | ||
collection = @site.config["just_the_docs"]["collections"][page.collection&.label]["name"] if page.instance_of?(Jekyll::Document) | ||
|
||
data = { | ||
url: url, | ||
title: page.data["title"], | ||
content: content, | ||
collection: collection | ||
} | ||
|
||
@data.push(data) | ||
end | ||
|
||
## | ||
# Saves the collection as a JSON file | ||
|
||
def self.save | ||
File.open(File.join(@site.config["destination"], "search-index.json"), 'w') do |f| | ||
f.puts JSON.pretty_generate(@data) | ||
end | ||
end | ||
end | ||
|
||
# Before any Document or Page is processed, initialize the ContentIndexer | ||
|
||
Jekyll::Hooks.register :site, :pre_render do |site| | ||
Jekyll::ContentIndexer.init(site) | ||
end | ||
|
||
# Process a Page as soon as its content is ready | ||
|
||
Jekyll::Hooks.register :pages, :post_convert do |page| | ||
Jekyll::ContentIndexer.add(page) | ||
end | ||
|
||
# Process a Document as soon as its content is ready | ||
|
||
Jekyll::Hooks.register :documents, :post_convert do |document| | ||
Jekyll::ContentIndexer.add(document) | ||
end | ||
|
||
# Save the produced collection after Jekyll is done writing all its stuff | ||
|
||
Jekyll::Hooks.register :site, :post_write do |_| | ||
Jekyll::ContentIndexer.save() | ||
end |