Skip to content

Commit

Permalink
Add search-indexer and fix link checker plugins (opensearch-project#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
AMoo-Miki authored and Miki committed Oct 20, 2021
1 parent efd492c commit a9519e7
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 5 deletions.
15 changes: 10 additions & 5 deletions _plugins/link-checker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,18 @@ def self.verify(site)
@urls.each do |url, pages|
@failures << "#{url}, linked to in ./#{pages.to_a.join(", ./")}" unless self.check(url)
end

msg = "Found #{@failures.size} dead link#{@failures.size > 1 ? 's' : ''}:\n#{@failures.join("\n")}" unless @failures.empty?

if @should_build_fatally
raise msg
unless @failures.empty?
msg = "Found #{@failures.size} dead link#{@failures.size > 1 ? 's' : ''}:\n#{@failures.join("\n")}"

if @should_build_fatally
raise msg
else
puts "\nLinkChecker: [Warning] #{msg}\n"
end

else
puts "\nLinkChecker: [Warning] #{msg}\n"
puts "\nLinkChecker: [Done]\n"
end
end

Expand Down
102 changes: 102 additions & 0 deletions _plugins/search-indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# frozen_string_literal: true

require "jekyll/hooks"
require "jekyll/document"
require "json"

##
# This singleton facilitates production of an indexable JSON representation of the content to populate a data source
# to provide search functionality.

module Jekyll::ContentIndexer

##
# The collection that will get stores as the output

@data = []

##
# Pattern to identify documents that should be excluded based on their URL

@excluded_paths = /\.(css|js|json|map|xml|txt|yml)$/i.freeze

##
# Pattern to identify block HTML tags (not comprehensive)

@html_block_tags = /\s*<[?\/]?(article|blockquote|d[dlt]|div|fieldset|form|h|li|main|nav|[ou]l|p|section|table).*?>\s*/im.freeze

##
# Pattern to identify certain HTML tags whose content should be excluded from indexing

@html_excluded_tags = /\s*<(head|style|script|h1).*?>.*?<\/\1>/im.freeze

##
# Initializes the singleton by recording the site

def self.init(site)
@site = site
end

##
# Processes a Document or Page and adds it to the collection

def self.add(page)
return if @excluded_paths.match(page.url)

content = page.content
.gsub(@html_excluded_tags, ' ') # Strip certain HTML blocks
.gsub(@html_block_tags, "\n") # Strip some block HTML tags, replacing with newline
.gsub(/\s*<[?\/!]?[a-z]+.*?>\s*/im, ' ') # Strip all remaining HTML tags
.gsub(/\s*[\r\n]+\s*/, "\n") # Clean line-breaks
.gsub(/\s{2,}/, ' ') # Trim long spaces
.gsub(/\s+([.:;,)!\]?])/, '\1') # Remove spaces before some punctuations
.strip # Trim leading and tailing whitespaces

return if content.empty?

url = @site.config["baseurl"] + page.url
collection = @site.config["just_the_docs"]["collections"][page.collection&.label]["name"] if page.instance_of?(Jekyll::Document)

data = {
url: url,
title: page.data["title"],
content: content,
collection: collection
}

@data.push(data)
end

##
# Saves the collection as a JSON file

def self.save
File.open(File.join(@site.config["destination"], "search-index.json"), 'w') do |f|
f.puts JSON.pretty_generate(@data)
end
end
end

# Before any Document or Page is processed, initialize the ContentIndexer

Jekyll::Hooks.register :site, :pre_render do |site|
Jekyll::ContentIndexer.init(site)
end

# Process a Page as soon as its content is ready

Jekyll::Hooks.register :pages, :post_convert do |page|
Jekyll::ContentIndexer.add(page)
end

# Process a Document as soon as its content is ready

Jekyll::Hooks.register :documents, :post_convert do |document|
Jekyll::ContentIndexer.add(document)
end

# Save the produced collection after Jekyll is done writing all its stuff

Jekyll::Hooks.register :site, :post_write do |_|
Jekyll::ContentIndexer.save()
end

0 comments on commit a9519e7

Please sign in to comment.