Add search-indexer and fix link checker plugins (opensearch-project#2)

AMoo-Miki · Oct 20, 2021 · a9519e7 · a9519e7
1 parent efd492c
commit a9519e7
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 5 deletions.
diff --git a/_plugins/link-checker.rb b/_plugins/link-checker.rb
@@ -106,13 +106,18 @@ def self.verify(site)
     @urls.each do |url, pages|
       @failures << "#{url}, linked to in ./#{pages.to_a.join(", ./")}" unless self.check(url)
     end
-
-    msg = "Found #{@failures.size} dead link#{@failures.size > 1 ? 's' : ''}:\n#{@failures.join("\n")}" unless @failures.empty?
 
-    if @should_build_fatally
-      raise msg
+    unless @failures.empty?
+      msg = "Found #{@failures.size} dead link#{@failures.size > 1 ? 's' : ''}:\n#{@failures.join("\n")}"
+
+      if @should_build_fatally
+        raise msg
+      else
+        puts "\nLinkChecker: [Warning] #{msg}\n"
+      end
+
     else
-      puts "\nLinkChecker: [Warning] #{msg}\n"
+      puts "\nLinkChecker: [Done]\n"
     end
   end
 

diff --git a/_plugins/search-indexer.rb b/_plugins/search-indexer.rb
@@ -0,0 +1,102 @@
+# frozen_string_literal: true
+
+require "jekyll/hooks"
+require "jekyll/document"
+require "json"
+
+##
+# This singleton facilitates production of an indexable JSON representation of the content to populate a data source
+# to provide search functionality.
+
+module Jekyll::ContentIndexer
+
+  ##
+  # The collection that will get stores as the output
+
+  @data = []
+
+  ##
+  # Pattern to identify documents that should be excluded based on their URL
+
+  @excluded_paths = /\.(css|js|json|map|xml|txt|yml)$/i.freeze
+
+  ##
+  # Pattern to identify block HTML tags (not comprehensive)
+
+  @html_block_tags = /\s*<[?\/]?(article|blockquote|d[dlt]|div|fieldset|form|h|li|main|nav|[ou]l|p|section|table).*?>\s*/im.freeze
+
+  ##
+  # Pattern to identify certain HTML tags whose content should be excluded from indexing
+
+  @html_excluded_tags = /\s*<(head|style|script|h1).*?>.*?<\/\1>/im.freeze
+
+  ##
+  # Initializes the singleton by recording the site
+
+  def self.init(site)
+    @site = site
+  end
+
+  ##
+  # Processes a Document or Page and adds it to the collection
+
+  def self.add(page)
+    return if @excluded_paths.match(page.url)
+
+    content = page.content
+                  .gsub(@html_excluded_tags, ' ')             # Strip certain HTML blocks
+                  .gsub(@html_block_tags, "\n")               # Strip some block HTML tags, replacing with newline
+                  .gsub(/\s*<[?\/!]?[a-z]+.*?>\s*/im, ' ')    # Strip all remaining HTML tags
+                  .gsub(/\s*[\r\n]+\s*/, "\n")                # Clean line-breaks
+                  .gsub(/\s{2,}/, ' ')                        # Trim long spaces
+                  .gsub(/\s+([.:;,)!\]?])/, '\1')             # Remove spaces before some punctuations
+                  .strip                                      # Trim leading and tailing whitespaces
+
+    return if content.empty?
+
+    url = @site.config["baseurl"] + page.url
+    collection = @site.config["just_the_docs"]["collections"][page.collection&.label]["name"] if page.instance_of?(Jekyll::Document)
+
+    data = {
+      url: url,
+      title: page.data["title"],
+      content: content,
+      collection: collection
+    }
+
+    @data.push(data)
+  end
+
+  ##
+  # Saves the collection as a JSON file
+
+  def self.save
+    File.open(File.join(@site.config["destination"], "search-index.json"), 'w') do |f|
+      f.puts JSON.pretty_generate(@data)
+    end
+  end
+end
+
+# Before any Document or Page is processed, initialize the ContentIndexer
+
+Jekyll::Hooks.register :site, :pre_render do |site|
+  Jekyll::ContentIndexer.init(site)
+end
+
+# Process a Page as soon as its content is ready
+
+Jekyll::Hooks.register :pages, :post_convert do |page|
+  Jekyll::ContentIndexer.add(page)
+end
+
+# Process a Document as soon as its content is ready
+
+Jekyll::Hooks.register :documents, :post_convert do |document|
+  Jekyll::ContentIndexer.add(document)
+end
+
+# Save the produced collection after Jekyll is done writing all its stuff
+
+Jekyll::Hooks.register :site, :post_write do |_|
+  Jekyll::ContentIndexer.save()
+end