From 64cd2fbdb9cd36849e26b918156fea9df699f2e9 Mon Sep 17 00:00:00 2001 From: Maruan Al-Shedivat Date: Wed, 10 Jan 2024 00:20:08 -0500 Subject: [PATCH] Enable specifying explicit list of external posts to display --- _plugins/external-posts.rb | 96 +++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 18 deletions(-) diff --git a/_plugins/external-posts.rb b/_plugins/external-posts.rb index fccaf09d7c67..91ef8d6d7f53 100644 --- a/_plugins/external-posts.rb +++ b/_plugins/external-posts.rb @@ -1,6 +1,8 @@ require 'feedjira' require 'httparty' require 'jekyll' +require 'nokogiri' +require 'time' module ExternalPosts class ExternalPostsGenerator < Jekyll::Generator @@ -10,27 +12,85 @@ class ExternalPostsGenerator < Jekyll::Generator def generate(site) if site.config['external_sources'] != nil site.config['external_sources'].each do |src| - p "Fetching external posts from #{src['name']}:" - xml = HTTParty.get(src['rss_url']).body - return if xml.nil? - feed = Feedjira.parse(xml) - feed.entries.each do |e| - p "...fetching #{e.url}" - slug = e.title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') - path = site.in_source_dir("_posts/#{slug}.md") - doc = Jekyll::Document.new( - path, { :site => site, :collection => site.collections['posts'] } - ) - doc.data['external_source'] = src['name']; - doc.data['feed_content'] = e.content; - doc.data['title'] = "#{e.title}"; - doc.data['description'] = e.summary; - doc.data['date'] = e.published; - doc.data['redirect'] = e.url; - site.collections['posts'].docs << doc + puts "Fetching external posts from #{src['name']}:" + if src['rss_url'] + fetch_from_rss(site, src) + elsif src['posts'] + fetch_from_urls(site, src) end end end end + + def fetch_from_rss(site, src) + xml = HTTParty.get(src['rss_url']).body + return if xml.nil? + feed = Feedjira.parse(xml) + process_entries(site, src, feed.entries) + end + + def process_entries(site, src, entries) + entries.each do |e| + puts "...fetching #{e.url}" + create_document(site, src['name'], e.url, { + title: e.title, + content: e.content, + summary: e.summary, + published: e.published + }) + end + end + + def create_document(site, source_name, url, content) + slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') + path = site.in_source_dir("_posts/#{slug}.md") + doc = Jekyll::Document.new( + path, { :site => site, :collection => site.collections['posts'] } + ) + doc.data['external_source'] = source_name + doc.data['title'] = content[:title] + doc.data['feed_content'] = content[:content] + doc.data['description'] = content[:summary] + doc.data['date'] = content[:published] + doc.data['redirect'] = url + site.collections['posts'].docs << doc + end + + def fetch_from_urls(site, src) + src['posts'].each do |post| + puts "...fetching #{post['url']}" + content = fetch_content_from_url(post['url']) + content[:published] = parse_published_date(post['published_date']) + create_document(site, src['name'], post['url'], content) + end + end + + def parse_published_date(published_date) + case published_date + when String + Time.parse(published_date).utc + when Date + published_date.to_time.utc + else + raise "Invalid date format for #{published_date}" + end + end + + def fetch_content_from_url(url) + html = HTTParty.get(url).body + parsed_html = Nokogiri::HTML(html) + + title = parsed_html.at('head title')&.text || '' + description = parsed_html.at('head meta[name="description"]')&.attr('content') || '' + body_content = parsed_html.at('body')&.inner_html || '' + + { + title: title, + content: body_content, + summary: description + # Note: The published date is now added in the fetch_from_urls method. + } + end + end end