From 89b7df6a92739cb3602e8ced21a1930e57c06dc6 Mon Sep 17 00:00:00 2001 From: Kenshi Muto Date: Sat, 18 Aug 2018 13:06:15 +0900 Subject: [PATCH 1/3] add review-epub2html tool --- bin/review-epub2html | 127 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100755 bin/review-epub2html diff --git a/bin/review-epub2html b/bin/review-epub2html new file mode 100755 index 000000000..9fb3fe5e7 --- /dev/null +++ b/bin/review-epub2html @@ -0,0 +1,127 @@ +#!/usr/bin/env ruby +# +# Copyright (c) 2018 Kenshi Muto +# +# This program is free software. +# You can distribute or modify this program under the terms of +# the GNU LGPL, Lesser General Public License version 2.1. +# For details of the GNU LGPL, see the file "COPYING". +# + +require 'zip' +require 'rexml/document' +require 'cgi' + +module ReVIEW + class Epub2Html + def initialize + @opfxml = nil + @htmls = {} + @head = nil + @tail = nil + end + + def parse_epub(epubname) + Zip::File.open(epubname) do |zio| + zio.each do |entry| + if entry.name =~ /.+\.opf\Z/ + opf = entry.get_input_stream.read + @opfxml = REXML::Document.new(opf) + elsif entry.name =~ /.+\.x?html\Z/ + @htmls[entry.name.sub('OEBPS/', '')] = entry.get_input_stream.read.force_encoding('utf-8') + end + end + end + nil + end + + def take_headtail(html) + @head = html.sub(/().*/m, '\1') + @tail = html.sub(%r{.*()}m, '\1') + end + + def sanitize(s) + s = s.sub(/\.x?html\Z/, ''). + sub(%r{\A\./}, '') + 's_' + CGI.escape(s). + gsub(/[.,+%]/, '_') + end + + def modify_html(fname, html) + doc = REXML::Document.new(html) + doc.context[:attribute_quote] = :quote + + ids = {} + + doc.each_element('//*[@id]') do |e| + sid = "#{sanitize(fname)}_#{sanitize(e.attributes['id'])}" + while ids[sid] + sid += 'E' + end + ids[sid] = true + e.attributes['id'] = sid + end + + doc.each_element('//a[@href]') do |e| + href = e.attributes['href'] + if href.start_with?('http:', 'https:', 'ftp:', 'ftps:', 'mailto:') + next + end + + file, anc = href.split('#', 2) + if anc + if file.empty? + anc = "#{sanitize(fname)}_#{sanitize(anc)}" + else + anc = "#{sanitize(file)}_#{sanitize(anc)}" + end + else + anc = sanitize(file) + end + + e.attributes['href'] = "##{anc}" + end + + doc.to_s. + sub(/.*()/m, %Q(
)). + sub(%r{().*}m, '
') + end + + def join_html(reffile) + body = [] + make_list.each do |fname| + if @head.nil? && (reffile.nil? || reffile == fname) + take_headtail(@htmls[fname]) + end + + body << modify_html(fname, @htmls[fname]) + end + "#{@head}\n#{body.join("\n")}\n#{@tail}" + end + + def make_list + items = {} + @opfxml.each_element("//package/manifest/item[@media-type='application/xhtml+xml']") do |e| + items[e.attributes['id']] = e.attributes['href'] + end + + files = [] + @opfxml.each_element('//package/spine/itemref') do |e| + files.push(items[e.attributes['idref']]) + end + + files + end + end +end + +if ARGV[0].nil? || !File.exist?(ARGV[0]) + STDERR.puts < HTML +EOT + exit 1 +end + +eph = ReVIEW::Epub2Html.new +eph.parse_epub(ARGV[0]) +puts eph.join_html(ARGV[1]) From c01860cb2a07013f721b5058666c7c492a8ca43e Mon Sep 17 00:00:00 2001 From: Kenshi Muto Date: Sat, 18 Aug 2018 13:27:13 +0900 Subject: [PATCH 2/3] remove unnecessary subdir finder --- bin/review-epub2html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/review-epub2html b/bin/review-epub2html index 9fb3fe5e7..bfc2dd98f 100755 --- a/bin/review-epub2html +++ b/bin/review-epub2html @@ -101,12 +101,12 @@ module ReVIEW def make_list items = {} - @opfxml.each_element("//package/manifest/item[@media-type='application/xhtml+xml']") do |e| + @opfxml.each_element("/package/manifest/item[@media-type='application/xhtml+xml']") do |e| items[e.attributes['id']] = e.attributes['href'] end files = [] - @opfxml.each_element('//package/spine/itemref') do |e| + @opfxml.each_element('/package/spine/itemref') do |e| files.push(items[e.attributes['idref']]) end From 39a242bc1a5f86ed3a49a07effd65e6cf8a64509 Mon Sep 17 00:00:00 2001 From: Kenshi Muto Date: Mon, 20 Aug 2018 19:13:14 +0900 Subject: [PATCH 3/3] split logic to lib --- bin/review-epub2html | 120 ++---------------------------------- lib/review/epub2html.rb | 133 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 114 deletions(-) create mode 100644 lib/review/epub2html.rb diff --git a/bin/review-epub2html b/bin/review-epub2html index bfc2dd98f..d5d30bfa2 100755 --- a/bin/review-epub2html +++ b/bin/review-epub2html @@ -6,122 +6,14 @@ # You can distribute or modify this program under the terms of # the GNU LGPL, Lesser General Public License version 2.1. # For details of the GNU LGPL, see the file "COPYING". -# - -require 'zip' -require 'rexml/document' -require 'cgi' - -module ReVIEW - class Epub2Html - def initialize - @opfxml = nil - @htmls = {} - @head = nil - @tail = nil - end - - def parse_epub(epubname) - Zip::File.open(epubname) do |zio| - zio.each do |entry| - if entry.name =~ /.+\.opf\Z/ - opf = entry.get_input_stream.read - @opfxml = REXML::Document.new(opf) - elsif entry.name =~ /.+\.x?html\Z/ - @htmls[entry.name.sub('OEBPS/', '')] = entry.get_input_stream.read.force_encoding('utf-8') - end - end - end - nil - end - - def take_headtail(html) - @head = html.sub(/().*/m, '\1') - @tail = html.sub(%r{.*()}m, '\1') - end - - def sanitize(s) - s = s.sub(/\.x?html\Z/, ''). - sub(%r{\A\./}, '') - 's_' + CGI.escape(s). - gsub(/[.,+%]/, '_') - end - - def modify_html(fname, html) - doc = REXML::Document.new(html) - doc.context[:attribute_quote] = :quote - - ids = {} - - doc.each_element('//*[@id]') do |e| - sid = "#{sanitize(fname)}_#{sanitize(e.attributes['id'])}" - while ids[sid] - sid += 'E' - end - ids[sid] = true - e.attributes['id'] = sid - end - doc.each_element('//a[@href]') do |e| - href = e.attributes['href'] - if href.start_with?('http:', 'https:', 'ftp:', 'ftps:', 'mailto:') - next - end +require 'pathname' - file, anc = href.split('#', 2) - if anc - if file.empty? - anc = "#{sanitize(fname)}_#{sanitize(anc)}" - else - anc = "#{sanitize(file)}_#{sanitize(anc)}" - end - else - anc = sanitize(file) - end +bindir = Pathname.new(__FILE__).realpath.dirname +$LOAD_PATH.unshift((bindir + '../lib').realpath) - e.attributes['href'] = "##{anc}" - end +require 'review/epub2html' - doc.to_s. - sub(/.*()/m, %Q(
)). - sub(%r{().*}m, '
') - end - - def join_html(reffile) - body = [] - make_list.each do |fname| - if @head.nil? && (reffile.nil? || reffile == fname) - take_headtail(@htmls[fname]) - end - - body << modify_html(fname, @htmls[fname]) - end - "#{@head}\n#{body.join("\n")}\n#{@tail}" - end - - def make_list - items = {} - @opfxml.each_element("/package/manifest/item[@media-type='application/xhtml+xml']") do |e| - items[e.attributes['id']] = e.attributes['href'] - end - - files = [] - @opfxml.each_element('/package/spine/itemref') do |e| - files.push(items[e.attributes['idref']]) - end - - files - end - end -end - -if ARGV[0].nil? || !File.exist?(ARGV[0]) - STDERR.puts < HTML -EOT - exit 1 +if File.basename($PROGRAM_NAME) == File.basename(__FILE__) + ReVIEW::Epub2Html.execute(*ARGV) end - -eph = ReVIEW::Epub2Html.new -eph.parse_epub(ARGV[0]) -puts eph.join_html(ARGV[1]) diff --git a/lib/review/epub2html.rb b/lib/review/epub2html.rb new file mode 100644 index 000000000..27a1e20d0 --- /dev/null +++ b/lib/review/epub2html.rb @@ -0,0 +1,133 @@ +# +# Copyright (c) 2018 Kenshi Muto +# +# This program is free software. +# You can distribute or modify this program under the terms of +# the GNU LGPL, Lesser General Public License version 2.1. +# For details of the GNU LGPL, see the file "COPYING". + +require 'zip' +require 'rexml/document' +require 'cgi' + +module ReVIEW + class Epub2Html + def self.execute(*args) + new.execute(*args) + end + + def execute(*args) + if args[0].nil? || !File.exist?(args[0]) + STDERR.puts < HTMLfile + file_for_head_and_foot: HTML file to extract header and footer area. + This file must be contained in the EPUB. + If omitted, the first found file is used. +EOT + exit 1 + end + + parse_epub(args[0]) + puts join_html(args[1]) + end + + def initialize + @opfxml = nil + @htmls = {} + @head = nil + @tail = nil + end + + def parse_epub(epubname) + Zip::File.open(epubname) do |zio| + zio.each do |entry| + if entry.name =~ /.+\.opf\Z/ + opf = entry.get_input_stream.read + @opfxml = REXML::Document.new(opf) + elsif entry.name =~ /.+\.x?html\Z/ + @htmls[entry.name.sub('OEBPS/', '')] = entry.get_input_stream.read.force_encoding('utf-8') + end + end + end + nil + end + + def take_headtail(html) + @head = html.sub(/().*/m, '\1') + @tail = html.sub(%r{.*()}m, '\1') + end + + def sanitize(s) + s = s.sub(/\.x?html\Z/, ''). + sub(%r{\A\./}, '') + 's_' + CGI.escape(s). + gsub(/[.,+%]/, '_') + end + + def modify_html(fname, html) + doc = REXML::Document.new(html) + doc.context[:attribute_quote] = :quote + + ids = {} + + doc.each_element('//*[@id]') do |e| + sid = "#{sanitize(fname)}_#{sanitize(e.attributes['id'])}" + while ids[sid] + sid += 'E' + end + ids[sid] = true + e.attributes['id'] = sid + end + + doc.each_element('//a[@href]') do |e| + href = e.attributes['href'] + if href.start_with?('http:', 'https:', 'ftp:', 'ftps:', 'mailto:') + next + end + + file, anc = href.split('#', 2) + if anc + if file.empty? + anc = "#{sanitize(fname)}_#{sanitize(anc)}" + else + anc = "#{sanitize(file)}_#{sanitize(anc)}" + end + else + anc = sanitize(file) + end + + e.attributes['href'] = "##{anc}" + end + + doc.to_s. + sub(/.*()/m, %Q(
)). + sub(%r{().*}m, '
') + end + + def join_html(reffile) + body = [] + make_list.each do |fname| + if @head.nil? && (reffile.nil? || reffile == fname) + take_headtail(@htmls[fname]) + end + + body << modify_html(fname, @htmls[fname]) + end + "#{@head}\n#{body.join("\n")}\n#{@tail}" + end + + def make_list + items = {} + @opfxml.each_element("/package/manifest/item[@media-type='application/xhtml+xml']") do |e| + items[e.attributes['id']] = e.attributes['href'] + end + + files = [] + @opfxml.each_element('/package/spine/itemref') do |e| + files.push(items[e.attributes['idref']]) + end + + files + end + end +end