xmlformat.rb

#!/usr/bin/ruby -w
# vim:set ts=2 sw=2 expandtab:

# xmlformat.rb - XML document reformatter

# Copyright (c) 2004, 2005, Kitebird, LLC.  All rights reserved.
# Some portions are based on the REX shallow XML parser, which
# is Copyright (c) 1998, Robert D. Cameron. These include the
# regular expression parsing variables and the shallow_parse()
# method.
# This software is licensed as described in the file LICENSE,
# which you should have received as part of this distribution.

# Differences from Perl version:
# - Pattern for classifying token as text node is different.
#   (cannot use !~ op for case)
# - It's important to use \A and \z|\Z rather than ^ and $ in pattern
#   matches on tokens, because ^ and $ might match after/before a
#   newline for a token that spans multiple lines!

require "getoptlong"

PROG_NAME = "xmlformat"
PROG_VERSION = "1.04"
PROG_LANG = "Ruby"

# ----------------------------------------------------------------------

# XMLFormat module

# Contains:
# - Methods for parsing XML document
# - Methods for reading configuration file and operating on configuration
#   information.

module XMLFormat


# ----------------------------------------------------------------------

# Module methods

# warn - print message to stderr
# die - print message to stderr and exit

def warn(*args)
  #$stderr.print args
end

def die(*args)
  $stderr.print args
  exit(1)
end

# ----------------------------------------------------------------------

# Module variables - these do not vary per class invocation

# Regular expressions for parsing document components. Based on REX.

# Compared to Perl version, these variable names use more Ruby-like
# lettercase. (Ruby likes to interpret variables that begin with
# uppercase as constants.)

# spe = shallow parsing expression
# se = scanning expression
# ce = completion expression
# rsb = right square brackets
# qm = question mark

@@text_se = "[^<]+"
@@until_hyphen = "[^-]*-"
@@until_2_hyphens = "#{@@until_hyphen}(?:[^-]#{@@until_hyphen})*-"
@@comment_ce = "#{@@until_2_hyphens}>?"
@@until_rsbs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+"
@@cdata_ce = "#{@@until_rsbs}(?:[^\\]>]#{@@until_rsbs})*>"
@@s = "[ \\n\\t\\r]+"
@@name_strt = "[A-Za-z_:]|[^\\x00-\\x7F]"
@@name_char = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"
@@name = "(?:#{@@name_strt})(?:#{@@name_char})*"
@@quote_se = "\"[^\"]*\"|'[^']*'"
@@dt_ident_se = "#{@@s}#{@@name}(?:#{@@s}(?:#{@@name}|#{@@quote_se}))*"
@@markup_decl_ce = "(?:[^\\]\"'><]+|#{@@quote_se})*>"
@@s1 = "[\\n\\r\\t ]"
@@until_qms = "[^?]*\\?+"
@@pi_tail = "\\?>|#{@@s1}#{@@until_qms}(?:[^>?]#{@@until_qms})*>"
@@dt_item_se =
"<(?:!(?:--#{@@until_2_hyphens}>|[^-]#{@@markup_decl_ce})|\\?#{@@name}(?:#{@@pi_tail}))|%#{@@name};|#{@@s}"
@@doctype_ce =
"#{@@dt_ident_se}(?:#{@@s})?(?:\\[(?:#{@@dt_item_se})*\\](?:#{@@s})?)?>?"
@@decl_ce =
"--(?:#{@@comment_ce})?|\\[CDATA\\[(?:#{@@cdata_ce})?|DOCTYPE(?:#{@@doctype_ce})?"
@@pi_ce = "#{@@name}(?:#{@@pi_tail})?"
@@end_tag_ce = "#{@@name}(?:#{@@s})?>?"
@@att_val_se = "\"[^<\"]*\"|'[^<']*'"
@@elem_tag_se =
"#{@@name}(?:#{@@s}#{@@name}(?:#{@@s})?=(?:#{@@s})?(?:#{@@att_val_se}))*(?:#{@@s})?/?>?"
@@markup_spe =
"<(?:!(?:#{@@decl_ce})?|\\?(?:#{@@pi_ce})?|/(?:#{@@end_tag_ce})?|(?:#{@@elem_tag_se})?)"
updated_regex ="#{@@text_se}|#{@@markup_spe}".gsub("+)?","*)")
@@xml_spe = Regexp.new(updated_regex)


# ----------------------------------------------------------------------

# Allowable formatting options and their possible values:
# - The keys of this hash are the allowable option names
# - The value for each key is list of allowable option values
# - If the value is nil, the option value must be numeric
# If any new formatting option is added to this program, it
# must be specified here, *and* a default value for it should
# be listed in the *DOCUMENT and *DEFAULT pseudo-element
# option hashes.

@@opt_list = {
  "format"        => [ "block", "inline", "verbatim" ],
  "normalize"     => [ "yes", "no" ],
  "subindent"     => nil,
  "wrap-length"   => nil,
  "entry-break"   => nil,
  "exit-break"    => nil,
  "element-break" => nil
}

class XMLFormatter

  # Object creation: set up the default formatting configuration
  # and variables for maintaining input and output document.

  def initialize

    # Formatting options for each element.

    @elt_opts = { }

    # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can
    # be overridden in the configuration file, but the options must also be
    # built in to make sure they exist if not specified in the configuration
    # file.  Each of the structures must have a value for every option.

    # Options for top-level document children.
    # - Do not change entry-break: 0 ensures no extra newlines before
    #   first element of output.
    # - Do not change exit-break: 1 ensures a newline after final element
    #   of output document.
    # - It's probably best not to change any of the others, except perhaps
    #   if you want to increase the element-break.

    @elt_opts["*DOCUMENT"] = {
      "format"        => "block",
      "normalize"     => "no",
      "subindent"     => 0,
      "wrap-length"   => 0,
      "entry-break"   => 0, # do not change
      "exit-break"    => 1, # do not change
      "element-break" => 1
    }

    # Default options. These are used for any elements in the document
    # that are not specified explicitly in the configuration file.

    @elt_opts["*DEFAULT"] = {
      "format"        => "block",
      "normalize"     => "no",
      "subindent"     => 2,
      "wrap-length"   => 0,
      "entry-break"   => 1,
      "exit-break"    => 1,
      "element-break" => 1
    }

    # Run the *DOCUMENT and *DEFAULT options through the option-checker
    # to verify that the built-in values are legal.

    err_count = 0

    @elt_opts.keys.each do |elt_name|                 # ... for each element
      @elt_opts[elt_name].each do |opt_name, opt_val| # ... for each option
        opt_val, err_msg = check_option(opt_name, opt_val)
        if err_msg.nil?
          @elt_opts[elt_name][opt_name] = opt_val
        else
          warn "LOGIC ERROR: #{elt_name} default option is invalid\n"
          warn "#{err_msg}\n"
          err_count += 1
        end
      end
    end

    # Make sure that the every option is represented in the
    # *DOCUMENT and *DEFAULT structures.

    @@opt_list.keys.each do |opt_name|
      @elt_opts.keys.each do |elt_name|
        if !@elt_opts[elt_name].has_key?(opt_name)
          warn "LOGIC ERROR: #{elt_name} has no default '#{opt_name}' option\n"
          err_count += 1
        end
      end
    end

    if err_count > 0
      raise "Cannot continue; internal default formatting options must be fixed"
    end

  end

  # Initialize the variables that are used per-document

  def init_doc_vars

    # Elements that are used in the document but not named explicitly
    # in the configuration file.

    @unconf_elts = { }

    # List of tokens for current document.

    @tokens = [ ]

    # List of line numbers for each token

    @line_num = [ ]

    # Document node tree (constructed from the token list)

    @tree = [ ]

    # Variables for formatting operations:
    # @out_doc = resulting output document (constructed from document tree)
    # @pending = array of pending tokens being held until flushed

    @out_doc = ""
    @pending = [ ]

    # Inline elements within block elements are processed using the
    # text normalization (and possible line-wrapping) values of their
    # enclosing block. Blocks and inlines may be nested, so we maintain
    # a stack that allows the normalize/wrap-length values of the current
    # block to be determined.

    @block_name_stack = [ ] # for debugging
    @block_opts_stack = [ ]

    # A similar stack for maintaining each block's current break type.

    @block_break_type_stack = [ ]
  end

  # Accessors for token list and resulting output document

  def tokens
    return @tokens
  end

  def out_doc
    return @out_doc
  end

  # Methods for adding strings to output document or
  # to the pending output array

  def add_to_doc(str)
    @out_doc << str
  end

  def add_to_pending(str)
    @pending << str
  end


  # Block stack maintenance methods

  # Push options onto or pop options off from the stack.  When doing
  # this, also push or pop an element onto the break-level stack.

  def begin_block(name, opts)
    @block_name_stack << name
    @block_opts_stack << opts
    @block_break_type_stack << "entry-break"
  end

  def end_block
    @block_name_stack.pop
    @block_opts_stack.pop
    @block_break_type_stack.pop
  end

  # Return the current block's normalization status or wrap length

  def block_normalize
    return @block_opts_stack.last["normalize"] == "yes"
  end

  def block_wrap_length
    return @block_opts_stack.last["wrap-length"]
  end

  # Set the current block's break type, or return the number of newlines
  # for the block's break type

  def set_block_break_type(type)
    @block_break_type_stack[@block_break_type_stack.size-1] = type
  end

  def block_break_value
    return @block_opts_stack.last[@block_break_type_stack.last]
  end


  # Read configuration information.  For each element, construct a hash
  # containing a hash key and value for each option name and value.
  # After reading the file, fill in missing option values for
  # incomplete option structures using the *DEFAULT options.

  def read_config(conf_file)
    elt_names = nil
    in_continuation = false
    saved_line = ""

    File.open(conf_file) do |fh|
      fh.each_line do |line|
        line.chomp!
        next if line =~ /^\s*($|#)/       # skip blank lines, comments
        if in_continuation
          line = saved_line + " " + line
          saved_line = ""
          in_continuation = false
        end
        if line !~ /^\s/
          # Line doesn't begin with whitespace, so it lists element names.
          # Names are separated by whitespace or commas, possibly followed
          # by a continuation character or comment.
          if line =~ /\\$/
            in_continuation = true
            saved_line = line.sub(/\\$/, "")  # remove continuation character
            next
          end
          line.sub!(/\s*#.*$/, "")            # remove any trailing comment
          elt_names = line.split(/[\s,]+/)
          # make sure each name has an entry in the elt_opts structure
          elt_names.each do |elt_name|
            @elt_opts[elt_name] = { } unless @elt_opts.has_key?(elt_name)
          end
        else
          # Line begins with whitespace, so it contains an option
          # to apply to the current element list, possibly followed by
          # a comment.  First check that there is a current list.
          # Then parse the option name/value.

          if elt_names.nil?
            raise "#{conf_file}:#{$.}: Option setting found before any " +
                "elements were named.\n"
          end
          line.sub!(/\s*#.*$/, "")
          line =~ /^\s*(\S+)(?:\s+|\s*=\s*)(\S+)$/
          opt_name, opt_val = $1, $2
          raise "#{conf_file}:#{$.}: Malformed line: #{$_}" if opt_val.nil?

          # Check option. If illegal, die with message. Otherwise,
          # add option to each element in current element list

          opt_val, err_msg = check_option(opt_name, opt_val)
          raise "#{conf_file}:#{$.}: #{err_msg}\n" unless err_msg.nil?
          elt_names.each do |elt_name|
            @elt_opts[elt_name][opt_name] = opt_val
          end

        end
      end
    end

    # For any element that has missing option values, fill in the values
    # using the options for the *DEFAULT pseudo-element.  This speeds up
    # element option lookups later.  It also makes it unnecessary to test
    # each option to see if it's defined: All element option structures
    # will have every option defined.

    def_opts = @elt_opts["*DEFAULT"]

    @elt_opts.keys.each do |elt_name|
      next if elt_name == "*DEFAULT"
      def_opts.keys.each do |opt_name|
        next if @elt_opts[elt_name].has_key?(opt_name)   # already set
        @elt_opts[elt_name][opt_name] = def_opts[opt_name]
      end
    end

  end


  # Check option name to make sure it's legal. Check the value to make sure
  # that it's legal for the name.  Return a two-element array:
  # (value, nil) if the option name and value are legal.
  # (nil, message) if an error was found; message contains error message.
  # For legal values, the returned value should be assigned to the option,
  # because it may get type-converted here.

  def check_option(opt_name, opt_val)

    # - Check option name to make sure it's a legal option
    # - Then check the value.  If there is a list of values
    #   the value must be one of them.  Otherwise, the value
    #   must be an integer.

    if !@@opt_list.has_key?(opt_name)
      return [ nil, "Unknown option name: #{opt_name}" ]
    end

    allowable_val = @@opt_list[opt_name]
    if !allowable_val.nil?
      if !allowable_val.find { |val| val == opt_val }
        return [ nil, "Unknown '#{opt_name}' value: #{opt_val}" ]
      end
    elsif !opt_val.is_a?(Integer)
      if opt_val =~ /^\d+$/
        opt_val = opt_val.to_i
      else
        return [ nil, "'#{opt_name}' value (#{opt_val}) should be an integer" ]
      end
    end
    return [ opt_val, nil ]
  end
  private :check_option


  # Return hash of option values for a given element.  If no options are found:
  # - Add the element name to the list of unconfigured options.
  # - Assign the default options to the element.  (This way the test for the
  #   option fails only once.)

  def get_opts(elt_name)
    opts = @elt_opts[elt_name]
    if opts.nil?
      @unconf_elts[elt_name] = 1
      opts = @elt_opts[elt_name] =  @elt_opts["*DEFAULT"]
    end
    return opts
  end
  private :get_opts


  # Display contents of configuration options to be used to process document.
  # For each element named in the elt_opts structure, display its format
  # type, and those options that apply to the type.

  def display_config
    # Format types and the additional options that apply to each type
    format_opts = {
      "block" => [
                  "entry-break",
                  "element-break",
                  "exit-break",
                  "subindent",
                  "normalize",
                  "wrap-length"
                  ],
      "inline" => [ ],
      "verbatim" => [ ]
    }
    @elt_opts.keys.sort.each do |elt_name|
      puts elt_name
      opts = @elt_opts[elt_name]
      format = opts["format"]
      # Write out format type, then options that apply to the format type
      puts "  format = #{format}"
      format_opts[format].each do |opt_name|
        puts "  #{opt_name} = #{opts[opt_name]}"
      end
      puts
    end
  end


  # Display the list of elements that are used in the document but not
  # configured in the configuration file.

  # Then re-unconfigure the elements so that they won't be considered
  # as configured for the next document, if there is one.

  def display_unconfigured_elements
    elts = @unconf_elts.keys
    if elts.empty?
      puts "The document contains no unconfigured elements."
    else
      puts "The following document elements were assigned no formatting options:"
      puts line_wrap(elts.sort.join(" "), 0, 0, 65).join("\n")
    end

    elts.each do |elt_name|
      @elt_opts.delete(elt_name)
    end
  end

  # ----------------------------------------------------------------------

  # Main document processing routine.
  # - Argument is a string representing an input document
  # - Return value is the reformatted document, or nil. An nil return
  #   signifies either that an error occurred, or that some option was
  #   given that suppresses document output. In either case, don't write
  #   any output for the document.  Any error messages will already have
  #   been printed when this returns.

  def process_doc(doc, verbose, check_parser, canonize_only,
                  show_unconf_elts)

    init_doc_vars

    # Perform lexical parse to split document into list of tokens
    warn "Parsing document...\n" if verbose
    shallow_parse(doc)

    if (check_parser)
      warn "Checking parser...\n" if verbose
      # concatentation of tokens should be identical to original document
      if doc == tokens.join("")
        puts "Parser is okay"
      else
        puts "PARSER ERROR: document token concatenation differs from document"
      end
      return nil
    end

    # Assign input line number to each token
    assign_line_numbers

    # Look for and report any error tokens returned by parser
    warn "Checking document for errors...\n" if verbose
    if report_errors > 0
      warn "Cannot continue processing document.\n"
      return nil
    end

    # Convert the token list to a tree structure
    warn "Convert document tokens to tree...\n" if verbose
    if tokens_to_tree > 0
      warn "Cannot continue processing document.\n"
      return nil
    end

    # Check: Stringify the tree to convert it back to a single string,
    # then compare to original document string (should be identical)
    # (This is an integrity check on the validity of the to-tree and stringify
    # operations; if one or both do not work properly, a mismatch should occur.)
    #str = tree_stringify
    #print str
    #warn "ERROR: mismatch between document and resulting string\n" if doc != str

    # Canonize tree to remove extraneous whitespace
    warn "Canonizing document tree...\n" if verbose
    tree_canonize

    if (canonize_only)
      puts tree_stringify
      return nil
    end

    # One side-effect of canonizing the tree is that the formatting
    # options are looked up for each element in the document.  That
    # causes the list of elements that have no explicit configuration
    # to be built.  Display the list and return if user requested it.

    if show_unconf_elts
      display_unconfigured_elements
      return nil
    end

    # Format the tree to produce formatted XML as a single string
    warn "Formatting document tree...\n" if verbose
    tree_format

    # If the document is not empty, add a newline and emit a warning if
    # reformatting failed to add a trailing newline.  This shouldn't
    # happen if the *DOCUMENT options are set up with exit-break = 1,
    # which is the reason for the warning rather than just silently
    # adding the newline.

    str = out_doc
    if !str.empty? && str !~ /\n\z/
      warn "LOGIC ERROR: trailing newline had to be added\n"
      str << "\n"
    end

    return str
  end

  # ----------------------------------------------------------------------

  # Parse XML document into array of tokens and store array

  def shallow_parse(xml_document)
    @tokens = xml_document.scan(@@xml_spe)
  end

  # ----------------------------------------------------------------------

  # Extract a tag name from a tag and return it. This uses a subset
  # of the document-parsing pattern elements.

  # Dies if the tag cannot be found, because this is supposed to be
  # called only with a legal tag.

  def extract_tag_name(tag)
    match = /\A<\/?(#{@@name})/.match(tag)
    return match[1] if match
    raise "Cannot find tag name in tag: #{tag}"
  end
  private :extract_tag_name

  # ----------------------------------------------------------------------

  # Assign an input line number to each token.  The number indicates
  # the line number on which the token begins.

  def assign_line_numbers
    line_num = 1;

    @line_num = [ ]
    @tokens.each do |token|
      @line_num << line_num
      line_num += token.count "\n"
    end
  end
  private :assign_line_numbers

  # ----------------------------------------------------------------------

  # Check token list for errors and report any that are found. Error
  # tokens are those that begin with "<" but do not end with ">".

  # Returns the error count.

  # Does not modify the original token list.

  def report_errors
    err_count = 0

    @tokens.each_index do |i|
      token = @tokens[i]
      if token =~ /\A</ && token !~ />\Z/
        warn "Malformed token at line #{@line_num[i]}, token #{i+1}: #{token}\n"
        err_count += 1
      end
    end

    warn "Number of errors found: #{err_count}\n" if err_count > 0
    return err_count
  end

  # ----------------------------------------------------------------------

  # Helper routine to print tag stack for tokens_to_tree

  def print_tag_stack(label, stack)
    if stack.size < 1
      warn "  #{label}: none\n"
    else
      warn "  #{label}:\n"
      stack.each_with_index do |tag, i|
        warn "  #{i+1}: #{tag}\n"
      end
    end
  end

  # Convert the list of XML document tokens to a tree representation.
  # The implementation uses a loop and a stack rather than recursion.

  # Does not modify the original token list.

  # Returns an error count.

  def tokens_to_tree

    tag_stack = [ ]        # stack for element tags
    children_stack = [ ]   # stack for lists of children
    children = [ ]         # current list of children
    err_count = 0

    # Note: the text token pattern test assumes that all text tokens
    # are non-empty. This should be true, because REX doesn't create
    # empty tokens.

    @tokens.each_index do |i|
      token = @tokens[i]
      line_num = @line_num[i]
      tok_err = "Error near line #{line_num}, token #{i+1} (#{token})"
      case token
      when /\A[^<]/                      # text
        children << text_node(token)
      when /\A<!--/                      # comment
        children << comment_node(token)
      when /\A<\?/                       # processing instruction
        children << pi_node(token)
      when /\A<!DOCTYPE/                 # DOCTYPE
        children << doctype_node(token)
      when /\A<!\[/                      # CDATA
        children << cdata_node(token)
      when /\A<\//                       # element close tag
        if tag_stack.empty?
          warn "#{tok_err}: Close tag w/o preceding open tag; malformed document?\n"
          err_count += 1
          next
        end
        if children_stack.empty?
          warn "#{tok_err}: Empty children stack; malformed document?\n"
          err_count += 1
          next
        end
        tag = tag_stack.pop
        open_tag_name = extract_tag_name(tag)
        close_tag_name = extract_tag_name(token)
        if open_tag_name != close_tag_name
          warn "#{tok_err}: Tag mismatch; malformed document?\n"
          warn "  open tag: #{tag}\n"
          warn "  close tag: #{token}\n"
          print_tag_stack("enclosing tags", tag_stack)
          err_count += 1
          next
        end
        elt = element_node(tag, token, children)
        children = children_stack.pop
        children << elt
      else                              # element open tag
        # If we reach here, we're seeing the open tag for an element:
        # - If the tag is also the close tag (e.g., <abc/>), close the
        #   element immediately, giving it an empty child list.
        # - Otherwise, push tag and child list on stacks, begin new child
        #   list for element body.
        case token
        when /\/>\Z/     # tag is of form <abc/>
          children << element_node(token, "", [ ])
        else              # tag is of form <abc>
          tag_stack << token
          children_stack << children
          children = [ ]
        end
      end
    end

    # At this point, the stacks should be empty if the document is
    # well-formed.

    if !tag_stack.empty?
      warn "Error at EOF: Unclosed tags; malformed document?\n"
      print_tag_stack("unclosed tags", tag_stack)
      err_count += 1
    end
    if !children_stack.empty?
      warn "Error at EOF: Unprocessed child elements; malformed document?\n"
# TODO: print out info about them
      err_count += 1
    end

    @tree = children
    return err_count
  end


  # Node-generating helper methods for tokens_to_tree

  # Generic node generator

  def node(type, content)
    return { "type" => type, "content" => content }
  end
  private :node

  # Generators for specific non-element nodes

  def text_node(content)
    return node("text", content)
  end
  private :text_node

  def comment_node(content)
    return node("comment", content)
  end
  private :comment_node

  def pi_node(content)
    return node("pi", content)
  end
  private :pi_node

  def doctype_node(content)
    return node("DOCTYPE", content)
  end
  private :doctype_node

  def cdata_node(content)
    return node("CDATA", content)
  end
  private :cdata_node

  # For an element node, create a standard node with the type and content
  # key/value pairs. Then add pairs for the "name", "open_tag", and
  # "close_tag" hash keys.

  def element_node(open_tag, close_tag, children)
    elt = node("elt", children)
    # name is the open tag with angle brackets and attibutes stripped
    elt["name"] = extract_tag_name(open_tag)
    elt["open_tag"] = open_tag
    elt["close_tag"] = close_tag
    return elt
  end
  private :element_node

  # ----------------------------------------------------------------------

  # Convert the given XML document tree (or subtree) to string form by
  # concatentating all of its components.  Argument is a reference
  # to a list of nodes at a given level of the tree.  (If argument is
  # missing, use the top level of the tree.)

  # Does not modify the node list.

  def tree_stringify(children = @tree)
    str = ""

    children.each do |child|
      # - Elements have list of child nodes as content (process recursively)
      # - All other node types have text content
      if child["type"] == "elt"
        str << child["open_tag"] +
              tree_stringify(child["content"]) +
              child["close_tag"]
      else
        str << child["content"]
      end
    end
    return str
  end

  # ----------------------------------------------------------------------

  # Put tree in "canonical" form by eliminating extraneous whitespace
  # from element text content.

  # children is a list of child nodes

  # This function modifies the node list.

  # Canonizing occurs as follows:
  # - Comment, PI, DOCTYPE, and CDATA nodes remain untouched
  # - Verbatim elements and their descendants remain untouched
  # - Within non-normalized block elements:
  #   - Delete all-whitespace text node children
  #   - Leave other text node children untouched
  # - Within normalized block elements:
  #   - Convert runs of whitespace (including line-endings) to single spaces
  #   - Trim leading whitespace of first text node
  #   - Trim trailing whitespace of last text node
  #   - Trim whitespace that is adjacent to a verbatim or non-normalized
  #     sub-element.  (For example, if a <programlisting> is followed by
  #     more text, delete any whitespace at beginning of that text.)
  # - Within inline elements:
  #   - Normalize the same way as the enclosing block element, with the
  #     exception that a space at the beginning or end is not removed.
  #     (Otherwise, <para>three<literal> blind </literal>mice</para>
  #     would become <para>three<literal>blind</literal>mice</para>).

  def tree_canonize
    @tree = tree_canonize2(@tree, "*DOCUMENT")
  end

  def tree_canonize2(children, par_name)

    # Formatting options for parent
    par_opts = get_opts(par_name)

    # If parent is a block element, remember its formatting options on
    # the block stack so they can be used to control canonization of
    # inline child elements.

    if par_opts["format"] == "block"
      begin_block(par_name, par_opts)
    end

    # Iterate through list of child nodes to preserve, modify, or
    # discard whitespace.  Return resulting list of children.

    # Canonize element and text nodes. Leave everything else (comments,
    # processing instructions, etc.) untouched.

    new_children = [ ]

    while !children.empty?

      child = children.shift

      if child["type"] == "elt"

        # Leave verbatim elements untouched. For other element nodes,
        # canonize child list using options appropriate to element.

        if get_opts(child["name"])["format"] != "verbatim"
          child["content"] = tree_canonize2(child["content"], child["name"])
        end

      elsif child["type"] == "text"

        # Delete all-whitespace node or strip whitespace as appropriate.

        # Paranoia check: We should never get here for verbatim elements,
        # because normalization is irrelevant for them.

        if par_opts["format"] == "verbatim"
          die "LOGIC ERROR: trying to canonize verbatim element #{par_name}!\n"
        end

        if !block_normalize

          # Enclosing block is not normalized:
          # - Delete child all-whitespace text nodes.
          # - Leave other text nodes untouched.

          next if child["content"] =~ /\A\s*\Z/

        else

          # Enclosing block is normalized, so normalize this text node:
          # - Convert runs of whitespace characters (including
          #   line-endings characters) to single spaces.
          # - Trim leading whitespace if this node is the first child
          #   of a block element or it follows a non-normalized node.
          # - Trim leading whitespace if this node is the last child
          #   of a block element or it precedes a non-normalized node.

          # These are nil if there is no prev or next child
          prev_child = new_children.last
          next_child = children.first

          child["content"].gsub!(/\s+/, " ")
          if (prev_child.nil? && par_opts["format"] == "block") ||
              non_normalized_node(prev_child)
            child["content"].sub!(/\A /, "")
          end
          if (next_child.nil? && par_opts["format"] == "block") ||
              non_normalized_node(next_child)
            child["content"].sub!(/ \Z/, "")
          end

          # If resulting text is empty, discard the node.
          next if child["content"] =~ /\A\Z/

        end
      end
      new_children << child
    end

    # Pop block stack if parent was a block element
    end_block if par_opts["format"] == "block"

    return new_children
  end
  private :tree_canonize2


  # Helper function for tree_canonize().

  # Determine whether a node is normalized.  This is used to check
  # the node that is adjacent to a given text node (either previous
  # or following).
  # - No is node is nil
  # - No if the node is a verbatim element
  # - If the node is a block element, yes or no according to its
  #   normalize option
  # - No if the node is an inline element.  Inlines are normalized
  #   if the parent block is normalized, but this method is not called
  #   except while examinine normalized blocks. So its inline children
  #   are also normalized.
  # - No if node is a comment, PI, DOCTYPE, or CDATA section. These are
  #   treated like verbatim elements.

  def non_normalized_node(node)
    return false if node.nil?
    case node["type"]
    when "elt"
      opts = get_opts(node["name"])
      case opts["format"]
      when "verbatim"
        return true
      when "block"
        return opts["normalize"] == "no"
      when "inline"
        return false
      else
        die "LOGIC ERROR: non_normalized_node: unhandled node format.\n"
      end
    when "comment", "pi", "DOCTYPE", "CDATA"
      return true
    when "text"
      die "LOGIC ERROR: non_normalized_node: got called for text node.\n"
    else
      die "LOGIC ERROR: non_normalized_node: unhandled node type.\n"
    end
  end
  private :non_normalized_node

  # ----------------------------------------------------------------------

  # Format (pretty-print) the document tree

  # Does not modify the node list.

  # The class maintains two variables for storing output:
  # - out_doc stores content that has been seen and "flushed".
  # - pending stores an array of strings (content of text nodes and inline
  #   element tags).  These are held until they need to be flushed, at
  #   which point they are concatenated and possibly wrapped/indented.
  #   Flushing occurs when a break needs to be written, which happens
  #   when something other than a text node or inline element is seen.

  # If parent name and children are not given, format the entire document.
  # Assume prevailing indent = 0 if not given.

  def tree_format(par_name = "*DOCUMENT", children = @tree, indent = 0)

    # Formatting options for parent element
    par_opts = get_opts(par_name)

    # If parent is a block element:
    # - Remember its formatting options on the block stack so they can
    #   be used to control formatting of inline child elements.
    # - Set initial break type to entry-break.
    # - Shift prevailing indent right before generating child content.

    if par_opts["format"] == "block"
      begin_block(par_name, par_opts)
      set_block_break_type("entry-break")
      indent += par_opts["subindent"]
    end

    # Variables for keeping track of whether the previous child
    # was a text node. Used for controlling break behavior in
    # non-normalized block elements: No line breaks are added around
    # text in such elements, nor is indenting added.

    prev_child_is_text = false
    cur_child_is_text = false

    children.each do |child|

      prev_child_is_text = cur_child_is_text

      # Text nodes: just add text to pending output

      if child["type"] == "text"
        cur_child_is_text = true
        add_to_pending(child["content"])
        next
      end

      cur_child_is_text = false

      # Element nodes: handle depending on format type

      if child["type"] == "elt"

        child_opts = get_opts(child["name"])

        # Verbatim elements:
        # - Print literally without change (use _stringify).
        # - Do not line-wrap or add any indent.

        if child_opts["format"] == "verbatim"
          flush_pending(indent)
          emit_break(0) unless prev_child_is_text && !block_normalize
          set_block_break_type("element-break")
          add_to_doc(child["open_tag"] +
                    tree_stringify(child["content"]) +
                    child["close_tag"])
          next
        end

        # Inline elements:
        # - Do not break or indent.
        # - Do not line-wrap content; just add content to pending output
        #   and let it be wrapped as part of parent's content.

        if child_opts["format"] == "inline"
          add_to_pending(child["open_tag"])
          tree_format(child["name"], child["content"], indent)
          add_to_pending(child["close_tag"])
          next
        end

        # If we get here, node is a block element.

        # - Break and flush any pending output
        # - Break and indent (no indent if break count is zero)
        # - Process element itself:
        #   - Put out opening tag
        #   - Put out element content
        #   - Put out any indent needed before closing tag. None needed if:
        #     - Element's exit-break is 0 (closing tag is not on new line,
        #       so don't indent it)
        #     - There is no separate closing tag (it was in <abc/> format)
        #     - Element has no children (tags will be written as
        #       <abc></abc>, so don't indent closing tag)
        #     - Element has children, but the block is not normalized and
        #       the last child is a text node
        #   - Put out closing tag

        flush_pending(indent)
        emit_break(indent) unless prev_child_is_text && !block_normalize
        set_block_break_type("element-break")
        add_to_doc(child["open_tag"])
        tree_format(child["name"], child["content"], indent)
        unless child_opts["exit-break"] <= 0 ||
            child["close_tag"].empty? ||
            child["content"].empty? ||
            (!child["content"].empty? &&
              child["content"].last["type"] == "text" &&
              child_opts["normalize"] == "no")
          add_to_doc(" " * indent)
        end
        add_to_doc(child["close_tag"])
        next
      end

      # Comments, PIs, etc. (everything other than text and elements),
      # treat similarly to verbatim block:
      # - Flush any pending output
      # - Put out a break
      # - Add node content to collected output

      flush_pending(indent)
      emit_break(0) unless prev_child_is_text && !block_normalize
      set_block_break_type("element-break")
      add_to_doc(child["content"])

    end

    prev_child_is_text = cur_child_is_text

    # Done processing current element's children now.

    # If current element is a block element:
    # - If there were any children, flush any pending output and put
    #   out the exit break.
    # - Pop the block stack

    if par_opts["format"] == "block"
      if !children.empty?
        flush_pending(indent)
        set_block_break_type("exit-break")
        emit_break(0) unless prev_child_is_text && !block_normalize
      end
      end_block
    end

  end


  # Emit a break - the appropriate number of newlines according to the
  # enclosing block's current break type.

  # In addition, emit the number of spaces indicated by indent.  (indent
  # > 0 when breaking just before emitting an element tag that should
  # be indented within its parent element.)

  # Exception: Emit no indent if break count is zero. That indicates
  # any following output will be written on the same output line, not
  # indented on a new line.

  # Initially, when processing a node's child list, the break type is
  # set to entry-break. Each subsequent break is an element-break.
  # (After child list has been processed, an exit-break is produced as well.)

  def emit_break(indent)

    # number of newlines to emit
    break_value = block_break_value

    add_to_doc("\n" * break_value)
    # add indent if there *was* a break
    add_to_doc(" " * indent) if indent >0 && break_value > 0
  end
  private :emit_break


  # Flush pending output to output document collected thus far:
  # - Wrap pending contents as necessary, with indent before *each* line.
  # - Add pending text to output document (thus "flushing" it)
  # - Clear pending array.

  def flush_pending(indent)

    # Do nothing if nothing to flush
    return if @pending.empty?

    # If current block is not normalized:
    # - Text nodes cannot be modified (no wrapping or indent).  Flush
    #   text as is without adding a break or indent.
    # If current block is normalized:
    # - Add a break.
    # - If line wrap is disabled:
    #   - Add indent if there is a break. (If there isn't a break, text
    #     should immediately follow preceding tag, so don't add indent.)
    #   - Add text without wrapping
    # - If line wrap is enabled:
    #   - First line indent is 0 if there is no break. (Text immediately
    #     follows preceding tag.) Otherwise first line indent is same as
    #     prevailing indent.
    #   - Any subsequent lines get the prevailing indent.

    # After flushing text, advance break type to element-break.


    s = ""

    if !block_normalize
      s << @pending.join("")
    else
      emit_break(0)
      wrap_len = block_wrap_length
      break_value = block_break_value
      if wrap_len <= 0
        s << " " * indent if break_value > 0
        s << @pending.join("")
      else
        first_indent = (break_value > 0 ? indent : 0)
        # Wrap lines, then join by newlines (don't add one at end)
        s << line_wrap(@pending, first_indent, indent, wrap_len).join("\n")
      end
    end

    add_to_doc(s)
    @pending = [ ]
    set_block_break_type("element-break")
  end
  private :flush_pending


  # Perform line-wrapping of string array to lines no longer than given
  # length (including indent).
  # Any word longer than line length appears by itself on line.
  # Return array of lines (not newline-terminated).

  # strs - array of text items to be joined and line-wrapped.
  # Each item may be:
  # - A tag (such as <emphasis role="bold">). This should be treated as
  #   an atomic unit, which is important for preserving inline tags intact.
  # - A possibly multi-word string (such as "This is a string"). In this
  #   latter case, line-wrapping preserves internal whitespace in the
  #   string, with the exception that if whitespace would be placed at
  #   the end of a line, it is discarded.

  # first_indent - indent for first line
  # rest_indent - indent for any remaining lines
  # max_len - maximum length of output lines (including indent)
  
  def line_wrap(strs, first_indent, rest_indent, max_len)

    # First, tokenize the strings
    words = []
    strs.each do |str|
      if str =~ /\A</
        # String is a tag; treat as atomic unit and don't split
        words << str
      else
        # String of white and non-white tokens.
        # Tokenize into white and non-white tokens.
        str.scan(/\S+|\s+/).each { |word| words << word }
      end
    end

    # Now merge tokens that are not separated by whitespace tokens. For
    # example, "<i>", "word", "</i>" gets merged to "<i>word</i>".  But
    # "<i>", " ", "word", " ", "</i>" gets left as separate tokens.

    words2 = []
    words.each do |word|
      # If there is a previous word that does not end with whitespace,
      # and the currrent word does not begin with whitespace, concatenate
      # current word to previous word.  Otherwise append current word to
      # end of list of words.
      if words2.last && words2.last !~ /\s\z/ && word !~ /\A\s/
          words2.last << word
      else
          words2 << word
      end
    end

    lines = [ ]
    line = ""
    llen = 0
    # set the indent for the first line
    indent = first_indent
    # saved-up whitespace to put before next non-white word
    white = ""
  
    words2.each do |word|            # ... while words remain to wrap
      # If word is whitespace, save it. It gets added before next
      # word if no line-break occurs.
      if word =~ /\A\s/
        white << word
        next
      end
      wlen = word.size
      if llen == 0
        # New output line; it gets at least one word (discard any
        # saved whitespace)
        line = " " * indent + word
        llen = indent + wlen
        indent = rest_indent
        white = ""
        next
      end
      if llen + white.length + wlen > max_len
        # Word (plus saved whitespace) won't fit on current line.
        # Begin new line (discard any saved whitespace).
        lines << line
        line = " " * indent + word
        llen = indent + wlen
        indent = rest_indent
        white = ""
        next
      end
      # add word to current line with saved whitespace between
      line << white + word
      llen += white.length + wlen
      white = ""
    end
  
    # push remaining line, if any
    lines << line unless line.empty?
  
    return lines
  end
  private :line_wrap

end # class XMLFormatter

end # module XMLFormat

# ----------------------------------------------------------------------

# Begin main program

include XMLFormat

usage = "Usage: #{PROG_NAME} [options] xml-file

Options:
--help, -h
    Print this message and exit.
--backup suffix -b suffix
    Back up the input document, adding suffix to the input
    filename to create the backup filename.
--canonized-output
    Proceed only as far as the document canonization stage,
    printing the result.
--check-parser
    Parse the document into tokens and verify that their
    concatenation is identical to the original input document.
    This option suppresses further document processing.
--config-file file_name, -f file_name
    Specify the configuration filename. If no file is named,
    xmlformat uses the file named by the environment variable
    XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it
    exists. Otherwise, xmlformat uses built-in formatting
    options.
--in-place, -i
    Format the document in place, replacing the contents of
    the input file with the reformatted document. (It's a
    good idea to use --backup along with this option.)
--show-config
    Show configuration options after reading configuration
    file. This option suppresses document processing.
--show-unconfigured-elements
    Show elements that are used in the document but for
    which no options were specified in the configuration
    file. This option suppresses document output.
--verbose, -v
    Be verbose about processing stages.
--version, -V
    Show version information and exit.
"

help = false
backup_suffix = nil
conf_file = nil
canonize_only = false
check_parser = false
in_place = false
show_conf = false
show_unconf_elts = false
show_version = false
verbose = false

opts = GetoptLong.new(
  [ "--help",        "-h",    GetoptLong::NO_ARGUMENT ],
  [ "--backup", "-b",         GetoptLong::REQUIRED_ARGUMENT ],
  [ "--canonized-output",     GetoptLong::NO_ARGUMENT ],
  [ "--check-parser",         GetoptLong::NO_ARGUMENT ],
  [ "--config-file", "-f",    GetoptLong::REQUIRED_ARGUMENT ],
  [ "--in-place", "-i",       GetoptLong::NO_ARGUMENT ],
  [ "--show-config",          GetoptLong::NO_ARGUMENT ],
  # need better name
  [ "--show-unconfigured-elements",          GetoptLong::NO_ARGUMENT ],
  [ "--verbose", "-v",        GetoptLong::NO_ARGUMENT ],
  [ "--version", "-V",        GetoptLong::NO_ARGUMENT ]
)

opts.each do |opt, arg|
  case opt
  when "--help"
    help = true
  when "--backup"
    backup_suffix = arg
  when "--canonized-output"
    canonize_only = true
  when "--check-parser"
    check_parser = true
  when "--config-file"
    conf_file = arg
  when "--in-place"
    in_place = true
  when "--show-config"
    show_conf = true
  when "--show-unconfigured-elements"
    show_unconf_elts = true
  when "--version"
    show_version = true
  when "--verbose"
    verbose = true
  else
    die "LOGIC ERROR: unhandled option: #{opt}\n"
  end
end

if help
  puts usage
  exit(0)
end

if show_version
  puts "#{PROG_NAME} #{PROG_VERSION} (#{PROG_LANG} version)"
  exit(0)
end

# --in-place option requires a named file

if in_place && ARGV.length == 0
  warn "WARNING: --in-place/-i option ignored (requires named input files)\n"
end

# --backup/-b is meaningless without --in-place

if backup_suffix
  unless in_place
    die "--backup/-b option meaningless without --in-place/-i option\n"
  end
end

# Save input filenames
#in_file = ARGV.dup

xf = XMLFormatter.new

env_conf_file = ENV["XMLFORMAT_CONF"]
def_conf_file = "./xmlformat.conf"

# If no config file was named, but XMLFORMAT_CONF is set, use its value
# as the config file name.
if conf_file.nil? && !env_conf_file.nil?
  conf_file = env_conf_file
end
# If config file still isn't defined, use the default file if it exists.
if conf_file.nil?
  if FileTest.readable?(def_conf_file) && !FileTest.directory?(def_conf_file)
    conf_file = def_conf_file
  end
end
if !conf_file.nil?
  warn "Reading configuration file...\n" if verbose
  if !FileTest.readable?(conf_file)
    die "Configuration file '#{conf_file}' is not readable.\n";
  end
  if FileTest.directory?(conf_file)
    die "Configuration file '#{conf_file}' is a directory.\n";
  end
  xf.read_config(conf_file)
end

if show_conf        # show configuration and exit
  xf.display_config
  exit(0)
end

# Process arguments.
# - If no files named, read string, write to stdout.
# - If files named, read and process each one. Write output to stdout
#   unless --in-place option was given.  Make backup of original file
#   if --backup option was given.

if ARGV.length == 0
  warn "Reading document...\n" if verbose
  in_doc = ""
  while gets; in_doc << $_; end

  out_doc = xf.process_doc(in_doc,
              verbose, check_parser, canonize_only, show_unconf_elts)
  if !out_doc.nil?
    warn "Writing output document...\n" if verbose
    print out_doc
  end
else
  ARGV.each do |file|
    warn "Reading document #{file}...\n" if verbose
    in_doc = ""
    File.open(file) do |fh|
      fh.each_line do |line|
        in_doc << line
      end
    end
    out_doc = xf.process_doc(in_doc,
                verbose, check_parser, canonize_only, show_unconf_elts)
    next if out_doc.nil?
    if in_place
      if backup_suffix
        warn "Making backup of #{file} to #{file}#{backup_suffix}...\n" if verbose
        File.rename(file, file + backup_suffix)
      end
      warn "Writing output document to #{file}...\n" if verbose
      File.open(file, "w") do |fh|
        fh.print out_doc
      end
    else
      warn "Writing output document...\n" if verbose
      print out_doc
    end
  end
end

warn "Done!\n" if verbose

exit(0)