Skip to content

Commit

Permalink
Support custom entity handling in simple form parsing (#14)
Browse files Browse the repository at this point in the history
This commit has a couple of changes:

* Supports converting XML 1.0 predefined entities.
* Hard deprecates "handle_entity_reference" event in handlers.
  • Loading branch information
qcam authored Apr 7, 2018
1 parent 291cfce commit e61f469
Show file tree
Hide file tree
Showing 13 changed files with 181 additions and 116 deletions.
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,6 @@ defmodule MyEventHandler do
IO.inspect "Receive characters #{chars}"
[{:chacters, chars} | state]
end

def handle_entity_reference(reference_name) do
MyHTMLEntityConverter.convert(reference_name)
end
end
```

Expand Down
76 changes: 44 additions & 32 deletions lib/saxy.ex
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,19 @@ defmodule Saxy do
## Encoding
Saxy supports ASCII and UTF-8 encodings and respects the encoding set in XML document prolog. That
means that if the prolog declares an encoding that is not supported, it simply stops parsing and returns.
Saxy supports UTF-8 encodings and respects the encoding set in XML document prolog, that
means that if the prolog declares an encoding that Saxy does not support, it stops parsing.
Though encoding declaration is optional in XML, so when encoding is missing in the document, UTF-8 will be
Though encoding declaration is optional in XML, when encoding is missing in the document, UTF-8 will be
the default encoding.
## Reference
Saxy converts character references by default, for example `A` is converted to `"A"` and `&` is
converted to `"&"`.
Saxy expands character references and XML 1.0 predefined entity references by default, for example `A`
is expanded to `"A"`, `&` to `"&"`, and `&` to `"&"`.
The parser **DOES NOT** convert any entity reference, the handler that uses `Saxy.Handler` behaviour needs to convert
all entity references during parsing by implementing `handle_entity_reference/1` callback.
See `Saxy.Handler` for more details.
Saxy does not expand external entity references, but provides an option where you can specify the strategy
of how they should be handled. See more in `Saxy.parse_string/4`.
## Creation of atoms
Expand All @@ -45,6 +43,13 @@ defmodule Saxy do
Saxy does not support XSD schemas.
## Shared options
* `:expand_entity` - specifies how external entity references should be handled. Three supported strategies respectively are:
* `:keep` - keep the original binary, for example `Orange ®` will be expanded to `"Orange ®"`, this is the default strategy.
* `:skip` - skip the original binary, for example `Orange ®` will be expanded to `"Orange "`.
* `{mod, fun, args}` - take the applied result of the specified MFA.
"""

alias Saxy.{Parser, ParsingError, State}
Expand All @@ -59,6 +64,10 @@ defmodule Saxy do
The third argument `state` can be used to keep track of data and parsing progress when parsing is happening, which will be
returned when parsing finishes.
### Options
See the “Shared options” section at the module documentation.
## Examples
defmodule MyEventHandler do
Expand Down Expand Up @@ -88,10 +97,6 @@ defmodule Saxy do
IO.inspect "Receive characters #{chars}"
[{:chacters, chars} | state]
end
def handle_entity_reference(reference_name) do
MyEntitiesConverter.convert(reference_name)
end
end
iex> xml = "<?xml version='1.0' ?><foo bar='value'></foo>"
Expand All @@ -107,18 +112,21 @@ defmodule Saxy do

@spec parse_string(
data :: binary,
handler :: module | function,
state :: term
) :: {:ok, state :: term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()}

def parse_string(data, handler, state) when is_binary(data) and is_atom(handler) do
initial_state = %State{
handler :: module() | function(),
initial_state :: term(),
options :: Keyword.t()
) :: {:ok, state :: term()} | {:error, exception :: ParsingError.t()}
def parse_string(data, handler, initial_state, options \\ []) when is_binary(data) and is_atom(handler) do
expand_entity = Keyword.get(options, :expand_entity, :keep)

state = %State{
prolog: nil,
handler: handler,
user_state: state
user_state: initial_state,
expand_entity: expand_entity
}

Parser.parse_document(data, :done, initial_state)
Parser.parse_document(data, :done, state)
end

@doc ~S"""
Expand Down Expand Up @@ -157,10 +165,6 @@ defmodule Saxy do
IO.inspect "Receive characters #{chars}"
[{:chacters, chars} | state]
end
def handle_entity_reference(reference_name) do
MyEntitiesConverter.convert(reference_name)
end
end
iex> stream = File.stream!("/path/to/file.xml")
Expand All @@ -179,21 +183,29 @@ defmodule Saxy do
in each chunk in the file you want to buffer. Anyway, Saxy will try trimming off the parsed parts of buffer
when it exceeds 2048 bytes (this number is not configurable yet) to keep the memory usage in a reasonable limit.
### Options
See the “Shared options” section at the module documentation.
"""

@spec parse_stream(
stream :: File.Stream.t() | Stream.t(),
handler :: module | function,
state :: term
) :: {:ok, state :: term} | {:error, exception :: ParsingError.t()}
handler :: module() | function(),
initial_state :: term(),
options :: Keyword.t()
) :: {:ok, state :: term()} | {:error, exception :: ParsingError.t()}

def parse_stream(%module{} = stream, handler, initial_state, options \\ []) when module in [File.Stream, Stream] do
expand_entity = Keyword.get(options, :expand_entity, :keep)

def parse_stream(%module{} = stream, handler, state) when module in [File.Stream, Stream] do
initial_state = %State{
state = %State{
prolog: nil,
handler: handler,
user_state: state
user_state: initial_state,
expand_entity: expand_entity
}

Parser.parse_document(<<>>, stream, initial_state)
Parser.parse_document(<<>>, stream, state)
end
end
24 changes: 19 additions & 5 deletions lib/saxy/emitter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,27 @@ defmodule Saxy.Emitter do
end
end

defp do_emit(event_type, data, handler, user_state) do
handler.handle_event(event_type, data, user_state)
end

@compile {:inline, [convert_entity_reference: 2]}

def convert_entity_reference(reference_name, %State{handler: handler}) do
handler.handle_entity_reference(reference_name)
end
def convert_entity_reference("amp", _state), do: [?&]

defp do_emit(event_type, data, handler, user_state) do
handler.handle_event(event_type, data, user_state)
def convert_entity_reference("lt", _state), do: [?<]

def convert_entity_reference("gt", _state), do: [?>]

def convert_entity_reference("apos", _state), do: [?']

def convert_entity_reference("quot", _state), do: [?"]

def convert_entity_reference(reference_name, state) do
case state.expand_entity do
:keep -> [?&, reference_name, ?;]
:skip -> []
{mod, fun, args} -> apply(mod, fun, [reference_name | args])
end
end
end
35 changes: 1 addition & 34 deletions lib/saxy/handler.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ defmodule Saxy.Handler do
## SAX Events
There are 6 types of event need to be handled in the handler.
There are a couple of events that need to be handled in the handler.
### `:start_document`
Expand Down Expand Up @@ -74,42 +74,9 @@ defmodule Saxy.Handler do
IO.inspect "Receive characters #{chars}"
{:ok, [{:chacters, chars} | state]}
end
def handle_entity_reference(reference_name) do
MyHTMLEntities.convert(reference_name)
end
end
"""

@callback handle_event(event_type :: atom, data :: any, user_state :: any) ::
{:ok, user_state :: any} | {:stop, returning :: any}

@doc """
Callback for entity reference conversion.
Saxy does not handle any entity reference conversion during parsing, this callback will be triggered every
time the parser encounters an entity reference.
## Examples
defmodule MyEventHandler do
def handle_event(_event_type, _event_data, state) do
{:ok, state}
end
def handle_entity_reference("amp") do
"&"
end
def handle_entity_reference("gt") do
"<"
end
def handle_entity_reference(_unknown) do
""
end
end
"""

@callback handle_entity_reference(reference_name :: binary) :: binary
end
22 changes: 19 additions & 3 deletions lib/saxy/simple_form.ex
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,25 @@ defmodule Saxy.SimpleForm do
"""

@spec parse_string(data :: binary) :: {:ok, term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()}
@doc """
Parse given string into simple form.
def parse_string(data) when is_binary(data) do
Saxy.parse_string(data, __MODULE__.Handler, [])
## Options
* `:expand_entity` - specifies how external entity references should be handled. Three supported strategies respectively are:
* `:keep` - keep the original binary, for example `Orange &reg;` will be expanded to `"Orange &reg;"`, this is the default strategy.
* `:skip` - skip the original binary, for example `Orange &reg;` will be expanded to `"Orange "`.
* `{mod, fun, args}` - take the applied result of the specified MFA.
"""

@spec parse_string(data :: binary, options :: Keyword.t()) ::
{:ok, term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()}

def parse_string(data, options \\ []) when is_binary(data) do
case Saxy.parse_string(data, __MODULE__.Handler, {[], options}, options) do
{:ok, {stack, _options}} -> {:ok, stack}
{:error, _reason} = error -> error
end
end
end
29 changes: 14 additions & 15 deletions lib/saxy/simple_form/handler.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,44 @@ defmodule Saxy.SimpleForm.Handler do

@behaviour Saxy.Handler

def handle_event(:start_document, _prolog, stack) do
{:ok, stack}
def handle_event(:start_document, _prolog, state) do
{:ok, state}
end

def handle_event(:start_element, {tag_name, attributes}, stack) do
def handle_event(:start_element, {tag_name, attributes}, state) do
{stack, options} = state
tag = {tag_name, attributes, []}

{:ok, [tag | stack]}
{:ok, {[tag | stack], options}}
end

def handle_event(:characters, chars, stack) do
def handle_event(:characters, chars, state) do
{stack, options} = state
[{tag_name, attributes, content} | stack] = stack

current = {tag_name, attributes, [chars | content]}

{:ok, [current | stack]}
{:ok, {[current | stack], options}}
end

def handle_event(:end_element, tag_name, stack) do
def handle_event(:end_element, tag_name, state) do
{stack, options} = state
[{^tag_name, attributes, content} | stack] = stack

current = {tag_name, attributes, Enum.reverse(content)}

case stack do
[] ->
{:ok, [current]}
{:ok, {[current], options}}

[parent | rest] ->
{parent_tag_name, parent_attributes, parent_content} = parent
parent = {parent_tag_name, parent_attributes, [current | parent_content]}
{:ok, [parent | rest]}
{:ok, {[parent | rest], options}}
end
end

def handle_event(:end_document, _, stack) do
{:ok, stack}
end

def handle_entity_reference(name) do
[?&, name, ?;]
def handle_event(:end_document, _, state) do
{:ok, state}
end
end
2 changes: 1 addition & 1 deletion lib/saxy/state.ex
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
defmodule Saxy.State do
@moduledoc false

@enforce_keys [:handler, :user_state, :prolog]
@enforce_keys [:handler, :user_state, :prolog, :expand_entity]

defstruct @enforce_keys ++ [stack: []]
end
3 changes: 2 additions & 1 deletion test/saxy/parser/element_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,8 @@ defmodule Saxy.Parser.ElementTest do
%Saxy.State{
prolog: nil,
handler: StackHandler,
user_state: state
user_state: state,
expand_entity: :keep
}
end

Expand Down
3 changes: 2 additions & 1 deletion test/saxy/parser/prolog_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ defmodule Saxy.Parser.PrologTest do
%Saxy.State{
prolog: nil,
handler: StackHandler,
user_state: state
user_state: state,
expand_entity: :keep
}
end

Expand Down
8 changes: 4 additions & 4 deletions test/saxy/parser_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ defmodule Saxy.ParserTest do
test "streaming parsing" do
buffer = ""
stream = File.stream!("./test/support/fixture/food.xml", [], 200)
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}

assert {:ok, state} = parse_document(buffer, stream, state)

assert length(state) == 74

buffer = ""
stream = File.stream!("./test/support/fixture/complex.xml", [], 200)
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}

assert {:ok, state} = parse_document(buffer, stream, state)

Expand All @@ -25,14 +25,14 @@ defmodule Saxy.ParserTest do

test "binary parsing" do
buffer = File.read!("./test/support/fixture/food.xml")
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}

assert {:ok, state} = parse_document(buffer, :done, state)

assert length(state) == 74

buffer = File.read!("./test/support/fixture/complex.xml")
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}

assert {:ok, state} = parse_document(buffer, :done, state)

Expand Down
Loading

0 comments on commit e61f469

Please sign in to comment.