diff --git a/README.md b/README.md
index 3d9e94e..419c4d2 100644
--- a/README.md
+++ b/README.md
@@ -59,10 +59,6 @@ defmodule MyEventHandler do
IO.inspect "Receive characters #{chars}"
[{:chacters, chars} | state]
end
-
- def handle_entity_reference(reference_name) do
- MyHTMLEntityConverter.convert(reference_name)
- end
end
```
diff --git a/lib/saxy.ex b/lib/saxy.ex
index f284a2e..64c5e0a 100644
--- a/lib/saxy.ex
+++ b/lib/saxy.ex
@@ -21,21 +21,19 @@ defmodule Saxy do
## Encoding
- Saxy supports ASCII and UTF-8 encodings and respects the encoding set in XML document prolog. That
- means that if the prolog declares an encoding that is not supported, it simply stops parsing and returns.
+ Saxy supports UTF-8 encodings and respects the encoding set in XML document prolog, that
+ means that if the prolog declares an encoding that Saxy does not support, it stops parsing.
- Though encoding declaration is optional in XML, so when encoding is missing in the document, UTF-8 will be
+ Though encoding declaration is optional in XML, when encoding is missing in the document, UTF-8 will be
the default encoding.
## Reference
- Saxy converts character references by default, for example `A` is converted to `"A"` and `&` is
- converted to `"&"`.
+ Saxy expands character references and XML 1.0 predefined entity references by default, for example `A`
+ is expanded to `"A"`, `&` to `"&"`, and `&` to `"&"`.
- The parser **DOES NOT** convert any entity reference, the handler that uses `Saxy.Handler` behaviour needs to convert
- all entity references during parsing by implementing `handle_entity_reference/1` callback.
-
- See `Saxy.Handler` for more details.
+ Saxy does not expand external entity references, but provides an option where you can specify the strategy
+ of how they should be handled. See more in `Saxy.parse_string/4`.
## Creation of atoms
@@ -45,6 +43,13 @@ defmodule Saxy do
Saxy does not support XSD schemas.
+ ## Shared options
+
+ * `:expand_entity` - specifies how external entity references should be handled. Three supported strategies respectively are:
+ * `:keep` - keep the original binary, for example `Orange ®` will be expanded to `"Orange ®"`, this is the default strategy.
+ * `:skip` - skip the original binary, for example `Orange ®` will be expanded to `"Orange "`.
+ * `{mod, fun, args}` - take the applied result of the specified MFA.
+
"""
alias Saxy.{Parser, ParsingError, State}
@@ -59,6 +64,10 @@ defmodule Saxy do
The third argument `state` can be used to keep track of data and parsing progress when parsing is happening, which will be
returned when parsing finishes.
+ ### Options
+
+ See the “Shared options” section at the module documentation.
+
## Examples
defmodule MyEventHandler do
@@ -88,10 +97,6 @@ defmodule Saxy do
IO.inspect "Receive characters #{chars}"
[{:chacters, chars} | state]
end
-
- def handle_entity_reference(reference_name) do
- MyEntitiesConverter.convert(reference_name)
- end
end
iex> xml = ""
@@ -107,18 +112,21 @@ defmodule Saxy do
@spec parse_string(
data :: binary,
- handler :: module | function,
- state :: term
- ) :: {:ok, state :: term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()}
-
- def parse_string(data, handler, state) when is_binary(data) and is_atom(handler) do
- initial_state = %State{
+ handler :: module() | function(),
+ initial_state :: term(),
+ options :: Keyword.t()
+ ) :: {:ok, state :: term()} | {:error, exception :: ParsingError.t()}
+ def parse_string(data, handler, initial_state, options \\ []) when is_binary(data) and is_atom(handler) do
+ expand_entity = Keyword.get(options, :expand_entity, :keep)
+
+ state = %State{
prolog: nil,
handler: handler,
- user_state: state
+ user_state: initial_state,
+ expand_entity: expand_entity
}
- Parser.parse_document(data, :done, initial_state)
+ Parser.parse_document(data, :done, state)
end
@doc ~S"""
@@ -157,10 +165,6 @@ defmodule Saxy do
IO.inspect "Receive characters #{chars}"
[{:chacters, chars} | state]
end
-
- def handle_entity_reference(reference_name) do
- MyEntitiesConverter.convert(reference_name)
- end
end
iex> stream = File.stream!("/path/to/file.xml")
@@ -179,21 +183,29 @@ defmodule Saxy do
in each chunk in the file you want to buffer. Anyway, Saxy will try trimming off the parsed parts of buffer
when it exceeds 2048 bytes (this number is not configurable yet) to keep the memory usage in a reasonable limit.
+ ### Options
+
+ See the “Shared options” section at the module documentation.
+
"""
@spec parse_stream(
stream :: File.Stream.t() | Stream.t(),
- handler :: module | function,
- state :: term
- ) :: {:ok, state :: term} | {:error, exception :: ParsingError.t()}
+ handler :: module() | function(),
+ initial_state :: term(),
+ options :: Keyword.t()
+ ) :: {:ok, state :: term()} | {:error, exception :: ParsingError.t()}
+
+ def parse_stream(%module{} = stream, handler, initial_state, options \\ []) when module in [File.Stream, Stream] do
+ expand_entity = Keyword.get(options, :expand_entity, :keep)
- def parse_stream(%module{} = stream, handler, state) when module in [File.Stream, Stream] do
- initial_state = %State{
+ state = %State{
prolog: nil,
handler: handler,
- user_state: state
+ user_state: initial_state,
+ expand_entity: expand_entity
}
- Parser.parse_document(<<>>, stream, initial_state)
+ Parser.parse_document(<<>>, stream, state)
end
end
diff --git a/lib/saxy/emitter.ex b/lib/saxy/emitter.ex
index 0a6039e..8208f52 100644
--- a/lib/saxy/emitter.ex
+++ b/lib/saxy/emitter.ex
@@ -16,13 +16,27 @@ defmodule Saxy.Emitter do
end
end
+ defp do_emit(event_type, data, handler, user_state) do
+ handler.handle_event(event_type, data, user_state)
+ end
+
@compile {:inline, [convert_entity_reference: 2]}
- def convert_entity_reference(reference_name, %State{handler: handler}) do
- handler.handle_entity_reference(reference_name)
- end
+ def convert_entity_reference("amp", _state), do: [?&]
- defp do_emit(event_type, data, handler, user_state) do
- handler.handle_event(event_type, data, user_state)
+ def convert_entity_reference("lt", _state), do: [?<]
+
+ def convert_entity_reference("gt", _state), do: [?>]
+
+ def convert_entity_reference("apos", _state), do: [?']
+
+ def convert_entity_reference("quot", _state), do: [?"]
+
+ def convert_entity_reference(reference_name, state) do
+ case state.expand_entity do
+ :keep -> [?&, reference_name, ?;]
+ :skip -> []
+ {mod, fun, args} -> apply(mod, fun, [reference_name | args])
+ end
end
end
diff --git a/lib/saxy/handler.ex b/lib/saxy/handler.ex
index 1f0cde2..5287157 100644
--- a/lib/saxy/handler.ex
+++ b/lib/saxy/handler.ex
@@ -20,7 +20,7 @@ defmodule Saxy.Handler do
## SAX Events
- There are 6 types of event need to be handled in the handler.
+ There are a couple of events that need to be handled in the handler.
### `:start_document`
@@ -74,42 +74,9 @@ defmodule Saxy.Handler do
IO.inspect "Receive characters #{chars}"
{:ok, [{:chacters, chars} | state]}
end
-
- def handle_entity_reference(reference_name) do
- MyHTMLEntities.convert(reference_name)
- end
end
"""
@callback handle_event(event_type :: atom, data :: any, user_state :: any) ::
{:ok, user_state :: any} | {:stop, returning :: any}
-
- @doc """
- Callback for entity reference conversion.
-
- Saxy does not handle any entity reference conversion during parsing, this callback will be triggered every
- time the parser encounters an entity reference.
-
- ## Examples
-
- defmodule MyEventHandler do
- def handle_event(_event_type, _event_data, state) do
- {:ok, state}
- end
-
- def handle_entity_reference("amp") do
- "&"
- end
-
- def handle_entity_reference("gt") do
- "<"
- end
-
- def handle_entity_reference(_unknown) do
- ""
- end
- end
- """
-
- @callback handle_entity_reference(reference_name :: binary) :: binary
end
diff --git a/lib/saxy/simple_form.ex b/lib/saxy/simple_form.ex
index ef6c73e..11c4cc6 100644
--- a/lib/saxy/simple_form.ex
+++ b/lib/saxy/simple_form.ex
@@ -50,9 +50,25 @@ defmodule Saxy.SimpleForm do
"""
- @spec parse_string(data :: binary) :: {:ok, term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()}
+ @doc """
+ Parse given string into simple form.
- def parse_string(data) when is_binary(data) do
- Saxy.parse_string(data, __MODULE__.Handler, [])
+ ## Options
+
+ * `:expand_entity` - specifies how external entity references should be handled. Three supported strategies respectively are:
+ * `:keep` - keep the original binary, for example `Orange ®` will be expanded to `"Orange ®"`, this is the default strategy.
+ * `:skip` - skip the original binary, for example `Orange ®` will be expanded to `"Orange "`.
+ * `{mod, fun, args}` - take the applied result of the specified MFA.
+
+ """
+
+ @spec parse_string(data :: binary, options :: Keyword.t()) ::
+ {:ok, term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()}
+
+ def parse_string(data, options \\ []) when is_binary(data) do
+ case Saxy.parse_string(data, __MODULE__.Handler, {[], options}, options) do
+ {:ok, {stack, _options}} -> {:ok, stack}
+ {:error, _reason} = error -> error
+ end
end
end
diff --git a/lib/saxy/simple_form/handler.ex b/lib/saxy/simple_form/handler.ex
index bc6e684..e422080 100644
--- a/lib/saxy/simple_form/handler.ex
+++ b/lib/saxy/simple_form/handler.ex
@@ -3,45 +3,44 @@ defmodule Saxy.SimpleForm.Handler do
@behaviour Saxy.Handler
- def handle_event(:start_document, _prolog, stack) do
- {:ok, stack}
+ def handle_event(:start_document, _prolog, state) do
+ {:ok, state}
end
- def handle_event(:start_element, {tag_name, attributes}, stack) do
+ def handle_event(:start_element, {tag_name, attributes}, state) do
+ {stack, options} = state
tag = {tag_name, attributes, []}
- {:ok, [tag | stack]}
+ {:ok, {[tag | stack], options}}
end
- def handle_event(:characters, chars, stack) do
+ def handle_event(:characters, chars, state) do
+ {stack, options} = state
[{tag_name, attributes, content} | stack] = stack
current = {tag_name, attributes, [chars | content]}
- {:ok, [current | stack]}
+ {:ok, {[current | stack], options}}
end
- def handle_event(:end_element, tag_name, stack) do
+ def handle_event(:end_element, tag_name, state) do
+ {stack, options} = state
[{^tag_name, attributes, content} | stack] = stack
current = {tag_name, attributes, Enum.reverse(content)}
case stack do
[] ->
- {:ok, [current]}
+ {:ok, {[current], options}}
[parent | rest] ->
{parent_tag_name, parent_attributes, parent_content} = parent
parent = {parent_tag_name, parent_attributes, [current | parent_content]}
- {:ok, [parent | rest]}
+ {:ok, {[parent | rest], options}}
end
end
- def handle_event(:end_document, _, stack) do
- {:ok, stack}
- end
-
- def handle_entity_reference(name) do
- [?&, name, ?;]
+ def handle_event(:end_document, _, state) do
+ {:ok, state}
end
end
diff --git a/lib/saxy/state.ex b/lib/saxy/state.ex
index e1398ae..c1bfb9e 100644
--- a/lib/saxy/state.ex
+++ b/lib/saxy/state.ex
@@ -1,7 +1,7 @@
defmodule Saxy.State do
@moduledoc false
- @enforce_keys [:handler, :user_state, :prolog]
+ @enforce_keys [:handler, :user_state, :prolog, :expand_entity]
defstruct @enforce_keys ++ [stack: []]
end
diff --git a/test/saxy/parser/element_test.exs b/test/saxy/parser/element_test.exs
index f3ebe2c..b458056 100644
--- a/test/saxy/parser/element_test.exs
+++ b/test/saxy/parser/element_test.exs
@@ -312,7 +312,8 @@ defmodule Saxy.Parser.ElementTest do
%Saxy.State{
prolog: nil,
handler: StackHandler,
- user_state: state
+ user_state: state,
+ expand_entity: :keep
}
end
diff --git a/test/saxy/parser/prolog_test.exs b/test/saxy/parser/prolog_test.exs
index 87c7549..957d3bb 100644
--- a/test/saxy/parser/prolog_test.exs
+++ b/test/saxy/parser/prolog_test.exs
@@ -236,7 +236,8 @@ defmodule Saxy.Parser.PrologTest do
%Saxy.State{
prolog: nil,
handler: StackHandler,
- user_state: state
+ user_state: state,
+ expand_entity: :keep
}
end
diff --git a/test/saxy/parser_test.exs b/test/saxy/parser_test.exs
index 1df084c..c32094f 100644
--- a/test/saxy/parser_test.exs
+++ b/test/saxy/parser_test.exs
@@ -8,7 +8,7 @@ defmodule Saxy.ParserTest do
test "streaming parsing" do
buffer = ""
stream = File.stream!("./test/support/fixture/food.xml", [], 200)
- state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
+ state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}
assert {:ok, state} = parse_document(buffer, stream, state)
@@ -16,7 +16,7 @@ defmodule Saxy.ParserTest do
buffer = ""
stream = File.stream!("./test/support/fixture/complex.xml", [], 200)
- state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
+ state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}
assert {:ok, state} = parse_document(buffer, stream, state)
@@ -25,14 +25,14 @@ defmodule Saxy.ParserTest do
test "binary parsing" do
buffer = File.read!("./test/support/fixture/food.xml")
- state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
+ state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}
assert {:ok, state} = parse_document(buffer, :done, state)
assert length(state) == 74
buffer = File.read!("./test/support/fixture/complex.xml")
- state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []}
+ state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep}
assert {:ok, state} = parse_document(buffer, :done, state)
diff --git a/test/saxy/simple_form_test.exs b/test/saxy/simple_form_test.exs
index b064488..dbaf537 100644
--- a/test/saxy/simple_form_test.exs
+++ b/test/saxy/simple_form_test.exs
@@ -1,7 +1,7 @@
defmodule Saxy.SimpleFormTest do
use ExUnit.Case, async: true
- test "parse_string/1 with simple XML" do
+ test "parse_string/1 with simple XML and default options" do
xml = """