diff --git a/README.md b/README.md index 3d9e94e..419c4d2 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,6 @@ defmodule MyEventHandler do IO.inspect "Receive characters #{chars}" [{:chacters, chars} | state] end - - def handle_entity_reference(reference_name) do - MyHTMLEntityConverter.convert(reference_name) - end end ``` diff --git a/lib/saxy.ex b/lib/saxy.ex index f284a2e..64c5e0a 100644 --- a/lib/saxy.ex +++ b/lib/saxy.ex @@ -21,21 +21,19 @@ defmodule Saxy do ## Encoding - Saxy supports ASCII and UTF-8 encodings and respects the encoding set in XML document prolog. That - means that if the prolog declares an encoding that is not supported, it simply stops parsing and returns. + Saxy supports UTF-8 encodings and respects the encoding set in XML document prolog, that + means that if the prolog declares an encoding that Saxy does not support, it stops parsing. - Though encoding declaration is optional in XML, so when encoding is missing in the document, UTF-8 will be + Though encoding declaration is optional in XML, when encoding is missing in the document, UTF-8 will be the default encoding. ## Reference - Saxy converts character references by default, for example `A` is converted to `"A"` and `&` is - converted to `"&"`. + Saxy expands character references and XML 1.0 predefined entity references by default, for example `A` + is expanded to `"A"`, `&` to `"&"`, and `&` to `"&"`. - The parser **DOES NOT** convert any entity reference, the handler that uses `Saxy.Handler` behaviour needs to convert - all entity references during parsing by implementing `handle_entity_reference/1` callback. - - See `Saxy.Handler` for more details. + Saxy does not expand external entity references, but provides an option where you can specify the strategy + of how they should be handled. See more in `Saxy.parse_string/4`. ## Creation of atoms @@ -45,6 +43,13 @@ defmodule Saxy do Saxy does not support XSD schemas. + ## Shared options + + * `:expand_entity` - specifies how external entity references should be handled. Three supported strategies respectively are: + * `:keep` - keep the original binary, for example `Orange ®` will be expanded to `"Orange ®"`, this is the default strategy. + * `:skip` - skip the original binary, for example `Orange ®` will be expanded to `"Orange "`. + * `{mod, fun, args}` - take the applied result of the specified MFA. + """ alias Saxy.{Parser, ParsingError, State} @@ -59,6 +64,10 @@ defmodule Saxy do The third argument `state` can be used to keep track of data and parsing progress when parsing is happening, which will be returned when parsing finishes. + ### Options + + See the “Shared options” section at the module documentation. + ## Examples defmodule MyEventHandler do @@ -88,10 +97,6 @@ defmodule Saxy do IO.inspect "Receive characters #{chars}" [{:chacters, chars} | state] end - - def handle_entity_reference(reference_name) do - MyEntitiesConverter.convert(reference_name) - end end iex> xml = "" @@ -107,18 +112,21 @@ defmodule Saxy do @spec parse_string( data :: binary, - handler :: module | function, - state :: term - ) :: {:ok, state :: term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()} - - def parse_string(data, handler, state) when is_binary(data) and is_atom(handler) do - initial_state = %State{ + handler :: module() | function(), + initial_state :: term(), + options :: Keyword.t() + ) :: {:ok, state :: term()} | {:error, exception :: ParsingError.t()} + def parse_string(data, handler, initial_state, options \\ []) when is_binary(data) and is_atom(handler) do + expand_entity = Keyword.get(options, :expand_entity, :keep) + + state = %State{ prolog: nil, handler: handler, - user_state: state + user_state: initial_state, + expand_entity: expand_entity } - Parser.parse_document(data, :done, initial_state) + Parser.parse_document(data, :done, state) end @doc ~S""" @@ -157,10 +165,6 @@ defmodule Saxy do IO.inspect "Receive characters #{chars}" [{:chacters, chars} | state] end - - def handle_entity_reference(reference_name) do - MyEntitiesConverter.convert(reference_name) - end end iex> stream = File.stream!("/path/to/file.xml") @@ -179,21 +183,29 @@ defmodule Saxy do in each chunk in the file you want to buffer. Anyway, Saxy will try trimming off the parsed parts of buffer when it exceeds 2048 bytes (this number is not configurable yet) to keep the memory usage in a reasonable limit. + ### Options + + See the “Shared options” section at the module documentation. + """ @spec parse_stream( stream :: File.Stream.t() | Stream.t(), - handler :: module | function, - state :: term - ) :: {:ok, state :: term} | {:error, exception :: ParsingError.t()} + handler :: module() | function(), + initial_state :: term(), + options :: Keyword.t() + ) :: {:ok, state :: term()} | {:error, exception :: ParsingError.t()} + + def parse_stream(%module{} = stream, handler, initial_state, options \\ []) when module in [File.Stream, Stream] do + expand_entity = Keyword.get(options, :expand_entity, :keep) - def parse_stream(%module{} = stream, handler, state) when module in [File.Stream, Stream] do - initial_state = %State{ + state = %State{ prolog: nil, handler: handler, - user_state: state + user_state: initial_state, + expand_entity: expand_entity } - Parser.parse_document(<<>>, stream, initial_state) + Parser.parse_document(<<>>, stream, state) end end diff --git a/lib/saxy/emitter.ex b/lib/saxy/emitter.ex index 0a6039e..8208f52 100644 --- a/lib/saxy/emitter.ex +++ b/lib/saxy/emitter.ex @@ -16,13 +16,27 @@ defmodule Saxy.Emitter do end end + defp do_emit(event_type, data, handler, user_state) do + handler.handle_event(event_type, data, user_state) + end + @compile {:inline, [convert_entity_reference: 2]} - def convert_entity_reference(reference_name, %State{handler: handler}) do - handler.handle_entity_reference(reference_name) - end + def convert_entity_reference("amp", _state), do: [?&] - defp do_emit(event_type, data, handler, user_state) do - handler.handle_event(event_type, data, user_state) + def convert_entity_reference("lt", _state), do: [?<] + + def convert_entity_reference("gt", _state), do: [?>] + + def convert_entity_reference("apos", _state), do: [?'] + + def convert_entity_reference("quot", _state), do: [?"] + + def convert_entity_reference(reference_name, state) do + case state.expand_entity do + :keep -> [?&, reference_name, ?;] + :skip -> [] + {mod, fun, args} -> apply(mod, fun, [reference_name | args]) + end end end diff --git a/lib/saxy/handler.ex b/lib/saxy/handler.ex index 1f0cde2..5287157 100644 --- a/lib/saxy/handler.ex +++ b/lib/saxy/handler.ex @@ -20,7 +20,7 @@ defmodule Saxy.Handler do ## SAX Events - There are 6 types of event need to be handled in the handler. + There are a couple of events that need to be handled in the handler. ### `:start_document` @@ -74,42 +74,9 @@ defmodule Saxy.Handler do IO.inspect "Receive characters #{chars}" {:ok, [{:chacters, chars} | state]} end - - def handle_entity_reference(reference_name) do - MyHTMLEntities.convert(reference_name) - end end """ @callback handle_event(event_type :: atom, data :: any, user_state :: any) :: {:ok, user_state :: any} | {:stop, returning :: any} - - @doc """ - Callback for entity reference conversion. - - Saxy does not handle any entity reference conversion during parsing, this callback will be triggered every - time the parser encounters an entity reference. - - ## Examples - - defmodule MyEventHandler do - def handle_event(_event_type, _event_data, state) do - {:ok, state} - end - - def handle_entity_reference("amp") do - "&" - end - - def handle_entity_reference("gt") do - "<" - end - - def handle_entity_reference(_unknown) do - "" - end - end - """ - - @callback handle_entity_reference(reference_name :: binary) :: binary end diff --git a/lib/saxy/simple_form.ex b/lib/saxy/simple_form.ex index ef6c73e..11c4cc6 100644 --- a/lib/saxy/simple_form.ex +++ b/lib/saxy/simple_form.ex @@ -50,9 +50,25 @@ defmodule Saxy.SimpleForm do """ - @spec parse_string(data :: binary) :: {:ok, term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()} + @doc """ + Parse given string into simple form. - def parse_string(data) when is_binary(data) do - Saxy.parse_string(data, __MODULE__.Handler, []) + ## Options + + * `:expand_entity` - specifies how external entity references should be handled. Three supported strategies respectively are: + * `:keep` - keep the original binary, for example `Orange ®` will be expanded to `"Orange ®"`, this is the default strategy. + * `:skip` - skip the original binary, for example `Orange ®` will be expanded to `"Orange "`. + * `{mod, fun, args}` - take the applied result of the specified MFA. + + """ + + @spec parse_string(data :: binary, options :: Keyword.t()) :: + {:ok, term} | {:error, exception :: Saxy.ParseError.t() | Saxy.HandlerError.t()} + + def parse_string(data, options \\ []) when is_binary(data) do + case Saxy.parse_string(data, __MODULE__.Handler, {[], options}, options) do + {:ok, {stack, _options}} -> {:ok, stack} + {:error, _reason} = error -> error + end end end diff --git a/lib/saxy/simple_form/handler.ex b/lib/saxy/simple_form/handler.ex index bc6e684..e422080 100644 --- a/lib/saxy/simple_form/handler.ex +++ b/lib/saxy/simple_form/handler.ex @@ -3,45 +3,44 @@ defmodule Saxy.SimpleForm.Handler do @behaviour Saxy.Handler - def handle_event(:start_document, _prolog, stack) do - {:ok, stack} + def handle_event(:start_document, _prolog, state) do + {:ok, state} end - def handle_event(:start_element, {tag_name, attributes}, stack) do + def handle_event(:start_element, {tag_name, attributes}, state) do + {stack, options} = state tag = {tag_name, attributes, []} - {:ok, [tag | stack]} + {:ok, {[tag | stack], options}} end - def handle_event(:characters, chars, stack) do + def handle_event(:characters, chars, state) do + {stack, options} = state [{tag_name, attributes, content} | stack] = stack current = {tag_name, attributes, [chars | content]} - {:ok, [current | stack]} + {:ok, {[current | stack], options}} end - def handle_event(:end_element, tag_name, stack) do + def handle_event(:end_element, tag_name, state) do + {stack, options} = state [{^tag_name, attributes, content} | stack] = stack current = {tag_name, attributes, Enum.reverse(content)} case stack do [] -> - {:ok, [current]} + {:ok, {[current], options}} [parent | rest] -> {parent_tag_name, parent_attributes, parent_content} = parent parent = {parent_tag_name, parent_attributes, [current | parent_content]} - {:ok, [parent | rest]} + {:ok, {[parent | rest], options}} end end - def handle_event(:end_document, _, stack) do - {:ok, stack} - end - - def handle_entity_reference(name) do - [?&, name, ?;] + def handle_event(:end_document, _, state) do + {:ok, state} end end diff --git a/lib/saxy/state.ex b/lib/saxy/state.ex index e1398ae..c1bfb9e 100644 --- a/lib/saxy/state.ex +++ b/lib/saxy/state.ex @@ -1,7 +1,7 @@ defmodule Saxy.State do @moduledoc false - @enforce_keys [:handler, :user_state, :prolog] + @enforce_keys [:handler, :user_state, :prolog, :expand_entity] defstruct @enforce_keys ++ [stack: []] end diff --git a/test/saxy/parser/element_test.exs b/test/saxy/parser/element_test.exs index f3ebe2c..b458056 100644 --- a/test/saxy/parser/element_test.exs +++ b/test/saxy/parser/element_test.exs @@ -312,7 +312,8 @@ defmodule Saxy.Parser.ElementTest do %Saxy.State{ prolog: nil, handler: StackHandler, - user_state: state + user_state: state, + expand_entity: :keep } end diff --git a/test/saxy/parser/prolog_test.exs b/test/saxy/parser/prolog_test.exs index 87c7549..957d3bb 100644 --- a/test/saxy/parser/prolog_test.exs +++ b/test/saxy/parser/prolog_test.exs @@ -236,7 +236,8 @@ defmodule Saxy.Parser.PrologTest do %Saxy.State{ prolog: nil, handler: StackHandler, - user_state: state + user_state: state, + expand_entity: :keep } end diff --git a/test/saxy/parser_test.exs b/test/saxy/parser_test.exs index 1df084c..c32094f 100644 --- a/test/saxy/parser_test.exs +++ b/test/saxy/parser_test.exs @@ -8,7 +8,7 @@ defmodule Saxy.ParserTest do test "streaming parsing" do buffer = "" stream = File.stream!("./test/support/fixture/food.xml", [], 200) - state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []} + state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep} assert {:ok, state} = parse_document(buffer, stream, state) @@ -16,7 +16,7 @@ defmodule Saxy.ParserTest do buffer = "" stream = File.stream!("./test/support/fixture/complex.xml", [], 200) - state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []} + state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep} assert {:ok, state} = parse_document(buffer, stream, state) @@ -25,14 +25,14 @@ defmodule Saxy.ParserTest do test "binary parsing" do buffer = File.read!("./test/support/fixture/food.xml") - state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []} + state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep} assert {:ok, state} = parse_document(buffer, :done, state) assert length(state) == 74 buffer = File.read!("./test/support/fixture/complex.xml") - state = %Saxy.State{user_state: [], handler: StackHandler, prolog: []} + state = %Saxy.State{user_state: [], handler: StackHandler, prolog: [], expand_entity: :keep} assert {:ok, state} = parse_document(buffer, :done, state) diff --git a/test/saxy/simple_form_test.exs b/test/saxy/simple_form_test.exs index b064488..dbaf537 100644 --- a/test/saxy/simple_form_test.exs +++ b/test/saxy/simple_form_test.exs @@ -1,7 +1,7 @@ defmodule Saxy.SimpleFormTest do use ExUnit.Case, async: true - test "parse_string/1 with simple XML" do + test "parse_string/1 with simple XML and default options" do xml = """ @@ -22,14 +22,14 @@ defmodule Saxy.SimpleFormTest do assert [first_element | elements] = elements assert {"movie", [{"id", "tt0120338"}, {"url", "https://www.imdb.com/title/tt0120338/"}], first_children} = first_element - assert first_children == [{"name", [], ["Titanic"]}, {"characters", [], ["Jack & Rose"]}] + assert first_children == [{"name", [], ["Titanic"]}, {"characters", [], ["Jack & Rose"]}] assert [second_element] = elements assert {"movie", [{"id", "tt0109830"}, {"url", "https://www.imdb.com/title/tt0109830/"}], second_children} = second_element - assert second_children == [{"name", [], ["Forest Gump"]}, {"characters", [], ["Forest & Jenny"]}] + assert second_children == [{"name", [], ["Forest Gump"]}, {"characters", [], ["Forest & Jenny"]}] end - test "parse_string/1 with food XML" do + test "parse_string/2 with food XML" do xml = File.read!("./test/support/fixture/food.xml") assert {:ok, simple_form} = Saxy.SimpleForm.parse_string(xml) @@ -37,4 +37,36 @@ defmodule Saxy.SimpleFormTest do assert [{"breakfast_menu", [], children}] = simple_form assert length(children) == 5 end + + test "parse_string/1 with customized entity handlers" do + xml = """ + + + + Titanic + Jack & Rose ® + + + Forest Gump + Forest & Jenny + + + """ + + assert {:ok, simple_form} = Saxy.SimpleForm.parse_string(xml, expand_entity: {__MODULE__, :handle_entity_reference, []}) + + assert [{"menu", [], elements}] = simple_form + + assert [first_element | elements] = elements + assert {"movie", [{"id", "tt0120338"}, {"url", "https://www.imdb.com/title/tt0120338/"}], first_children} = first_element + assert first_children == [{"name", [], ["Titanic"]}, {"characters", [], ["Jack & Rose ®"]}] + + assert [second_element] = elements + assert {"movie", [{"id", "tt0109830"}, {"url", "https://www.imdb.com/title/tt0109830/"}], second_children} = second_element + assert second_children == [{"name", [], ["Forest Gump"]}, {"characters", [], ["Forest & Jenny"]}] + end + + def handle_entity_reference("reg") do + "®" + end end diff --git a/test/saxy_test.exs b/test/saxy_test.exs index 2f797ca..3efe19f 100644 --- a/test/saxy_test.exs +++ b/test/saxy_test.exs @@ -17,6 +17,43 @@ defmodule SaxyTest do assert {:ok, _state} = Saxy.parse_string(data, StackHandler, []) end + test "parse_string/4 expanding entities" do + data = """ + + Something &unknown; + """ + + assert {:ok, state} = Saxy.parse_string(data, StackHandler, [], expand_entity: :keep) + + assert state == [ + {:end_document, {}}, + {:end_element, "foo"}, + {:characters, "Something &unknown;"}, + {:start_element, {"foo", []}}, + {:start_document, [version: "1.0"]} + ] + + assert {:ok, state} = Saxy.parse_string(data, StackHandler, [], expand_entity: :skip) + + assert state == [ + {:end_document, {}}, + {:end_element, "foo"}, + {:characters, "Something "}, + {:start_element, {"foo", []}}, + {:start_document, [version: "1.0"]} + ] + + assert {:ok, state} = Saxy.parse_string(data, StackHandler, [], expand_entity: {__MODULE__, :convert_entity, []}) + + assert state == [ + {:end_document, {}}, + {:end_element, "foo"}, + {:characters, "Something known"}, + {:start_element, {"foo", []}}, + {:start_document, [version: "1.0"]} + ] + end + test "parse_stream/3" do stream = File.stream!("./test/support/fixture/food.xml", [], 1024) assert {:ok, _state} = Saxy.parse_stream(stream, StackHandler, []) @@ -59,4 +96,6 @@ defmodule SaxyTest do assert {:error, error} = Saxy.parse_string(data, WrongHandler, []) assert HandlerError.message(error) == "unexpected return :something_wrong in :start_document event handler" end + + def convert_entity("unknown"), do: "known" end diff --git a/test/support/test_handlers.ex b/test/support/test_handlers.ex index 107f584..e4525b3 100644 --- a/test/support/test_handlers.ex +++ b/test/support/test_handlers.ex @@ -4,10 +4,6 @@ defmodule Saxy.TestHandlers.StackHandler do def handle_event(event_type, event_data, acc) do {:ok, [{event_type, event_data} | acc]} end - - def handle_entity_reference("amp") do - "&" - end end defmodule Saxy.TestHandlers.FastReturnHandler do @@ -16,10 +12,6 @@ defmodule Saxy.TestHandlers.FastReturnHandler do def handle_event(_event_type, _event_data, _acc) do {:stop, :fast_return} end - - def handle_entity_reference(_reference_name) do - <<>> - end end defmodule Saxy.TestHandlers.WrongHandler do @@ -28,8 +20,4 @@ defmodule Saxy.TestHandlers.WrongHandler do def handle_event(_event_type, _event_data, _acc) do :something_wrong end - - def handle_entity_reference(_reference_name) do - <<>> - end end