Skip to content

Commit

Permalink
Add include_inputs? option to Floki.text (#459)
Browse files Browse the repository at this point in the history
Closes #391
  • Loading branch information
viniciusmuller authored May 22, 2023
1 parent 07bb4cc commit 34fb89a
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 29 deletions.
9 changes: 6 additions & 3 deletions lib/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,9 @@ defmodule Floki do
iex> Floki.text({"div", [], [{"script", [], ["hello"]}, " world"]})
" world"
iex> Floki.text([{"input", [{"type", "date"}, {"value", "2017-06-01"}], []}], include_inputs: true)
"2017-06-01"
iex> Floki.text({"div", [], [{"script", [], ["hello"]}, " world"]}, js: true)
"hello world"
Expand All @@ -504,7 +507,7 @@ defmodule Floki do

@spec text(html_tree | html_node | binary) :: binary

def text(html, opts \\ [deep: true, js: false, style: true, sep: ""]) do
def text(html, opts \\ [deep: true, js: false, style: true, sep: "", include_inputs: false]) do
cleaned_html_tree =
html
|> parse_it()
Expand All @@ -518,8 +521,8 @@ defmodule Floki do
end

case opts[:sep] do
nil -> search_strategy.get(cleaned_html_tree)
sep -> search_strategy.get(cleaned_html_tree, sep)
nil -> search_strategy.get(cleaned_html_tree, "", opts[:include_inputs])
sep -> search_strategy.get(cleaned_html_tree, sep, opts[:include_inputs])
end
end

Expand Down
32 changes: 21 additions & 11 deletions lib/floki/deep_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,35 @@ defmodule Floki.DeepText do

@type html_tree :: tuple | list

@spec get(html_tree, binary) :: binary
@spec get(html_tree, binary, boolean) :: binary

def get(html_tree, sep \\ "") do
get_text(html_tree, "", sep)
def get(html_tree, sep \\ "", include_inputs? \\ false)

def get(html_tree, sep, include_inputs?) do
get_text(html_tree, "", sep, include_inputs?)
end

defp get_text(text, "", _sep) when is_binary(text), do: text
defp get_text(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep)
defp get_text(text, "", _sep, _) when is_binary(text), do: text
defp get_text(text, acc, sep, _) when is_binary(text), do: Enum.join([acc, text], sep)

defp get_text(nodes, acc, sep) when is_list(nodes) do
defp get_text(nodes, acc, sep, include_inputs?) when is_list(nodes) do
Enum.reduce(nodes, acc, fn child, istr ->
get_text(child, istr, sep)
get_text(child, istr, sep, include_inputs?)
end)
end

defp get_text({:comment, _}, acc, _), do: acc
defp get_text({"br", _, _}, acc, _), do: acc <> "\n"
defp get_text({:comment, _}, acc, _, _), do: acc
defp get_text({"br", _, _}, acc, _, _), do: acc <> "\n"

defp get_text({"input", attrs, _}, acc, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp get_text({"textarea", attrs, _}, acc, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp get_text({_, _, nodes}, acc, sep) do
get_text(nodes, acc, sep)
defp get_text({_, _, nodes}, acc, sep, include_inputs?) do
get_text(nodes, acc, sep, include_inputs?)
end
end
35 changes: 20 additions & 15 deletions lib/floki/flat_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,36 @@ defmodule Floki.FlatText do

@type html_tree :: tuple | list

@spec get(html_tree, binary) :: binary
@spec get(html_tree, binary, boolean) :: binary

def get(html_nodes, sep \\ "")
def get(html_nodes, sep \\ "", include_inputs? \\ false)

def get(html_nodes, sep) when is_list(html_nodes) do
def get(html_nodes, sep, include_inputs?) when is_list(html_nodes) do
Enum.reduce(html_nodes, "", fn html_node, acc ->
text_from_node(html_node, acc, sep)
text_from_node(html_node, acc, 0, sep, include_inputs?)
end)
end

def get(html_node, sep) do
text_from_node(html_node, "", sep)
def get(html_node, sep, include_inputs?) do
text_from_node(html_node, "", 0, sep, include_inputs?)
end

defp text_from_node({_tag, _attrs, html_nodes}, acc, sep) do
defp text_from_node({"input", attrs, []}, acc, _, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp text_from_node({"textarea", attrs, []}, acc, _, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?)
when depth < 1 do
Enum.reduce(html_nodes, acc, fn html_node, acc ->
capture_text(html_node, acc, sep)
text_from_node(html_node, acc, depth + 1, sep, include_inputs?)
end)
end

defp text_from_node(text, "", _sep) when is_binary(text), do: text
defp text_from_node(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep)
defp text_from_node(_, acc, _), do: acc

defp capture_text(text, "", _sep) when is_binary(text), do: text
defp capture_text(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep)
defp capture_text(_html_node, acc, _), do: acc
defp text_from_node(text, "", _, _sep, _) when is_binary(text), do: text
defp text_from_node(text, acc, _, sep, _) when is_binary(text), do: Enum.join([acc, text], sep)
defp text_from_node(_, acc, _, _, _), do: acc
end
35 changes: 35 additions & 0 deletions lib/floki/text_extractor.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
defmodule Floki.TextExtractor do
@moduledoc false

@allowed_input_types [
"color",
"date",
"datetime-local",
"email",
"month",
"number",
"search",
"tel",
"text",
"time",
"url",
"week"
]

def extract_input_value(attrs) do
{"type", t} = Enum.find(attrs, {"type", "text"}, &match?({"type", _}, &1))

if t in @allowed_input_types do
extract_value(attrs)
else
""
end
end

defp extract_value(attrs) do
Enum.find_value(attrs, "", fn
{"value", v} -> v
_ -> nil
end)
end
end
24 changes: 24 additions & 0 deletions test/floki/deep_text_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,30 @@ defmodule Floki.DeepTextTest do
assert Floki.DeepText.get(node, " ") == "Google"
end

test "extracts text from text input" do
html = "<input value='foo' />"
{:ok, node} = Floki.parse_document(html)

assert Floki.DeepText.get(node, " ", true) == "foo"
end

test "extracts text from textarea" do
html = "<textarea value='bar' />"
{:ok, node} = Floki.parse_document(html)

assert Floki.DeepText.get(node, " ", true) == "bar"
end

test "extracts text from nested inputs" do
node =
{"div", [],
[
{"input", [{"value", "bar"}], []}
]}

assert Floki.DeepText.get(node, " ", true) == "bar"
end

test "text from a list of deep nodes" do
nodes = [
{
Expand Down
22 changes: 22 additions & 0 deletions test/floki/flat_text_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,28 @@ defmodule Floki.FlatTextTest do
assert Floki.FlatText.get(node, " ") == "Elixir lang"
end

test "extracts text from text input" do
node = {"input", [{"value", "foo"}], []}

assert Floki.FlatText.get(node, " ", true) == "foo"
end

test "extracts text from textarea" do
node = {"textarea", [{"value", "bar"}], []}

assert Floki.FlatText.get(node, " ", true) == "bar"
end

test "extracts text from nested inputs" do
node =
{"div", [],
[
{"input", [{"value", "bar"}], []}
]}

assert Floki.FlatText.get(node, " ", true) == "bar"
end

test "a blank string when the node does not have text in the same level" do
node = {"div", [], [{"a", [], ["Something in a deeper node"]}]}

Expand Down

0 comments on commit 34fb89a

Please sign in to comment.