Skip to content

Commit

Permalink
More parsing (#34)
Browse files Browse the repository at this point in the history
* checkpoint

* checkpoint

* checkpoint

* checkpoint

* refactor

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint
  • Loading branch information
dogweather authored Dec 13, 2023
1 parent 48f562c commit 372bbd3
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 21 deletions.
2 changes: 1 addition & 1 deletion config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ config :crawly,
log_to_file: true,

closespider_timeout: 1,
concurrent_requests_per_domain: 8,
concurrent_requests_per_domain: 10,
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Expand Down
47 changes: 27 additions & 20 deletions lib/news/article.ex
Original file line number Diff line number Diff line change
Expand Up @@ -88,40 +88,47 @@ defmodule News.Article do

@spec find_citations_in_html(Floki.html_tree) :: [binary]
def find_citations_in_html(document) do
cites_from_hrefs =
document
(cites_from_hrefs(document) ++ cites_from_text(document))
|> filter(&is_binary/1)
|> cleanup_list()
end

def cites_from_hrefs(document) do
document
|> hrefs()
|> map(&href_to_cite/1)
end

def cites_from_text(document) do
html = Floki.text(document)
crs_cites_from_text_1 =
Regex.scan(~r/(C.R.S. §(?:§)? \d+-\d+-\d+)/, html)
|> flatten()
|> map(fn m -> String.replace(m, ~r/§ ?/, "", global: true) end)

crs_cites_from_text_2 =
Regex.scan(~r/(\d+-\d+-\d+(?:\.\d+)?) C.R.S./, html)
|> map(&last/1)
|> map(fn m -> "C.R.S. #{m}" end)
|> flatten()
[
simple_cites(html, ~r/(\d+-\d+-\d+(?:\.\d+)?) C.R.S./, &("C.R.S. #{&1}")),
simple_cites(html, ~r/Colo. Rev. Stat. § (\d+-\d+-\d+(?:\.\d+)?)/, &("C.R.S. #{&1}")),
simple_cites(html, ~r/Nev. Rev. Stat. § (\d+[A-Z]?\.\d+[A-Z]?)/, &("NRS #{&1}")),
simple_cites(html, ~r/NY Penal Law § (\d+\.\d+)/, &("NY Penal Law Section #{&1}")),
simple_cites(html, ~r/Ore. Rev. Stat. § (\d+[A-Z]?\.\d+[A-Z]?)/, &("ORS #{&1}")),

crs_cites_from_text_3 =
Regex.scan(~r/Colo. Rev. Stat. § (\d+-\d+-\d+(?:\.\d+)?)/, html)
|> map(&last/1)
|> map(fn m -> "C.R.S. #{m}" end)
Regex.scan(~r/(C.R.S. §(?:§)? \d+-\d+-\d+)/, html)
|> flatten()
|> map(fn m -> String.replace(m, ~r/§ ?/, "", global: true) end),

tx_cites_from_text =
Regex.scan(~r/(Texas \w+ Code Section [\d\w.]+)/, html)
|> flatten()
|> map(fn m -> String.replace(m, "Texas ", "Tex. ") end)
|> map(fn m -> String.replace(m, "Family ", "Fam. ") end)
|> map(fn m -> String.replace(m, "Transportation ", "Transp. ") end)
|> map(fn m -> String.replace(m, "Transportation ", "Transp. ") end),
]
|> flatten()
end


(cites_from_hrefs ++ crs_cites_from_text_1 ++ crs_cites_from_text_2 ++ crs_cites_from_text_3 ++ tx_cites_from_text)
|> filter(&is_binary/1)
|> cleanup_list()
@spec simple_cites(binary(), Regex.t(), (any() -> any())) :: list()
def simple_cites(html, regex, replace_func) do
Regex.scan(regex, html)
|> map(&last/1)
|> map(replace_func)
|> flatten()
end


Expand Down
4 changes: 4 additions & 0 deletions test/news/article_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ defmodule News.ArticleTest do
@snippet_test_cases [
%{html: "<html></html>", cites: []},
%{html: "<html><p>under Colo. Rev. Stat. § 24-34-402.7 and</p></html>", cites: ["C.R.S. 24-34-402.7"]},
%{html: "<html><p>under Ore. Rev. Stat. § 633.295 and</p></html>", cites: ["ORS 633.295"]},
%{html: "<html><p>under Nev. Rev. Stat. § 675.020 and</p></html>", cites: ["NRS 675.020"]},
%{html: "<html><p>under Nev. Rev. Stat. § 603A.302 and</p></html>", cites: ["NRS 603A.302"]},
%{html: "<html><p>See NY Penal Law § 155.25.</p></html>", cites: ["NY Penal Law Section 155.25"]},
]

Enum.each(@snippet_test_cases, fn %{html: html, cites: cites} ->
Expand Down

0 comments on commit 372bbd3

Please sign in to comment.