Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace ocamlnet HTML parser with Lambda Soup #15

Merged
merged 2 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/dune
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(library
(name river)
(public_name river)
(libraries cohttp cohttp-lwt cohttp-lwt-unix syndic netstring lambdasoup))
(libraries cohttp cohttp-lwt cohttp-lwt-unix str syndic lambdasoup))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not listing str gave a warning on OCaml 5.2.0.

118 changes: 17 additions & 101 deletions lib/post.ml
Original file line number Diff line number Diff line change
Expand Up @@ -22,108 +22,30 @@ type t = {
feed : Feed.t;
author : string;
email : string;
content : Nethtml.document list;
content : Soup.soup Soup.node;
mutable link_response : (string, string) result option;
}

let rec len_prefix_of_html html len =
if len <= 0 then (0, [])
else
match html with
| [] -> (len, [])
| el :: tl ->
let len, prefix_el = len_prefix_of_el el len in
let len, prefix_tl = len_prefix_of_html tl len in
(len, prefix_el :: prefix_tl)

and len_prefix_of_el el len =
match el with
| Nethtml.Data d ->
let len' = len - String.length d in
(len', if len' >= 0 then el else Data (String.sub d 0 len ^ "…"))
| Nethtml.Element (tag, args, content) ->
(* Remove "id" and "name" to avoid duplicate anchors with the whole
post. *)
let args = List.filter (fun (n, _) -> n <> "id" && n <> "name") args in
let len, prefix_content = len_prefix_of_html content len in
(len, Element (tag, args, prefix_content))

let prefix_of_html html len = snd (len_prefix_of_html html len)
Copy link
Contributor Author

@aantron aantron Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These prefix_of... functions appeared to be dead code. Since they also referenced module Nethtml, I removed them.


let rec filter_map l f =
match l with
| [] -> []
| a :: tl -> (
match f a with None -> filter_map tl f | Some a -> a :: filter_map tl f)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This became dead code.


let encode_html =
Netencoding.Html.encode ~prefer_name:false ~in_enc:`Enc_utf8 ()

let decode_document html = Nethtml.decode ~enc:`Enc_utf8 html
let encode_document html = Nethtml.encode ~enc:`Enc_utf8 html
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These entity-translating functions are not necessary with Lambda Soup, as Markup.ml applies all the necessary encoding and decoding internally, as required in the HTML5 specification. HTML5 defaults to UTF-8.


let rec resolve ?xmlbase html = List.map (resolve_links_el ~xmlbase) html

and resolve_links_el ~xmlbase = function
| Nethtml.Element ("a", attrs, sub) ->
let attrs =
match List.partition (fun (t, _) -> t = "href") attrs with
| [], _ -> attrs
| (_, h) :: _, attrs ->
let src =
Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string h))
in
("href", src) :: attrs
in
Nethtml.Element ("a", attrs, resolve ?xmlbase sub)
| Nethtml.Element ("img", attrs, sub) ->
let attrs =
match List.partition (fun (t, _) -> t = "src") attrs with
| [], _ -> attrs
| (_, src) :: _, attrs ->
let src =
Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string src))
in
("src", src) :: attrs
in
Nethtml.Element ("img", attrs, sub)
| Nethtml.Element (e, attrs, sub) ->
Nethtml.Element (e, attrs, resolve ?xmlbase sub)
| Data _ as d -> d
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This recursive traversal became soup $$ "a[href]" and soup $$ "img[src]".

let resolve_links_attr ~xmlbase attr el =
Soup.R.attribute attr el
|> Uri.of_string
|> Syndic.XML.resolve ~xmlbase
|> Uri.to_string
|> fun value -> Soup.set_attribute attr value el

(* Things that posts should not contain *)
let undesired_tags = [ "style"; "script" ]
let undesired_attr = [ "id" ]

let remove_undesired_attr =
List.filter (fun (a, _) -> not (List.mem a undesired_attr))

let rec remove_undesired_tags html = filter_map html remove_undesired_tags_el

and remove_undesired_tags_el = function
| Nethtml.Element (t, a, sub) ->
if List.mem t undesired_tags then None
else
Some
(Nethtml.Element
(t, remove_undesired_attr a, remove_undesired_tags sub))
| Data _ as d -> Some d

let relaxed_html40_dtd =
(* Allow <font> inside <pre> because blogspot uses it! :-( *)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what this refers to. However, in HTML5, the <font> tag is allowed in <pre>, and Lambda Soup loads that correctly.

let constr =
`Sub_exclusions
( [ "img"; "object"; "applet"; "big"; "small"; "sub"; "sup"; "basefont" ],
`Inline )
in
let dtd = Nethtml.relaxed_html40_dtd in
("pre", (`Block, constr)) :: List.remove_assoc "pre" dtd

let html_of_text ?xmlbase s =
try
Nethtml.parse (new Netchannels.input_string s) ~dtd:relaxed_html40_dtd
|> decode_document |> resolve ?xmlbase |> remove_undesired_tags
with _ -> [ Nethtml.Data (encode_html s) ]
let soup = Soup.parse s in
let ($$) = Soup.($$) in
soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
soup $$ "*" |> Soup.iter (fun el ->
undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
soup

(* Do not trust sites using XML for HTML content. Convert to string and parse
back. (Does not always fix bad HTML unfortunately.) *)
Expand Down Expand Up @@ -184,7 +106,7 @@ let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
| Some (Text s) -> html_of_text s
| Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
| Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
| None -> [])
| None -> Soup.parse "")
in
let author, _ = e.authors in
{
Expand Down Expand Up @@ -241,14 +163,8 @@ let posts_of_feed c =
| Feed.Atom f -> List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries
| Feed.Rss2 ch -> List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items

let string_of_html html =
let buffer = Buffer.create 1024 in
let channel = new Netchannels.output_buffer buffer in
let () = Nethtml.write channel @@ encode_document html in
Buffer.contents buffer

let mk_entry post =
let content = Syndic.Atom.Html (None, string_of_html post.content) in
let content = Syndic.Atom.Html (None, Soup.to_string post.content) in
let contributors =
[ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ]
in
Expand Down
2 changes: 1 addition & 1 deletion lib/river.ml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ let date post = post.Post.date
let feed post = post.Post.feed
let author post = post.Post.author
let email post = post.Post.email
let content post = Post.string_of_html post.Post.content
let content post = Soup.to_string post.Post.content

let meta_description post =
match Post.fetch_link post with
Expand Down
1 change: 0 additions & 1 deletion river.opam
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ depends: [
"cohttp-lwt-unix" {>= "5.0.0"}
"ptime"
"lwt"
"ocamlnet"
"lambdasoup"
"odoc" {with-doc}
]
Expand Down