-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Replace ocamlnet HTML parser with Lambda Soup #15
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
(library | ||
(name river) | ||
(public_name river) | ||
(libraries cohttp cohttp-lwt cohttp-lwt-unix syndic netstring lambdasoup)) | ||
(libraries cohttp cohttp-lwt cohttp-lwt-unix str syndic lambdasoup)) | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,108 +22,30 @@ type t = { | |
feed : Feed.t; | ||
author : string; | ||
email : string; | ||
content : Nethtml.document list; | ||
content : Soup.soup Soup.node; | ||
mutable link_response : (string, string) result option; | ||
} | ||
|
||
let rec len_prefix_of_html html len = | ||
if len <= 0 then (0, []) | ||
else | ||
match html with | ||
| [] -> (len, []) | ||
| el :: tl -> | ||
let len, prefix_el = len_prefix_of_el el len in | ||
let len, prefix_tl = len_prefix_of_html tl len in | ||
(len, prefix_el :: prefix_tl) | ||
|
||
and len_prefix_of_el el len = | ||
match el with | ||
| Nethtml.Data d -> | ||
let len' = len - String.length d in | ||
(len', if len' >= 0 then el else Data (String.sub d 0 len ^ "…")) | ||
| Nethtml.Element (tag, args, content) -> | ||
(* Remove "id" and "name" to avoid duplicate anchors with the whole | ||
post. *) | ||
let args = List.filter (fun (n, _) -> n <> "id" && n <> "name") args in | ||
let len, prefix_content = len_prefix_of_html content len in | ||
(len, Element (tag, args, prefix_content)) | ||
|
||
let prefix_of_html html len = snd (len_prefix_of_html html len) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These |
||
|
||
let rec filter_map l f = | ||
match l with | ||
| [] -> [] | ||
| a :: tl -> ( | ||
match f a with None -> filter_map tl f | Some a -> a :: filter_map tl f) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This became dead code. |
||
|
||
let encode_html = | ||
Netencoding.Html.encode ~prefer_name:false ~in_enc:`Enc_utf8 () | ||
|
||
let decode_document html = Nethtml.decode ~enc:`Enc_utf8 html | ||
let encode_document html = Nethtml.encode ~enc:`Enc_utf8 html | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These entity-translating functions are not necessary with Lambda Soup, as Markup.ml applies all the necessary encoding and decoding internally, as required in the HTML5 specification. HTML5 defaults to UTF-8. |
||
|
||
let rec resolve ?xmlbase html = List.map (resolve_links_el ~xmlbase) html | ||
|
||
and resolve_links_el ~xmlbase = function | ||
| Nethtml.Element ("a", attrs, sub) -> | ||
let attrs = | ||
match List.partition (fun (t, _) -> t = "href") attrs with | ||
| [], _ -> attrs | ||
| (_, h) :: _, attrs -> | ||
let src = | ||
Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string h)) | ||
in | ||
("href", src) :: attrs | ||
in | ||
Nethtml.Element ("a", attrs, resolve ?xmlbase sub) | ||
| Nethtml.Element ("img", attrs, sub) -> | ||
let attrs = | ||
match List.partition (fun (t, _) -> t = "src") attrs with | ||
| [], _ -> attrs | ||
| (_, src) :: _, attrs -> | ||
let src = | ||
Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string src)) | ||
in | ||
("src", src) :: attrs | ||
in | ||
Nethtml.Element ("img", attrs, sub) | ||
| Nethtml.Element (e, attrs, sub) -> | ||
Nethtml.Element (e, attrs, resolve ?xmlbase sub) | ||
| Data _ as d -> d | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This recursive traversal became |
||
let resolve_links_attr ~xmlbase attr el = | ||
Soup.R.attribute attr el | ||
|> Uri.of_string | ||
|> Syndic.XML.resolve ~xmlbase | ||
|> Uri.to_string | ||
|> fun value -> Soup.set_attribute attr value el | ||
|
||
(* Things that posts should not contain *) | ||
let undesired_tags = [ "style"; "script" ] | ||
let undesired_attr = [ "id" ] | ||
|
||
let remove_undesired_attr = | ||
List.filter (fun (a, _) -> not (List.mem a undesired_attr)) | ||
|
||
let rec remove_undesired_tags html = filter_map html remove_undesired_tags_el | ||
|
||
and remove_undesired_tags_el = function | ||
| Nethtml.Element (t, a, sub) -> | ||
if List.mem t undesired_tags then None | ||
else | ||
Some | ||
(Nethtml.Element | ||
(t, remove_undesired_attr a, remove_undesired_tags sub)) | ||
| Data _ as d -> Some d | ||
|
||
let relaxed_html40_dtd = | ||
(* Allow <font> inside <pre> because blogspot uses it! :-( *) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what this refers to. However, in HTML5, the |
||
let constr = | ||
`Sub_exclusions | ||
( [ "img"; "object"; "applet"; "big"; "small"; "sub"; "sup"; "basefont" ], | ||
`Inline ) | ||
in | ||
let dtd = Nethtml.relaxed_html40_dtd in | ||
("pre", (`Block, constr)) :: List.remove_assoc "pre" dtd | ||
|
||
let html_of_text ?xmlbase s = | ||
try | ||
Nethtml.parse (new Netchannels.input_string s) ~dtd:relaxed_html40_dtd | ||
|> decode_document |> resolve ?xmlbase |> remove_undesired_tags | ||
with _ -> [ Nethtml.Data (encode_html s) ] | ||
let soup = Soup.parse s in | ||
let ($$) = Soup.($$) in | ||
soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href"); | ||
soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src"); | ||
undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete); | ||
soup $$ "*" |> Soup.iter (fun el -> | ||
undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el)); | ||
soup | ||
|
||
(* Do not trust sites using XML for HTML content. Convert to string and parse | ||
back. (Does not always fix bad HTML unfortunately.) *) | ||
|
@@ -184,7 +106,7 @@ let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) = | |
| Some (Text s) -> html_of_text s | ||
| Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s | ||
| Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h | ||
| None -> []) | ||
| None -> Soup.parse "") | ||
in | ||
let author, _ = e.authors in | ||
{ | ||
|
@@ -241,14 +163,8 @@ let posts_of_feed c = | |
| Feed.Atom f -> List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries | ||
| Feed.Rss2 ch -> List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items | ||
|
||
let string_of_html html = | ||
let buffer = Buffer.create 1024 in | ||
let channel = new Netchannels.output_buffer buffer in | ||
let () = Nethtml.write channel @@ encode_document html in | ||
Buffer.contents buffer | ||
|
||
let mk_entry post = | ||
let content = Syndic.Atom.Html (None, string_of_html post.content) in | ||
let content = Syndic.Atom.Html (None, Soup.to_string post.content) in | ||
let contributors = | ||
[ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ] | ||
in | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,6 @@ depends: [ | |
"cohttp-lwt-unix" {>= "5.0.0"} | ||
"ptime" | ||
"lwt" | ||
"ocamlnet" | ||
"lambdasoup" | ||
"odoc" {with-doc} | ||
] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not listing
str
gave a warning on OCaml 5.2.0.