From 476dc945a908a69548bddd267f143a3e5d9c8a1a Mon Sep 17 00:00:00 2001 From: Anton Bachin Date: Wed, 7 Aug 2024 21:20:49 +0300 Subject: [PATCH 1/2] Replace ocamlnet HTML parser with Lambda Soup --- lib/dune | 2 +- lib/post.ml | 118 ++++++++------------------------------------------- lib/river.ml | 2 +- river.opam | 1 - 4 files changed, 19 insertions(+), 104 deletions(-) diff --git a/lib/dune b/lib/dune index 4088bdd..3eb610f 100644 --- a/lib/dune +++ b/lib/dune @@ -1,4 +1,4 @@ (library (name river) (public_name river) - (libraries cohttp cohttp-lwt cohttp-lwt-unix syndic netstring lambdasoup)) + (libraries cohttp cohttp-lwt cohttp-lwt-unix str syndic lambdasoup)) diff --git a/lib/post.ml b/lib/post.ml index d6e9366..a7e6697 100644 --- a/lib/post.ml +++ b/lib/post.ml @@ -22,108 +22,30 @@ type t = { feed : Feed.t; author : string; email : string; - content : Nethtml.document list; + content : Soup.soup Soup.node; mutable link_response : (string, string) result option; } -let rec len_prefix_of_html html len = - if len <= 0 then (0, []) - else - match html with - | [] -> (len, []) - | el :: tl -> - let len, prefix_el = len_prefix_of_el el len in - let len, prefix_tl = len_prefix_of_html tl len in - (len, prefix_el :: prefix_tl) - -and len_prefix_of_el el len = - match el with - | Nethtml.Data d -> - let len' = len - String.length d in - (len', if len' >= 0 then el else Data (String.sub d 0 len ^ "…")) - | Nethtml.Element (tag, args, content) -> - (* Remove "id" and "name" to avoid duplicate anchors with the whole - post. *) - let args = List.filter (fun (n, _) -> n <> "id" && n <> "name") args in - let len, prefix_content = len_prefix_of_html content len in - (len, Element (tag, args, prefix_content)) - -let prefix_of_html html len = snd (len_prefix_of_html html len) - -let rec filter_map l f = - match l with - | [] -> [] - | a :: tl -> ( - match f a with None -> filter_map tl f | Some a -> a :: filter_map tl f) - -let encode_html = - Netencoding.Html.encode ~prefer_name:false ~in_enc:`Enc_utf8 () - -let decode_document html = Nethtml.decode ~enc:`Enc_utf8 html -let encode_document html = Nethtml.encode ~enc:`Enc_utf8 html - -let rec resolve ?xmlbase html = List.map (resolve_links_el ~xmlbase) html - -and resolve_links_el ~xmlbase = function - | Nethtml.Element ("a", attrs, sub) -> - let attrs = - match List.partition (fun (t, _) -> t = "href") attrs with - | [], _ -> attrs - | (_, h) :: _, attrs -> - let src = - Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string h)) - in - ("href", src) :: attrs - in - Nethtml.Element ("a", attrs, resolve ?xmlbase sub) - | Nethtml.Element ("img", attrs, sub) -> - let attrs = - match List.partition (fun (t, _) -> t = "src") attrs with - | [], _ -> attrs - | (_, src) :: _, attrs -> - let src = - Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string src)) - in - ("src", src) :: attrs - in - Nethtml.Element ("img", attrs, sub) - | Nethtml.Element (e, attrs, sub) -> - Nethtml.Element (e, attrs, resolve ?xmlbase sub) - | Data _ as d -> d +let resolve_links_attr ~xmlbase attr el = + Soup.R.attribute attr el + |> Uri.of_string + |> Syndic.XML.resolve ~xmlbase + |> Uri.to_string + |> fun value -> Soup.set_attribute attr value el (* Things that posts should not contain *) let undesired_tags = [ "style"; "script" ] let undesired_attr = [ "id" ] -let remove_undesired_attr = - List.filter (fun (a, _) -> not (List.mem a undesired_attr)) - -let rec remove_undesired_tags html = filter_map html remove_undesired_tags_el - -and remove_undesired_tags_el = function - | Nethtml.Element (t, a, sub) -> - if List.mem t undesired_tags then None - else - Some - (Nethtml.Element - (t, remove_undesired_attr a, remove_undesired_tags sub)) - | Data _ as d -> Some d - -let relaxed_html40_dtd = - (* Allow inside
 because blogspot uses it! :-( *)
-  let constr =
-    `Sub_exclusions
-      ( [ "img"; "object"; "applet"; "big"; "small"; "sub"; "sup"; "basefont" ],
-        `Inline )
-  in
-  let dtd = Nethtml.relaxed_html40_dtd in
-  ("pre", (`Block, constr)) :: List.remove_assoc "pre" dtd
-
 let html_of_text ?xmlbase s =
-  try
-    Nethtml.parse (new Netchannels.input_string s) ~dtd:relaxed_html40_dtd
-    |> decode_document |> resolve ?xmlbase |> remove_undesired_tags
-  with _ -> [ Nethtml.Data (encode_html s) ]
+  let soup = Soup.parse s in
+  let ($$) = Soup.($$) in
+  soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
+  soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
+  undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
+  soup $$ "*" |> Soup.iter (fun el ->
+    undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
+  soup
 
 (* Do not trust sites using XML for HTML content. Convert to string and parse
    back. (Does not always fix bad HTML unfortunately.) *)
@@ -184,7 +106,7 @@ let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
         | Some (Text s) -> html_of_text s
         | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
         | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
-        | None -> [])
+        | None -> Soup.parse "")
   in
   let author, _ = e.authors in
   {
@@ -241,14 +163,8 @@ let posts_of_feed c =
   | Feed.Atom f -> List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries
   | Feed.Rss2 ch -> List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items
 
-let string_of_html html =
-  let buffer = Buffer.create 1024 in
-  let channel = new Netchannels.output_buffer buffer in
-  let () = Nethtml.write channel @@ encode_document html in
-  Buffer.contents buffer
-
 let mk_entry post =
-  let content = Syndic.Atom.Html (None, string_of_html post.content) in
+  let content = Syndic.Atom.Html (None, Soup.to_string post.content) in
   let contributors =
     [ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ]
   in
diff --git a/lib/river.ml b/lib/river.ml
index 7916861..11472f7 100644
--- a/lib/river.ml
+++ b/lib/river.ml
@@ -29,7 +29,7 @@ let date post = post.Post.date
 let feed post = post.Post.feed
 let author post = post.Post.author
 let email post = post.Post.email
-let content post = Post.string_of_html post.Post.content
+let content post = Soup.to_string post.Post.content
 
 let meta_description post =
   match Post.fetch_link post with
diff --git a/river.opam b/river.opam
index c3c362f..b2a2f0e 100644
--- a/river.opam
+++ b/river.opam
@@ -17,7 +17,6 @@ depends: [
   "cohttp-lwt-unix" {>= "5.0.0"}
   "ptime"
   "lwt"
-  "ocamlnet"
   "lambdasoup"
   "odoc" {with-doc}
 ]

From c771e5c113602c6fbc2531783446f647d4265170 Mon Sep 17 00:00:00 2001
From: Anton Bachin 
Date: Fri, 9 Aug 2024 17:10:39 +0300
Subject: [PATCH 2/2] Update ocamlnet reference in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cbde8a8..4ff0898 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ RSS2 and Atom feed aggregator for OCaml
 - Supports pagination and generating well-formed html prefix snippets.
 - Support for generating aggregate feeds.
 - Sorts the posts from most recent to oldest.
-- Depends on ocamlnet for html parsing.
+- Depends on Lambda Soup for html parsing.
 
 ## Installation