From 476dc945a908a69548bddd267f143a3e5d9c8a1a Mon Sep 17 00:00:00 2001
From: Anton Bachin <antonbachin@yahoo.com>
Date: Wed, 7 Aug 2024 21:20:49 +0300
Subject: [PATCH 1/2] Replace ocamlnet HTML parser with Lambda Soup

---
 lib/dune     |   2 +-
 lib/post.ml  | 118 ++++++++-------------------------------------------
 lib/river.ml |   2 +-
 river.opam   |   1 -
 4 files changed, 19 insertions(+), 104 deletions(-)

diff --git a/lib/dune b/lib/dune
index 4088bdd..3eb610f 100644
--- a/lib/dune
+++ b/lib/dune
@@ -1,4 +1,4 @@
 (library
  (name river)
  (public_name river)
- (libraries cohttp cohttp-lwt cohttp-lwt-unix syndic netstring lambdasoup))
+ (libraries cohttp cohttp-lwt cohttp-lwt-unix str syndic lambdasoup))
diff --git a/lib/post.ml b/lib/post.ml
index d6e9366..a7e6697 100644
--- a/lib/post.ml
+++ b/lib/post.ml
@@ -22,108 +22,30 @@ type t = {
   feed : Feed.t;
   author : string;
   email : string;
-  content : Nethtml.document list;
+  content : Soup.soup Soup.node;
   mutable link_response : (string, string) result option;
 }
 
-let rec len_prefix_of_html html len =
-  if len <= 0 then (0, [])
-  else
-    match html with
-    | [] -> (len, [])
-    | el :: tl ->
-        let len, prefix_el = len_prefix_of_el el len in
-        let len, prefix_tl = len_prefix_of_html tl len in
-        (len, prefix_el :: prefix_tl)
-
-and len_prefix_of_el el len =
-  match el with
-  | Nethtml.Data d ->
-      let len' = len - String.length d in
-      (len', if len' >= 0 then el else Data (String.sub d 0 len ^ "…"))
-  | Nethtml.Element (tag, args, content) ->
-      (* Remove "id" and "name" to avoid duplicate anchors with the whole
-         post. *)
-      let args = List.filter (fun (n, _) -> n <> "id" && n <> "name") args in
-      let len, prefix_content = len_prefix_of_html content len in
-      (len, Element (tag, args, prefix_content))
-
-let prefix_of_html html len = snd (len_prefix_of_html html len)
-
-let rec filter_map l f =
-  match l with
-  | [] -> []
-  | a :: tl -> (
-      match f a with None -> filter_map tl f | Some a -> a :: filter_map tl f)
-
-let encode_html =
-  Netencoding.Html.encode ~prefer_name:false ~in_enc:`Enc_utf8 ()
-
-let decode_document html = Nethtml.decode ~enc:`Enc_utf8 html
-let encode_document html = Nethtml.encode ~enc:`Enc_utf8 html
-
-let rec resolve ?xmlbase html = List.map (resolve_links_el ~xmlbase) html
-
-and resolve_links_el ~xmlbase = function
-  | Nethtml.Element ("a", attrs, sub) ->
-      let attrs =
-        match List.partition (fun (t, _) -> t = "href") attrs with
-        | [], _ -> attrs
-        | (_, h) :: _, attrs ->
-            let src =
-              Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string h))
-            in
-            ("href", src) :: attrs
-      in
-      Nethtml.Element ("a", attrs, resolve ?xmlbase sub)
-  | Nethtml.Element ("img", attrs, sub) ->
-      let attrs =
-        match List.partition (fun (t, _) -> t = "src") attrs with
-        | [], _ -> attrs
-        | (_, src) :: _, attrs ->
-            let src =
-              Uri.to_string (Syndic.XML.resolve ~xmlbase (Uri.of_string src))
-            in
-            ("src", src) :: attrs
-      in
-      Nethtml.Element ("img", attrs, sub)
-  | Nethtml.Element (e, attrs, sub) ->
-      Nethtml.Element (e, attrs, resolve ?xmlbase sub)
-  | Data _ as d -> d
+let resolve_links_attr ~xmlbase attr el =
+  Soup.R.attribute attr el
+  |> Uri.of_string
+  |> Syndic.XML.resolve ~xmlbase
+  |> Uri.to_string
+  |> fun value -> Soup.set_attribute attr value el
 
 (* Things that posts should not contain *)
 let undesired_tags = [ "style"; "script" ]
 let undesired_attr = [ "id" ]
 
-let remove_undesired_attr =
-  List.filter (fun (a, _) -> not (List.mem a undesired_attr))
-
-let rec remove_undesired_tags html = filter_map html remove_undesired_tags_el
-
-and remove_undesired_tags_el = function
-  | Nethtml.Element (t, a, sub) ->
-      if List.mem t undesired_tags then None
-      else
-        Some
-          (Nethtml.Element
-             (t, remove_undesired_attr a, remove_undesired_tags sub))
-  | Data _ as d -> Some d
-
-let relaxed_html40_dtd =
-  (* Allow <font> inside <pre> because blogspot uses it! :-( *)
-  let constr =
-    `Sub_exclusions
-      ( [ "img"; "object"; "applet"; "big"; "small"; "sub"; "sup"; "basefont" ],
-        `Inline )
-  in
-  let dtd = Nethtml.relaxed_html40_dtd in
-  ("pre", (`Block, constr)) :: List.remove_assoc "pre" dtd
-
 let html_of_text ?xmlbase s =
-  try
-    Nethtml.parse (new Netchannels.input_string s) ~dtd:relaxed_html40_dtd
-    |> decode_document |> resolve ?xmlbase |> remove_undesired_tags
-  with _ -> [ Nethtml.Data (encode_html s) ]
+  let soup = Soup.parse s in
+  let ($$) = Soup.($$) in
+  soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
+  soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
+  undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
+  soup $$ "*" |> Soup.iter (fun el ->
+    undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
+  soup
 
 (* Do not trust sites using XML for HTML content. Convert to string and parse
    back. (Does not always fix bad HTML unfortunately.) *)
@@ -184,7 +106,7 @@ let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
         | Some (Text s) -> html_of_text s
         | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
         | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
-        | None -> [])
+        | None -> Soup.parse "")
   in
   let author, _ = e.authors in
   {
@@ -241,14 +163,8 @@ let posts_of_feed c =
   | Feed.Atom f -> List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries
   | Feed.Rss2 ch -> List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items
 
-let string_of_html html =
-  let buffer = Buffer.create 1024 in
-  let channel = new Netchannels.output_buffer buffer in
-  let () = Nethtml.write channel @@ encode_document html in
-  Buffer.contents buffer
-
 let mk_entry post =
-  let content = Syndic.Atom.Html (None, string_of_html post.content) in
+  let content = Syndic.Atom.Html (None, Soup.to_string post.content) in
   let contributors =
     [ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ]
   in
diff --git a/lib/river.ml b/lib/river.ml
index 7916861..11472f7 100644
--- a/lib/river.ml
+++ b/lib/river.ml
@@ -29,7 +29,7 @@ let date post = post.Post.date
 let feed post = post.Post.feed
 let author post = post.Post.author
 let email post = post.Post.email
-let content post = Post.string_of_html post.Post.content
+let content post = Soup.to_string post.Post.content
 
 let meta_description post =
   match Post.fetch_link post with
diff --git a/river.opam b/river.opam
index c3c362f..b2a2f0e 100644
--- a/river.opam
+++ b/river.opam
@@ -17,7 +17,6 @@ depends: [
   "cohttp-lwt-unix" {>= "5.0.0"}
   "ptime"
   "lwt"
-  "ocamlnet"
   "lambdasoup"
   "odoc" {with-doc}
 ]

From c771e5c113602c6fbc2531783446f647d4265170 Mon Sep 17 00:00:00 2001
From: Anton Bachin <antonbachin@yahoo.com>
Date: Fri, 9 Aug 2024 17:10:39 +0300
Subject: [PATCH 2/2] Update ocamlnet reference in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cbde8a8..4ff0898 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ RSS2 and Atom feed aggregator for OCaml
 - Supports pagination and generating well-formed html prefix snippets.
 - Support for generating aggregate feeds.
 - Sorts the posts from most recent to oldest.
-- Depends on ocamlnet for html parsing.
+- Depends on Lambda Soup for html parsing.
 
 ## Installation