From ed9ff89c96119149815d2371beb7ec489d9514b8 Mon Sep 17 00:00:00 2001
From: Thomas Hansen <th@aista.com>
Date: Fri, 9 Feb 2024 12:50:56 +0200
Subject: [PATCH] Getting ready for release

---
 backend/backend.csproj                        |   2 +-
 .../crawling/magic.ai.crawl-site-on-thread.hl | 459 ++++++++++++++++++
 .../crawling/magic.ai.crawl-site.hl           | 457 +----------------
 .../crawling/magic.ai.html.extract.hl         | 194 +++-----
 backend/slots/Version.cs                      |   2 +-
 5 files changed, 542 insertions(+), 572 deletions(-)
 create mode 100644 backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl
diff --git a/backend/backend.csproj b/backend/backend.csproj
index 9c17a97491..5d3bec2f14 100644
--- a/backend/backend.csproj
+++ b/backend/backend.csproj
@@ -26,7 +26,7 @@
 
   <ItemGroup>
     <PackageReference Include="magic.lambda.system" Version="17.2.0" />
-    <PackageReference Include="magic.library" Version="17.3.3" />
+    <PackageReference Include="magic.library" Version="17.3.4" />
     <PackageReference Include="Microsoft.AspNetCore.Mvc.NewtonsoftJson" Version="8.0.0" />
   </ItemGroup>
 
diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl
new file mode 100644
index 0000000000..4eba515511
--- /dev/null
+++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl
@@ -0,0 +1,459 @@
+
+/*
+ * Crawls the specified website generating training data for machine learning in the process.
+ */
+slots.create:magic.ai.crawl-site-on-thread
+
+   /*
+    * Loading robots.txt from specified [url].
+    */
+   unwrap:x:+/*
+   signal:magic.ai.load-robots
+      url:x:@.arguments/*/url
+      feedback-channel:x:@.arguments/*/feedback-channel
+
+   // Checking if site contains a robots.txt file.
+   if
+      eq:x:@signal/*/found
+         .:bool:true
+      .lambda
+
+         // Site contains a robots.txt file, signaling frontend of that fact.
+         sockets.signal:x:@.arguments/*/feedback-channel
+            args
+               message:Site has robots.txt
+               type:info
+         sleep:100
+
+         // Signaling frontend how many sitemaps we found in robots.txt file.
+         strings.concat
+            .:"Found "
+            get-count:x:@signal/*/sitemap/*
+            .:" sitemaps in robots.txt file"
+         unwrap:x:+/**
+         sockets.signal:x:@.arguments/*/feedback-channel
+            args
+               message:x:@strings.concat
+               type:info
+         sleep:100
+
+         // Checking if robots.txt contains a crawl-delay.
+         if
+            exists:x:@signal/*/crawl-delay
+            .lambda
+
+               // Updating delay to value from robots.txt.
+               remove-nodes:x:@.arguments/*/delay
+               unwrap:x:+/*
+               validators.default:x:@.arguments
+                  delay:x:@signal/*/crawl-delay
+
+
+               // Signaling frontend to inform of that we found a crawl-delay value.
+               strings.concat
+                  .:"Robots.txt file contains a Crawl-Delay value of "
+                  math.divide:x:@signal/*/crawl-delay
+                     .:int:1000
+                  .:" seconds"
+               unwrap:x:+/**
+               sockets.signal:x:@.arguments/*/feedback-channel
+                  args
+                     message:x:@strings.concat
+                     type:info
+
+   else
+
+      // Site does not contain a robots.txt file, signaling that fact to frontend.
+      sockets.signal:x:@.arguments/*/feedback-channel
+         args
+            message:Could not find a robots.txt file for website
+            type:warning
+      sleep:100
+      strings.concat
+         .:"We will try to retrieve sitemap from "
+         get-value:x:@signal/*/sitemap/0
+      unwrap:x:+/**
+      sockets.signal:x:@.arguments/*/feedback-channel
+         args
+            message:x:@strings.concat
+            type:info
+      sleep:100
+
+   /*
+    * Checking if we should filter according to URL, as
+    * in caller provided a sub-folder URL such as foo.com/bar, at which
+    * point we only import URLs below /bar hierarchy.
+    *
+    * Default value is false, implying robots.txt file is solely responsible
+    * for filtering.
+    */
+   .filter-on-url:bool:false
+   strings.split:x:@.arguments/*/url
+      .:/
+   if
+      mt
+         get-count:x:@strings.split/*
+         .:int:2
+      .lambda
+         set-value:x:@.filter-on-url
+            .:bool:true
+
+   /*
+    * Trying to load URLs from sitemap returned from above invocation.
+    */
+   add:x:./*/signal/[1,2]
+      get-nodes:x:@signal/*/sitemap
+      get-nodes:x:@signal/*/allow
+      get-nodes:x:@signal/*/disallow
+   unwrap:x:+/*
+   signal:magic.ai.load-sitemap
+      max:x:@.arguments/*/max
+      feedback-channel:x:@.arguments/*/feedback-channel
+      url:x:@.arguments/*/url
+      filter-on-url:x:@.filter-on-url
+
+   // Signaling user what we're about to do.
+   strings.concat
+      .:"Deleting old snippets for type '"
+      get-value:x:@.arguments/*/type
+      .:"' matching URL of "
+      get-value:x:@.arguments/*/url
+   unwrap:x:+/**
+   sockets.signal:x:@.arguments/*/feedback-channel
+      args
+         message:x:@strings.concat
+         type:info
+   sleep:100
+
+   // Deleting all old training snippets matching specified URL and type.
+   .uri
+   set-value:x:@.uri
+      strings.concat
+         get-value:x:@.arguments/*/url
+         .:%
+   data.connect:[generic|magic]
+      data.execute:@"
+delete from vss_ml_training_snippets
+where rowid in (select id as rowid from ml_training_snippets where type = @type and uri like @uri);
+delete from ml_training_snippets where type = @type and uri like @uri;"
+         type:x:@.arguments/*/type
+         uri:x:@.uri
+
+   // Verifying we found at least one sitemap.
+   if
+      eq:x:@signal/*/has-sitemap
+         .:bool:true
+      .lambda
+
+         /*
+          * We found at least one sitemap.
+          *
+          * Signaling frontend how many URLs we found, and how many there are in total.
+          */
+         get-count:x:@signal/*/urls/*
+         strings.concat
+            .:"We found "
+            get-value:x:@signal/*/total
+            .:" URLs in sitemap(s), we will be scraping "
+            get-value:x:@get-count
+            .:" URLs"
+         unwrap:x:+/**
+         sockets.signal:x:@.arguments/*/feedback-channel
+            args
+               message:x:@strings.concat
+               type:info
+         sleep:100
+
+         // Checking if site contains more URLs than we're scraping.
+         if
+            eq:x:@get-count
+               .:int:0
+            .lambda
+
+               // Warning user!
+               strings.concat
+                  .:"Warning, we could not find a single valid URL in site"
+               unwrap:x:+/**
+               sockets.signal:x:@.arguments/*/feedback-channel
+                  args
+                     message:x:@strings.concat
+                     type:warning
+               sleep:100
+         else-if
+            mt
+               get-value:x:@signal/*/total
+               get-value:x:@get-count
+            .lambda
+
+               // Warning user!
+               strings.concat
+                  .:"Warning, site contains more than "
+                  get-value:x:@get-count
+                  .:" URLs and will only be partially scraped"
+               unwrap:x:+/**
+               sockets.signal:x:@.arguments/*/feedback-channel
+                  args
+                     message:x:@strings.concat
+                     type:warning
+               sleep:100
+
+         // Feedback about URLs we're about to scrape, but only if there are any URLs.
+         if
+            mt:x:@get-count
+               .:int:0
+            .lambda
+
+               // Adding spacer.
+               sockets.signal:x:@.arguments/*/feedback-channel
+                  args
+                     message:------------------------------------------------------------------------------------------------------------------------
+                     type:info
+               sleep:100
+               sockets.signal:x:@.arguments/*/feedback-channel
+                  args
+                     message:"URLs we will scrape are as follows:"
+                     type:info
+               sleep:100
+
+         // Iterating through each URL returned from above invocation.
+         for-each:x:@signal/*/urls/*
+
+            unwrap:x:+/**
+            sockets.signal:x:@.arguments/*/feedback-channel
+               args
+                  message:x:@.dp/#
+                  type:info
+            sleep:10
+
+         // Iterating through each URL returned from above invocation.
+         for-each:x:@signal/*/urls/*
+
+            // Making sure we trap exceptions.
+            try
+
+               // Adding spacer.
+               sockets.signal:x:@.arguments/*/feedback-channel
+                  args
+                     message:------------------------------------------------------------------------------------------------------------------------
+                     type:info
+               sleep:100
+
+               // Scraping currently iterated URL.
+               unwrap:x:+/*
+               signal:magic.ai.url.scrape
+                  url:x:@.dp/#
+                  type:x:@.arguments/*/type
+                  threshold:x:@.arguments/*/threshold
+                  feedback-channel:x:@.arguments/*/feedback-channel
+
+               // Verifying we've got more snippets before applying Crawl-Delay
+               if
+                  neq:x:@.dp/#
+                     get-value:x:@signal/@signal/*/urls/0/-
+                  .lambda
+
+                     // Signaling frontend that we're waiting for n seconds.
+                     strings.concat
+                        .:"Waiting for "
+                        math.divide:x:@.arguments/*/delay
+                           .:int:1000
+                        .:" seconds to avoid exhausting web server"
+                     unwrap:x:+/**
+                     sockets.signal:x:@.arguments/*/feedback-channel
+                        args
+                           message:x:@strings.concat
+                           type:info
+                     sleep:100
+
+                     // Sleeping for [delay] milliseconds to avoid exhausting web server.
+                     sleep:x:@.arguments/*/delay
+
+            .catch
+
+               // Logging as error.
+               log.error:Could not scrape URL
+                  url:x:@.dp/#
+                  message:x:@.arguments/*/message
+
+               // Signaling frontend to inform about error.
+               strings.concat
+                  .:"Could not scrape URL, error was: '"
+                  get-value:x:@.arguments/*/message
+                  .:"'"
+               unwrap:x:+/**
+               sockets.signal:x:@.arguments/@.arguments/*/feedback-channel
+                  roles:root
+                  args
+                     message:x:@strings.concat
+                     type:warning
+               sleep:100
+
+         // Adding spacer.
+         sockets.signal:x:@.arguments/*/feedback-channel
+            args
+               message:------------------------------------------------------------------------------------------------------------------------
+               type:info
+         sleep:100
+
+         /*
+          * Crawling is done.
+          * Making sure we notify client that we're done and do some logging.
+          */
+         sockets.signal:magic.backend.message
+            roles:root
+            args
+               message:Done creating OpenAI training data from URL
+               type:success
+         sleep:100
+
+         // Basic logging.
+         log.info:OpenAI training data successfully created
+            url:x:@.arguments/*/url
+            type:x:@.arguments/*/type
+
+         // Checking if caller wants us to execute some lambda object once we're done.
+         if
+            exists:x:@.arguments/*/.onafter
+            .lambda
+               eval:x:@.arguments/*/.onafter
+
+   else
+
+      /*
+       * Site did not have a valid sitemap, hence we
+       * try to crawl it manually instead.
+       *
+       * This is the list of URLs we should scrape.
+       */
+      .urls
+
+      // This is the list of URLs we already have scraped.
+      .done
+
+      // Adding root URL to above list of URLs to be crawled.
+      unwrap:x:+/*/*
+      add:x:@.urls
+         .
+            .:x:@.arguments/*/url
+
+      // No sitemap(s) found, informing user
+      sockets.signal:x:@.arguments/*/feedback-channel
+         args
+            message:Could not find any valid sitemaps
+            type:warning
+      sleep:100
+
+      // Informing frontend of that we'll try to crawl site.
+      sockets.signal:x:@.arguments/*/feedback-channel
+         args
+            message:Trying to crawl site even though we did not find a valid sitemap
+            type:info
+      sleep:100
+
+      /*
+       * Looping through all above [.urls] as long as we don't exceed [max] argument,
+       * and for as long as we have URLs to scrape.
+       */
+      while
+         and
+            exists:x:@.urls/*
+            lt
+               get-count:x:@.done/*
+               get-value:x:@.arguments/*/max
+         .lambda
+
+            // Adding spacer.
+            sockets.signal:x:@.arguments/*/feedback-channel
+               args
+                  message:------------------------------------------------------------------------------------------------------------------------
+                  type:info
+            sleep:100
+
+            /*
+             * Scraping first URL in above [.urls] informing slot that
+             * we want it to return URLs found during scraping.
+             */
+            unwrap:x:+/*
+            signal:magic.ai.url.scrape
+               url:x:@.urls/0
+               type:x:@.arguments/*/type
+               images:bool:true
+               code:bool:true
+               lists:bool:true
+               main:bool:true
+               empty-completion:bool:false
+               threshold:x:@.arguments/*/threshold
+               feedback-channel:x:@.arguments/*/feedback-channel
+
+            /*
+             * Adding currently iterated URL to [.done] and removing it
+             * from above [.urls] collection.
+             */
+            add:x:@.done
+               get-nodes:x:@.urls/0
+            remove-nodes:x:@.urls/0
+
+            /*
+             * Adding all URLs returned in above invocation to above [.urls] collection,
+             * unless we've already crawled the URL.
+             */
+            for-each:x:@signal/*
+
+               // Checking if URL has been imported or added before, and that it matches base URL provided by caller.
+               if
+                  and
+                     not-exists:x:@.done/*/={@.dp/#}
+                     not-exists:x:@.urls/*/={@.dp/#}
+                     strings.starts-with:x:@.dp/#
+                        get-value:x:@.arguments/*/url
+                  .lambda
+
+                     // Adding URL to [.urls] collection.
+                     add:x:@.urls
+                        get-nodes:x:@.dp/#
+
+            // Signaling frontend that we're waiting for n seconds.
+            strings.concat
+               .:"Waiting for "
+               math.divide:x:@.arguments/*/delay
+                  .:int:1000
+               .:" seconds to avoid exhausting web server"
+            unwrap:x:+/**
+            sockets.signal:x:@.arguments/*/feedback-channel
+               args
+                  message:x:@strings.concat
+                  type:info
+            sleep:100
+
+            // Sleeping for [delay] milliseconds to avoid exhausting web server.
+            sleep:x:@.arguments/*/delay
+
+      // Adding spacer.
+      sockets.signal:x:@.arguments/*/feedback-channel
+         args
+            message:------------------------------------------------------------------------------------------------------------------------
+            type:info
+      sleep:100
+
+      // Informing frontend of that we're done crawling.
+      strings.concat
+         .:"Done scraping "
+         get-count:x:@.done/*
+         .:" URLs"
+      unwrap:x:+/**
+      sockets.signal:x:@.arguments/*/feedback-channel
+         args
+            message:x:@strings.concat
+            type:info
+      sleep:100
+
+      // Basic logging.
+      log.info:OpenAI training data successfully created
+         url:x:@.arguments/*/url
+         type:x:@.arguments/*/type
+
+      // Checking if caller wants us to execute some lambda object once we're done.
+      if
+         exists:x:@.arguments/*/.onafter
+         .lambda
+            eval:x:@.arguments/*/.onafter
diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl
index 6533d40892..9f06629230 100644
--- a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl
+++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl
@@ -12,459 +12,10 @@ slots.create:magic.ai.crawl-site
       // Making sure exceptions does not leave thread.
       try
 
-         /*
-          * Loading robots.txt from specified [url].
-          */
-         unwrap:x:+/*
-         signal:magic.ai.load-robots
-            url:x:@.arguments/*/url
-            feedback-channel:x:@.arguments/*/feedback-channel
-
-         // Checking if site contains a robots.txt file.
-         if
-            eq:x:@signal/*/found
-               .:bool:true
-            .lambda
-
-               // Site contains a robots.txt file, signaling frontend of that fact.
-               sockets.signal:x:@.arguments/*/feedback-channel
-                  args
-                     message:Site has robots.txt
-                     type:info
-               sleep:100
-
-               // Signaling frontend how many sitemaps we found in robots.txt file.
-               strings.concat
-                  .:"Found "
-                  get-count:x:@signal/*/sitemap/*
-                  .:" sitemaps in robots.txt file"
-               unwrap:x:+/**
-               sockets.signal:x:@.arguments/*/feedback-channel
-                  args
-                     message:x:@strings.concat
-                     type:info
-               sleep:100
-
-               // Checking if robots.txt contains a crawl-delay.
-               if
-                  exists:x:@signal/*/crawl-delay
-                  .lambda
-
-                     // Updating delay to value from robots.txt.
-                     remove-nodes:x:@.arguments/*/delay
-                     unwrap:x:+/*
-                     validators.default:x:@.arguments
-                        delay:x:@signal/*/crawl-delay
-
-
-                     // Signaling frontend to inform of that we found a crawl-delay value.
-                     strings.concat
-                        .:"Robots.txt file contains a Crawl-Delay value of "
-                        math.divide:x:@signal/*/crawl-delay
-                           .:int:1000
-                        .:" seconds"
-                     unwrap:x:+/**
-                     sockets.signal:x:@.arguments/*/feedback-channel
-                        args
-                           message:x:@strings.concat
-                           type:info
-
-         else
-
-            // Site does not contain a robots.txt file, signaling that fact to frontend.
-            sockets.signal:x:@.arguments/*/feedback-channel
-               args
-                  message:Could not find a robots.txt file for website
-                  type:warning
-            sleep:100
-            strings.concat
-               .:"We will try to retrieve sitemap from "
-               get-value:x:@signal/*/sitemap/0
-            unwrap:x:+/**
-            sockets.signal:x:@.arguments/*/feedback-channel
-               args
-                  message:x:@strings.concat
-                  type:info
-            sleep:100
-
-         /*
-          * Checking if we should filter according to URL, as
-          * in caller provided a sub-folder URL such as foo.com/bar, at which
-          * point we only import URLs below /bar hierarchy.
-          *
-          * Default value is false, implying robots.txt file is solely responsible
-          * for filtering.
-          */
-         .filter-on-url:bool:false
-         strings.split:x:@.arguments/*/url
-            .:/
-         if
-            mt
-               get-count:x:@strings.split/*
-               .:int:2
-            .lambda
-               set-value:x:@.filter-on-url
-                  .:bool:true
-
-         /*
-          * Trying to load URLs from sitemap returned from above invocation.
-          */
-         add:x:./*/signal/[1,2]
-            get-nodes:x:@signal/*/sitemap
-            get-nodes:x:@signal/*/allow
-            get-nodes:x:@signal/*/disallow
-         unwrap:x:+/*
-         signal:magic.ai.load-sitemap
-            max:x:@.arguments/*/max
-            feedback-channel:x:@.arguments/*/feedback-channel
-            url:x:@.arguments/*/url
-            filter-on-url:x:@.filter-on-url
-
-         // Signaling user what we're about to do.
-         strings.concat
-            .:"Deleting old snippets for type '"
-            get-value:x:@.arguments/*/type
-            .:"' matching URL of "
-            get-value:x:@.arguments/*/url
-         unwrap:x:+/**
-         sockets.signal:x:@.arguments/*/feedback-channel
-            args
-               message:x:@strings.concat
-               type:info
-         sleep:100
-
-         // Deleting all old training snippets matching specified URL and type.
-         .uri
-         set-value:x:@.uri
-            strings.concat
-               get-value:x:@.arguments/*/url
-               .:%
-         data.connect:[generic|magic]
-            data.execute:@"
-delete from vss_ml_training_snippets
-   where rowid in (select id as rowid from ml_training_snippets where type = @type and uri like @uri);
-delete from ml_training_snippets where type = @type and uri like @uri;"
-               type:x:@.arguments/*/type
-               uri:x:@.uri
-
-         // Verifying we found at least one sitemap.
-         if
-            eq:x:@signal/*/has-sitemap
-               .:bool:true
-            .lambda
-
-               /*
-                * We found at least one sitemap.
-                *
-                * Signaling frontend how many URLs we found, and how many there are in total.
-                */
-               get-count:x:@signal/*/urls/*
-               strings.concat
-                  .:"We found "
-                  get-value:x:@signal/*/total
-                  .:" URLs in sitemap(s), we will be scraping "
-                  get-value:x:@get-count
-                  .:" URLs"
-               unwrap:x:+/**
-               sockets.signal:x:@.arguments/*/feedback-channel
-                  args
-                     message:x:@strings.concat
-                     type:info
-               sleep:100
-
-               // Checking if site contains more URLs than we're scraping.
-               if
-                  eq:x:@get-count
-                     .:int:0
-                  .lambda
-
-                     // Warning user!
-                     strings.concat
-                        .:"Warning, we could not find a single valid URL in site, probably because sitemap or robots.txt file prohibits scraping, or because your filter is entirely excluded from robots.txt for AINIRO selector"
-                     unwrap:x:+/**
-                     sockets.signal:x:@.arguments/*/feedback-channel
-                        args
-                           message:x:@strings.concat
-                           type:warning
-                     sleep:100
-               else-if
-                  mt
-                     get-value:x:@signal/*/total
-                     get-value:x:@get-count
-                  .lambda
-
-                     // Warning user!
-                     strings.concat
-                        .:"Warning, site contains more than "
-                        get-value:x:@get-count
-                        .:" URLs and will only be partially scraped"
-                     unwrap:x:+/**
-                     sockets.signal:x:@.arguments/*/feedback-channel
-                        args
-                           message:x:@strings.concat
-                           type:warning
-                     sleep:100
-
-               // Feedback about URLs we're about to scrape, but only if there are any URLs.
-               if
-                  mt:x:@get-count
-                     .:int:0
-                  .lambda
-
-                     // Adding spacer.
-                     sockets.signal:x:@.arguments/*/feedback-channel
-                        args
-                           message:------------------------------------------------------------------------------------------------------------------------
-                           type:info
-                     sleep:100
-                        sockets.signal:x:@.arguments/*/feedback-channel
-                           args
-                              message:"URLs we will scrape are as follows:"
-                              type:info
-                        sleep:100
-
-               // Iterating through each URL returned from above invocation.
-               for-each:x:@signal/*/urls/*
-
-                  unwrap:x:+/**
-                  sockets.signal:x:@.arguments/*/feedback-channel
-                     args
-                        message:x:@.dp/#
-                        type:info
-                  sleep:100
-
-               // Iterating through each URL returned from above invocation.
-               for-each:x:@signal/*/urls/*
-
-                  // Making sure we trap exceptions.
-                  try
-
-                     // Adding spacer.
-                     sockets.signal:x:@.arguments/*/feedback-channel
-                        args
-                           message:------------------------------------------------------------------------------------------------------------------------
-                           type:info
-                     sleep:100
-
-                     // Scraping currently iterated URL.
-                     unwrap:x:+/*
-                     signal:magic.ai.url.scrape
-                        url:x:@.dp/#
-                        type:x:@.arguments/*/type
-                        threshold:x:@.arguments/*/threshold
-                        feedback-channel:x:@.arguments/*/feedback-channel
-
-                     // Verifying we've got more snippets before applying Crawl-Delay
-                     if
-                        neq:x:@.dp/#
-                           get-value:x:@signal/@signal/*/urls/0/-
-                        .lambda
-
-                           // Signaling frontend that we're waiting for n seconds.
-                           strings.concat
-                              .:"Waiting for "
-                              math.divide:x:@.arguments/*/delay
-                                 .:int:1000
-                              .:" seconds to avoid exhausting web server"
-                           unwrap:x:+/**
-                           sockets.signal:x:@.arguments/*/feedback-channel
-                              args
-                                 message:x:@strings.concat
-                                 type:info
-                           sleep:100
-
-                           // Sleeping for [delay] milliseconds to avoid exhausting web server.
-                           sleep:x:@.arguments/*/delay
-
-                  .catch
-
-                     // Logging as error.
-                     log.error:Could not scrape URL
-                        url:x:@.dp/#
-                        message:x:@.arguments/*/message
-
-                     // Signaling frontend to inform about error.
-                     strings.concat
-                        .:"Could not scrape URL, error was: '"
-                        get-value:x:@.arguments/*/message
-                        .:"'"
-                     unwrap:x:+/**
-                     sockets.signal:x:@.arguments/@.arguments/*/feedback-channel
-                        roles:root
-                        args
-                           message:x:@strings.concat
-                           type:warning
-                     sleep:100
-
-               // Adding spacer.
-               sockets.signal:x:@.arguments/*/feedback-channel
-                  args
-                     message:------------------------------------------------------------------------------------------------------------------------
-                     type:info
-               sleep:100
-
-               /*
-                * Crawling is done.
-                * Making sure we notify client that we're done and do some logging.
-                */
-               sockets.signal:magic.backend.message
-                  roles:root
-                  args
-                     message:Done creating OpenAI training data from URL
-                     type:success
-               sleep:100
-
-               // Basic logging.
-               log.info:OpenAI training data successfully created
-                  url:x:@.arguments/*/url
-                  type:x:@.arguments/*/type
-
-               // Checking if caller wants us to execute some lambda object once we're done.
-               if
-                  exists:x:@.arguments/*/.onafter
-                  .lambda
-                     eval:x:@.arguments/*/.onafter
-
-         else
-
-            /*
-             * Site did not have a valid sitemap, hence we
-             * try to crawl it manually instead.
-             *
-             * This is the list of URLs we should scrape.
-             */
-            .urls
-
-            // This is the list of URLs we already have scraped.
-            .done
-
-            // Adding root URL to above list of URLs to be crawled.
-            unwrap:x:+/*/*
-            add:x:@.urls
-               .
-                  .:x:@.arguments/*/url
-
-            // No sitemap(s) found, informing user
-            sockets.signal:x:@.arguments/*/feedback-channel
-               args
-                  message:Could not find any valid sitemaps
-                  type:warning
-            sleep:100
-
-            // Informing frontend of that we'll try to crawl site.
-            sockets.signal:x:@.arguments/*/feedback-channel
-               args
-                  message:Trying to crawl site even though we did not find a valid sitemap
-                  type:info
-            sleep:100
-
-            /*
-             * Looping through all above [.urls] as long as we don't exceed [max] argument,
-             * and for as long as we have URLs to scrape.
-             */
-            while
-               and
-                  exists:x:@.urls/*
-                  lt
-                     get-count:x:@.done/*
-                     get-value:x:@.arguments/*/max
-               .lambda
-
-                  // Adding spacer.
-                  sockets.signal:x:@.arguments/*/feedback-channel
-                     args
-                        message:------------------------------------------------------------------------------------------------------------------------
-                        type:info
-                  sleep:100
-
-                  /*
-                   * Scraping first URL in above [.urls] informing slot that
-                   * we want it to return URLs found during scraping.
-                   */
-                  unwrap:x:+/*
-                  signal:magic.ai.url.scrape
-                     url:x:@.urls/0
-                     type:x:@.arguments/*/type
-                     images:bool:true
-                     code:bool:true
-                     lists:bool:true
-                     main:bool:true
-                     empty-completion:bool:false
-                     threshold:x:@.arguments/*/threshold
-                     feedback-channel:x:@.arguments/*/feedback-channel
-
-                  /*
-                   * Adding currently iterated URL to [.done] and removing it
-                   * from above [.urls] collection.
-                   */
-                  add:x:@.done
-                     get-nodes:x:@.urls/0
-                  remove-nodes:x:@.urls/0
-
-                  /*
-                   * Adding all URLs returned in above invocation to above [.urls] collection,
-                   * unless we've already crawled the URL.
-                   */
-                  for-each:x:@signal/*
-
-                     // Checking if URL has been imported or added before, and that it matches base URL provided by caller.
-                     if
-                        and
-                           not-exists:x:@.done/*/={@.dp/#}
-                           not-exists:x:@.urls/*/={@.dp/#}
-                           strings.starts-with:x:@.dp/#
-                              get-value:x:@.arguments/*/url
-                        .lambda
-
-                           // Adding URL to [.urls] collection.
-                           add:x:@.urls
-                              get-nodes:x:@.dp/#
-
-                  // Signaling frontend that we're waiting for n seconds.
-                  strings.concat
-                     .:"Waiting for "
-                     math.divide:x:@.arguments/*/delay
-                        .:int:1000
-                     .:" seconds to avoid exhausting web server"
-                  unwrap:x:+/**
-                  sockets.signal:x:@.arguments/*/feedback-channel
-                     args
-                        message:x:@strings.concat
-                        type:info
-                  sleep:100
-
-                  // Sleeping for [delay] milliseconds to avoid exhausting web server.
-                  sleep:x:@.arguments/*/delay
-
-            // Adding spacer.
-            sockets.signal:x:@.arguments/*/feedback-channel
-               args
-                  message:------------------------------------------------------------------------------------------------------------------------
-                  type:info
-            sleep:100
-
-            // Informing frontend of that we're done crawling.
-            strings.concat
-               .:"Done scraping "
-               get-count:x:@.done/*
-               .:" URLs"
-            unwrap:x:+/**
-            sockets.signal:x:@.arguments/*/feedback-channel
-               args
-                  message:x:@strings.concat
-                  type:info
-            sleep:100
-
-            // Basic logging.
-            log.info:OpenAI training data successfully created
-               url:x:@.arguments/*/url
-               type:x:@.arguments/*/type
-
-            // Checking if caller wants us to execute some lambda object once we're done.
-            if
-               exists:x:@.arguments/*/.onafter
-               .lambda
-                  eval:x:@.arguments/*/.onafter
+         // Invoking slot responsible for doing the actual crawling.
+         add:x:./*/signal
+            get-nodes:x:@.arguments/*
+         signal:magic.ai.crawl-site-on-thread
 
       .catch
 
diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl
index 0a5991e2eb..c2097ce6ee 100644
--- a/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl
+++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl
@@ -36,14 +36,12 @@ slots.create:magic.ai.html.extract
    // Converting HTML to Markdown.
    html2markdown:x:@.arguments/*/html
       url:x:@.arguments/*/url
-   set-value:x:@.markdown
-      get-value:x:@html2markdown
 
-   // Checking if site is SPA, at which point we return early.
+   // Checking if we have any Markdown at all, and if not we return early.
    if
       or
-         null:x:@.markdown
-         eq:x:@.markdown
+         null:x:@html2markdown
+         eq:x:@html2markdown
             .:
       .lambda
 
@@ -54,107 +52,60 @@ slots.create:magic.ai.html.extract
             meta
                main:int:0
 
-   // Converting raw HTML to lambda to allow us to extract title, description, hyperlinks, etc.
-   .html-lambda
-   add:x:@.html-lambda
-      html2lambda:x:@.arguments/*/html
-
-   /*
-    * Finding URLs from document.
-    *
-    * Notice, for simplicity reasons we do this by round tripping through HTML,
-    * for then to convert HTML to lambda, and iterate through each anchor HTML
-    * element in lambda.
-    *
-    * This is not optimal, and could be optimised, but it keeps the code
-    * DRY at least, since our [html2markdown] slot at this point have resolved
-    * our relative URLs ...
-    */
-   markdown2html:x:@html2markdown
-   html2lambda:x:@markdown2html
-   for-each:x:@html2lambda/**/a/*/\@href
-
-      /*
-       * Notice, the URLs we return are for the scraper to crawl and scrape,
-       * so we don't return mailto or tel URLs here, and we only return URLs
-       * from the same domain.
-       */
-      if
-         and
-            not
-               strings.starts-with:x:@.dp/#
-                  .:"mailto:"
-            not
-               strings.starts-with:x:@.dp/#
-                  .:"tel:"
-         .lambda
-         
-            // Removing '#'.
-            .url
-            set-value:x:@.url
-               get-value:x:@.dp/#
-            strings.split:x:@.url
-               .:#
-            set-value:x:@.url
-               get-value:x:@strings.split/0
-
-            // Valid URL, now checking if it's the same domain.
-            .local
-            strings.split:x:@.arguments/*/url
-               .:"/"
-            set-value:x:@.local
-               get-value:x:@strings.split/1
-            .current
-            strings.split:x:@.url
-               .:"/"
-            set-value:x:@.current
-               get-value:x:@strings.split/1
-            if
-               eq:x:@.local
-                  get-value:x:@.current
-               .lambda
-
-                  // Local URL, now making sure it's not the same URL.
-                  strings.trim-end:x:@.url
-                     .:/
-                  strings.trim-end:x:@.arguments/*/url
-                     .:/
-                  if
-                     neq:x:@strings.trim-end
-                        get-value:x:@strings.trim-end/@strings.trim-end
-                     .lambda
-                     
-                        // Not the same URLas the one we'recurrently scraping.
-                        unwrap:x:+/*/*
-                        add:x:@.urls
-                           .
-                              .:x:@.url
-
-   // Setting title and description from document.
+   // Retrieving title and description from document.
    set-value:x:@.title
-      get-value:x:@.html-lambda/**/head/**/title/*/\#text
+      get-value:x:@html2markdown/*/title
    set-value:x:@.description
-      get-value:x:@.html-lambda/**/head/**/meta/*/\@name/=description/./*/\@content
+      get-value:x:@html2markdown/*/description
+
+   // Adding URLs found as we transformed HTML to Markdown.
+   add:x:@.urls
+      get-nodes:x:@html2markdown/*/urls/*
 
-   // Creating our prompt.
+   // Storing Markdown in above buffer node.
+   set-value:x:@.markdown
+      get-value:x:@html2markdown
+
+   /*
+    * Creating our "base prompt", which is the default to be used for
+    * all training snippets found in document.
+    */
    .prompt
    set-value:x:@.prompt
       get-first-value
          get-value:x:@.title
-         get-value:x:@html2lambda/**/h1/[0,1]/*/\#text
-         get-value:x:@html2lambda/**/h2/[0,1]/*/\#text
-         get-value:x:@html2lambda/**/h3/[0,1]/*/\#text
-         get-value:x:@html2lambda/**/h4/[0,1]/*/\#text
-         get-value:x:@html2lambda/**/h5/[0,1]/*/\#text
-         get-value:x:@html2lambda/**/h6/[0,1]/*/\#text
          get-value:x:@.description
          .:"Page"
 
-   // Breaking page into sections.
+   /*
+    * Breaking page into sections.
+    *
+    * Here we are breaking page down into smaller training snippets, based upon
+    * lists found at root, and pre sections (code).
+    *
+    * This is done to avoid creating summaries of lists that typically might contain
+    * navbar parts and URLs, in addition to making sure we keep all code segments as is.
+    *
+    * The [.tmp-prompt] below is being used as the prompt for individual OL, UL and PRE
+    * elements, while the [.remaining] part is the remaining Markdown after we've remove
+    * all UL, OL and PRE elements.
+    */
    .remaining
    .tmp-prompt
    set-value:x:@.tmp-prompt
       get-value:x:@.prompt
+
+   /*
+    * For simplicity reasons we convert Markdown to HTML and then to lambda again,
+    * to allow us to semantically traverse UL elements, OL elements and PRE elements.
+    *
+    * This allows us to extract UL, OL, and PRE elements, and import these as individual
+    * training snippets.
+    */
+   markdown2html:x:@html2markdown
+   html2lambda:x:@markdown2html
+
+   // Iterating through all root nodes found as we converted Markdown back to lambda again.
    for-each:x:@html2lambda/*
 
       get-name:x:@.dp/#
@@ -166,30 +117,37 @@ slots.create:magic.ai.html.extract
          case:h4
          case:h5
          case:h6
+
+            /*
+             * If page contains Hx element, we append it to our base
+             * prompt, to try to keep as much of the (relevant) information as
+             * possible in our prompt for our UL, OL and PRE training snippets.
+             */
             set-value:x:@.tmp-prompt
-               get-value:x:@.dp/#/*/\#text
+               strings.concat
+                  get-value:x:@.prompt
+                  .:" | "
+                  get-value:x:@.dp/#/*/\#text
 
          case:ul
          case:ol
          case:pre
 
             /*
-             * To avoid repeating navbars "everywhere" we remove these from Markdown and returns
-             * these as individual training snippets. This will ensure navbars are only imported
-             * once, as a separated training snippet.
+             * This is a bulleted list, ordered list, or a code segment.
+             *
+             * We keep it exactly as is, but return it as an individual training snippet,
+             * such that it becomes an isolated training snippet during
+             * import.
              */
             lambda2html:x:@.dp/#
             html2markdown:x:@lambda2html
                url:x:@.arguments/*/url
-            strings.concat
-               get-value:x:@.title
-               .:" | "
-               get-value:x:@.tmp-prompt
             unwrap:x:+/*/*/*
             add:x:@.snippets
                .
                   .
-                     prompt:x:@strings.concat
+                     prompt:x:@.tmp-prompt
                      completion:x:@html2markdown
 
          default
@@ -198,11 +156,18 @@ slots.create:magic.ai.html.extract
             add:x:@.remaining
                get-nodes:x:@.dp/#
 
+   /*
+    * Now we have removed all UL, OL and PRE elements, and [.remaining] contains
+    * a lambda node hierarchy of everything that remains in the page, at which
+    * point we can transform it back to HTML for then to transform it to Markdown,
+    * and return that as an individual snippets, without PRE, UL and OL elements.
+    */
    lambda2html:x:@.remaining/*
    set-value:x:@.markdown
       html2markdown:x:@lambda2html
          url:x:@.arguments/*/url
 
+   // Adding remaining HTML as an individual training snippet.
    .completion
    set-value:x:@.completion
       get-value:x:@.markdown
@@ -213,19 +178,14 @@ slots.create:magic.ai.html.extract
             prompt:x:@.prompt
             completion:x:@.completion
 
-   // Returning snippets to caller if we could find anything.
-   if
-      exists:x:@.snippets/*
-      .lambda
-
-         // Returning snippets and meta information to caller.
-         add:x:./*/return/*/snippets
-            get-nodes:x:@.snippets/*
-         add:x:./*/return/*/meta
-            get-nodes:x:@.meta/*
-         add:x:./*/return/*/urls
-            get-nodes:x:@.urls/*
-         return
-            urls
-            snippets
-            meta
+   // Returning snippets and meta information to caller.
+   add:x:./*/return/*/snippets
+      get-nodes:x:@.snippets/*
+   add:x:./*/return/*/meta
+      get-nodes:x:@.meta/*
+   add:x:./*/return/*/urls
+      get-nodes:x:@.urls/*
+   return
+      urls
+      snippets
+      meta
diff --git a/backend/slots/Version.cs b/backend/slots/Version.cs
index 9b05da7ad2..3a0dbfc9fd 100644
--- a/backend/slots/Version.cs
+++ b/backend/slots/Version.cs
@@ -20,7 +20,7 @@ public class Version : ISlot
         /// <param name="input">Parameters passed from signaler</param>
         public void Signal(ISignaler signaler, Node input)
         {
-            input.Value = "v17.3.3";
+            input.Value = "v17.3.4";
         }
     }
 }