Started working on scraping sites without sitemap

polterguy · Oct 8, 2023 · 048d1ec · 048d1ec
1 parent 2a9dacc
commit 048d1ec
Show file tree

Hide file tree

Showing 4 changed files with 286 additions and 78 deletions.
diff --git a/backend/files/system/openai/magic.startup/magic.ai.crawl-site.hl b/backend/files/system/openai/magic.startup/magic.ai.crawl-site.hl
@@ -101,106 +101,247 @@ slots.create:magic.ai.crawl-site
          unwrap:x:+/*
          signal:magic.ai.load-sitemap
             max:x:@.arguments/*/max
-         lambda2hyper:x:-
-         log.info:x:-
 
-         // Signaling frontend how many URLs we found, and how many there are in total.
-         strings.concat
-            .:"We found "
-            get-value:x:@signal/*/total
-            .:" URLs in sitemap(s)"
-         unwrap:x:+/**
-         sockets.signal:magic.backend.chatbot
-            roles:root
-            args
-               message:x:@strings.concat
-               type:info
-         sleep:100
-
-         // Checking if site contains more URLs than we're scraping.
-         get-count:x:@signal/*/urls/*
+         // Verifying we found at least one sitemap.
          if
-            mt
-               get-value:x:@signal/*/total
-               get-value:x:@get-count
+            eq:x:@signal/*/has-sitemap
+               .:bool:true
             .lambda
 
-               // Warning user!
+               /*
+                * We found at least one sitemap.
+                *
+                * Signaling frontend how many URLs we found, and how many there are in total.
+                */
                strings.concat
-                  .:"Warning, site contains more than "
-                  get-value:x:@get-count
-                  .:" URLs and will only be partially scraped"
+                  .:"We found "
+                  get-value:x:@signal/*/total
+                  .:" URLs in sitemap(s)"
                unwrap:x:+/**
                sockets.signal:magic.backend.chatbot
                   roles:root
                   args
                      message:x:@strings.concat
-                     type:warning
+                     type:info
                sleep:100
 
-         // Iterating through each URL returned from above invocation.
-         for-each:x:@signal/*/urls/*
+               // Checking if site contains more URLs than we're scraping.
+               get-count:x:@signal/*/urls/*
+               if
+                  mt
+                     get-value:x:@signal/*/total
+                     get-value:x:@get-count
+                  .lambda
 
-            // Making sure we trap exceptions.
-            try
+                     // Warning user!
+                     strings.concat
+                        .:"Warning, site contains more than "
+                        get-value:x:@get-count
+                        .:" URLs and will only be partially scraped"
+                     unwrap:x:+/**
+                     sockets.signal:magic.backend.chatbot
+                        roles:root
+                        args
+                           message:x:@strings.concat
+                           type:warning
+                     sleep:100
 
-               // Scraping currently iterated URL.
-               unwrap:x:+/*
-               signal:magic.ai.url.scrape
-                  url:x:@.dp/#
-                  type:x:@.arguments/*/type
-                  images:bool:true
-                  code:bool:true
-                  lists:bool:true
-                  main:bool:true
-                  empty-completion:bool:true
-                  threshold:x:@.arguments/*/threshold
-
-               // Signaling frontend that we're waiting for n seconds.
-               strings.concat
-                  .:"Waiting for "
-                  math.divide:x:@.arguments/*/delay
-                     .:int:1000
-                  .:" seconds to avoid exhausting web server"
-               unwrap:x:+/**
-               sockets.signal:magic.backend.chatbot
+               // Iterating through each URL returned from above invocation.
+               for-each:x:@signal/*/urls/*
+
+                  // Making sure we trap exceptions.
+                  try
+
+                     // Scraping currently iterated URL.
+                     unwrap:x:+/*
+                     signal:magic.ai.url.scrape
+                        url:x:@.dp/#
+                        type:x:@.arguments/*/type
+                        images:bool:true
+                        code:bool:true
+                        lists:bool:true
+                        main:bool:true
+                        empty-completion:bool:true
+                        threshold:x:@.arguments/*/threshold
+
+                     // Signaling frontend that we're waiting for n seconds.
+                     strings.concat
+                        .:"Waiting for "
+                        math.divide:x:@.arguments/*/delay
+                           .:int:1000
+                        .:" seconds to avoid exhausting web server"
+                     unwrap:x:+/**
+                     sockets.signal:magic.backend.chatbot
+                        roles:root
+                        args
+                           message:x:@strings.concat
+                           type:info
+                     sleep:100
+
+                     // Sleeping for [delay] milliseconds to avoid exhausting web server.
+                     sleep:x:@.arguments/*/delay
+
+                  .catch
+
+                     // Logging as error.
+                     log.error:Could not scrape URL
+                        url:x:@.dp/#
+                        message:x:@.arguments/*/message
+
+               /*
+                * Crawling is done.
+                * Making sure we notify client that we're done and do some logging.
+                */
+               sockets.signal:magic.backend.message
                   roles:root
                   args
-                     message:x:@strings.concat
-                     type:info
+                     message:Done creating OpenAI training data from URL
+                     type:success
                sleep:100
 
-               // Sleeping for [delay] milliseconds to avoid exhausting web server.
-               sleep:x:@.arguments/*/delay
+               // Basic logging.
+               log.info:OpenAI training data successfully created
+                  url:x:@.arguments/*/url
+                  type:x:@.arguments/*/type
 
-            .catch
+               // Checking if caller wants us to execute some lambda object once we're done.
+               if
+                  exists:x:@.arguments/*/.onafter
+                  .lambda
+                     eval:x:@.arguments/*/.onafter
 
-               // Logging as error.
-               log.error:Could not scrape URL
-                  url:x:@.dp/#
-                  message:x:@.arguments/*/message
+         else
 
-         /*
-          * Crawling is done.
-          * Making sure we notify client that we're done and do some logging.
-          */
-         sockets.signal:magic.backend.message
-            roles:root
-            args
-               message:Done creating OpenAI training data from URL
-               type:success
-         sleep:100
+            /*
+             * Site did not have a valid sitemap, hence we
+             * try to crawl it manually instead.
+             *
+             * This is the list of URLs we should scrape.
+             */
+            .urls
 
-         // Basic logging.
-         log.info:OpenAI training data successfully created
-            url:x:@.arguments/*/url
-            type:x:@.arguments/*/type
+            // This is the list of URLs we already have scraped.
+            .done
 
-         // Checking if caller wants us to execute some lambda object once we're done.
-         if
-            exists:x:@.arguments/*/.onafter
-            .lambda
-               eval:x:@.arguments/*/.onafter
+            // Adding root URL to above list of URLs to be crawled.
+            unwrap:x:+/*/*
+            add:x:@.urls
+               .
+                  .:x:@.arguments/*/url
+
+            // Informing frontend of that we'll try to crawl site.
+            sockets.signal:magic.backend.chatbot
+               roles:root
+               args
+                  message:Trying to crawl site even though we did not find a valid sitemap
+                  type:info
+            sleep:100
+
+            /*
+             * Looping through all above [.urls] as long as we don't exceed [max] argument,
+             * and for as long as we have URLs to scrape.
+             */
+            while
+               and
+                  exists:x:@.urls/*
+                  lt
+                     get-count:x:@.done/*
+                     get-value:x:@.arguments/*/max
+               .lambda
+
+                  /*
+                   * Scraping first URL in above [.urls] informing slot that
+                   * we want it to return URLs found during scraping.
+                   */
+                  unwrap:x:+/*
+                  signal:magic.ai.url.scrape
+                     url:x:@.urls/0
+                     type:x:@.arguments/*/type
+                     images:bool:true
+                     code:bool:true
+                     lists:bool:true
+                     main:bool:true
+                     empty-completion:bool:true
+                     threshold:x:@.arguments/*/threshold
+
+                  /*
+                   * Adding currently iterated URL to [.done] and removing it
+                   * from above [.urls] collection.
+                   */
+                  add:x:@.done
+                     get-nodes:x:@.urls/0
+                  remove-nodes:x:@.urls/0
+
+                  /*
+                   * Adding all URLs returned in above invocation to above [.urls] collection,
+                   * unless we've already crawled the URL.
+                   */
+                  for-each:x:@signal/*
+
+                     // Verifying we have not already scraped URL.
+                     .exists:bool:false
+                     for-each:x:@.done/*
+
+                        // Checking if URL exists in [.done] collection.
+                        if
+                           eq:x:@.dp/#
+                              get-value:x:@.dp/@.dp/#
+                           .lambda
+
+                              // URL has already been imported.
+                              set-value:x:@.exists
+                                 .:bool:true
+
+                     // Checking if URL has been imported before.
+                     if
+                        eq:x:@.exists
+                           .:bool:false
+                        .lambda
+
+                           // Adding URL to [.urls] collection.
+                           add:x:@.urls
+                              get-nodes:x:@.dp/#
+
+                  // Signaling frontend that we're waiting for n seconds.
+                  strings.concat
+                     .:"Waiting for "
+                     math.divide:x:@.arguments/*/delay
+                        .:int:1000
+                     .:" seconds to avoid exhausting web server"
+                  unwrap:x:+/**
+                  sockets.signal:magic.backend.chatbot
+                     roles:root
+                     args
+                        message:x:@strings.concat
+                        type:info
+                  sleep:100
+
+                  // Sleeping for [delay] milliseconds to avoid exhausting web server.
+                  sleep:x:@.arguments/*/delay
+
+            // Informing frontend of that we're done crawling.
+            strings.concat
+               .:"Done scraping "
+               get-count:x:@.done/*
+               .:" URLs"
+            unwrap:x:+/**
+            sockets.signal:magic.backend.chatbot
+               roles:root
+               args
+                  message:x:@strings.concat
+                  type:info
+            sleep:100
+
+            // Basic logging.
+            log.info:OpenAI training data successfully created
+               url:x:@.arguments/*/url
+               type:x:@.arguments/*/type
+
+            // Checking if caller wants us to execute some lambda object once we're done.
+            if
+               exists:x:@.arguments/*/.onafter
+               .lambda
+                  eval:x:@.arguments/*/.onafter
 
       .catch
 

diff --git a/backend/files/system/openai/magic.startup/magic.ai.html.extract-snippets.hl b/backend/files/system/openai/magic.startup/magic.ai.html.extract-snippets.hl
@@ -116,11 +116,59 @@ slots.create:magic.ai.html.extract-snippets
    // Buffer used for snippet to return.
    .snippets
 
+   // Buffer used to hold all URLs found during scraping.
+   .urls
+
    // Used to store a reference to above HTML transformed to lambda.
    .document
    set-value:x:@.document
       reference:x:@html2lambda/*/html
 
+   /*
+    * Looping through entire document to find URLs in it such that we
+    * can return this to caller.
+    */
+   for-each:x:@.document/#/**/a/*/\@href
+
+      // Sanity checking currently iterated URL.
+      if
+         and
+            not-null:x:@.dp/#
+            neq:x:@.dp/#
+               .:
+            not
+               strings.starts-with:x:@.dp/#
+                  .:javascript
+            not
+               strings.starts-with:x:@.dp/#
+                  .:void
+            not
+               exists:x:./*/rel/=nofollow
+         .lambda
+
+            // Removing hash tag parts, if existing.
+            strings.split:x:@.dp/#
+               .:#
+            unwrap:x:+/*
+            signal:magic.url.normalize
+               url:x:@strings.split/0
+               base:x:@.base
+               scheme:x:@.scheme
+
+            // Verifying this is a local URL.
+            if
+               strings.starts-with:x:@signal
+                  get-value:x:@.base
+               .lambda
+
+                  // This is a local URL, trimming trailing slash.
+                  strings.trim-end:x:@signal
+                     .:/
+                  unwrap:x:+/*/*
+                  add:x:@.urls
+                     .
+                        .:x:@strings.trim-end
+
    /*
     * Checking if caller wants images.
     *
@@ -210,6 +258,9 @@ slots.create:magic.ai.html.extract-snippets
             get-nodes:x:@.snippets/*
          add:x:./*/return/*/meta
             get-nodes:x:@.meta/*
+         add:x:./*/return/*/urls
+            get-nodes:x:@.urls/*
          return
+            urls
             snippets
             meta