From ed9ff89c96119149815d2371beb7ec489d9514b8 Mon Sep 17 00:00:00 2001
From: Thomas Hansen
Date: Fri, 9 Feb 2024 12:50:56 +0200
Subject: [PATCH] Getting ready for release
---
backend/backend.csproj | 2 +-
.../crawling/magic.ai.crawl-site-on-thread.hl | 459 ++++++++++++++++++
.../crawling/magic.ai.crawl-site.hl | 457 +----------------
.../crawling/magic.ai.html.extract.hl | 194 +++-----
backend/slots/Version.cs | 2 +-
5 files changed, 542 insertions(+), 572 deletions(-)
create mode 100644 backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl
diff --git a/backend/backend.csproj b/backend/backend.csproj
index 9c17a97491..5d3bec2f14 100644
--- a/backend/backend.csproj
+++ b/backend/backend.csproj
@@ -26,7 +26,7 @@
-
+
diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl
new file mode 100644
index 0000000000..4eba515511
--- /dev/null
+++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl
@@ -0,0 +1,459 @@
+
+/*
+ * Crawls the specified website generating training data for machine learning in the process.
+ */
+slots.create:magic.ai.crawl-site-on-thread
+
+ /*
+ * Loading robots.txt from specified [url].
+ */
+ unwrap:x:+/*
+ signal:magic.ai.load-robots
+ url:x:@.arguments/*/url
+ feedback-channel:x:@.arguments/*/feedback-channel
+
+ // Checking if site contains a robots.txt file.
+ if
+ eq:x:@signal/*/found
+ .:bool:true
+ .lambda
+
+ // Site contains a robots.txt file, signaling frontend of that fact.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:Site has robots.txt
+ type:info
+ sleep:100
+
+ // Signaling frontend how many sitemaps we found in robots.txt file.
+ strings.concat
+ .:"Found "
+ get-count:x:@signal/*/sitemap/*
+ .:" sitemaps in robots.txt file"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+ sleep:100
+
+ // Checking if robots.txt contains a crawl-delay.
+ if
+ exists:x:@signal/*/crawl-delay
+ .lambda
+
+ // Updating delay to value from robots.txt.
+ remove-nodes:x:@.arguments/*/delay
+ unwrap:x:+/*
+ validators.default:x:@.arguments
+ delay:x:@signal/*/crawl-delay
+
+
+ // Signaling frontend to inform of that we found a crawl-delay value.
+ strings.concat
+ .:"Robots.txt file contains a Crawl-Delay value of "
+ math.divide:x:@signal/*/crawl-delay
+ .:int:1000
+ .:" seconds"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+
+ else
+
+ // Site does not contain a robots.txt file, signaling that fact to frontend.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:Could not find a robots.txt file for website
+ type:warning
+ sleep:100
+ strings.concat
+ .:"We will try to retrieve sitemap from "
+ get-value:x:@signal/*/sitemap/0
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+ sleep:100
+
+ /*
+ * Checking if we should filter according to URL, as
+ * in caller provided a sub-folder URL such as foo.com/bar, at which
+ * point we only import URLs below /bar hierarchy.
+ *
+ * Default value is false, implying robots.txt file is solely responsible
+ * for filtering.
+ */
+ .filter-on-url:bool:false
+ strings.split:x:@.arguments/*/url
+ .:/
+ if
+ mt
+ get-count:x:@strings.split/*
+ .:int:2
+ .lambda
+ set-value:x:@.filter-on-url
+ .:bool:true
+
+ /*
+ * Trying to load URLs from sitemap returned from above invocation.
+ */
+ add:x:./*/signal/[1,2]
+ get-nodes:x:@signal/*/sitemap
+ get-nodes:x:@signal/*/allow
+ get-nodes:x:@signal/*/disallow
+ unwrap:x:+/*
+ signal:magic.ai.load-sitemap
+ max:x:@.arguments/*/max
+ feedback-channel:x:@.arguments/*/feedback-channel
+ url:x:@.arguments/*/url
+ filter-on-url:x:@.filter-on-url
+
+ // Signaling user what we're about to do.
+ strings.concat
+ .:"Deleting old snippets for type '"
+ get-value:x:@.arguments/*/type
+ .:"' matching URL of "
+ get-value:x:@.arguments/*/url
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+ sleep:100
+
+ // Deleting all old training snippets matching specified URL and type.
+ .uri
+ set-value:x:@.uri
+ strings.concat
+ get-value:x:@.arguments/*/url
+ .:%
+ data.connect:[generic|magic]
+ data.execute:@"
+delete from vss_ml_training_snippets
+where rowid in (select id as rowid from ml_training_snippets where type = @type and uri like @uri);
+delete from ml_training_snippets where type = @type and uri like @uri;"
+ type:x:@.arguments/*/type
+ uri:x:@.uri
+
+ // Verifying we found at least one sitemap.
+ if
+ eq:x:@signal/*/has-sitemap
+ .:bool:true
+ .lambda
+
+ /*
+ * We found at least one sitemap.
+ *
+ * Signaling frontend how many URLs we found, and how many there are in total.
+ */
+ get-count:x:@signal/*/urls/*
+ strings.concat
+ .:"We found "
+ get-value:x:@signal/*/total
+ .:" URLs in sitemap(s), we will be scraping "
+ get-value:x:@get-count
+ .:" URLs"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+ sleep:100
+
+ // Checking if site contains more URLs than we're scraping.
+ if
+ eq:x:@get-count
+ .:int:0
+ .lambda
+
+ // Warning user!
+ strings.concat
+ .:"Warning, we could not find a single valid URL in site"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:warning
+ sleep:100
+ else-if
+ mt
+ get-value:x:@signal/*/total
+ get-value:x:@get-count
+ .lambda
+
+ // Warning user!
+ strings.concat
+ .:"Warning, site contains more than "
+ get-value:x:@get-count
+ .:" URLs and will only be partially scraped"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:warning
+ sleep:100
+
+ // Feedback about URLs we're about to scrape, but only if there are any URLs.
+ if
+ mt:x:@get-count
+ .:int:0
+ .lambda
+
+ // Adding spacer.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:------------------------------------------------------------------------------------------------------------------------
+ type:info
+ sleep:100
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:"URLs we will scrape are as follows:"
+ type:info
+ sleep:100
+
+ // Iterating through each URL returned from above invocation.
+ for-each:x:@signal/*/urls/*
+
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@.dp/#
+ type:info
+ sleep:10
+
+ // Iterating through each URL returned from above invocation.
+ for-each:x:@signal/*/urls/*
+
+ // Making sure we trap exceptions.
+ try
+
+ // Adding spacer.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:------------------------------------------------------------------------------------------------------------------------
+ type:info
+ sleep:100
+
+ // Scraping currently iterated URL.
+ unwrap:x:+/*
+ signal:magic.ai.url.scrape
+ url:x:@.dp/#
+ type:x:@.arguments/*/type
+ threshold:x:@.arguments/*/threshold
+ feedback-channel:x:@.arguments/*/feedback-channel
+
+ // Verifying we've got more snippets before applying Crawl-Delay
+ if
+ neq:x:@.dp/#
+ get-value:x:@signal/@signal/*/urls/0/-
+ .lambda
+
+ // Signaling frontend that we're waiting for n seconds.
+ strings.concat
+ .:"Waiting for "
+ math.divide:x:@.arguments/*/delay
+ .:int:1000
+ .:" seconds to avoid exhausting web server"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+ sleep:100
+
+ // Sleeping for [delay] milliseconds to avoid exhausting web server.
+ sleep:x:@.arguments/*/delay
+
+ .catch
+
+ // Logging as error.
+ log.error:Could not scrape URL
+ url:x:@.dp/#
+ message:x:@.arguments/*/message
+
+ // Signaling frontend to inform about error.
+ strings.concat
+ .:"Could not scrape URL, error was: '"
+ get-value:x:@.arguments/*/message
+ .:"'"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/@.arguments/*/feedback-channel
+ roles:root
+ args
+ message:x:@strings.concat
+ type:warning
+ sleep:100
+
+ // Adding spacer.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:------------------------------------------------------------------------------------------------------------------------
+ type:info
+ sleep:100
+
+ /*
+ * Crawling is done.
+ * Making sure we notify client that we're done and do some logging.
+ */
+ sockets.signal:magic.backend.message
+ roles:root
+ args
+ message:Done creating OpenAI training data from URL
+ type:success
+ sleep:100
+
+ // Basic logging.
+ log.info:OpenAI training data successfully created
+ url:x:@.arguments/*/url
+ type:x:@.arguments/*/type
+
+ // Checking if caller wants us to execute some lambda object once we're done.
+ if
+ exists:x:@.arguments/*/.onafter
+ .lambda
+ eval:x:@.arguments/*/.onafter
+
+ else
+
+ /*
+ * Site did not have a valid sitemap, hence we
+ * try to crawl it manually instead.
+ *
+ * This is the list of URLs we should scrape.
+ */
+ .urls
+
+ // This is the list of URLs we already have scraped.
+ .done
+
+ // Adding root URL to above list of URLs to be crawled.
+ unwrap:x:+/*/*
+ add:x:@.urls
+ .
+ .:x:@.arguments/*/url
+
+ // No sitemap(s) found, informing user
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:Could not find any valid sitemaps
+ type:warning
+ sleep:100
+
+ // Informing frontend of that we'll try to crawl site.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:Trying to crawl site even though we did not find a valid sitemap
+ type:info
+ sleep:100
+
+ /*
+ * Looping through all above [.urls] as long as we don't exceed [max] argument,
+ * and for as long as we have URLs to scrape.
+ */
+ while
+ and
+ exists:x:@.urls/*
+ lt
+ get-count:x:@.done/*
+ get-value:x:@.arguments/*/max
+ .lambda
+
+ // Adding spacer.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:------------------------------------------------------------------------------------------------------------------------
+ type:info
+ sleep:100
+
+ /*
+ * Scraping first URL in above [.urls] informing slot that
+ * we want it to return URLs found during scraping.
+ */
+ unwrap:x:+/*
+ signal:magic.ai.url.scrape
+ url:x:@.urls/0
+ type:x:@.arguments/*/type
+ images:bool:true
+ code:bool:true
+ lists:bool:true
+ main:bool:true
+ empty-completion:bool:false
+ threshold:x:@.arguments/*/threshold
+ feedback-channel:x:@.arguments/*/feedback-channel
+
+ /*
+ * Adding currently iterated URL to [.done] and removing it
+ * from above [.urls] collection.
+ */
+ add:x:@.done
+ get-nodes:x:@.urls/0
+ remove-nodes:x:@.urls/0
+
+ /*
+ * Adding all URLs returned in above invocation to above [.urls] collection,
+ * unless we've already crawled the URL.
+ */
+ for-each:x:@signal/*
+
+ // Checking if URL has been imported or added before, and that it matches base URL provided by caller.
+ if
+ and
+ not-exists:x:@.done/*/={@.dp/#}
+ not-exists:x:@.urls/*/={@.dp/#}
+ strings.starts-with:x:@.dp/#
+ get-value:x:@.arguments/*/url
+ .lambda
+
+ // Adding URL to [.urls] collection.
+ add:x:@.urls
+ get-nodes:x:@.dp/#
+
+ // Signaling frontend that we're waiting for n seconds.
+ strings.concat
+ .:"Waiting for "
+ math.divide:x:@.arguments/*/delay
+ .:int:1000
+ .:" seconds to avoid exhausting web server"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+ sleep:100
+
+ // Sleeping for [delay] milliseconds to avoid exhausting web server.
+ sleep:x:@.arguments/*/delay
+
+ // Adding spacer.
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:------------------------------------------------------------------------------------------------------------------------
+ type:info
+ sleep:100
+
+ // Informing frontend of that we're done crawling.
+ strings.concat
+ .:"Done scraping "
+ get-count:x:@.done/*
+ .:" URLs"
+ unwrap:x:+/**
+ sockets.signal:x:@.arguments/*/feedback-channel
+ args
+ message:x:@strings.concat
+ type:info
+ sleep:100
+
+ // Basic logging.
+ log.info:OpenAI training data successfully created
+ url:x:@.arguments/*/url
+ type:x:@.arguments/*/type
+
+ // Checking if caller wants us to execute some lambda object once we're done.
+ if
+ exists:x:@.arguments/*/.onafter
+ .lambda
+ eval:x:@.arguments/*/.onafter
diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl
index 6533d40892..9f06629230 100644
--- a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl
+++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl
@@ -12,459 +12,10 @@ slots.create:magic.ai.crawl-site
// Making sure exceptions does not leave thread.
try
- /*
- * Loading robots.txt from specified [url].
- */
- unwrap:x:+/*
- signal:magic.ai.load-robots
- url:x:@.arguments/*/url
- feedback-channel:x:@.arguments/*/feedback-channel
-
- // Checking if site contains a robots.txt file.
- if
- eq:x:@signal/*/found
- .:bool:true
- .lambda
-
- // Site contains a robots.txt file, signaling frontend of that fact.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:Site has robots.txt
- type:info
- sleep:100
-
- // Signaling frontend how many sitemaps we found in robots.txt file.
- strings.concat
- .:"Found "
- get-count:x:@signal/*/sitemap/*
- .:" sitemaps in robots.txt file"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
- sleep:100
-
- // Checking if robots.txt contains a crawl-delay.
- if
- exists:x:@signal/*/crawl-delay
- .lambda
-
- // Updating delay to value from robots.txt.
- remove-nodes:x:@.arguments/*/delay
- unwrap:x:+/*
- validators.default:x:@.arguments
- delay:x:@signal/*/crawl-delay
-
-
- // Signaling frontend to inform of that we found a crawl-delay value.
- strings.concat
- .:"Robots.txt file contains a Crawl-Delay value of "
- math.divide:x:@signal/*/crawl-delay
- .:int:1000
- .:" seconds"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
-
- else
-
- // Site does not contain a robots.txt file, signaling that fact to frontend.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:Could not find a robots.txt file for website
- type:warning
- sleep:100
- strings.concat
- .:"We will try to retrieve sitemap from "
- get-value:x:@signal/*/sitemap/0
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
- sleep:100
-
- /*
- * Checking if we should filter according to URL, as
- * in caller provided a sub-folder URL such as foo.com/bar, at which
- * point we only import URLs below /bar hierarchy.
- *
- * Default value is false, implying robots.txt file is solely responsible
- * for filtering.
- */
- .filter-on-url:bool:false
- strings.split:x:@.arguments/*/url
- .:/
- if
- mt
- get-count:x:@strings.split/*
- .:int:2
- .lambda
- set-value:x:@.filter-on-url
- .:bool:true
-
- /*
- * Trying to load URLs from sitemap returned from above invocation.
- */
- add:x:./*/signal/[1,2]
- get-nodes:x:@signal/*/sitemap
- get-nodes:x:@signal/*/allow
- get-nodes:x:@signal/*/disallow
- unwrap:x:+/*
- signal:magic.ai.load-sitemap
- max:x:@.arguments/*/max
- feedback-channel:x:@.arguments/*/feedback-channel
- url:x:@.arguments/*/url
- filter-on-url:x:@.filter-on-url
-
- // Signaling user what we're about to do.
- strings.concat
- .:"Deleting old snippets for type '"
- get-value:x:@.arguments/*/type
- .:"' matching URL of "
- get-value:x:@.arguments/*/url
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
- sleep:100
-
- // Deleting all old training snippets matching specified URL and type.
- .uri
- set-value:x:@.uri
- strings.concat
- get-value:x:@.arguments/*/url
- .:%
- data.connect:[generic|magic]
- data.execute:@"
-delete from vss_ml_training_snippets
- where rowid in (select id as rowid from ml_training_snippets where type = @type and uri like @uri);
-delete from ml_training_snippets where type = @type and uri like @uri;"
- type:x:@.arguments/*/type
- uri:x:@.uri
-
- // Verifying we found at least one sitemap.
- if
- eq:x:@signal/*/has-sitemap
- .:bool:true
- .lambda
-
- /*
- * We found at least one sitemap.
- *
- * Signaling frontend how many URLs we found, and how many there are in total.
- */
- get-count:x:@signal/*/urls/*
- strings.concat
- .:"We found "
- get-value:x:@signal/*/total
- .:" URLs in sitemap(s), we will be scraping "
- get-value:x:@get-count
- .:" URLs"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
- sleep:100
-
- // Checking if site contains more URLs than we're scraping.
- if
- eq:x:@get-count
- .:int:0
- .lambda
-
- // Warning user!
- strings.concat
- .:"Warning, we could not find a single valid URL in site, probably because sitemap or robots.txt file prohibits scraping, or because your filter is entirely excluded from robots.txt for AINIRO selector"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:warning
- sleep:100
- else-if
- mt
- get-value:x:@signal/*/total
- get-value:x:@get-count
- .lambda
-
- // Warning user!
- strings.concat
- .:"Warning, site contains more than "
- get-value:x:@get-count
- .:" URLs and will only be partially scraped"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:warning
- sleep:100
-
- // Feedback about URLs we're about to scrape, but only if there are any URLs.
- if
- mt:x:@get-count
- .:int:0
- .lambda
-
- // Adding spacer.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:------------------------------------------------------------------------------------------------------------------------
- type:info
- sleep:100
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:"URLs we will scrape are as follows:"
- type:info
- sleep:100
-
- // Iterating through each URL returned from above invocation.
- for-each:x:@signal/*/urls/*
-
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@.dp/#
- type:info
- sleep:100
-
- // Iterating through each URL returned from above invocation.
- for-each:x:@signal/*/urls/*
-
- // Making sure we trap exceptions.
- try
-
- // Adding spacer.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:------------------------------------------------------------------------------------------------------------------------
- type:info
- sleep:100
-
- // Scraping currently iterated URL.
- unwrap:x:+/*
- signal:magic.ai.url.scrape
- url:x:@.dp/#
- type:x:@.arguments/*/type
- threshold:x:@.arguments/*/threshold
- feedback-channel:x:@.arguments/*/feedback-channel
-
- // Verifying we've got more snippets before applying Crawl-Delay
- if
- neq:x:@.dp/#
- get-value:x:@signal/@signal/*/urls/0/-
- .lambda
-
- // Signaling frontend that we're waiting for n seconds.
- strings.concat
- .:"Waiting for "
- math.divide:x:@.arguments/*/delay
- .:int:1000
- .:" seconds to avoid exhausting web server"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
- sleep:100
-
- // Sleeping for [delay] milliseconds to avoid exhausting web server.
- sleep:x:@.arguments/*/delay
-
- .catch
-
- // Logging as error.
- log.error:Could not scrape URL
- url:x:@.dp/#
- message:x:@.arguments/*/message
-
- // Signaling frontend to inform about error.
- strings.concat
- .:"Could not scrape URL, error was: '"
- get-value:x:@.arguments/*/message
- .:"'"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/@.arguments/*/feedback-channel
- roles:root
- args
- message:x:@strings.concat
- type:warning
- sleep:100
-
- // Adding spacer.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:------------------------------------------------------------------------------------------------------------------------
- type:info
- sleep:100
-
- /*
- * Crawling is done.
- * Making sure we notify client that we're done and do some logging.
- */
- sockets.signal:magic.backend.message
- roles:root
- args
- message:Done creating OpenAI training data from URL
- type:success
- sleep:100
-
- // Basic logging.
- log.info:OpenAI training data successfully created
- url:x:@.arguments/*/url
- type:x:@.arguments/*/type
-
- // Checking if caller wants us to execute some lambda object once we're done.
- if
- exists:x:@.arguments/*/.onafter
- .lambda
- eval:x:@.arguments/*/.onafter
-
- else
-
- /*
- * Site did not have a valid sitemap, hence we
- * try to crawl it manually instead.
- *
- * This is the list of URLs we should scrape.
- */
- .urls
-
- // This is the list of URLs we already have scraped.
- .done
-
- // Adding root URL to above list of URLs to be crawled.
- unwrap:x:+/*/*
- add:x:@.urls
- .
- .:x:@.arguments/*/url
-
- // No sitemap(s) found, informing user
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:Could not find any valid sitemaps
- type:warning
- sleep:100
-
- // Informing frontend of that we'll try to crawl site.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:Trying to crawl site even though we did not find a valid sitemap
- type:info
- sleep:100
-
- /*
- * Looping through all above [.urls] as long as we don't exceed [max] argument,
- * and for as long as we have URLs to scrape.
- */
- while
- and
- exists:x:@.urls/*
- lt
- get-count:x:@.done/*
- get-value:x:@.arguments/*/max
- .lambda
-
- // Adding spacer.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:------------------------------------------------------------------------------------------------------------------------
- type:info
- sleep:100
-
- /*
- * Scraping first URL in above [.urls] informing slot that
- * we want it to return URLs found during scraping.
- */
- unwrap:x:+/*
- signal:magic.ai.url.scrape
- url:x:@.urls/0
- type:x:@.arguments/*/type
- images:bool:true
- code:bool:true
- lists:bool:true
- main:bool:true
- empty-completion:bool:false
- threshold:x:@.arguments/*/threshold
- feedback-channel:x:@.arguments/*/feedback-channel
-
- /*
- * Adding currently iterated URL to [.done] and removing it
- * from above [.urls] collection.
- */
- add:x:@.done
- get-nodes:x:@.urls/0
- remove-nodes:x:@.urls/0
-
- /*
- * Adding all URLs returned in above invocation to above [.urls] collection,
- * unless we've already crawled the URL.
- */
- for-each:x:@signal/*
-
- // Checking if URL has been imported or added before, and that it matches base URL provided by caller.
- if
- and
- not-exists:x:@.done/*/={@.dp/#}
- not-exists:x:@.urls/*/={@.dp/#}
- strings.starts-with:x:@.dp/#
- get-value:x:@.arguments/*/url
- .lambda
-
- // Adding URL to [.urls] collection.
- add:x:@.urls
- get-nodes:x:@.dp/#
-
- // Signaling frontend that we're waiting for n seconds.
- strings.concat
- .:"Waiting for "
- math.divide:x:@.arguments/*/delay
- .:int:1000
- .:" seconds to avoid exhausting web server"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
- sleep:100
-
- // Sleeping for [delay] milliseconds to avoid exhausting web server.
- sleep:x:@.arguments/*/delay
-
- // Adding spacer.
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:------------------------------------------------------------------------------------------------------------------------
- type:info
- sleep:100
-
- // Informing frontend of that we're done crawling.
- strings.concat
- .:"Done scraping "
- get-count:x:@.done/*
- .:" URLs"
- unwrap:x:+/**
- sockets.signal:x:@.arguments/*/feedback-channel
- args
- message:x:@strings.concat
- type:info
- sleep:100
-
- // Basic logging.
- log.info:OpenAI training data successfully created
- url:x:@.arguments/*/url
- type:x:@.arguments/*/type
-
- // Checking if caller wants us to execute some lambda object once we're done.
- if
- exists:x:@.arguments/*/.onafter
- .lambda
- eval:x:@.arguments/*/.onafter
+ // Invoking slot responsible for doing the actual crawling.
+ add:x:./*/signal
+ get-nodes:x:@.arguments/*
+ signal:magic.ai.crawl-site-on-thread
.catch
diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl
index 0a5991e2eb..c2097ce6ee 100644
--- a/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl
+++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl
@@ -36,14 +36,12 @@ slots.create:magic.ai.html.extract
// Converting HTML to Markdown.
html2markdown:x:@.arguments/*/html
url:x:@.arguments/*/url
- set-value:x:@.markdown
- get-value:x:@html2markdown
- // Checking if site is SPA, at which point we return early.
+ // Checking if we have any Markdown at all, and if not we return early.
if
or
- null:x:@.markdown
- eq:x:@.markdown
+ null:x:@html2markdown
+ eq:x:@html2markdown
.:
.lambda
@@ -54,107 +52,60 @@ slots.create:magic.ai.html.extract
meta
main:int:0
- // Converting raw HTML to lambda to allow us to extract title, description, hyperlinks, etc.
- .html-lambda
- add:x:@.html-lambda
- html2lambda:x:@.arguments/*/html
-
- /*
- * Finding URLs from document.
- *
- * Notice, for simplicity reasons we do this by round tripping through HTML,
- * for then to convert HTML to lambda, and iterate through each anchor HTML
- * element in lambda.
- *
- * This is not optimal, and could be optimised, but it keeps the code
- * DRY at least, since our [html2markdown] slot at this point have resolved
- * our relative URLs ...
- */
- markdown2html:x:@html2markdown
- html2lambda:x:@markdown2html
- for-each:x:@html2lambda/**/a/*/\@href
-
- /*
- * Notice, the URLs we return are for the scraper to crawl and scrape,
- * so we don't return mailto or tel URLs here, and we only return URLs
- * from the same domain.
- */
- if
- and
- not
- strings.starts-with:x:@.dp/#
- .:"mailto:"
- not
- strings.starts-with:x:@.dp/#
- .:"tel:"
- .lambda
-
- // Removing '#'.
- .url
- set-value:x:@.url
- get-value:x:@.dp/#
- strings.split:x:@.url
- .:#
- set-value:x:@.url
- get-value:x:@strings.split/0
-
- // Valid URL, now checking if it's the same domain.
- .local
- strings.split:x:@.arguments/*/url
- .:"/"
- set-value:x:@.local
- get-value:x:@strings.split/1
- .current
- strings.split:x:@.url
- .:"/"
- set-value:x:@.current
- get-value:x:@strings.split/1
- if
- eq:x:@.local
- get-value:x:@.current
- .lambda
-
- // Local URL, now making sure it's not the same URL.
- strings.trim-end:x:@.url
- .:/
- strings.trim-end:x:@.arguments/*/url
- .:/
- if
- neq:x:@strings.trim-end
- get-value:x:@strings.trim-end/@strings.trim-end
- .lambda
-
- // Not the same URLas the one we'recurrently scraping.
- unwrap:x:+/*/*
- add:x:@.urls
- .
- .:x:@.url
-
- // Setting title and description from document.
+ // Retrieving title and description from document.
set-value:x:@.title
- get-value:x:@.html-lambda/**/head/**/title/*/\#text
+ get-value:x:@html2markdown/*/title
set-value:x:@.description
- get-value:x:@.html-lambda/**/head/**/meta/*/\@name/=description/./*/\@content
+ get-value:x:@html2markdown/*/description
+
+ // Adding URLs found as we transformed HTML to Markdown.
+ add:x:@.urls
+ get-nodes:x:@html2markdown/*/urls/*
- // Creating our prompt.
+ // Storing Markdown in above buffer node.
+ set-value:x:@.markdown
+ get-value:x:@html2markdown
+
+ /*
+ * Creating our "base prompt", which is the default to be used for
+ * all training snippets found in document.
+ */
.prompt
set-value:x:@.prompt
get-first-value
get-value:x:@.title
- get-value:x:@html2lambda/**/h1/[0,1]/*/\#text
- get-value:x:@html2lambda/**/h2/[0,1]/*/\#text
- get-value:x:@html2lambda/**/h3/[0,1]/*/\#text
- get-value:x:@html2lambda/**/h4/[0,1]/*/\#text
- get-value:x:@html2lambda/**/h5/[0,1]/*/\#text
- get-value:x:@html2lambda/**/h6/[0,1]/*/\#text
get-value:x:@.description
.:"Page"
- // Breaking page into sections.
+ /*
+ * Breaking page into sections.
+ *
+ * Here we are breaking page down into smaller training snippets, based upon
+ * lists found at root, and pre sections (code).
+ *
+ * This is done to avoid creating summaries of lists that typically might contain
+ * navbar parts and URLs, in addition to making sure we keep all code segments as is.
+ *
+ * The [.tmp-prompt] below is being used as the prompt for individual OL, UL and PRE
+ * elements, while the [.remaining] part is the remaining Markdown after we've remove
+ * all UL, OL and PRE elements.
+ */
.remaining
.tmp-prompt
set-value:x:@.tmp-prompt
get-value:x:@.prompt
+
+ /*
+ * For simplicity reasons we convert Markdown to HTML and then to lambda again,
+ * to allow us to semantically traverse UL elements, OL elements and PRE elements.
+ *
+ * This allows us to extract UL, OL, and PRE elements, and import these as individual
+ * training snippets.
+ */
+ markdown2html:x:@html2markdown
+ html2lambda:x:@markdown2html
+
+ // Iterating through all root nodes found as we converted Markdown back to lambda again.
for-each:x:@html2lambda/*
get-name:x:@.dp/#
@@ -166,30 +117,37 @@ slots.create:magic.ai.html.extract
case:h4
case:h5
case:h6
+
+ /*
+ * If page contains Hx element, we append it to our base
+ * prompt, to try to keep as much of the (relevant) information as
+ * possible in our prompt for our UL, OL and PRE training snippets.
+ */
set-value:x:@.tmp-prompt
- get-value:x:@.dp/#/*/\#text
+ strings.concat
+ get-value:x:@.prompt
+ .:" | "
+ get-value:x:@.dp/#/*/\#text
case:ul
case:ol
case:pre
/*
- * To avoid repeating navbars "everywhere" we remove these from Markdown and returns
- * these as individual training snippets. This will ensure navbars are only imported
- * once, as a separated training snippet.
+ * This is a bulleted list, ordered list, or a code segment.
+ *
+ * We keep it exactly as is, but return it as an individual training snippet,
+ * such that it becomes an isolated training snippet during
+ * import.
*/
lambda2html:x:@.dp/#
html2markdown:x:@lambda2html
url:x:@.arguments/*/url
- strings.concat
- get-value:x:@.title
- .:" | "
- get-value:x:@.tmp-prompt
unwrap:x:+/*/*/*
add:x:@.snippets
.
.
- prompt:x:@strings.concat
+ prompt:x:@.tmp-prompt
completion:x:@html2markdown
default
@@ -198,11 +156,18 @@ slots.create:magic.ai.html.extract
add:x:@.remaining
get-nodes:x:@.dp/#
+ /*
+ * Now we have removed all UL, OL and PRE elements, and [.remaining] contains
+ * a lambda node hierarchy of everything that remains in the page, at which
+ * point we can transform it back to HTML for then to transform it to Markdown,
+ * and return that as an individual snippets, without PRE, UL and OL elements.
+ */
lambda2html:x:@.remaining/*
set-value:x:@.markdown
html2markdown:x:@lambda2html
url:x:@.arguments/*/url
+ // Adding remaining HTML as an individual training snippet.
.completion
set-value:x:@.completion
get-value:x:@.markdown
@@ -213,19 +178,14 @@ slots.create:magic.ai.html.extract
prompt:x:@.prompt
completion:x:@.completion
- // Returning snippets to caller if we could find anything.
- if
- exists:x:@.snippets/*
- .lambda
-
- // Returning snippets and meta information to caller.
- add:x:./*/return/*/snippets
- get-nodes:x:@.snippets/*
- add:x:./*/return/*/meta
- get-nodes:x:@.meta/*
- add:x:./*/return/*/urls
- get-nodes:x:@.urls/*
- return
- urls
- snippets
- meta
+ // Returning snippets and meta information to caller.
+ add:x:./*/return/*/snippets
+ get-nodes:x:@.snippets/*
+ add:x:./*/return/*/meta
+ get-nodes:x:@.meta/*
+ add:x:./*/return/*/urls
+ get-nodes:x:@.urls/*
+ return
+ urls
+ snippets
+ meta
diff --git a/backend/slots/Version.cs b/backend/slots/Version.cs
index 9b05da7ad2..3a0dbfc9fd 100644
--- a/backend/slots/Version.cs
+++ b/backend/slots/Version.cs
@@ -20,7 +20,7 @@ public class Version : ISlot
/// Parameters passed from signaler
public void Signal(ISignaler signaler, Node input)
{
- input.Value = "v17.3.3";
+ input.Value = "v17.3.4";
}
}
}
|