From ed9ff89c96119149815d2371beb7ec489d9514b8 Mon Sep 17 00:00:00 2001 From: Thomas Hansen Date: Fri, 9 Feb 2024 12:50:56 +0200 Subject: [PATCH] Getting ready for release --- backend/backend.csproj | 2 +- .../crawling/magic.ai.crawl-site-on-thread.hl | 459 ++++++++++++++++++ .../crawling/magic.ai.crawl-site.hl | 457 +---------------- .../crawling/magic.ai.html.extract.hl | 194 +++----- backend/slots/Version.cs | 2 +- 5 files changed, 542 insertions(+), 572 deletions(-) create mode 100644 backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl diff --git a/backend/backend.csproj b/backend/backend.csproj index 9c17a97491..5d3bec2f14 100644 --- a/backend/backend.csproj +++ b/backend/backend.csproj @@ -26,7 +26,7 @@ - + diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl new file mode 100644 index 0000000000..4eba515511 --- /dev/null +++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site-on-thread.hl @@ -0,0 +1,459 @@ + +/* + * Crawls the specified website generating training data for machine learning in the process. + */ +slots.create:magic.ai.crawl-site-on-thread + + /* + * Loading robots.txt from specified [url]. + */ + unwrap:x:+/* + signal:magic.ai.load-robots + url:x:@.arguments/*/url + feedback-channel:x:@.arguments/*/feedback-channel + + // Checking if site contains a robots.txt file. + if + eq:x:@signal/*/found + .:bool:true + .lambda + + // Site contains a robots.txt file, signaling frontend of that fact. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:Site has robots.txt + type:info + sleep:100 + + // Signaling frontend how many sitemaps we found in robots.txt file. + strings.concat + .:"Found " + get-count:x:@signal/*/sitemap/* + .:" sitemaps in robots.txt file" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + sleep:100 + + // Checking if robots.txt contains a crawl-delay. + if + exists:x:@signal/*/crawl-delay + .lambda + + // Updating delay to value from robots.txt. + remove-nodes:x:@.arguments/*/delay + unwrap:x:+/* + validators.default:x:@.arguments + delay:x:@signal/*/crawl-delay + + + // Signaling frontend to inform of that we found a crawl-delay value. + strings.concat + .:"Robots.txt file contains a Crawl-Delay value of " + math.divide:x:@signal/*/crawl-delay + .:int:1000 + .:" seconds" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + + else + + // Site does not contain a robots.txt file, signaling that fact to frontend. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:Could not find a robots.txt file for website + type:warning + sleep:100 + strings.concat + .:"We will try to retrieve sitemap from " + get-value:x:@signal/*/sitemap/0 + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + sleep:100 + + /* + * Checking if we should filter according to URL, as + * in caller provided a sub-folder URL such as foo.com/bar, at which + * point we only import URLs below /bar hierarchy. + * + * Default value is false, implying robots.txt file is solely responsible + * for filtering. + */ + .filter-on-url:bool:false + strings.split:x:@.arguments/*/url + .:/ + if + mt + get-count:x:@strings.split/* + .:int:2 + .lambda + set-value:x:@.filter-on-url + .:bool:true + + /* + * Trying to load URLs from sitemap returned from above invocation. + */ + add:x:./*/signal/[1,2] + get-nodes:x:@signal/*/sitemap + get-nodes:x:@signal/*/allow + get-nodes:x:@signal/*/disallow + unwrap:x:+/* + signal:magic.ai.load-sitemap + max:x:@.arguments/*/max + feedback-channel:x:@.arguments/*/feedback-channel + url:x:@.arguments/*/url + filter-on-url:x:@.filter-on-url + + // Signaling user what we're about to do. + strings.concat + .:"Deleting old snippets for type '" + get-value:x:@.arguments/*/type + .:"' matching URL of " + get-value:x:@.arguments/*/url + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + sleep:100 + + // Deleting all old training snippets matching specified URL and type. + .uri + set-value:x:@.uri + strings.concat + get-value:x:@.arguments/*/url + .:% + data.connect:[generic|magic] + data.execute:@" +delete from vss_ml_training_snippets +where rowid in (select id as rowid from ml_training_snippets where type = @type and uri like @uri); +delete from ml_training_snippets where type = @type and uri like @uri;" + type:x:@.arguments/*/type + uri:x:@.uri + + // Verifying we found at least one sitemap. + if + eq:x:@signal/*/has-sitemap + .:bool:true + .lambda + + /* + * We found at least one sitemap. + * + * Signaling frontend how many URLs we found, and how many there are in total. + */ + get-count:x:@signal/*/urls/* + strings.concat + .:"We found " + get-value:x:@signal/*/total + .:" URLs in sitemap(s), we will be scraping " + get-value:x:@get-count + .:" URLs" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + sleep:100 + + // Checking if site contains more URLs than we're scraping. + if + eq:x:@get-count + .:int:0 + .lambda + + // Warning user! + strings.concat + .:"Warning, we could not find a single valid URL in site" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:warning + sleep:100 + else-if + mt + get-value:x:@signal/*/total + get-value:x:@get-count + .lambda + + // Warning user! + strings.concat + .:"Warning, site contains more than " + get-value:x:@get-count + .:" URLs and will only be partially scraped" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:warning + sleep:100 + + // Feedback about URLs we're about to scrape, but only if there are any URLs. + if + mt:x:@get-count + .:int:0 + .lambda + + // Adding spacer. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:------------------------------------------------------------------------------------------------------------------------ + type:info + sleep:100 + sockets.signal:x:@.arguments/*/feedback-channel + args + message:"URLs we will scrape are as follows:" + type:info + sleep:100 + + // Iterating through each URL returned from above invocation. + for-each:x:@signal/*/urls/* + + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@.dp/# + type:info + sleep:10 + + // Iterating through each URL returned from above invocation. + for-each:x:@signal/*/urls/* + + // Making sure we trap exceptions. + try + + // Adding spacer. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:------------------------------------------------------------------------------------------------------------------------ + type:info + sleep:100 + + // Scraping currently iterated URL. + unwrap:x:+/* + signal:magic.ai.url.scrape + url:x:@.dp/# + type:x:@.arguments/*/type + threshold:x:@.arguments/*/threshold + feedback-channel:x:@.arguments/*/feedback-channel + + // Verifying we've got more snippets before applying Crawl-Delay + if + neq:x:@.dp/# + get-value:x:@signal/@signal/*/urls/0/- + .lambda + + // Signaling frontend that we're waiting for n seconds. + strings.concat + .:"Waiting for " + math.divide:x:@.arguments/*/delay + .:int:1000 + .:" seconds to avoid exhausting web server" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + sleep:100 + + // Sleeping for [delay] milliseconds to avoid exhausting web server. + sleep:x:@.arguments/*/delay + + .catch + + // Logging as error. + log.error:Could not scrape URL + url:x:@.dp/# + message:x:@.arguments/*/message + + // Signaling frontend to inform about error. + strings.concat + .:"Could not scrape URL, error was: '" + get-value:x:@.arguments/*/message + .:"'" + unwrap:x:+/** + sockets.signal:x:@.arguments/@.arguments/*/feedback-channel + roles:root + args + message:x:@strings.concat + type:warning + sleep:100 + + // Adding spacer. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:------------------------------------------------------------------------------------------------------------------------ + type:info + sleep:100 + + /* + * Crawling is done. + * Making sure we notify client that we're done and do some logging. + */ + sockets.signal:magic.backend.message + roles:root + args + message:Done creating OpenAI training data from URL + type:success + sleep:100 + + // Basic logging. + log.info:OpenAI training data successfully created + url:x:@.arguments/*/url + type:x:@.arguments/*/type + + // Checking if caller wants us to execute some lambda object once we're done. + if + exists:x:@.arguments/*/.onafter + .lambda + eval:x:@.arguments/*/.onafter + + else + + /* + * Site did not have a valid sitemap, hence we + * try to crawl it manually instead. + * + * This is the list of URLs we should scrape. + */ + .urls + + // This is the list of URLs we already have scraped. + .done + + // Adding root URL to above list of URLs to be crawled. + unwrap:x:+/*/* + add:x:@.urls + . + .:x:@.arguments/*/url + + // No sitemap(s) found, informing user + sockets.signal:x:@.arguments/*/feedback-channel + args + message:Could not find any valid sitemaps + type:warning + sleep:100 + + // Informing frontend of that we'll try to crawl site. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:Trying to crawl site even though we did not find a valid sitemap + type:info + sleep:100 + + /* + * Looping through all above [.urls] as long as we don't exceed [max] argument, + * and for as long as we have URLs to scrape. + */ + while + and + exists:x:@.urls/* + lt + get-count:x:@.done/* + get-value:x:@.arguments/*/max + .lambda + + // Adding spacer. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:------------------------------------------------------------------------------------------------------------------------ + type:info + sleep:100 + + /* + * Scraping first URL in above [.urls] informing slot that + * we want it to return URLs found during scraping. + */ + unwrap:x:+/* + signal:magic.ai.url.scrape + url:x:@.urls/0 + type:x:@.arguments/*/type + images:bool:true + code:bool:true + lists:bool:true + main:bool:true + empty-completion:bool:false + threshold:x:@.arguments/*/threshold + feedback-channel:x:@.arguments/*/feedback-channel + + /* + * Adding currently iterated URL to [.done] and removing it + * from above [.urls] collection. + */ + add:x:@.done + get-nodes:x:@.urls/0 + remove-nodes:x:@.urls/0 + + /* + * Adding all URLs returned in above invocation to above [.urls] collection, + * unless we've already crawled the URL. + */ + for-each:x:@signal/* + + // Checking if URL has been imported or added before, and that it matches base URL provided by caller. + if + and + not-exists:x:@.done/*/={@.dp/#} + not-exists:x:@.urls/*/={@.dp/#} + strings.starts-with:x:@.dp/# + get-value:x:@.arguments/*/url + .lambda + + // Adding URL to [.urls] collection. + add:x:@.urls + get-nodes:x:@.dp/# + + // Signaling frontend that we're waiting for n seconds. + strings.concat + .:"Waiting for " + math.divide:x:@.arguments/*/delay + .:int:1000 + .:" seconds to avoid exhausting web server" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + sleep:100 + + // Sleeping for [delay] milliseconds to avoid exhausting web server. + sleep:x:@.arguments/*/delay + + // Adding spacer. + sockets.signal:x:@.arguments/*/feedback-channel + args + message:------------------------------------------------------------------------------------------------------------------------ + type:info + sleep:100 + + // Informing frontend of that we're done crawling. + strings.concat + .:"Done scraping " + get-count:x:@.done/* + .:" URLs" + unwrap:x:+/** + sockets.signal:x:@.arguments/*/feedback-channel + args + message:x:@strings.concat + type:info + sleep:100 + + // Basic logging. + log.info:OpenAI training data successfully created + url:x:@.arguments/*/url + type:x:@.arguments/*/type + + // Checking if caller wants us to execute some lambda object once we're done. + if + exists:x:@.arguments/*/.onafter + .lambda + eval:x:@.arguments/*/.onafter diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl index 6533d40892..9f06629230 100644 --- a/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl +++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.crawl-site.hl @@ -12,459 +12,10 @@ slots.create:magic.ai.crawl-site // Making sure exceptions does not leave thread. try - /* - * Loading robots.txt from specified [url]. - */ - unwrap:x:+/* - signal:magic.ai.load-robots - url:x:@.arguments/*/url - feedback-channel:x:@.arguments/*/feedback-channel - - // Checking if site contains a robots.txt file. - if - eq:x:@signal/*/found - .:bool:true - .lambda - - // Site contains a robots.txt file, signaling frontend of that fact. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:Site has robots.txt - type:info - sleep:100 - - // Signaling frontend how many sitemaps we found in robots.txt file. - strings.concat - .:"Found " - get-count:x:@signal/*/sitemap/* - .:" sitemaps in robots.txt file" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - sleep:100 - - // Checking if robots.txt contains a crawl-delay. - if - exists:x:@signal/*/crawl-delay - .lambda - - // Updating delay to value from robots.txt. - remove-nodes:x:@.arguments/*/delay - unwrap:x:+/* - validators.default:x:@.arguments - delay:x:@signal/*/crawl-delay - - - // Signaling frontend to inform of that we found a crawl-delay value. - strings.concat - .:"Robots.txt file contains a Crawl-Delay value of " - math.divide:x:@signal/*/crawl-delay - .:int:1000 - .:" seconds" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - - else - - // Site does not contain a robots.txt file, signaling that fact to frontend. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:Could not find a robots.txt file for website - type:warning - sleep:100 - strings.concat - .:"We will try to retrieve sitemap from " - get-value:x:@signal/*/sitemap/0 - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - sleep:100 - - /* - * Checking if we should filter according to URL, as - * in caller provided a sub-folder URL such as foo.com/bar, at which - * point we only import URLs below /bar hierarchy. - * - * Default value is false, implying robots.txt file is solely responsible - * for filtering. - */ - .filter-on-url:bool:false - strings.split:x:@.arguments/*/url - .:/ - if - mt - get-count:x:@strings.split/* - .:int:2 - .lambda - set-value:x:@.filter-on-url - .:bool:true - - /* - * Trying to load URLs from sitemap returned from above invocation. - */ - add:x:./*/signal/[1,2] - get-nodes:x:@signal/*/sitemap - get-nodes:x:@signal/*/allow - get-nodes:x:@signal/*/disallow - unwrap:x:+/* - signal:magic.ai.load-sitemap - max:x:@.arguments/*/max - feedback-channel:x:@.arguments/*/feedback-channel - url:x:@.arguments/*/url - filter-on-url:x:@.filter-on-url - - // Signaling user what we're about to do. - strings.concat - .:"Deleting old snippets for type '" - get-value:x:@.arguments/*/type - .:"' matching URL of " - get-value:x:@.arguments/*/url - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - sleep:100 - - // Deleting all old training snippets matching specified URL and type. - .uri - set-value:x:@.uri - strings.concat - get-value:x:@.arguments/*/url - .:% - data.connect:[generic|magic] - data.execute:@" -delete from vss_ml_training_snippets - where rowid in (select id as rowid from ml_training_snippets where type = @type and uri like @uri); -delete from ml_training_snippets where type = @type and uri like @uri;" - type:x:@.arguments/*/type - uri:x:@.uri - - // Verifying we found at least one sitemap. - if - eq:x:@signal/*/has-sitemap - .:bool:true - .lambda - - /* - * We found at least one sitemap. - * - * Signaling frontend how many URLs we found, and how many there are in total. - */ - get-count:x:@signal/*/urls/* - strings.concat - .:"We found " - get-value:x:@signal/*/total - .:" URLs in sitemap(s), we will be scraping " - get-value:x:@get-count - .:" URLs" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - sleep:100 - - // Checking if site contains more URLs than we're scraping. - if - eq:x:@get-count - .:int:0 - .lambda - - // Warning user! - strings.concat - .:"Warning, we could not find a single valid URL in site, probably because sitemap or robots.txt file prohibits scraping, or because your filter is entirely excluded from robots.txt for AINIRO selector" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:warning - sleep:100 - else-if - mt - get-value:x:@signal/*/total - get-value:x:@get-count - .lambda - - // Warning user! - strings.concat - .:"Warning, site contains more than " - get-value:x:@get-count - .:" URLs and will only be partially scraped" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:warning - sleep:100 - - // Feedback about URLs we're about to scrape, but only if there are any URLs. - if - mt:x:@get-count - .:int:0 - .lambda - - // Adding spacer. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:------------------------------------------------------------------------------------------------------------------------ - type:info - sleep:100 - sockets.signal:x:@.arguments/*/feedback-channel - args - message:"URLs we will scrape are as follows:" - type:info - sleep:100 - - // Iterating through each URL returned from above invocation. - for-each:x:@signal/*/urls/* - - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@.dp/# - type:info - sleep:100 - - // Iterating through each URL returned from above invocation. - for-each:x:@signal/*/urls/* - - // Making sure we trap exceptions. - try - - // Adding spacer. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:------------------------------------------------------------------------------------------------------------------------ - type:info - sleep:100 - - // Scraping currently iterated URL. - unwrap:x:+/* - signal:magic.ai.url.scrape - url:x:@.dp/# - type:x:@.arguments/*/type - threshold:x:@.arguments/*/threshold - feedback-channel:x:@.arguments/*/feedback-channel - - // Verifying we've got more snippets before applying Crawl-Delay - if - neq:x:@.dp/# - get-value:x:@signal/@signal/*/urls/0/- - .lambda - - // Signaling frontend that we're waiting for n seconds. - strings.concat - .:"Waiting for " - math.divide:x:@.arguments/*/delay - .:int:1000 - .:" seconds to avoid exhausting web server" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - sleep:100 - - // Sleeping for [delay] milliseconds to avoid exhausting web server. - sleep:x:@.arguments/*/delay - - .catch - - // Logging as error. - log.error:Could not scrape URL - url:x:@.dp/# - message:x:@.arguments/*/message - - // Signaling frontend to inform about error. - strings.concat - .:"Could not scrape URL, error was: '" - get-value:x:@.arguments/*/message - .:"'" - unwrap:x:+/** - sockets.signal:x:@.arguments/@.arguments/*/feedback-channel - roles:root - args - message:x:@strings.concat - type:warning - sleep:100 - - // Adding spacer. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:------------------------------------------------------------------------------------------------------------------------ - type:info - sleep:100 - - /* - * Crawling is done. - * Making sure we notify client that we're done and do some logging. - */ - sockets.signal:magic.backend.message - roles:root - args - message:Done creating OpenAI training data from URL - type:success - sleep:100 - - // Basic logging. - log.info:OpenAI training data successfully created - url:x:@.arguments/*/url - type:x:@.arguments/*/type - - // Checking if caller wants us to execute some lambda object once we're done. - if - exists:x:@.arguments/*/.onafter - .lambda - eval:x:@.arguments/*/.onafter - - else - - /* - * Site did not have a valid sitemap, hence we - * try to crawl it manually instead. - * - * This is the list of URLs we should scrape. - */ - .urls - - // This is the list of URLs we already have scraped. - .done - - // Adding root URL to above list of URLs to be crawled. - unwrap:x:+/*/* - add:x:@.urls - . - .:x:@.arguments/*/url - - // No sitemap(s) found, informing user - sockets.signal:x:@.arguments/*/feedback-channel - args - message:Could not find any valid sitemaps - type:warning - sleep:100 - - // Informing frontend of that we'll try to crawl site. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:Trying to crawl site even though we did not find a valid sitemap - type:info - sleep:100 - - /* - * Looping through all above [.urls] as long as we don't exceed [max] argument, - * and for as long as we have URLs to scrape. - */ - while - and - exists:x:@.urls/* - lt - get-count:x:@.done/* - get-value:x:@.arguments/*/max - .lambda - - // Adding spacer. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:------------------------------------------------------------------------------------------------------------------------ - type:info - sleep:100 - - /* - * Scraping first URL in above [.urls] informing slot that - * we want it to return URLs found during scraping. - */ - unwrap:x:+/* - signal:magic.ai.url.scrape - url:x:@.urls/0 - type:x:@.arguments/*/type - images:bool:true - code:bool:true - lists:bool:true - main:bool:true - empty-completion:bool:false - threshold:x:@.arguments/*/threshold - feedback-channel:x:@.arguments/*/feedback-channel - - /* - * Adding currently iterated URL to [.done] and removing it - * from above [.urls] collection. - */ - add:x:@.done - get-nodes:x:@.urls/0 - remove-nodes:x:@.urls/0 - - /* - * Adding all URLs returned in above invocation to above [.urls] collection, - * unless we've already crawled the URL. - */ - for-each:x:@signal/* - - // Checking if URL has been imported or added before, and that it matches base URL provided by caller. - if - and - not-exists:x:@.done/*/={@.dp/#} - not-exists:x:@.urls/*/={@.dp/#} - strings.starts-with:x:@.dp/# - get-value:x:@.arguments/*/url - .lambda - - // Adding URL to [.urls] collection. - add:x:@.urls - get-nodes:x:@.dp/# - - // Signaling frontend that we're waiting for n seconds. - strings.concat - .:"Waiting for " - math.divide:x:@.arguments/*/delay - .:int:1000 - .:" seconds to avoid exhausting web server" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - sleep:100 - - // Sleeping for [delay] milliseconds to avoid exhausting web server. - sleep:x:@.arguments/*/delay - - // Adding spacer. - sockets.signal:x:@.arguments/*/feedback-channel - args - message:------------------------------------------------------------------------------------------------------------------------ - type:info - sleep:100 - - // Informing frontend of that we're done crawling. - strings.concat - .:"Done scraping " - get-count:x:@.done/* - .:" URLs" - unwrap:x:+/** - sockets.signal:x:@.arguments/*/feedback-channel - args - message:x:@strings.concat - type:info - sleep:100 - - // Basic logging. - log.info:OpenAI training data successfully created - url:x:@.arguments/*/url - type:x:@.arguments/*/type - - // Checking if caller wants us to execute some lambda object once we're done. - if - exists:x:@.arguments/*/.onafter - .lambda - eval:x:@.arguments/*/.onafter + // Invoking slot responsible for doing the actual crawling. + add:x:./*/signal + get-nodes:x:@.arguments/* + signal:magic.ai.crawl-site-on-thread .catch diff --git a/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl b/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl index 0a5991e2eb..c2097ce6ee 100644 --- a/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl +++ b/backend/files/system/openai/magic.startup/crawling/magic.ai.html.extract.hl @@ -36,14 +36,12 @@ slots.create:magic.ai.html.extract // Converting HTML to Markdown. html2markdown:x:@.arguments/*/html url:x:@.arguments/*/url - set-value:x:@.markdown - get-value:x:@html2markdown - // Checking if site is SPA, at which point we return early. + // Checking if we have any Markdown at all, and if not we return early. if or - null:x:@.markdown - eq:x:@.markdown + null:x:@html2markdown + eq:x:@html2markdown .: .lambda @@ -54,107 +52,60 @@ slots.create:magic.ai.html.extract meta main:int:0 - // Converting raw HTML to lambda to allow us to extract title, description, hyperlinks, etc. - .html-lambda - add:x:@.html-lambda - html2lambda:x:@.arguments/*/html - - /* - * Finding URLs from document. - * - * Notice, for simplicity reasons we do this by round tripping through HTML, - * for then to convert HTML to lambda, and iterate through each anchor HTML - * element in lambda. - * - * This is not optimal, and could be optimised, but it keeps the code - * DRY at least, since our [html2markdown] slot at this point have resolved - * our relative URLs ... - */ - markdown2html:x:@html2markdown - html2lambda:x:@markdown2html - for-each:x:@html2lambda/**/a/*/\@href - - /* - * Notice, the URLs we return are for the scraper to crawl and scrape, - * so we don't return mailto or tel URLs here, and we only return URLs - * from the same domain. - */ - if - and - not - strings.starts-with:x:@.dp/# - .:"mailto:" - not - strings.starts-with:x:@.dp/# - .:"tel:" - .lambda - - // Removing '#'. - .url - set-value:x:@.url - get-value:x:@.dp/# - strings.split:x:@.url - .:# - set-value:x:@.url - get-value:x:@strings.split/0 - - // Valid URL, now checking if it's the same domain. - .local - strings.split:x:@.arguments/*/url - .:"/" - set-value:x:@.local - get-value:x:@strings.split/1 - .current - strings.split:x:@.url - .:"/" - set-value:x:@.current - get-value:x:@strings.split/1 - if - eq:x:@.local - get-value:x:@.current - .lambda - - // Local URL, now making sure it's not the same URL. - strings.trim-end:x:@.url - .:/ - strings.trim-end:x:@.arguments/*/url - .:/ - if - neq:x:@strings.trim-end - get-value:x:@strings.trim-end/@strings.trim-end - .lambda - - // Not the same URLas the one we'recurrently scraping. - unwrap:x:+/*/* - add:x:@.urls - . - .:x:@.url - - // Setting title and description from document. + // Retrieving title and description from document. set-value:x:@.title - get-value:x:@.html-lambda/**/head/**/title/*/\#text + get-value:x:@html2markdown/*/title set-value:x:@.description - get-value:x:@.html-lambda/**/head/**/meta/*/\@name/=description/./*/\@content + get-value:x:@html2markdown/*/description + + // Adding URLs found as we transformed HTML to Markdown. + add:x:@.urls + get-nodes:x:@html2markdown/*/urls/* - // Creating our prompt. + // Storing Markdown in above buffer node. + set-value:x:@.markdown + get-value:x:@html2markdown + + /* + * Creating our "base prompt", which is the default to be used for + * all training snippets found in document. + */ .prompt set-value:x:@.prompt get-first-value get-value:x:@.title - get-value:x:@html2lambda/**/h1/[0,1]/*/\#text - get-value:x:@html2lambda/**/h2/[0,1]/*/\#text - get-value:x:@html2lambda/**/h3/[0,1]/*/\#text - get-value:x:@html2lambda/**/h4/[0,1]/*/\#text - get-value:x:@html2lambda/**/h5/[0,1]/*/\#text - get-value:x:@html2lambda/**/h6/[0,1]/*/\#text get-value:x:@.description .:"Page" - // Breaking page into sections. + /* + * Breaking page into sections. + * + * Here we are breaking page down into smaller training snippets, based upon + * lists found at root, and pre sections (code). + * + * This is done to avoid creating summaries of lists that typically might contain + * navbar parts and URLs, in addition to making sure we keep all code segments as is. + * + * The [.tmp-prompt] below is being used as the prompt for individual OL, UL and PRE + * elements, while the [.remaining] part is the remaining Markdown after we've remove + * all UL, OL and PRE elements. + */ .remaining .tmp-prompt set-value:x:@.tmp-prompt get-value:x:@.prompt + + /* + * For simplicity reasons we convert Markdown to HTML and then to lambda again, + * to allow us to semantically traverse UL elements, OL elements and PRE elements. + * + * This allows us to extract UL, OL, and PRE elements, and import these as individual + * training snippets. + */ + markdown2html:x:@html2markdown + html2lambda:x:@markdown2html + + // Iterating through all root nodes found as we converted Markdown back to lambda again. for-each:x:@html2lambda/* get-name:x:@.dp/# @@ -166,30 +117,37 @@ slots.create:magic.ai.html.extract case:h4 case:h5 case:h6 + + /* + * If page contains Hx element, we append it to our base + * prompt, to try to keep as much of the (relevant) information as + * possible in our prompt for our UL, OL and PRE training snippets. + */ set-value:x:@.tmp-prompt - get-value:x:@.dp/#/*/\#text + strings.concat + get-value:x:@.prompt + .:" | " + get-value:x:@.dp/#/*/\#text case:ul case:ol case:pre /* - * To avoid repeating navbars "everywhere" we remove these from Markdown and returns - * these as individual training snippets. This will ensure navbars are only imported - * once, as a separated training snippet. + * This is a bulleted list, ordered list, or a code segment. + * + * We keep it exactly as is, but return it as an individual training snippet, + * such that it becomes an isolated training snippet during + * import. */ lambda2html:x:@.dp/# html2markdown:x:@lambda2html url:x:@.arguments/*/url - strings.concat - get-value:x:@.title - .:" | " - get-value:x:@.tmp-prompt unwrap:x:+/*/*/* add:x:@.snippets . . - prompt:x:@strings.concat + prompt:x:@.tmp-prompt completion:x:@html2markdown default @@ -198,11 +156,18 @@ slots.create:magic.ai.html.extract add:x:@.remaining get-nodes:x:@.dp/# + /* + * Now we have removed all UL, OL and PRE elements, and [.remaining] contains + * a lambda node hierarchy of everything that remains in the page, at which + * point we can transform it back to HTML for then to transform it to Markdown, + * and return that as an individual snippets, without PRE, UL and OL elements. + */ lambda2html:x:@.remaining/* set-value:x:@.markdown html2markdown:x:@lambda2html url:x:@.arguments/*/url + // Adding remaining HTML as an individual training snippet. .completion set-value:x:@.completion get-value:x:@.markdown @@ -213,19 +178,14 @@ slots.create:magic.ai.html.extract prompt:x:@.prompt completion:x:@.completion - // Returning snippets to caller if we could find anything. - if - exists:x:@.snippets/* - .lambda - - // Returning snippets and meta information to caller. - add:x:./*/return/*/snippets - get-nodes:x:@.snippets/* - add:x:./*/return/*/meta - get-nodes:x:@.meta/* - add:x:./*/return/*/urls - get-nodes:x:@.urls/* - return - urls - snippets - meta + // Returning snippets and meta information to caller. + add:x:./*/return/*/snippets + get-nodes:x:@.snippets/* + add:x:./*/return/*/meta + get-nodes:x:@.meta/* + add:x:./*/return/*/urls + get-nodes:x:@.urls/* + return + urls + snippets + meta diff --git a/backend/slots/Version.cs b/backend/slots/Version.cs index 9b05da7ad2..3a0dbfc9fd 100644 --- a/backend/slots/Version.cs +++ b/backend/slots/Version.cs @@ -20,7 +20,7 @@ public class Version : ISlot /// Parameters passed from signaler public void Signal(ISignaler signaler, Node input) { - input.Value = "v17.3.3"; + input.Value = "v17.3.4"; } } }