Skip to content

Commit

Permalink
Started working on scraping sites without sitemap
Browse files Browse the repository at this point in the history
  • Loading branch information
polterguy committed Oct 8, 2023
1 parent 2a9dacc commit 048d1ec
Show file tree
Hide file tree
Showing 4 changed files with 286 additions and 78 deletions.
295 changes: 218 additions & 77 deletions backend/files/system/openai/magic.startup/magic.ai.crawl-site.hl
Original file line number Diff line number Diff line change
Expand Up @@ -101,106 +101,247 @@ slots.create:magic.ai.crawl-site
unwrap:x:+/*
signal:magic.ai.load-sitemap
max:x:@.arguments/*/max
lambda2hyper:x:-
log.info:x:-

// Signaling frontend how many URLs we found, and how many there are in total.
strings.concat
.:"We found "
get-value:x:@signal/*/total
.:" URLs in sitemap(s)"
unwrap:x:+/**
sockets.signal:magic.backend.chatbot
roles:root
args
message:x:@strings.concat
type:info
sleep:100

// Checking if site contains more URLs than we're scraping.
get-count:x:@signal/*/urls/*
// Verifying we found at least one sitemap.
if
mt
get-value:x:@signal/*/total
get-value:x:@get-count
eq:x:@signal/*/has-sitemap
.:bool:true
.lambda

// Warning user!
/*
* We found at least one sitemap.
*
* Signaling frontend how many URLs we found, and how many there are in total.
*/
strings.concat
.:"Warning, site contains more than "
get-value:x:@get-count
.:" URLs and will only be partially scraped"
.:"We found "
get-value:x:@signal/*/total
.:" URLs in sitemap(s)"
unwrap:x:+/**
sockets.signal:magic.backend.chatbot
roles:root
args
message:x:@strings.concat
type:warning
type:info
sleep:100

// Iterating through each URL returned from above invocation.
for-each:x:@signal/*/urls/*
// Checking if site contains more URLs than we're scraping.
get-count:x:@signal/*/urls/*
if
mt
get-value:x:@signal/*/total
get-value:x:@get-count
.lambda

// Making sure we trap exceptions.
try
// Warning user!
strings.concat
.:"Warning, site contains more than "
get-value:x:@get-count
.:" URLs and will only be partially scraped"
unwrap:x:+/**
sockets.signal:magic.backend.chatbot
roles:root
args
message:x:@strings.concat
type:warning
sleep:100

// Scraping currently iterated URL.
unwrap:x:+/*
signal:magic.ai.url.scrape
url:x:@.dp/#
type:x:@.arguments/*/type
images:bool:true
code:bool:true
lists:bool:true
main:bool:true
empty-completion:bool:true
threshold:x:@.arguments/*/threshold

// Signaling frontend that we're waiting for n seconds.
strings.concat
.:"Waiting for "
math.divide:x:@.arguments/*/delay
.:int:1000
.:" seconds to avoid exhausting web server"
unwrap:x:+/**
sockets.signal:magic.backend.chatbot
// Iterating through each URL returned from above invocation.
for-each:x:@signal/*/urls/*

// Making sure we trap exceptions.
try

// Scraping currently iterated URL.
unwrap:x:+/*
signal:magic.ai.url.scrape
url:x:@.dp/#
type:x:@.arguments/*/type
images:bool:true
code:bool:true
lists:bool:true
main:bool:true
empty-completion:bool:true
threshold:x:@.arguments/*/threshold

// Signaling frontend that we're waiting for n seconds.
strings.concat
.:"Waiting for "
math.divide:x:@.arguments/*/delay
.:int:1000
.:" seconds to avoid exhausting web server"
unwrap:x:+/**
sockets.signal:magic.backend.chatbot
roles:root
args
message:x:@strings.concat
type:info
sleep:100

// Sleeping for [delay] milliseconds to avoid exhausting web server.
sleep:x:@.arguments/*/delay

.catch

// Logging as error.
log.error:Could not scrape URL
url:x:@.dp/#
message:x:@.arguments/*/message

/*
* Crawling is done.
* Making sure we notify client that we're done and do some logging.
*/
sockets.signal:magic.backend.message
roles:root
args
message:x:@strings.concat
type:info
message:Done creating OpenAI training data from URL
type:success
sleep:100

// Sleeping for [delay] milliseconds to avoid exhausting web server.
sleep:x:@.arguments/*/delay
// Basic logging.
log.info:OpenAI training data successfully created
url:x:@.arguments/*/url
type:x:@.arguments/*/type

.catch
// Checking if caller wants us to execute some lambda object once we're done.
if
exists:x:@.arguments/*/.onafter
.lambda
eval:x:@.arguments/*/.onafter

// Logging as error.
log.error:Could not scrape URL
url:x:@.dp/#
message:x:@.arguments/*/message
else

/*
* Crawling is done.
* Making sure we notify client that we're done and do some logging.
*/
sockets.signal:magic.backend.message
roles:root
args
message:Done creating OpenAI training data from URL
type:success
sleep:100
/*
* Site did not have a valid sitemap, hence we
* try to crawl it manually instead.
*
* This is the list of URLs we should scrape.
*/
.urls

// Basic logging.
log.info:OpenAI training data successfully created
url:x:@.arguments/*/url
type:x:@.arguments/*/type
// This is the list of URLs we already have scraped.
.done

// Checking if caller wants us to execute some lambda object once we're done.
if
exists:x:@.arguments/*/.onafter
.lambda
eval:x:@.arguments/*/.onafter
// Adding root URL to above list of URLs to be crawled.
unwrap:x:+/*/*
add:x:@.urls
.
.:x:@.arguments/*/url

// Informing frontend of that we'll try to crawl site.
sockets.signal:magic.backend.chatbot
roles:root
args
message:Trying to crawl site even though we did not find a valid sitemap
type:info
sleep:100

/*
* Looping through all above [.urls] as long as we don't exceed [max] argument,
* and for as long as we have URLs to scrape.
*/
while
and
exists:x:@.urls/*
lt
get-count:x:@.done/*
get-value:x:@.arguments/*/max
.lambda

/*
* Scraping first URL in above [.urls] informing slot that
* we want it to return URLs found during scraping.
*/
unwrap:x:+/*
signal:magic.ai.url.scrape
url:x:@.urls/0
type:x:@.arguments/*/type
images:bool:true
code:bool:true
lists:bool:true
main:bool:true
empty-completion:bool:true
threshold:x:@.arguments/*/threshold

/*
* Adding currently iterated URL to [.done] and removing it
* from above [.urls] collection.
*/
add:x:@.done
get-nodes:x:@.urls/0
remove-nodes:x:@.urls/0

/*
* Adding all URLs returned in above invocation to above [.urls] collection,
* unless we've already crawled the URL.
*/
for-each:x:@signal/*

// Verifying we have not already scraped URL.
.exists:bool:false
for-each:x:@.done/*

// Checking if URL exists in [.done] collection.
if
eq:x:@.dp/#
get-value:x:@.dp/@.dp/#
.lambda

// URL has already been imported.
set-value:x:@.exists
.:bool:true

// Checking if URL has been imported before.
if
eq:x:@.exists
.:bool:false
.lambda

// Adding URL to [.urls] collection.
add:x:@.urls
get-nodes:x:@.dp/#

// Signaling frontend that we're waiting for n seconds.
strings.concat
.:"Waiting for "
math.divide:x:@.arguments/*/delay
.:int:1000
.:" seconds to avoid exhausting web server"
unwrap:x:+/**
sockets.signal:magic.backend.chatbot
roles:root
args
message:x:@strings.concat
type:info
sleep:100

// Sleeping for [delay] milliseconds to avoid exhausting web server.
sleep:x:@.arguments/*/delay

// Informing frontend of that we're done crawling.
strings.concat
.:"Done scraping "
get-count:x:@.done/*
.:" URLs"
unwrap:x:+/**
sockets.signal:magic.backend.chatbot
roles:root
args
message:x:@strings.concat
type:info
sleep:100

// Basic logging.
log.info:OpenAI training data successfully created
url:x:@.arguments/*/url
type:x:@.arguments/*/type

// Checking if caller wants us to execute some lambda object once we're done.
if
exists:x:@.arguments/*/.onafter
.lambda
eval:x:@.arguments/*/.onafter

.catch

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,59 @@ slots.create:magic.ai.html.extract-snippets
// Buffer used for snippet to return.
.snippets

// Buffer used to hold all URLs found during scraping.
.urls

// Used to store a reference to above HTML transformed to lambda.
.document
set-value:x:@.document
reference:x:@html2lambda/*/html

/*
* Looping through entire document to find URLs in it such that we
* can return this to caller.
*/
for-each:x:@.document/#/**/a/*/\@href

// Sanity checking currently iterated URL.
if
and
not-null:x:@.dp/#
neq:x:@.dp/#
.:
not
strings.starts-with:x:@.dp/#
.:javascript
not
strings.starts-with:x:@.dp/#
.:void
not
exists:x:./*/rel/=nofollow
.lambda

// Removing hash tag parts, if existing.
strings.split:x:@.dp/#
.:#
unwrap:x:+/*
signal:magic.url.normalize
url:x:@strings.split/0
base:x:@.base
scheme:x:@.scheme

// Verifying this is a local URL.
if
strings.starts-with:x:@signal
get-value:x:@.base
.lambda

// This is a local URL, trimming trailing slash.
strings.trim-end:x:@signal
.:/
unwrap:x:+/*/*
add:x:@.urls
.
.:x:@strings.trim-end

/*
* Checking if caller wants images.
*
Expand Down Expand Up @@ -210,6 +258,9 @@ slots.create:magic.ai.html.extract-snippets
get-nodes:x:@.snippets/*
add:x:./*/return/*/meta
get-nodes:x:@.meta/*
add:x:./*/return/*/urls
get-nodes:x:@.urls/*
return
urls
snippets
meta
Loading

0 comments on commit 048d1ec

Please sign in to comment.