Skip to content

Commit

Permalink
Allowing to crawl sub-sections of websites
Browse files Browse the repository at this point in the history
  • Loading branch information
polterguy committed Nov 10, 2023
1 parent 12d852c commit 5afe765
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ slots.create:magic.ai.crawl-site
signal:magic.ai.load-sitemap
max:x:@.arguments/*/max
feedback-channel:x:@.arguments/*/feedback-channel
url:x:@.arguments/*/url

// Verifying we found at least one sitemap.
if
Expand Down Expand Up @@ -126,6 +127,20 @@ slots.create:magic.ai.crawl-site

// Checking if site contains more URLs than we're scraping.
if
eq:x:@get-count
.:int:0
.lambda

// Warning user!
strings.concat
.:"Warning, we could not find a single valid URL in site, probably because sitemap or robots.txt file prohibits scraping, or because your filter is entirely excluded from robots.txt for AINIRO selector"
unwrap:x:+/**
sockets.signal:x:@.arguments/*/feedback-channel
args
message:x:@strings.concat
type:warning
sleep:100
else-if
mt
get-value:x:@signal/*/total
get-value:x:@get-count
Expand All @@ -143,19 +158,23 @@ slots.create:magic.ai.crawl-site
type:warning
sleep:100

// Adding spacer.
sockets.signal:x:@.arguments/*/feedback-channel
args
message:------------------------------------------------------------------------------------------------------------------------
type:info
sleep:100
// Feedback about URLs we're about to scrape, but only if there are any URLs.
if
mt:x:@get-count
.:int:0
.lambda

// Feedback
sockets.signal:x:@.arguments/*/feedback-channel
args
message:"URLs we will scrape are as follows:"
type:info
sleep:100
// Adding spacer.
sockets.signal:x:@.arguments/*/feedback-channel
args
message:------------------------------------------------------------------------------------------------------------------------
type:info
sleep:100
sockets.signal:x:@.arguments/*/feedback-channel
args
message:"URLs we will scrape are as follows:"
type:info
sleep:100

// Iterating through each URL returned from above invocation.
for-each:x:@signal/*/urls/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,22 @@ slots.create:magic.ai.load-sitemap
set-value:x:@.allowed
.:bool:true

// Verifying URL starts with base URL.
if
and
exists:x:@.arguments/*/url
not-null:x:@.arguments/*/url
neq:x:@.arguments/*/url
.:
not
strings.starts-with:x:@sort/0
get-value:x:@.arguments/*/url
.lambda

// URL does not match base URL of scraping invocation.
set-value:x:@.allowed
.:bool:false

// Verifying URL is allowed.
if
eq:x:@.allowed
Expand Down

0 comments on commit 5afe765

Please sign in to comment.