Skip to content

Commit

Permalink
Better crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
polterguy committed Oct 8, 2023
1 parent 7035c75 commit 0862d91
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -259,17 +259,17 @@ slots.create:magic.ai.html.extract-main
get-name:x:@.dp/#/.
.:a
exists:x:@.dp/#/./*/\@href
not-null:x:@.dp/#/./*/\@href
neq:x:@.dp/#/./*/\@href
not-null:x:@.dp/#/./*/\@href/[0,1]
neq:x:@.dp/#/./*/\@href/[0,1]
.:
not
strings.ends-with:x:@.dp/#/./*/\@href
strings.ends-with:x:@.dp/#/./*/\@href/[0,1]
.:#
not
strings.starts-with:x:@.dp/#/./*/\@href
strings.starts-with:x:@.dp/#/./*/\@href/[0,1]
.:javascript
not
strings.starts-with:x:@.dp/#/./*/\@href
strings.starts-with:x:@.dp/#/./*/\@href/[0,1]
.:void
.lambda

Expand All @@ -282,7 +282,7 @@ slots.create:magic.ai.html.extract-main
unwrap:x:+/*/*
set-value:x:@.url
signal:magic.url.normalize
url:x:@.dp/#/./*/\@href
url:x:@.dp/#/./*/\@href/[0,1]
base:x:@.arguments/*/base
scheme:x:@.arguments/*/scheme

Expand Down Expand Up @@ -464,17 +464,17 @@ slots.create:magic.ai.html.extract-main
get-name:x:@.dp/#/.
.:a
exists:x:@.dp/#/./*/\@href
not-null:x:@.dp/#/./*/\@href
neq:x:@.dp/#/./*/\@href
not-null:x:@.dp/#/./*/\@href/[0,1]
neq:x:@.dp/#/./*/\@href/[0,1]
.:
not
strings.ends-with:x:@.dp/#/./*/\@href
strings.ends-with:x:@.dp/#/./*/\@href/[0,1]
.:#
not
strings.starts-with:x:@.dp/#/./*/\@href
strings.starts-with:x:@.dp/#/./*/\@href/[0,1]
.:javascript
not
strings.starts-with:x:@.dp/#/./*/\@href
strings.starts-with:x:@.dp/#/./*/\@href/[0,1]
.:void
.lambda

Expand All @@ -487,7 +487,7 @@ slots.create:magic.ai.html.extract-main
unwrap:x:+/*/*
set-value:x:@.url
signal:magic.url.normalize
url:x:@.dp/#/./*/\@href
url:x:@.dp/#/./*/\@href/[0,1]
base:x:@.arguments/*/base
scheme:x:@.arguments/*/scheme

Expand Down Expand Up @@ -689,9 +689,9 @@ slots.create:magic.ai.html.extract-main
if
and
exists:x:@.dp/#/*/\#text
not-null:x:@.dp/#/*/\#text
not-null:x:@.dp/#/*/\#text/[0,1]
neq
strings.trim:x:@.dp/#/*/\#text
strings.trim:x:@.dp/#/*/\#text/[0,1]
.:"  \r\n\t"
.:
.lambda
Expand All @@ -712,7 +712,7 @@ slots.create:magic.ai.html.extract-main
set-value:x:@.tmp
strings.concat
get-value:x:@.tmp
get-value:x:@.dp/#/*/\#text
get-value:x:@.dp/#/*/\#text/[0,1]

// Making sure we track the fact that currently iterated list item has valuable content.
set-value:x:@.has-content
Expand All @@ -725,31 +725,31 @@ slots.create:magic.ai.html.extract-main
if
and
exists:x:@.dp/#/*/\#text
not-null:x:@.dp/#/*/\#text
not-null:x:@.dp/#/*/\#text/[0,1]
neq
strings.trim:x:@.dp/#/*/\#text
strings.trim:x:@.dp/#/*/\#text/[0,1]
.:"  \r\n\t"
.:
exists:x:@.dp/#/*/\@href
not-null:x:@.dp/#/*/\@href
neq:x:@.dp/#/*/\@href
not-null:x:@.dp/#/*/\@href/[0,1]
neq:x:@.dp/#/*/\@href/[0,1]
.:
neq
strings.trim:x:@.dp/#/*/\@href
strings.trim:x:@.dp/#/*/\@href/[0,1]
.:"  \r\n\t"
.:
not
strings.starts-with:x:@.dp/#/*/\@href
strings.starts-with:x:@.dp/#/*/\@href/[0,1]
.:"javascript:"
not
strings.ends-with:x:@.dp/#/*/\@href
strings.ends-with:x:@.dp/#/*/\@href/[0,1]
.:#
.lambda

// Trimming text and removing CR/LF characters from it. Notice, there's a (U+00A0) character here.
.anchor
set-value:x:@.anchor
strings.trim:x:@.dp/#/*/\#text
strings.trim:x:@.dp/#/*/\#text/[0,1]
.:"  \r\n\t"
set-value:x:@.anchor
strings.replace:x:@.anchor
Expand All @@ -775,7 +775,7 @@ slots.create:magic.ai.html.extract-main
// Concatenating hyperlink to [.tmp], making sure we first normalize URL.
unwrap:x:+/*
signal:magic.url.normalize
url:x:@.dp/#/*/\@href
url:x:@.dp/#/*/\@href/[0,1]
base:x:@.arguments/*/base
scheme:x:@.arguments/*/scheme
set-value:x:@.tmp
Expand Down
40 changes: 27 additions & 13 deletions backend/files/system/openai/magic.startup/magic.ai.load-sitemap.hl
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ slots.create:magic.ai.load-sitemap
log.error:Sitemap invocation did not return success status
url:x:@.dp/#
status:x:@http.get
throw:Could not retrieve sitemap, check log for details
throw:Could not retrieve sitemap

// Verifying request returned text/xml MIME type or application/xml MIME type.
if
Expand All @@ -122,15 +122,7 @@ slots.create:magic.ai.load-sitemap
.lambda

// Bogus sitemap file, not XML.
log.error:Sitemap was not valid XML
url:x:@.dp/#
Content-Type:x:@http.get/*/headers/*/Content-Type
content-type:x:@http.get/*/headers/*/content-type
sockets.signal:magic.backend.chatbot
roles:root
args
message:Sitemap was not XML
type:warning
throw:Sitemap was not XML

else

Expand Down Expand Up @@ -254,6 +246,18 @@ slots.create:magic.ai.load-sitemap
add:x:@.urls
get-nodes:x:@signal/*/urls/*

// If above invocation returned no sitemap, and we've got zero URLs, we set [.has-sitemap] to false.
if
and
eq:x:@signal/*/has-sitemap
.:bool:false
eq
get-count:x:@.urls/*
.:int:0
.lambda
set-value:x:@.has-sitemap
.:bool:false

/*
* Iterating through each text URL referenced in main sitemap.
*
Expand All @@ -280,6 +284,18 @@ slots.create:magic.ai.load-sitemap
add:x:@.urls
get-nodes:x:@signal/*/urls/*

// If above invocation returned no sitemap, and we've got zero URLs, we set [.has-sitemap] to false.
if
and
eq:x:@signal/*/has-sitemap
.:bool:false
eq
get-count:x:@.urls/*
.:int:0
.lambda
set-value:x:@.has-sitemap
.:bool:false

// Total URLs in sitemap(s).
.total
set-value:x:@.total
Expand Down Expand Up @@ -423,11 +439,9 @@ slots.create:magic.ai.load-sitemap
roles:root
args
message:x:@strings.concat
type:info
type:warning
sleep:100

// Returning "no URLs" to caller
return-nodes
has-sitemap:bool:false
total:int:0
urls

0 comments on commit 0862d91

Please sign in to comment.