Skip to content

Commit

Permalink
feat(page): add nextjs build ssg path handling
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 4, 2024
1 parent f088564 commit bc68858
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 49 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.11.18"
version = "2.11.20"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand All @@ -22,7 +22,7 @@ tokio-stream = "0.1"
hashbrown = { version = "0.15", default-features = true }
log = "0.4"
percent-encoding = "2"
regex = { version = "1", optional = true }
regex = { version = "1" }
ua_generator = { version = "^0.5", optional = true }
string_concat = "0.0.1"
lazy_static = "1"
Expand Down Expand Up @@ -113,8 +113,8 @@ features = [

[features]
default = ["sync", "reqwest_native_tls_native_roots", "cookies", "ua_generator", "encoding", "string_interner_buffer_backend"]
regex = ["dep:regex"]
glob = ["dep:regex", "dep:itertools"]
regex = []
glob = [ "dep:itertools"]
ua_generator = ["dep:ua_generator"]
jemalloc = ["tikv-jemallocator"]
decentralized = ["serde", "flexbuffers"]
Expand Down Expand Up @@ -143,7 +143,7 @@ chrome_intercept = ["chrome"]
chrome_headless_new = ["chrome"]
cookies = ["reqwest/cookies"]
cron = ["dep:async_job", "dep:chrono", "dep:cron", "dep:async-trait"]
smart = ["chrome", "dep:regex", "dep:rand", "chrome_intercept"]
smart = ["chrome", "dep:rand", "chrome_intercept"]
encoding = []
headers = ["dep:httpdate"]
real_browser = ["dep:statrs", "dep:rand"]
Expand Down
138 changes: 132 additions & 6 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::Client;
use crate::RelativeSelectors;
use bytes::Bytes;
use hashbrown::HashSet;
use regex::bytes::Regex;
use reqwest::StatusCode;
use tokio::time::Duration;

Expand All @@ -26,6 +27,7 @@ use url::Url;
lazy_static! {
/// Wildcard match all domains.
static ref CASELESS_WILD_CARD: CaseInsensitiveString = CaseInsensitiveString::new("*");
static ref SSG_CAPTURE: Regex = Regex::new(r#""(.*?)""#).unwrap();
}

#[cfg(any(feature = "smart", feature = "chrome_intercept"))]
Expand Down Expand Up @@ -967,20 +969,16 @@ impl Page {
} else {
let html = Box::new(Html::parse_fragment(html));
let mut stream = tokio_stream::iter(html.tree);

// the original url
let parent_host = &selectors.1[0];
// the host schemes
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2; // the domain after redirects
// the base matcher to
let sub_matcher = &selectors.0;

while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
let element_name = element.name();

if element_name == "a" {
if element.name() == "a" {
if let Some(href) = element.attr("href") {
self.push_link(
href,
Expand All @@ -1002,7 +1000,119 @@ impl Page {

/// Find the links as a stream using string resource validation
#[inline(always)]
#[cfg(all(not(feature = "decentralized"), not(feature = "full_resources"),))]
pub async fn links_stream_base_ssg<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
selectors: &RelativeSelectors,
html: &str,
client: &Client,
) -> HashSet<A> {
use auto_encoder::auto_encode_bytes;

let mut map = HashSet::new();

if !html.is_empty() {
if html.starts_with("<?xml") {
self.links_stream_xml_links_stream_base(selectors, html, &mut map)
.await;
} else {
let html = Box::new(crate::packages::scraper::Html::parse_fragment(html));
let mut stream = tokio_stream::iter(html.tree);

// the original url
let parent_host = &selectors.1[0];
// the host schemes
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2; // the domain after redirects
let sub_matcher = &selectors.0;

let mut build_ssg_path = None;

while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
match element.name() {
"a" => {
if let Some(href) = element.attr("href") {
self.push_link(
href,
&mut map,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
);
}
}
"script" if build_ssg_path.is_none() => {
if let Some(source) = element.attr("src") {
if source.starts_with("/_next/static/")
&& source.ends_with("/_ssgManifest.js")
{
build_ssg_path = Some(self.abs_path(source));
}
}
}
_ => (),
}
}
}

if let Some(build_ssg_path) = build_ssg_path {
if let Some(s) = build_ssg_path {
let build_page = Page::new_page(s.as_str(), &client).await;

for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) {
if let Some(matched) = cap.get(1) {
let href = auto_encode_bytes(&matched.as_bytes())
.replace(r#"\u002F"#, "/");

fn get_last_segment(path: &str) -> &str {
if let Some(pos) = path.rfind('/') {
&path[pos + 1..]
} else {
path
}
}

let last_segment = get_last_segment(&href);

// we can pass in a static map of the dynamic SSG routes pre-hand, custom API endpoint to seed, or etc later.
if !(last_segment.starts_with("[") && last_segment.ends_with("]")) {
self.push_link(
&href,
&mut map,
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
sub_matcher,
);
}
}
}
}
}
}
}
map
}

/// Find the links as a stream using string resource validation and parsing the script for nextjs initial SSG paths.
pub async fn links_stream_ssg<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
selectors: &RelativeSelectors,
client: &Client,
) -> HashSet<A> {
if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
return Default::default();
}
self.links_stream_base_ssg(selectors, &self.get_html(), client)
.await
}

/// Find the links as a stream using string resource validation
#[inline(always)]
#[cfg(all(not(feature = "decentralized"), not(feature = "full_resources")))]
pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
selectors: &RelativeSelectors,
Expand Down Expand Up @@ -1365,6 +1475,22 @@ impl Page {
}
}

/// Find all href links and return them using CSS selectors.
#[inline(always)]
pub async fn links_ssg(
&self,
selectors: &RelativeSelectors,
client: &Client,
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => {
self.links_stream_ssg::<CaseInsensitiveString>(selectors, client)
.await
}
}
}

/// Find all href links and return them using CSS selectors gathering all resources.
#[inline(always)]
pub async fn links_full(
Expand Down
49 changes: 22 additions & 27 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1300,33 +1300,24 @@ impl Website {
log::info!("fetch {}", &url);

// allow initial page mutation
match page.final_redirect_destination.as_deref() {
Some(domain) => {
let prior_domain = self.domain_parsed.take();
self.domain_parsed = match url::Url::parse(domain) {
Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))),
_ => None,
};
self.url = Box::new(domain.into());
match self.setup_selectors() {
Some(s) => {
base.0 = s.0;
base.1 = s.1;
match prior_domain {
Some(prior_domain) => match prior_domain.host_str() {
Some(dname) => {
base.2 = dname.into();
}
_ => (),
},
_ => (),
}
if let Some(domain) = page.final_redirect_destination.as_deref() {
let prior_domain = self.domain_parsed.take();
self.domain_parsed = match url::Url::parse(domain) {
Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))),
_ => None,
};
self.url = Box::new(domain.into());
if let Some(s) = self.setup_selectors() {
base.0 = s.0;
base.1 = s.1;

if let Some(prior_domain) = prior_domain {
if let Some(dname) = prior_domain.host_str() {
base.2 = dname.into();
}
_ => (),
}
}
_ => (),
};
}

let links = if !page.is_empty() {
self.links_visited.insert(match self.on_link_find_callback {
Expand All @@ -1336,7 +1327,7 @@ impl Website {
}
_ => *self.url.clone(),
});
page.links(base).await
page.links_ssg(base, client).await
} else {
self.status = CrawlStatus::Empty;
Default::default()
Expand Down Expand Up @@ -1438,7 +1429,8 @@ impl Website {
}
_ => *self.url.clone(),
});
let links = HashSet::from(page.links(&base).await);

let links = HashSet::from(page.links_ssg(&base, &client).await);

links
} else {
Expand Down Expand Up @@ -2164,10 +2156,13 @@ impl Website {
let return_page_links = self.configuration.return_page_links;
let only_html = self.configuration.only_html && !full_resources;

let (mut interval, throttle) = self.setup_crawl();

let mut links: HashSet<CaseInsensitiveString> =
self.drain_extra_links().collect();
let (mut interval, throttle) = self.setup_crawl();

links.extend(self._crawl_establish(client, &mut selector, false).await);

self.configuration.configure_allowlist();

let mut q = match &self.channel_queue {
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.11.18"
version = "2.11.20"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.11.18"
version = "2.11.20"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.11.18"
version = "2.11.20"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.11.18"
version = "2.11.20"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.11.18"
version = "2.11.20"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit bc68858

Please sign in to comment.