feat(page): add nextjs build ssg path handling

spider-rs · Nov 4, 2024 · bc68858 · bc68858
1 parent f088564
commit bc68858
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 49 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "2.11.18"
+version = "2.11.20"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]
@@ -22,7 +22,7 @@ tokio-stream = "0.1"
 hashbrown = { version = "0.15", default-features = true }
 log = "0.4"
 percent-encoding = "2"
-regex = { version = "1", optional = true }
+regex = { version = "1" }
 ua_generator = { version = "^0.5", optional = true }
 string_concat = "0.0.1"
 lazy_static = "1"
@@ -113,8 +113,8 @@ features = [
 
 [features]
 default = ["sync", "reqwest_native_tls_native_roots", "cookies", "ua_generator", "encoding", "string_interner_buffer_backend"]
-regex = ["dep:regex"]
-glob = ["dep:regex", "dep:itertools"]
+regex = []
+glob = [ "dep:itertools"]
 ua_generator = ["dep:ua_generator"]
 jemalloc = ["tikv-jemallocator"]
 decentralized = ["serde", "flexbuffers"]
@@ -143,7 +143,7 @@ chrome_intercept = ["chrome"]
 chrome_headless_new = ["chrome"]
 cookies = ["reqwest/cookies"]
 cron = ["dep:async_job", "dep:chrono", "dep:cron", "dep:async-trait"]
-smart = ["chrome", "dep:regex", "dep:rand", "chrome_intercept"]
+smart = ["chrome", "dep:rand", "chrome_intercept"]
 encoding = []
 headers = ["dep:httpdate"]
 real_browser = ["dep:statrs", "dep:rand"]

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -12,6 +12,7 @@ use crate::Client;
 use crate::RelativeSelectors;
 use bytes::Bytes;
 use hashbrown::HashSet;
+use regex::bytes::Regex;
 use reqwest::StatusCode;
 use tokio::time::Duration;
 
@@ -26,6 +27,7 @@ use url::Url;
 lazy_static! {
     /// Wildcard match all domains.
     static ref CASELESS_WILD_CARD: CaseInsensitiveString = CaseInsensitiveString::new("*");
+    static ref SSG_CAPTURE: Regex =  Regex::new(r#""(.*?)""#).unwrap();
 }
 
 #[cfg(any(feature = "smart", feature = "chrome_intercept"))]
@@ -967,20 +969,16 @@ impl Page {
             } else {
                 let html = Box::new(Html::parse_fragment(html));
                 let mut stream = tokio_stream::iter(html.tree);
-
                 // the original url
                 let parent_host = &selectors.1[0];
                 // the host schemes
                 let parent_host_scheme = &selectors.1[1];
                 let base_input_domain = &selectors.2; // the domain after redirects
-                                                      // the base matcher to
                 let sub_matcher = &selectors.0;
 
                 while let Some(node) = stream.next().await {
                     if let Some(element) = node.as_element() {
-                        let element_name = element.name();
-
-                        if element_name == "a" {
+                        if element.name() == "a" {
                             if let Some(href) = element.attr("href") {
                                 self.push_link(
                                     href,
@@ -1002,7 +1000,119 @@ impl Page {
 
     /// Find the links as a stream using string resource validation
     #[inline(always)]
-    #[cfg(all(not(feature = "decentralized"), not(feature = "full_resources"),))]
+    pub async fn links_stream_base_ssg<A: PartialEq + Eq + std::hash::Hash + From<String>>(
+        &self,
+        selectors: &RelativeSelectors,
+        html: &str,
+        client: &Client,
+    ) -> HashSet<A> {
+        use auto_encoder::auto_encode_bytes;
+
+        let mut map = HashSet::new();
+
+        if !html.is_empty() {
+            if html.starts_with("<?xml") {
+                self.links_stream_xml_links_stream_base(selectors, html, &mut map)
+                    .await;
+            } else {
+                let html = Box::new(crate::packages::scraper::Html::parse_fragment(html));
+                let mut stream = tokio_stream::iter(html.tree);
+
+                // the original url
+                let parent_host = &selectors.1[0];
+                // the host schemes
+                let parent_host_scheme = &selectors.1[1];
+                let base_input_domain = &selectors.2; // the domain after redirects
+                let sub_matcher = &selectors.0;
+
+                let mut build_ssg_path = None;
+
+                while let Some(node) = stream.next().await {
+                    if let Some(element) = node.as_element() {
+                        match element.name() {
+                            "a" => {
+                                if let Some(href) = element.attr("href") {
+                                    self.push_link(
+                                        href,
+                                        &mut map,
+                                        &selectors.0,
+                                        parent_host,
+                                        parent_host_scheme,
+                                        base_input_domain,
+                                        sub_matcher,
+                                    );
+                                }
+                            }
+                            "script" if build_ssg_path.is_none() => {
+                                if let Some(source) = element.attr("src") {
+                                    if source.starts_with("/_next/static/")
+                                        && source.ends_with("/_ssgManifest.js")
+                                    {
+                                        build_ssg_path = Some(self.abs_path(source));
+                                    }
+                                }
+                            }
+                            _ => (),
+                        }
+                    }
+                }
+
+                if let Some(build_ssg_path) = build_ssg_path {
+                    if let Some(s) = build_ssg_path {
+                        let build_page = Page::new_page(s.as_str(), &client).await;
+
+                        for cap in SSG_CAPTURE.captures_iter(build_page.get_html_bytes_u8()) {
+                            if let Some(matched) = cap.get(1) {
+                                let href = auto_encode_bytes(&matched.as_bytes())
+                                    .replace(r#"\u002F"#, "/");
+
+                                fn get_last_segment(path: &str) -> &str {
+                                    if let Some(pos) = path.rfind('/') {
+                                        &path[pos + 1..]
+                                    } else {
+                                        path
+                                    }
+                                }
+
+                                let last_segment = get_last_segment(&href);
+
+                                // we can pass in a static map of the dynamic SSG routes pre-hand, custom API endpoint to seed, or etc later.
+                                if !(last_segment.starts_with("[") && last_segment.ends_with("]")) {
+                                    self.push_link(
+                                        &href,
+                                        &mut map,
+                                        &selectors.0,
+                                        parent_host,
+                                        parent_host_scheme,
+                                        base_input_domain,
+                                        sub_matcher,
+                                    );
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        map
+    }
+
+    /// Find the links as a stream using string resource validation and parsing the script for nextjs initial SSG paths.
+    pub async fn links_stream_ssg<A: PartialEq + Eq + std::hash::Hash + From<String>>(
+        &self,
+        selectors: &RelativeSelectors,
+        client: &Client,
+    ) -> HashSet<A> {
+        if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
+            return Default::default();
+        }
+        self.links_stream_base_ssg(selectors, &self.get_html(), client)
+            .await
+    }
+
+    /// Find the links as a stream using string resource validation
+    #[inline(always)]
+    #[cfg(all(not(feature = "decentralized"), not(feature = "full_resources")))]
     pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
         &self,
         selectors: &RelativeSelectors,
@@ -1365,6 +1475,22 @@ impl Page {
         }
     }
 
+    /// Find all href links and return them using CSS selectors.
+    #[inline(always)]
+    pub async fn links_ssg(
+        &self,
+        selectors: &RelativeSelectors,
+        client: &Client,
+    ) -> HashSet<CaseInsensitiveString> {
+        match self.html.is_some() {
+            false => Default::default(),
+            true => {
+                self.links_stream_ssg::<CaseInsensitiveString>(selectors, client)
+                    .await
+            }
+        }
+    }
+
     /// Find all href links and return them using CSS selectors gathering all resources.
     #[inline(always)]
     pub async fn links_full(

diff --git a/spider/src/website.rs b/spider/src/website.rs
@@ -1300,33 +1300,24 @@ impl Website {
             log::info!("fetch {}", &url);
 
             // allow initial page mutation
-            match page.final_redirect_destination.as_deref() {
-                Some(domain) => {
-                    let prior_domain = self.domain_parsed.take();
-                    self.domain_parsed = match url::Url::parse(domain) {
-                        Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))),
-                        _ => None,
-                    };
-                    self.url = Box::new(domain.into());
-                    match self.setup_selectors() {
-                        Some(s) => {
-                            base.0 = s.0;
-                            base.1 = s.1;
-                            match prior_domain {
-                                Some(prior_domain) => match prior_domain.host_str() {
-                                    Some(dname) => {
-                                        base.2 = dname.into();
-                                    }
-                                    _ => (),
-                                },
-                                _ => (),
-                            }
+            if let Some(domain) = page.final_redirect_destination.as_deref() {
+                let prior_domain = self.domain_parsed.take();
+                self.domain_parsed = match url::Url::parse(domain) {
+                    Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))),
+                    _ => None,
+                };
+                self.url = Box::new(domain.into());
+                if let Some(s) = self.setup_selectors() {
+                    base.0 = s.0;
+                    base.1 = s.1;
+
+                    if let Some(prior_domain) = prior_domain {
+                        if let Some(dname) = prior_domain.host_str() {
+                            base.2 = dname.into();
                         }
-                        _ => (),
                     }
                 }
-                _ => (),
-            };
+            }
 
             let links = if !page.is_empty() {
                 self.links_visited.insert(match self.on_link_find_callback {
@@ -1336,7 +1327,7 @@ impl Website {
                     }
                     _ => *self.url.clone(),
                 });
-                page.links(base).await
+                page.links_ssg(base, client).await
             } else {
                 self.status = CrawlStatus::Empty;
                 Default::default()
@@ -1438,7 +1429,8 @@ impl Website {
                     }
                     _ => *self.url.clone(),
                 });
-                let links = HashSet::from(page.links(&base).await);
+
+                let links = HashSet::from(page.links_ssg(&base, &client).await);
 
                 links
             } else {
@@ -2164,10 +2156,13 @@ impl Website {
                     let return_page_links = self.configuration.return_page_links;
                     let only_html = self.configuration.only_html && !full_resources;
 
+                    let (mut interval, throttle) = self.setup_crawl();
+
                     let mut links: HashSet<CaseInsensitiveString> =
                         self.drain_extra_links().collect();
-                    let (mut interval, throttle) = self.setup_crawl();
+
                     links.extend(self._crawl_establish(client, &mut selector, false).await);
+
                     self.configuration.configure_allowlist();
 
                     let mut q = match &self.channel_queue {

diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_chrome"
-version = "2.11.18"
+version = "2.11.20"
 rust-version = "1.70"
 authors = [
     "j-mendez <jeff@spider.cloud>"

diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "2.11.18"
+version = "2.11.20"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]

diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_transformations"
-version = "2.11.18"
+version = "2.11.20"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]

diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_utils"
-version = "2.11.18"
+version = "2.11.20"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]

diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_worker"
-version = "2.11.18"
+version = "2.11.20"
 authors = [
     "j-mendez <jeff@spider.cloud>"
 ]