From 19ed6a7889eb3df46a91e8fdb5ca6cb225bacf17 Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Wed, 10 Apr 2024 05:50:28 -0400
Subject: [PATCH] chore(openai): add detailed gpt results output

---
 Cargo.lock               |  8 ++++----
 examples/Cargo.toml      |  2 +-
 spider/Cargo.toml        |  2 +-
 spider/README.md         | 24 ++++++++++++------------
 spider/src/page.rs       | 16 ++++++++++++++--
 spider/src/utils.rs      | 24 ++++++++++++++----------
 spider/src/website.rs    |  4 ++--
 spider_cli/Cargo.toml    |  4 ++--
 spider_worker/Cargo.toml |  4 ++--
 9 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 96ce16411..f1631ae14 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3377,7 +3377,7 @@ dependencies = [
 
 [[package]]
 name = "spider"
-version = "1.90.0"
+version = "1.91.1"
 dependencies = [
  "ahash",
  "async-openai",
@@ -3427,7 +3427,7 @@ dependencies = [
 
 [[package]]
 name = "spider_cli"
-version = "1.90.0"
+version = "1.91.1"
 dependencies = [
  "clap",
  "env_logger",
@@ -3439,7 +3439,7 @@ dependencies = [
 
 [[package]]
 name = "spider_examples"
-version = "1.83.12"
+version = "1.90.0"
 dependencies = [
  "convert_case 0.6.0",
  "env_logger",
@@ -3450,7 +3450,7 @@ dependencies = [
 
 [[package]]
 name = "spider_worker"
-version = "1.90.0"
+version = "1.91.1"
 dependencies = [
  "env_logger",
  "lazy_static",
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
index b8eae8000..49bbb419b 100644
--- a/examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.83.12"
+version = "1.90.0"
 authors = [
     "madeindjs <contact@rousseau-alexandre.fr>",
     "j-mendez <jeff@a11ywatch.com>",
diff --git a/spider/Cargo.toml b/spider/Cargo.toml
index 0aa12afb4..c431bf1af 100644
--- a/spider/Cargo.toml
+++ b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.90.0"
+version = "1.91.1"
 authors = [
     "madeindjs <contact@rousseau-alexandre.fr>",
     "j-mendez <jeff@a11ywatch.com>",
diff --git a/spider/README.md b/spider/README.md
index 6aebc857c..60391d583 100644
--- a/spider/README.md
+++ b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.90.0"
+spider = "1.91.1"
 ```
 
 And then the code:
@@ -93,7 +93,7 @@ We have the following optional feature flags.
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["regex", "ua_generator"] }
+spider = { version = "1.91.1", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -137,7 +137,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["decentralized"] }
+spider = { version = "1.91.1", features = ["decentralized"] }
 ```
 
 ```sh
@@ -168,7 +168,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["sync"] }
+spider = { version = "1.91.1", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -198,7 +198,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["regex"] }
+spider = { version = "1.91.1", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -225,7 +225,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["control"] }
+spider = { version = "1.91.1", features = ["control"] }
 ```
 
 ```rust
@@ -295,7 +295,7 @@ Use cron jobs to run crawls continuously at anytime.
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["sync", "cron"] }
+spider = { version = "1.91.1", features = ["sync", "cron"] }
 ```
 
 ```rust,no_run
@@ -334,7 +334,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["chrome", "chrome_intercept"] }
+spider = { version = "1.91.1", features = ["chrome", "chrome_intercept"] }
 ```
 
 You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
@@ -364,7 +364,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`]
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["cache"] }
+spider = { version = "1.91.1", features = ["cache"] }
 ```
 
 You need to set `website.cache` to true to enable as well.
@@ -395,7 +395,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["smart"] }
+spider = { version = "1.91.1", features = ["smart"] }
 ```
 
 ```rust,no_run
@@ -421,7 +421,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["openai"] }
+spider = { version = "1.91.1", features = ["openai"] }
 ```
 
 ```rust
@@ -447,7 +447,7 @@ Set a depth limit to prevent forwarding.
 
 ```toml
 [dependencies]
-spider = { version = "1.90.0", features = ["budget"] }
+spider = { version = "1.91.1", features = ["budget"] }
 ```
 
 ```rust,no_run
diff --git a/spider/src/page.rs b/spider/src/page.rs
index ce6ae05a8..a47232121 100644
--- a/spider/src/page.rs
+++ b/spider/src/page.rs
@@ -63,6 +63,18 @@ lazy_static! {
     };
 }
 
+/// The AI data returned from a GPT.
+#[derive(Debug, Clone, Default)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+pub struct AIResults {
+    /// The prompt used for the GPT.
+    pub input: String,
+    /// The js output of the GPT response.
+    pub js_output: String,
+    /// The content output returned from the GPT response that is not a browser script, example: extracted data from the markup.
+    pub content_output: Vec<String>,
+}
+
 /// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper).
 #[derive(Debug, Clone)]
 #[cfg(not(feature = "decentralized"))]
@@ -98,7 +110,7 @@ pub struct Page {
     pub openai_credits_used: Option<Vec<crate::utils::OpenAIUsage>>,
     #[cfg(feature = "openai")]
     /// The extra data from the AI, example extracting data etc...
-    pub extra_ai_data: Option<Vec<String>>,
+    pub extra_ai_data: Option<Vec<AIResults>>,
 }
 
 /// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper).
@@ -128,7 +140,7 @@ pub struct Page {
     pub openai_credits_used: Option<Vec<crate::utils::OpenAIUsage>>,
     #[cfg(feature = "openai")]
     /// The extra data from the AI, example extracting data etc...
-    pub extra_ai_data: Option<Vec<String>>,
+    pub extra_ai_data: Option<Vec<AIResults>>,
 }
 
 lazy_static! {
diff --git a/spider/src/utils.rs b/spider/src/utils.rs
index c913a141d..5fec4951a 100644
--- a/spider/src/utils.rs
+++ b/spider/src/utils.rs
@@ -71,7 +71,7 @@ pub struct PageResponse {
     pub openai_credits_used: Option<Vec<OpenAIUsage>>,
     #[cfg(feature = "openai")]
     /// The extra data from the AI, example extracting data etc...
-    pub extra_ai_data: Option<Vec<String>>,
+    pub extra_ai_data: Option<Vec<crate::page::AIResults>>,
 }
 
 /// wait for event with timeout
@@ -228,24 +228,27 @@ pub fn handle_openai_credits(_page_response: &mut PageResponse, _tokens_used: Op
 
 /// Handle extra OpenAI data used. This does nothing without 'openai' feature flag.
 #[cfg(feature = "openai")]
-pub fn handle_extra_ai_data(page_response: &mut PageResponse, js: &str) -> String {
+pub fn handle_extra_ai_data(page_response: &mut PageResponse, prompt: &str, js: &str) {
     match serde_json::from_str::<JsonResponse>(&js) {
         Ok(x) => {
+            let ai_response = crate::page::AIResults {
+                input: prompt.into(),
+                js_output: x.js,
+                content_output: x.content,
+            };
+
             match page_response.extra_ai_data.as_mut() {
-                Some(v) => v.extend(x.content),
-                None => page_response.extra_ai_data = Some(x.content),
+                Some(v) => v.push(ai_response),
+                None => page_response.extra_ai_data = Some(Vec::from([ai_response])),
             };
-            x.js
         }
-        _ => Default::default(),
+        _ => (),
     }
 }
 
 #[cfg(not(feature = "openai"))]
 /// Handle extra OpenAI data used. This does nothing without 'openai' feature flag.
-pub fn handle_extra_ai_data(_page_response: &mut PageResponse, _js: &str) -> String {
-    Default::default()
-}
+pub fn handle_extra_ai_data(_page_response: &mut PageResponse, _prompt: &str, _js: &str) {}
 
 #[cfg(feature = "chrome")]
 /// Perform a network request to a resource extracting all content as text streaming via chrome.
@@ -349,7 +352,8 @@ pub async fn fetch_page_html_chrome_base(
                         };
 
                         let js_script = if gpt_configs.extra_ai_data {
-                            handle_extra_ai_data(&mut page_response, &js_script)
+                            handle_extra_ai_data(&mut page_response, &prompt, &js_script);
+                            js_script
                         } else {
                             js_script
                         };
diff --git a/spider/src/website.rs b/spider/src/website.rs
index f5d5ea1fd..fd32d4f16 100644
--- a/spider/src/website.rs
+++ b/spider/src/website.rs
@@ -4410,7 +4410,7 @@ impl Website {
     /// use spider::tokio;
     /// use spider::website::Website;
     /// #[tokio::main]
-    /// 
+    ///
     /// async fn main() {
     ///     let mut website: Website = Website::new("http://example.com");
     ///     let mut rx2 = website.subscribe(18).unwrap();
@@ -4450,7 +4450,7 @@ impl Website {
     /// ```
     /// use spider::tokio;
     /// use spider::website::Website;
-    /// 
+    ///
     /// #[tokio::main]
     /// async fn main() {
     ///     let mut website: Website = Website::new("http://example.com");
diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
index 3117f2456..b2e42bd5b 100644
--- a/spider_cli/Cargo.toml
+++ b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "1.90.0"
+version = "1.91.1"
 authors = [
     "madeindjs <contact@rousseau-alexandre.fr>",
     "j-mendez <jeff@a11ywatch.com>",
@@ -29,7 +29,7 @@ quote = "1.0.18"
 failure_derive = "0.1.8"
 
 [dependencies.spider]
-version = "1.90.0"
+version = "1.91.1"
 path = "../spider"
 
 [[bin]]
diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml
index 3b79dea4d..be4164f0d 100644
--- a/spider_worker/Cargo.toml
+++ b/spider_worker/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_worker"
-version = "1.90.0"
+version = "1.91.1"
 authors = [
     "madeindjs <contact@rousseau-alexandre.fr>",
     "j-mendez <jeff@a11ywatch.com>",
@@ -25,7 +25,7 @@ lazy_static = "1.4.0"
 env_logger = "0.11.3"
 
 [dependencies.spider]
-version = "1.90.0"
+version = "1.91.1"
 path = "../spider"
 features = ["serde", "flexbuffers"]