From 19ed6a7889eb3df46a91e8fdb5ca6cb225bacf17 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Wed, 10 Apr 2024 05:50:28 -0400 Subject: [PATCH] chore(openai): add detailed gpt results output --- Cargo.lock | 8 ++++---- examples/Cargo.toml | 2 +- spider/Cargo.toml | 2 +- spider/README.md | 24 ++++++++++++------------ spider/src/page.rs | 16 ++++++++++++++-- spider/src/utils.rs | 24 ++++++++++++++---------- spider/src/website.rs | 4 ++-- spider_cli/Cargo.toml | 4 ++-- spider_worker/Cargo.toml | 4 ++-- 9 files changed, 52 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 96ce16411..f1631ae14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3377,7 +3377,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.90.0" +version = "1.91.1" dependencies = [ "ahash", "async-openai", @@ -3427,7 +3427,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.90.0" +version = "1.91.1" dependencies = [ "clap", "env_logger", @@ -3439,7 +3439,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.83.12" +version = "1.90.0" dependencies = [ "convert_case 0.6.0", "env_logger", @@ -3450,7 +3450,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.90.0" +version = "1.91.1" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index b8eae8000..49bbb419b 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.83.12" +version = "1.90.0" authors = [ "madeindjs ", "j-mendez ", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 0aa12afb4..c431bf1af 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.90.0" +version = "1.91.1" authors = [ "madeindjs ", "j-mendez ", diff --git a/spider/README.md b/spider/README.md index 6aebc857c..60391d583 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.90.0" +spider = "1.91.1" ``` And then the code: @@ -93,7 +93,7 @@ We have the following optional feature flags. ```toml [dependencies] -spider = { version = "1.90.0", features = ["regex", "ua_generator"] } +spider = { version = "1.91.1", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -137,7 +137,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.90.0", features = ["decentralized"] } +spider = { version = "1.91.1", features = ["decentralized"] } ``` ```sh @@ -168,7 +168,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.90.0", features = ["sync"] } +spider = { version = "1.91.1", features = ["sync"] } ``` ```rust,no_run @@ -198,7 +198,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.90.0", features = ["regex"] } +spider = { version = "1.91.1", features = ["regex"] } ``` ```rust,no_run @@ -225,7 +225,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.90.0", features = ["control"] } +spider = { version = "1.91.1", features = ["control"] } ``` ```rust @@ -295,7 +295,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.90.0", features = ["sync", "cron"] } +spider = { version = "1.91.1", features = ["sync", "cron"] } ``` ```rust,no_run @@ -334,7 +334,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "1.90.0", features = ["chrome", "chrome_intercept"] } +spider = { version = "1.91.1", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -364,7 +364,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`] ```toml [dependencies] -spider = { version = "1.90.0", features = ["cache"] } +spider = { version = "1.91.1", features = ["cache"] } ``` You need to set `website.cache` to true to enable as well. @@ -395,7 +395,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "1.90.0", features = ["smart"] } +spider = { version = "1.91.1", features = ["smart"] } ``` ```rust,no_run @@ -421,7 +421,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur ```toml [dependencies] -spider = { version = "1.90.0", features = ["openai"] } +spider = { version = "1.91.1", features = ["openai"] } ``` ```rust @@ -447,7 +447,7 @@ Set a depth limit to prevent forwarding. ```toml [dependencies] -spider = { version = "1.90.0", features = ["budget"] } +spider = { version = "1.91.1", features = ["budget"] } ``` ```rust,no_run diff --git a/spider/src/page.rs b/spider/src/page.rs index ce6ae05a8..a47232121 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -63,6 +63,18 @@ lazy_static! { }; } +/// The AI data returned from a GPT. +#[derive(Debug, Clone, Default)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct AIResults { + /// The prompt used for the GPT. + pub input: String, + /// The js output of the GPT response. + pub js_output: String, + /// The content output returned from the GPT response that is not a browser script, example: extracted data from the markup. + pub content_output: Vec, +} + /// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper). #[derive(Debug, Clone)] #[cfg(not(feature = "decentralized"))] @@ -98,7 +110,7 @@ pub struct Page { pub openai_credits_used: Option>, #[cfg(feature = "openai")] /// The extra data from the AI, example extracting data etc... - pub extra_ai_data: Option>, + pub extra_ai_data: Option>, } /// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper). @@ -128,7 +140,7 @@ pub struct Page { pub openai_credits_used: Option>, #[cfg(feature = "openai")] /// The extra data from the AI, example extracting data etc... - pub extra_ai_data: Option>, + pub extra_ai_data: Option>, } lazy_static! { diff --git a/spider/src/utils.rs b/spider/src/utils.rs index c913a141d..5fec4951a 100644 --- a/spider/src/utils.rs +++ b/spider/src/utils.rs @@ -71,7 +71,7 @@ pub struct PageResponse { pub openai_credits_used: Option>, #[cfg(feature = "openai")] /// The extra data from the AI, example extracting data etc... - pub extra_ai_data: Option>, + pub extra_ai_data: Option>, } /// wait for event with timeout @@ -228,24 +228,27 @@ pub fn handle_openai_credits(_page_response: &mut PageResponse, _tokens_used: Op /// Handle extra OpenAI data used. This does nothing without 'openai' feature flag. #[cfg(feature = "openai")] -pub fn handle_extra_ai_data(page_response: &mut PageResponse, js: &str) -> String { +pub fn handle_extra_ai_data(page_response: &mut PageResponse, prompt: &str, js: &str) { match serde_json::from_str::(&js) { Ok(x) => { + let ai_response = crate::page::AIResults { + input: prompt.into(), + js_output: x.js, + content_output: x.content, + }; + match page_response.extra_ai_data.as_mut() { - Some(v) => v.extend(x.content), - None => page_response.extra_ai_data = Some(x.content), + Some(v) => v.push(ai_response), + None => page_response.extra_ai_data = Some(Vec::from([ai_response])), }; - x.js } - _ => Default::default(), + _ => (), } } #[cfg(not(feature = "openai"))] /// Handle extra OpenAI data used. This does nothing without 'openai' feature flag. -pub fn handle_extra_ai_data(_page_response: &mut PageResponse, _js: &str) -> String { - Default::default() -} +pub fn handle_extra_ai_data(_page_response: &mut PageResponse, _prompt: &str, _js: &str) {} #[cfg(feature = "chrome")] /// Perform a network request to a resource extracting all content as text streaming via chrome. @@ -349,7 +352,8 @@ pub async fn fetch_page_html_chrome_base( }; let js_script = if gpt_configs.extra_ai_data { - handle_extra_ai_data(&mut page_response, &js_script) + handle_extra_ai_data(&mut page_response, &prompt, &js_script); + js_script } else { js_script }; diff --git a/spider/src/website.rs b/spider/src/website.rs index f5d5ea1fd..fd32d4f16 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -4410,7 +4410,7 @@ impl Website { /// use spider::tokio; /// use spider::website::Website; /// #[tokio::main] - /// + /// /// async fn main() { /// let mut website: Website = Website::new("http://example.com"); /// let mut rx2 = website.subscribe(18).unwrap(); @@ -4450,7 +4450,7 @@ impl Website { /// ``` /// use spider::tokio; /// use spider::website::Website; - /// + /// /// #[tokio::main] /// async fn main() { /// let mut website: Website = Website::new("http://example.com"); diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 3117f2456..b2e42bd5b 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.90.0" +version = "1.91.1" authors = [ "madeindjs ", "j-mendez ", @@ -29,7 +29,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.90.0" +version = "1.91.1" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 3b79dea4d..be4164f0d 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.90.0" +version = "1.91.1" authors = [ "madeindjs ", "j-mendez ", @@ -25,7 +25,7 @@ lazy_static = "1.4.0" env_logger = "0.11.3" [dependencies.spider] -version = "1.90.0" +version = "1.91.1" path = "../spider" features = ["serde", "flexbuffers"]