diff --git a/crates/jina/Cargo.toml b/crates/jina/Cargo.toml index ad987392fb..f8d87dc8d3 100644 --- a/crates/jina/Cargo.toml +++ b/crates/jina/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2024" [dependencies] -reqwest = { workspace = true } +reqwest = { workspace = true, features = ["json"] } schemars = { workspace = true } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } diff --git a/crates/jina/src/client.rs b/crates/jina/src/client.rs index b3e18d4e5e..9643d0e4dd 100644 --- a/crates/jina/src/client.rs +++ b/crates/jina/src/client.rs @@ -33,7 +33,11 @@ impl JinaClientBuilder { headers.insert(reqwest::header::AUTHORIZATION, auth_value); headers.insert( reqwest::header::ACCEPT, - reqwest::header::HeaderValue::from_static("text/plain"), + reqwest::header::HeaderValue::from_static("application/json"), + ); + headers.insert( + reqwest::header::CONTENT_TYPE, + reqwest::header::HeaderValue::from_static("application/json"), ); let client = reqwest::Client::builder() diff --git a/crates/jina/src/lib.rs b/crates/jina/src/lib.rs index 5d1e8b6a1c..b9061cce02 100644 --- a/crates/jina/src/lib.rs +++ b/crates/jina/src/lib.rs @@ -1,10 +1,14 @@ mod client; mod error; mod reader; +mod search; +mod types; pub use client::*; pub use error::*; pub use reader::*; +pub use search::*; +pub use types::*; macro_rules! common_derives { ($item:item) => { @@ -39,6 +43,44 @@ mod tests { let _ = client .read_url(ReadUrlRequest { url: "https://example.com".to_string(), + respond_with: None, + no_cache: None, + target_selector: None, + wait_for_selector: None, + remove_selector: None, + token_budget: None, + with_links_summary: None, + with_images_summary: None, + retain_images: None, + with_generated_alt: None, + with_iframe: None, + with_shadow_dom: None, + timeout: None, + }) + .await; + } + + #[tokio::test] + #[ignore] + async fn test_search() { + let client = JinaClientBuilder::default() + .api_key("test-key") + .build() + .unwrap(); + + let _ = client + .search(SearchRequest { + q: "latest AI developments".to_string(), + search_type: None, + num: Some(5), + engine: None, + gl: None, + hl: None, + location: None, + page: None, + site: None, + no_cache: None, + token_budget: None, }) .await; } diff --git a/crates/jina/src/reader.rs b/crates/jina/src/reader.rs index 2d191bfd31..0e9a346a48 100644 --- a/crates/jina/src/reader.rs +++ b/crates/jina/src/reader.rs @@ -1,19 +1,64 @@ use crate::client::{JinaClient, check_response}; use crate::common_derives; +use crate::types::{ReaderResponseEnvelope, RespondWith, RetainImages}; common_derives! { + #[serde(rename_all = "camelCase")] pub struct ReadUrlRequest { #[schemars(description = "The URL to read and convert to markdown")] pub url: String, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Response format")] + pub respond_with: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Whether to bypass the cache")] + pub no_cache: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "CSS selectors to focus on specific elements")] + pub target_selector: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "CSS selectors for elements to wait for before reading")] + pub wait_for_selector: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "CSS selectors for elements to remove from the output")] + pub remove_selector: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Maximum number of tokens in the output")] + pub token_budget: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Include a summary of all links at the end")] + pub with_links_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Include a summary of all images at the end")] + pub with_images_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "How to handle images in the output")] + pub retain_images: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Generate alt text for images")] + pub with_generated_alt: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Include iframe content")] + pub with_iframe: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Include shadow DOM content")] + pub with_shadow_dom: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Timeout in milliseconds")] + pub timeout: Option, } } impl JinaClient { pub async fn read_url(&self, req: ReadUrlRequest) -> Result { - let url = format!("https://r.jina.ai/{}", req.url); - - let response = self.client.get(&url).send().await?; + let response = self + .client + .post("https://r.jina.ai/") + .json(&req) + .send() + .await?; let response = check_response(response).await?; - Ok(response.text().await?) + let envelope: ReaderResponseEnvelope = response.json().await?; + Ok(envelope.data.content) } } diff --git a/crates/jina/src/search.rs b/crates/jina/src/search.rs new file mode 100644 index 0000000000..cdbef2a1c2 --- /dev/null +++ b/crates/jina/src/search.rs @@ -0,0 +1,56 @@ +use crate::client::{JinaClient, check_response}; +use crate::common_derives; +use crate::types::{SearchEngine, SearchResponseEnvelope, SearchResultItem, SearchType}; + +common_derives! { + #[serde(rename_all = "camelCase")] + pub struct SearchRequest { + #[schemars(description = "The search query")] + pub q: String, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "type")] + #[schemars(description = "Type of search: web, images, or news")] + pub search_type: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Number of results to return (0-20)")] + pub num: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Search engine to use")] + pub engine: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Country code for geolocation")] + pub gl: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Language code")] + pub hl: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Location for search results")] + pub location: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Page number for pagination")] + pub page: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Limit results to specific sites")] + pub site: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Whether to bypass the cache")] + pub no_cache: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[schemars(description = "Maximum number of tokens in the output")] + pub token_budget: Option, + } +} + +impl JinaClient { + pub async fn search(&self, req: SearchRequest) -> Result, crate::Error> { + let response = self + .client + .post("https://s.jina.ai/") + .json(&req) + .send() + .await?; + let response = check_response(response).await?; + let envelope: SearchResponseEnvelope = response.json().await?; + Ok(envelope.data) + } +} diff --git a/crates/jina/src/types.rs b/crates/jina/src/types.rs new file mode 100644 index 0000000000..4deb588a89 --- /dev/null +++ b/crates/jina/src/types.rs @@ -0,0 +1,93 @@ +use crate::common_derives; + +common_derives! { + #[serde(rename_all = "camelCase")] + pub struct ReaderResponseEnvelope { + pub code: u16, + pub status: u32, + pub data: ReaderData, + } +} + +common_derives! { + #[serde(rename_all = "camelCase")] + pub struct ReaderData { + #[serde(default)] + pub title: String, + #[serde(default)] + pub url: String, + #[serde(default)] + pub content: String, + #[serde(default)] + pub description: String, + } +} + +common_derives! { + #[serde(rename_all = "camelCase")] + pub struct SearchResponseEnvelope { + pub code: u16, + pub status: u32, + pub data: Vec, + } +} + +common_derives! { + #[serde(rename_all = "camelCase")] + pub struct SearchResultItem { + #[serde(default)] + pub title: String, + #[serde(default)] + pub url: String, + #[serde(default)] + pub content: String, + #[serde(default)] + pub description: String, + } +} + +common_derives! { + pub enum RespondWith { + #[serde(rename = "markdown")] + Markdown, + #[serde(rename = "html")] + Html, + #[serde(rename = "text")] + Text, + #[serde(rename = "screenshot")] + Screenshot, + #[serde(rename = "pageshot")] + Pageshot, + } +} + +common_derives! { + pub enum RetainImages { + #[serde(rename = "none")] + None, + #[serde(rename = "all")] + All, + #[serde(rename = "alt")] + Alt, + } +} + +common_derives! { + pub enum SearchType { + #[serde(rename = "web")] + Web, + #[serde(rename = "images")] + Images, + #[serde(rename = "news")] + News, + } +} + +common_derives! { + pub enum SearchEngine { + #[serde(rename = "google")] + Google, + #[serde(rename = "bing")] + Bing, + } +}