From b8f31db42a0b0212eb1c95c7634d112b929f05b1 Mon Sep 17 00:00:00 2001 From: frc4533-lincoln <132951735+frc4533-lincoln@users.noreply.github.com> Date: Sat, 19 Oct 2024 19:15:17 -0400 Subject: [PATCH] Kinda works --- Cargo.lock | 153 +++++++++++++++++++++++++++++++++- Cargo.toml | 16 +++- assets/dragynfruit.png | Bin 0 -> 2857 bytes src/crawler.rs | 3 - src/lib.rs | 57 +++++++++++++ src/main.rs | 73 ++++++++++++---- src/scrapers/bandcamp.rs | 23 +++++ src/scrapers/duckduckgo.rs | 86 +++++++++++++++++++ src/scrapers/google.rs | 37 ++++++++ src/scrapers/mod.rs | 26 ++++++ src/scrapers/stackexchange.rs | 54 ++++++++++++ src/web.rs | 25 ++++-- views/base.html | 98 +++++++++++++--------- views/index.html | 5 +- views/results.html | 52 +++++++++--- 15 files changed, 621 insertions(+), 87 deletions(-) create mode 100644 assets/dragynfruit.png create mode 100644 src/lib.rs create mode 100644 src/scrapers/bandcamp.rs create mode 100644 src/scrapers/duckduckgo.rs create mode 100644 src/scrapers/google.rs create mode 100644 src/scrapers/mod.rs create mode 100644 src/scrapers/stackexchange.rs diff --git a/Cargo.lock b/Cargo.lock index 8b2e1cf..6ff19e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -109,12 +115,31 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "anyhow" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" + [[package]] name = "arc-swap" version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" +[[package]] +name = "async-compression" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" +dependencies = [ + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", +] + [[package]] name = "async-trait" version = "0.1.80" @@ -203,7 +228,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.3", "object", "rustc-demangle", ] @@ -244,6 +269,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata 0.1.10", +] + [[package]] name = "bstr" version = "1.9.1" @@ -335,6 +371,34 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +[[package]] +name = "cookie" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7efb37c3e1ccb1ff97164ad95ac1606e8ccd35b3fa0a7d99a304c7f4a428cc24" +dependencies = [ + "percent-encoding", + "time", + "version_check", +] + +[[package]] +name = "cookie_store" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "387461abbc748185c3a6e1673d826918b450b87ff22639429c694619a83b6cf6" +dependencies = [ + "cookie", + "idna 0.3.0", + "log", + "publicsuffix", + "serde", + "serde_derive", + "serde_json", + "time", + "url", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -572,6 +636,16 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "flate2" +version = "1.0.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +dependencies = [ + "crc32fast", + "miniz_oxide 0.8.0", +] + [[package]] name = "fnv" version = "1.0.7" @@ -737,7 +811,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" dependencies = [ "aho-corasick", - "bstr", + "bstr 1.9.1", "log", "regex-automata 0.4.6", "regex-syntax 0.8.3", @@ -950,6 +1024,16 @@ dependencies = [ "cc", ] +[[package]] +name = "idna" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "idna" version = "0.5.0" @@ -1193,6 +1277,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "0.8.11" @@ -1625,6 +1718,22 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psl-types" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" + +[[package]] +name = "publicsuffix" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a8c1bda5ae1af7f99a2962e49df150414a43d62404644d98dd5c3a93d07457" +dependencies = [ + "idna 0.3.0", + "psl-types", +] + [[package]] name = "quote" version = "1.0.36" @@ -1762,8 +1871,11 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" dependencies = [ + "async-compression", "base64", "bytes", + "cookie", + "cookie_store", "encoding_rs", "futures-core", "futures-util", @@ -1790,6 +1902,7 @@ dependencies = [ "system-configuration", "tokio", "tokio-native-tls", + "tokio-util", "tower-service", "url", "wasm-bindgen", @@ -1914,12 +2027,17 @@ dependencies = [ "axum", "env_logger", "log", + "lru", + "once_cell", + "phf 0.11.2", "reqwest", "scraper", "serde", + "serde_qs", "sled", "tantivy", "tera", + "texting_robots", "tokio", "url", ] @@ -2007,6 +2125,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_qs" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd34f36fe4c5ba9654417139a9b3a20d2e1de6012ee678ad14d240c22c78d8d6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2396,6 +2525,22 @@ dependencies = [ "unic-segment", ] +[[package]] +name = "texting_robots" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82a718a28dda2e67ad6e0464597b58eae39e2e4d0451e03d1028d71e81bb4a" +dependencies = [ + "anyhow", + "bstr 0.2.17", + "lazy_static", + "nom", + "percent-encoding", + "regex", + "thiserror", + "url", +] + [[package]] name = "thiserror" version = "1.0.61" @@ -2704,7 +2849,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", - "idna", + "idna 0.5.0", "percent-encoding", ] diff --git a/Cargo.toml b/Cargo.toml index 48be63f..b014c7a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,15 +11,25 @@ env_logger = "0.11.3" log = "0.4.21" lru = "0.12.3" once_cell = "1.19.0" -reqwest = "0.12.4" +reqwest = { version = "0.12.4", features = ["cookies", "gzip", "json"] } scraper = "0.19.0" serde = { version = "1.0.203", features = ["derive"] } sled = "0.34.7" -tantivy = { version = "0.22.0", default-features = false, features = ["zstd-compression", "mmap", "stopwords"] } +tantivy = { version = "0.22.0", default-features = false, features = [ + "zstd-compression", + "mmap", + "stopwords", +] } tera = "1.20.0" texting_robots = "0.2.2" -tokio = { version = "1.38.0", features = ["rt-multi-thread", "macros", "signal"] } +tokio = { version = "1.38.0", features = [ + "rt-multi-thread", + "macros", + "signal", +] } url = "2.5.0" +serde_qs = "0.13.0" +phf = { version = "0.11", features = ["macros"] } [profile.release] lto = true diff --git a/assets/dragynfruit.png b/assets/dragynfruit.png new file mode 100644 index 0000000000000000000000000000000000000000..0ac7c047a4feee29c542cd8b173e4d90d1082ae8 GIT binary patch literal 2857 zcmV+^3)b|BP)pF8FWQhbW?9;ba!ELWdL_~cP?peYja~^aAhuUa%Y?FJQ@H13bsi^ zK~!jg&6;_16jc_+f3LdJTS&|TiD3tUNB|Y1BSBP@&4DNb7##;0#iJrRj(XIi<31yZ z%i)jl%;-3R&VfNuP$w(`E|_r?;=&?si4jl{0_p5qC*56j=Tvn<(n)vH5mY`-om9Vf z>%H&Z``*3xRiQXeki0w_H7r4K9f%WAdl9)^y|wloenM!+ZKPTrL~$3849bII3>xcX z%ADbx=nBZBOVbgHd zdtjR6f*RC~)OC`b9mk2LfSC40-5Z1G08)lr%?am$sh2H8aMvPUO zHo{Q4%h1_>^Qn)%+am@3QF8xE;Hq{_28wM)P=9jat>2in&V=p;^gA_b)8ZucKyyst zsV2>F_v!}*Cn%+^73dpAPV{jLMlAU-W(A(!cYmhR*`lCRX6;wp!1*s8A+aJ5oxDrq zwYvsUlkOmUbGe+gp(3gd0_@FBkPj|Tqbc4VNp8i0VT-O&$`Xxj58O0#QE|{|8!tj} zN5O-c@w|HXAb!Y<9>zbNlx&eVv1@Xws!ferJxJl~KC*RwMg;O=;;C53cT&K_Ed&32 zNF+ENsOs3~_t@CHpda3W&Q1dx8WfFBYWr+}mC5vtt@mRN!C#F2G=JGXqa%ppZ z&Z}iy`S3nQ7gQ@-P@-;1@4yIEg;JD7YHz_wNyvMsYDO0+9#2-s0 z9;bq_ONi1DS1pF88Y8|Ef{d-dhhzm8AiNg;IP1Gqpz@3nTa+-#;%e zpDDTbT7_C>WuO-HE2?Mu(+5p!SkPVN1bY`@#Fus)tH)N3RZif|_-C0$1cWV_g~K0x zUW65`zmtwMG3EI~7J0+oNb||t@egaVzLpd$LDxx6F12T_zZ@;9! zt-tE}YqWrkSxdho(HlTd*he6B4`{)k%*u5AjCruwB?de9HSxgcHTYY*!Z&R^@x7|5 z=Vb2M$<$>BNvx>H-{*Y-DLcUkc55;J(2H1Jb2@)2gzsuP%LU_-*gNv~+_KHaXWkCp z(7PNvRf0?wgKN^8aYprk3Ey?hSH8Z>?X{W8pHc`mUrZKjuqBZCOc^Enzb7*@lL7It zYbJPF$|FOd;M~Q`xba?Q&6>s5jsM}R-$1e>Hg701(7_z%YU{1YQFIEMMiZMwE3fmX z{MS?``ugo)vDGC)qhBWw2!+lA4Y2ixHn~sA;QjCF%<~p?b#>GRVAFx<-c;rlY`H>x z2g~c((q$s6$s+~WEaFeJ2JFFF{`62ilP6E6s;ccESJTu&2sqLJOP4OCprC-l!a}R< zJ2MIEG4GB3$#x2xXYyNTK3{h7fcH`o;_SSkbWZCoVzgSNy@e*EVi7AYEw(s7@MSqq6;*IsZ#Ob95j8L#v`9rp1Xl+o zt}<;B-^ANe8E`Eqr}m3!BQIE(i@~(owK*FboYp9^=#D z<%8hVpr0G+0wLKceZa1T$TkMS+XQ_*p|+sG8?{Z$6UG06I?wcS@C`7_tYmbsKCD;@ zc-HYjV=nU*Ru;DD!rewW*V}uOWk5z|J!kuRVTPrg>Sowk4k=EkG$7dtcb?Kl)Kp~Y zUKp1QzfKReRr=wD?_vG`aJM^k`VCln2!^|$vIz+qNOm7I(@VNeu}zp4P?q7GdIk5W zA*^k&+=}>(Y;5sFJ1foLY81=Ob{piZF2QTF^LAq+7&=qko7mcJ&d%`7;3||^TJxZL14~wAaf8yx|J~;t@t#LUql14U z%xHaHK|R9U0kv#WJGwv9?w?3f016=R=HIpSs15So|lHv)Qhi` z@TzNL_y_f`mYCr3-AkE z0z4-$#}Moyhyfq#29vrYe;^0*v(~mq Self { + Self::General + } +} + +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct Query { + #[serde(rename = "q")] + pub query: String, + #[serde(rename = "k")] + pub kind: Kind, + #[serde(rename = "p")] + pub page: usize, +} + +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct Result { + pub url: String, + pub title: String, + pub general: Option, + pub forum: Option, + pub image: Option, +} + +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct GeneralResult { + pub snippet: String, +} +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct ForumResult { + pub poster_image: Option, + pub poster_username: String, + pub poster_url: Option, + pub tags: Option>, +} +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct ImageResult {} diff --git a/src/main.rs b/src/main.rs index 21b785b..6510037 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,26 +12,28 @@ extern crate lru; mod crawler; mod page; mod ranking; +mod scrapers; mod web; -use std::{ - fs, io, - num::{NonZeroU16, NonZeroUsize}, - sync::Arc, -}; +use std::{fs, num::NonZeroUsize, sync::Arc, time::Instant}; -use axum::{routing::get, Router}; -use crawler::Crawler; +use axum::{ + http::{HeaderMap, HeaderValue}, + routing::get, + Router, +}; use log::LevelFilter; use lru::LruCache; use page::Page; +use reqwest::Client; use scraper::Selector; +use sled::Db; use tantivy::{ doc, query::QueryParser, schema::{Field, Schema, FAST, STORED, TEXT}, store::{Compressor, ZstdCompressor}, - Index, IndexReader, IndexSettings, Searcher, + Index, IndexReader, IndexSettings, }; use tokio::{ net::TcpListener, @@ -44,6 +46,8 @@ pub struct AppState { count_cache: Arc>>, reader: IndexReader, query_parser: QueryParser, + client: Client, + db: Db, url: Field, title: Field, @@ -52,6 +56,42 @@ pub struct AppState { #[tokio::main(worker_threads = 12)] async fn main() { + let mut headers = HeaderMap::new(); + for (key, val) in [ + ( + "User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0", + ), + ( + "Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + ), + ("Accept-Language", "en-US,en;q=0.5"), + ("Accept-Encoding", "gzip"), + ("DNT", "1"), + ("Connection", "keep-alive"), + ("Upgrade-Insecure-Requests", "1"), + ("Sec-Fetch-Dest", "document"), + ("Sec-Fetch-Mode", "navigate"), + ("Sec-Fetch-Site", "none"), + ("Sec-Fetch-User", "?1"), + ("Priority", "u=1"), + ("TE", "trailers"), + ] { + headers.append(key, HeaderValue::from_str(val).unwrap()); + } + let client = reqwest::Client::builder() + .default_headers(headers) + .build() + .unwrap(); + + let st = Instant::now(); + //let res = scrapers::stackexchange::StackExchange::search(client.clone(), Query { query: String::from("rust"), page: 2 }).await; + //println!("{res:?}"); + println!("{:?}", st.elapsed()); + + //println!("{res:?}"); + env_logger::builder() .filter_level(LevelFilter::Info) .parse_default_env() @@ -111,13 +151,13 @@ async fn main() { } }); - info!("initializing crawler"); - let cr = Crawler::new(tx).await; + //info!("initializing crawler"); + //let cr = Crawler::new(tx).await; - info!("starting crawler"); - tokio::spawn(async move { - cr.run().await.unwrap(); - }); + //info!("starting crawler"); + //tokio::spawn(async move { + // cr.run().await.unwrap(); + //}); let query_parser = QueryParser::for_index(&index, vec![title, body]); //let searcher = index.reader().unwrap().searcher(); @@ -144,15 +184,20 @@ async fn main() { let count_cache = Arc::new(Mutex::new(LruCache::new(NonZeroUsize::new(500).unwrap()))); + let db = sled::open("searched-db").unwrap(); + info!("initializing web"); let r = Router::new() .route("/", get(web::search)) .route("/search", get(web::results)) + .route("/assets/dragynfruit.png", get(web::dragynfruit_logo)) .with_state(AppState { //index, count_cache, reader, query_parser, + client, + db, url, title, diff --git a/src/scrapers/bandcamp.rs b/src/scrapers/bandcamp.rs new file mode 100644 index 0000000..0a2f233 --- /dev/null +++ b/src/scrapers/bandcamp.rs @@ -0,0 +1,23 @@ +use scraper::{Html, Selector}; +use url::Url; + +use super::Scraper; + +pub struct Bandcamp; +impl Scraper for Bandcamp { + const HEADERS: &'static [&'static str] = &[]; + + async fn search(state: crate::AppState, query: searched::Query) -> Vec { + let res = state.client.get(Url::parse_with_params("https://bandcamp.com/search", &[ + ("q", query.query.clone()), + ("page", query.page.to_string()) + ]).unwrap()).send().await.unwrap().text().await.unwrap(); + + let html = Html::parse_document(&res); + for result in html.select(&Selector::parse("li[class=searchresult]").unwrap()) { + println!("{}", result.html()); + } + + vec![] + } +} diff --git a/src/scrapers/duckduckgo.rs b/src/scrapers/duckduckgo.rs new file mode 100644 index 0000000..b284c34 --- /dev/null +++ b/src/scrapers/duckduckgo.rs @@ -0,0 +1,86 @@ +use std::collections::HashMap; + +use scraper::{Html, Selector}; + +use super::Scraper; + +pub struct Duckduckgo; +impl Scraper for Duckduckgo { + const HEADERS: &'static [&'static str] = &[]; + async fn search(state: crate::AppState, query: searched::Query) -> Vec { + let mut results = Vec::new(); + + let mut req = state + .client + .post("https://lite.duckduckgo.com/lite/") + .header("Content-Type", "application/x-www-form-urlencoded"); + + let vqds = state.db.open_tree("duckduckgo-vqds").unwrap(); + + let mut form: HashMap<&str, String> = HashMap::new(); + form.insert("q", query.query.clone()); + + let mut offset = 0; + let mut cookies = String::new(); + + if query.page > 1 { + if query.page == 2 { + offset = (query.page - 1) * 20; + } else if query.page > 2 { + offset = 20 + (query.page - 2) * 50; + } + form.insert("s", offset.to_string()); + form.insert("nextParams", String::from("")); + form.insert("v", String::from("l")); + form.insert("o", String::from("json")); + form.insert("dc", (offset + 1).to_string()); + form.insert("api", String::from("d.js")); + req = req.header("Referer", String::from("https://lite.duckduckgo.com/")); + form.insert( + "vqd", + String::from_utf8(vqds.get(&query.query).unwrap().unwrap().to_vec()).unwrap(), + ); + form.insert("kl", String::from("wt-wt")); + cookies.push_str("kl=wt-wt"); + } + + let res = req.form(&form).send().await.unwrap().text().await.unwrap(); + + let html = Html::parse_document(&res); + + if !vqds.contains_key(&query.query).unwrap() { + let vqd = html + .select(&Selector::parse("input[name=vqd]").unwrap()) + .next() + .unwrap() + .attr("value") + .unwrap(); + vqds.insert(query.query.as_str(), vqd).unwrap(); + } + + let link_sel = Selector::parse("a.result-link").unwrap(); + let link_sels = html.select(&link_sel); + + let snippet_sel = Selector::parse("td.result-snippet").unwrap(); + let snippet_sels = html.select(&snippet_sel); + + let result_sels = link_sels.zip(snippet_sels); + + for (link, snippet) in result_sels { + let url = link.attr("href").unwrap().to_string(); + let title = link.inner_html(); + let body_preview = snippet.inner_html(); + + results.push(searched::Result { + url, + title, + general: Some(searched::GeneralResult { + snippet: body_preview, + }), + ..Default::default() + }); + } + + results + } +} diff --git a/src/scrapers/google.rs b/src/scrapers/google.rs new file mode 100644 index 0000000..9a75d03 --- /dev/null +++ b/src/scrapers/google.rs @@ -0,0 +1,37 @@ +use reqwest::header::{self, HeaderMap, HeaderValue}; +use url::Url; + +pub struct Google; +impl Google { + pub async fn get() { + let mut headers = HeaderMap::new(); + for (key, val) in [ + ("User-Agent", ""), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"), + ("Accept-Language", "en-US,en;q=0.5"), + ("Accept-Encoding", "gzip"), + ("DNT", "1"), + ("Connection", "keep-alive"), + ("Upgrade-Insecure-Requests", "1"), + ("Sec-Fetch-Dest", "document"), + ("Sec-Fetch-Mode", "navigate"), + ("Sec-Fetch-Site", "none"), + ("Sec-Fetch-User", "?1"), + ("Priority", "u=1"), + ("TE", "trailers"), + ] { + headers.append(key, HeaderValue::from_str(val).unwrap()); + } + let client = reqwest::Client::builder() + .default_headers(headers) + .build() + .unwrap(); + + let res = client + .get(Url::parse_with_params("https://google.com/search", [("q", "rust lang")]).unwrap()) + .send() + .await + .unwrap(); + //println!("{}", res.text().await.unwrap()); + } +} diff --git a/src/scrapers/mod.rs b/src/scrapers/mod.rs new file mode 100644 index 0000000..373c67b --- /dev/null +++ b/src/scrapers/mod.rs @@ -0,0 +1,26 @@ +//pub mod google; +pub mod duckduckgo; +pub mod stackexchange; +pub mod bandcamp; + +use duckduckgo::Duckduckgo; +use searched::Query; +use stackexchange::StackExchange; +use bandcamp::Bandcamp; + +use crate::AppState; + +pub trait Scraper: Sized { + const HEADERS: &'static [&'static str]; + + async fn search(state: AppState, query: Query) -> Vec; +} + +pub async fn search(scraper: &str, state: AppState, query: Query) -> Vec { + match scraper { + "duckduckgo" => Duckduckgo::search(state, query).await, + "stackexchange" => StackExchange::search(state, query).await, + "bandcamp" => Bandcamp::search(state, query).await, + &_ => unimplemented!(), + } +} diff --git a/src/scrapers/stackexchange.rs b/src/scrapers/stackexchange.rs new file mode 100644 index 0000000..61eba90 --- /dev/null +++ b/src/scrapers/stackexchange.rs @@ -0,0 +1,54 @@ +use url::Url; + +use super::Scraper; + +#[derive(Deserialize)] +struct Res { + items: Vec, +} + +#[derive(Deserialize)] +struct ResItem { + link: String, + title: String, + tags: Vec, +} + +pub struct StackExchange; +impl Scraper for StackExchange { + const HEADERS: &'static [&'static str] = &[]; + + async fn search(state: crate::AppState, query: searched::Query) -> Vec { + let data: Res = state + .client + .get( + Url::parse_with_params( + "https://api.stackexchange.com/2.3/search/advanced", + &[ + ("q", query.query), + ("page", query.page.to_string()), + ("site", "stackoverflow".to_string()), + ], + ) + .unwrap(), + ) + .send() + .await + .unwrap() + .json() + .await + .unwrap(); + + data.items + .into_iter() + .map(|item| searched::Result { + title: item.title, + url: item.link, + general: Some(searched::GeneralResult { + snippet: item.tags.join(" "), + }), + ..Default::default() + }) + .collect() + } +} diff --git a/src/web.rs b/src/web.rs index 9a9b521..a022881 100644 --- a/src/web.rs +++ b/src/web.rs @@ -6,22 +6,21 @@ use std::{ }; use axum::{ - extract::{Query, State}, - response::{Html, IntoResponse, Redirect}, + body::Body, extract::{Query, State}, http::header, response::{Html, IntoResponse, Redirect, Response} }; use once_cell::sync::Lazy; use tantivy::{ collector::{Count, TopDocs}, schema::Value, - DocAddress, Score, Searcher, TantivyDocument, + DocAddress, Score, TantivyDocument, }; use tera::{Context, Tera}; use tokio::{sync::Mutex, time::Instant}; -use crate::AppState; +use crate::{scrapers::{self, duckduckgo::Duckduckgo, Scraper}, AppState}; pub static TEMPLATES: Lazy>> = Lazy::new(|| { - let mut tera = match Tera::new("views/**/*") { + let tera = match Tera::new("views/**/*") { Ok(t) => t, Err(e) => { println!("Parsing error(s): {}", e); @@ -46,9 +45,11 @@ pub async fn search(Query(params): Query) -> impl IntoResponse { .into_response() } -#[derive(Deserialize)] +#[derive(Deserialize, Default)] pub struct SearchParams { q: Option, + k: Option, + s: Option, p: Option, } @@ -63,7 +64,7 @@ pub struct SearchResult { pub struct SearchResults { query: String, count: usize, - results: Vec, + results: Vec, parse_time: f32, search_time: f32, gather_time: f32, @@ -77,7 +78,7 @@ pub async fn results( (*TEMPLATES.lock().await).full_reload().unwrap(); let mut results: Vec = Vec::new(); - let reader = st.reader; + let reader = st.reader.clone(); reader.reload().unwrap(); let searcher = reader.searcher(); @@ -117,6 +118,10 @@ pub async fn results( } let gather_time = gather_st.elapsed().as_secs_f32() * 1_000.0; + let search_st = Instant::now(); + let results = scrapers::search(params.s.unwrap().as_str(), st.clone(), searched::Query { query: q.clone(), kind: params.k.unwrap_or_default(), page: params.p.unwrap_or(1) }).await; + let search_time = search_st.elapsed().as_secs_f32() * 1_000.0; + return Html( (*TEMPLATES.lock().await) .render( @@ -138,3 +143,7 @@ pub async fn results( return Redirect::to("/").into_response(); } } + +pub async fn dragynfruit_logo() -> impl IntoResponse { + Response::builder().header(header::CONTENT_TYPE, "image/png").body(Body::from(include_bytes!("../assets/dragynfruit.png").to_vec())).unwrap() +} diff --git a/views/base.html b/views/base.html index 4a5472c..a725aac 100644 --- a/views/base.html +++ b/views/base.html @@ -1,47 +1,65 @@ + {% block title %}Searched{% endblock title %} diff --git a/views/index.html b/views/index.html index 0cb9c8f..c22e5b8 100644 --- a/views/index.html +++ b/views/index.html @@ -6,9 +6,10 @@

Searched

A search engine

- + + +