diff --git a/Cargo.lock b/Cargo.lock index a5161b272..8cb3964f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -567,6 +567,15 @@ dependencies = [ "tokio", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -592,6 +601,33 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "aws-lc-rs" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdd82dba44d209fddb11c190e0a94b78651f95299598e472215667417a03ff1d" +dependencies = [ + "aws-lc-sys", + "mirai-annotations", + "paste", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df7a4168111d7eb622a31b214057b8509c0a7e1794f44c546d742330dc793972" +dependencies = [ + "bindgen", + "cc", + "cmake", + "dunce", + "fs_extra", + "libc", + "paste", +] + [[package]] name = "axum" version = "0.7.9" @@ -698,6 +734,12 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "benches" version = "0.0.0" @@ -715,6 +757,29 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags 2.6.0", + "cexpr", + "clang-sys", + "itertools 0.10.5", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 2.0.90", + "which 4.4.2", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -741,6 +806,9 @@ name = "bitflags" version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +dependencies = [ + "serde", +] [[package]] name = "block-buffer" @@ -895,6 +963,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom 7.1.3", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -977,6 +1054,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.23" @@ -1017,6 +1105,15 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.3" @@ -1097,6 +1194,12 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + [[package]] name = "const_fn" version = "0.4.10" @@ -1152,8 +1255,8 @@ checksum = "03a5d7b21829bc7b4bf4754a978a241ae54ea55a40f92bb20216e54096f4b951" dependencies = [ "aes-gcm", "base64 0.13.1", - "hkdf", - "hmac", + "hkdf 0.10.0", + "hmac 0.10.1", "percent-encoding", "rand 0.8.5", "sha2 0.9.9", @@ -1231,6 +1334,21 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcb25d077389e53838a8158c8e99174c5a9d902dee4904320db714f3c653ffba" +[[package]] +name = "crc" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.4.2" @@ -1475,6 +1593,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "der" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f55bf8e7b65898637379c1b74eb1551107c8294ed26d855ceb9fd1a09cfc9bc0" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + [[package]] name = "deranged" version = "0.3.11" @@ -1544,7 +1673,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer 0.10.4", + "const-oid", "crypto-common", + "subtle", ] [[package]] @@ -1600,6 +1731,12 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + [[package]] name = "dtoa" version = "1.0.9" @@ -1632,6 +1769,9 @@ name = "either" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +dependencies = [ + "serde", +] [[package]] name = "encoding_rs" @@ -1693,6 +1833,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + [[package]] name = "event-listener" version = "2.5.3" @@ -1838,6 +1989,17 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "flume" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +dependencies = [ + "futures-core", + "futures-sink", + "spin 0.9.8", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1874,6 +2036,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futf" version = "0.1.5" @@ -1926,6 +2094,17 @@ dependencies = [ "futures-util", ] +[[package]] +name = "futures-intrusive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f" +dependencies = [ + "futures-core", + "lock_api", + "parking_lot", +] + [[package]] name = "futures-io" version = "0.3.31" @@ -2066,6 +2245,12 @@ version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "gloo-timers" version = "0.3.0" @@ -2137,6 +2322,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashbrown" @@ -2150,6 +2339,15 @@ dependencies = [ "serde", ] +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "hdrhistogram" version = "7.5.4" @@ -2269,7 +2467,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51ab2f639c231793c5f6114bdb9bbe50a7dbbfcd7c7c6bd8475dec2d991e964f" dependencies = [ "digest 0.9.0", - "hmac", + "hmac 0.10.1", +] + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac 0.12.1", ] [[package]] @@ -2282,6 +2489,15 @@ dependencies = [ "digest 0.9.0", ] +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest 0.10.7", +] + [[package]] name = "home" version = "0.5.9" @@ -2923,6 +3139,9 @@ name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin 0.9.8", +] [[package]] name = "lazycell" @@ -2949,6 +3168,16 @@ version = "0.2.168" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aaeb2981e0606ca11d79718f8bb01164f1d6ed75080182d3abf017e6d244b6d" +[[package]] +name = "libloading" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" +dependencies = [ + "cfg-if", + "windows-targets 0.48.5", +] + [[package]] name = "libm" version = "0.2.11" @@ -2965,6 +3194,17 @@ dependencies = [ "libc", ] +[[package]] +name = "libsqlite3-sys" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -3162,6 +3402,16 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest 0.10.7", +] + [[package]] name = "memchr" version = "2.7.4" @@ -3242,6 +3492,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "mirai-annotations" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" + [[package]] name = "moka" version = "0.12.8" @@ -3382,6 +3638,23 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -3406,6 +3679,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-rational" version = "0.4.2" @@ -3595,6 +3879,15 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -3794,6 +4087,27 @@ dependencies = [ "futures-io", ] +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.31" @@ -3891,6 +4205,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "prettyplease" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" +dependencies = [ + "proc-macro2", + "syn 2.0.90", +] + [[package]] name = "proc-macro-crate" version = "1.3.1" @@ -4440,6 +4764,26 @@ dependencies = [ "serde", ] +[[package]] +name = "rsa" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47c75d7c5c6b673e58bf54d8544a9f432e3a925b0e80f7cd3602ab5c50c55519" +dependencies = [ + "const-oid", + "digest 0.10.7", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core 0.6.4", + "signature", + "spki", + "subtle", + "zeroize", +] + [[package]] name = "rustc-demangle" version = "0.1.24" @@ -4536,6 +4880,7 @@ version = "0.23.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" dependencies = [ + "aws-lc-rs", "log", "once_cell", "ring 0.17.8", @@ -4581,6 +4926,7 @@ version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ + "aws-lc-rs", "ring 0.17.8", "rustls-pki-types", "untrusted 0.9.0", @@ -4937,6 +5283,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest 0.10.7", + "rand_core 0.6.4", +] + [[package]] name = "simba" version = "0.8.1" @@ -5008,7 +5364,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.21.33" +version = "2.22.2" dependencies = [ "ahash", "aho-corasick", @@ -5054,6 +5410,7 @@ dependencies = [ "sitemap", "smallvec", "spider_chrome", + "sqlx", "statrs", "string-interner", "string_concat", @@ -5071,7 +5428,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.21.33" +version = "2.22.2" dependencies = [ "adblock", "aho-corasick", @@ -5104,7 +5461,7 @@ dependencies = [ "tracing", "tracing-subscriber", "url", - "which", + "which 6.0.3", "winreg 0.52.0", ] @@ -5161,7 +5518,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.21.33" +version = "2.22.2" dependencies = [ "clap", "env_logger", @@ -5186,7 +5543,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.21.33" +version = "2.22.2" dependencies = [ "aho-corasick", "fast_html2md", @@ -5208,7 +5565,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.21.33" +version = "2.22.2" dependencies = [ "indexmap 1.9.3", "serde", @@ -5220,7 +5577,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.21.33" +version = "2.22.2" dependencies = [ "env_logger", "lazy_static", @@ -5239,6 +5596,227 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "sqlformat" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bba3a93db0cc4f7bdece8bb09e77e2e785c20bfebf79eb8340ed80708048790" +dependencies = [ + "nom 7.1.3", + "unicode_categories", +] + +[[package]] +name = "sqlx" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93334716a037193fac19df402f8571269c84a00852f6a7066b5d2616dcd64d3e" +dependencies = [ + "sqlx-core", + "sqlx-macros", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", +] + +[[package]] +name = "sqlx-core" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4d8060b456358185f7d50c55d9b5066ad956956fddec42ee2e8567134a8936e" +dependencies = [ + "atoi", + "byteorder", + "bytes", + "crc", + "crossbeam-queue", + "either", + "event-listener 5.3.1", + "futures-channel", + "futures-core", + "futures-intrusive", + "futures-io", + "futures-util", + "hashbrown 0.14.5", + "hashlink", + "hex", + "indexmap 2.7.0", + "log", + "memchr", + "native-tls", + "once_cell", + "paste", + "percent-encoding", + "rustls 0.23.20", + "rustls-pemfile", + "serde", + "serde_json", + "sha2 0.10.8", + "smallvec", + "sqlformat", + "thiserror 1.0.69", + "tokio", + "tokio-stream", + "tracing", + "url", + "webpki-roots 0.26.7", +] + +[[package]] +name = "sqlx-macros" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cac0692bcc9de3b073e8d747391827297e075c7710ff6276d9f7a1f3d58c6657" +dependencies = [ + "proc-macro2", + "quote", + "sqlx-core", + "sqlx-macros-core", + "syn 2.0.90", +] + +[[package]] +name = "sqlx-macros-core" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1804e8a7c7865599c9c79be146dc8a9fd8cc86935fa641d3ea58e5f0688abaa5" +dependencies = [ + "dotenvy", + "either", + "heck 0.5.0", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde", + "serde_json", + "sha2 0.10.8", + "sqlx-core", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", + "syn 2.0.90", + "tempfile", + "tokio", + "url", +] + +[[package]] +name = "sqlx-mysql" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64bb4714269afa44aef2755150a0fc19d756fb580a67db8885608cf02f47d06a" +dependencies = [ + "atoi", + "base64 0.22.1", + "bitflags 2.6.0", + "byteorder", + "bytes", + "crc", + "digest 0.10.7", + "dotenvy", + "either", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "generic-array", + "hex", + "hkdf 0.12.4", + "hmac 0.12.1", + "itoa 1.0.14", + "log", + "md-5", + "memchr", + "once_cell", + "percent-encoding", + "rand 0.8.5", + "rsa", + "serde", + "sha1 0.10.6", + "sha2 0.10.8", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror 1.0.69", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-postgres" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fa91a732d854c5d7726349bb4bb879bb9478993ceb764247660aee25f67c2f8" +dependencies = [ + "atoi", + "base64 0.22.1", + "bitflags 2.6.0", + "byteorder", + "crc", + "dotenvy", + "etcetera", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "hex", + "hkdf 0.12.4", + "hmac 0.12.1", + "home", + "itoa 1.0.14", + "log", + "md-5", + "memchr", + "once_cell", + "rand 0.8.5", + "serde", + "serde_json", + "sha2 0.10.8", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror 1.0.69", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-sqlite" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5b2cf34a45953bfd3daaf3db0f7a7878ab9b7a6b91b422d24a7a9e4c857b680" +dependencies = [ + "atoi", + "flume", + "futures-channel", + "futures-core", + "futures-executor", + "futures-intrusive", + "futures-util", + "libsqlite3-sys", + "log", + "percent-encoding", + "serde", + "serde_urlencoded", + "sqlx-core", + "tracing", + "url", +] [[package]] name = "ssri" @@ -5381,6 +5959,17 @@ version = "0.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3c3ee6129eec20fed59acf2e9cfb3ffd20d0bbe39fe334c22af0edc56dfe752" +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + [[package]] name = "strsim" version = "0.11.1" @@ -6157,12 +6746,33 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df" +[[package]] +name = "unicode-bidi" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" + [[package]] name = "unicode-ident" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-properties" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -6187,6 +6797,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "universal-hash" version = "0.4.0" @@ -6366,6 +6982,12 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + [[package]] name = "wasm-bindgen" version = "0.2.99" @@ -6494,6 +7116,18 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix 0.38.42", +] + [[package]] name = "which" version = "6.0.3" @@ -6506,6 +7140,16 @@ dependencies = [ "winsafe", ] +[[package]] +name = "whoami" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "372d5b87f58ec45c384ba03563b03544dc5fadc3983e434b286913f5b4a9bb6d" +dependencies = [ + "redox_syscall", + "wasite", +] + [[package]] name = "wide" version = "0.7.30" diff --git a/examples/advanced_configuration.rs b/examples/advanced_configuration.rs index a42bdfe20..a16481fef 100644 --- a/examples/advanced_configuration.rs +++ b/examples/advanced_configuration.rs @@ -42,7 +42,7 @@ async fn main() -> Result<(), Error> { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/budget.rs b/examples/budget.rs index 3b12d99dd..eddf012d7 100644 --- a/examples/budget.rs +++ b/examples/budget.rs @@ -22,7 +22,7 @@ async fn main() -> Result<(), Error> { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/chrome.rs b/examples/chrome.rs index 2f1821f93..4891e721f 100644 --- a/examples/chrome.rs +++ b/examples/chrome.rs @@ -27,7 +27,7 @@ async fn crawl_website(url: &str) -> Result<()> { let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", diff --git a/examples/chrome_remote.rs b/examples/chrome_remote.rs index e5449c068..57355eb5c 100644 --- a/examples/chrome_remote.rs +++ b/examples/chrome_remote.rs @@ -42,7 +42,7 @@ async fn crawl_website(url: &str) -> Result<()> { let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl({}) is: {:?} for total pages: {:?}", diff --git a/examples/chrome_screenshot.rs b/examples/chrome_screenshot.rs index 167928691..f926303fa 100644 --- a/examples/chrome_screenshot.rs +++ b/examples/chrome_screenshot.rs @@ -39,7 +39,7 @@ async fn main() { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", diff --git a/examples/chrome_screenshot_with_config.rs b/examples/chrome_screenshot_with_config.rs index 74e2c3aa0..52b955520 100644 --- a/examples/chrome_screenshot_with_config.rs +++ b/examples/chrome_screenshot_with_config.rs @@ -32,7 +32,7 @@ async fn main() { let start = crate::tokio::time::Instant::now(); website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", diff --git a/examples/chrome_viewport.rs b/examples/chrome_viewport.rs index 7ab9ab2a3..fee706727 100644 --- a/examples/chrome_viewport.rs +++ b/examples/chrome_viewport.rs @@ -23,7 +23,7 @@ async fn main() { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/chrome_web_automation.rs b/examples/chrome_web_automation.rs index ca527ba91..1ad92cb1a 100644 --- a/examples/chrome_web_automation.rs +++ b/examples/chrome_web_automation.rs @@ -48,7 +48,7 @@ async fn main() { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/configuration.rs b/examples/configuration.rs index 9479a350d..eb42c3b2b 100644 --- a/examples/configuration.rs +++ b/examples/configuration.rs @@ -25,7 +25,7 @@ async fn main() -> Result<(), Error> { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/css_scrape.rs b/examples/css_scrape.rs index 046f99873..3454a45ea 100644 --- a/examples/css_scrape.rs +++ b/examples/css_scrape.rs @@ -39,7 +39,7 @@ async fn main() { format!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", duration, - website.get_links().len() + website.get_size().await ) .as_bytes(), ) diff --git a/examples/depth.rs b/examples/depth.rs index 69c4ea654..3c8590b30 100644 --- a/examples/depth.rs +++ b/examples/depth.rs @@ -17,7 +17,7 @@ async fn main() -> Result<(), Error> { website.crawl().await; let duration: std::time::Duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/download.rs b/examples/download.rs index 46c252aad..008a196d5 100644 --- a/examples/download.rs +++ b/examples/download.rs @@ -46,11 +46,8 @@ async fn main() { .open(&download_file) .expect("Unable to open file"); - match page.get_bytes() { - Some(b) => { - file.write_all(b).unwrap_or_default(); - } - _ => (), + if let Some(b) = page.get_bytes() { + file.write_all(b).unwrap_or_default(); } log("downloaded", download_file) diff --git a/examples/example.rs b/examples/example.rs index fdaceade8..c41669970 100644 --- a/examples/example.rs +++ b/examples/example.rs @@ -22,7 +22,7 @@ async fn main() { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/loop.rs b/examples/loop.rs index ef7494b34..155d1acae 100644 --- a/examples/loop.rs +++ b/examples/loop.rs @@ -55,7 +55,7 @@ async fn main() { format!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}\n", duration, - website.get_links().len() + website.get_size().await ) .as_bytes(), ) diff --git a/examples/openai.rs b/examples/openai.rs index 77e722281..86297f6d9 100644 --- a/examples/openai.rs +++ b/examples/openai.rs @@ -58,7 +58,7 @@ async fn main() { let start = crate::tokio::time::Instant::now(); website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", diff --git a/examples/openai_cache.rs b/examples/openai_cache.rs index e3ab36468..2d1042580 100644 --- a/examples/openai_cache.rs +++ b/examples/openai_cache.rs @@ -63,7 +63,7 @@ async fn main() { let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", diff --git a/examples/openai_extra.rs b/examples/openai_extra.rs index 7fb72bce8..01b6f24df 100644 --- a/examples/openai_extra.rs +++ b/examples/openai_extra.rs @@ -44,7 +44,7 @@ async fn main() { let start = crate::tokio::time::Instant::now(); website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", diff --git a/examples/openai_multi.rs b/examples/openai_multi.rs index fb24d3d05..1f6299d2f 100644 --- a/examples/openai_multi.rs +++ b/examples/openai_multi.rs @@ -45,7 +45,7 @@ async fn main() { let start = crate::tokio::time::Instant::now(); website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", diff --git a/examples/queue.rs b/examples/queue.rs index ba07034e9..0ea987664 100644 --- a/examples/queue.rs +++ b/examples/queue.rs @@ -43,6 +43,6 @@ async fn main() { println!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", duration, - website.get_links().len() + website.get_size().await ) } diff --git a/examples/real_world.rs b/examples/real_world.rs index 5b8acdb8e..fcb050d9a 100644 --- a/examples/real_world.rs +++ b/examples/real_world.rs @@ -46,7 +46,7 @@ async fn crawl_website(url: &str) -> Result<()> { async move { website.crawl().await; website.unsubscribe(); - website.get_links() + website.get_all_links_visited().await }, async move { while let Ok(page) = rx2.recv().await { diff --git a/examples/serde.rs b/examples/serde.rs index 6afe47d40..64deeb83e 100644 --- a/examples/serde.rs +++ b/examples/serde.rs @@ -12,7 +12,7 @@ async fn main() { website.crawl().await; - let links = website.get_links(); + let links = website.get_all_links_visited().await; let mut s = flexbuffers::FlexbufferSerializer::new(); diff --git a/examples/sitemap.rs b/examples/sitemap.rs index 267ad3052..47301459c 100644 --- a/examples/sitemap.rs +++ b/examples/sitemap.rs @@ -27,7 +27,7 @@ async fn main() { let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/examples/smart.rs b/examples/smart.rs index bb606f897..f12500f83 100644 --- a/examples/smart.rs +++ b/examples/smart.rs @@ -17,5 +17,5 @@ async fn main() { website.crawl_smart().await; - println!("Links found {:?}", website.get_links().len()); + println!("Links found {:?}", website.get_size().await); } diff --git a/examples/subscribe.rs b/examples/subscribe.rs index 28e248b84..2bedde6ca 100644 --- a/examples/subscribe.rs +++ b/examples/subscribe.rs @@ -32,7 +32,7 @@ async fn main() { format!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", duration, - website.get_links().len() + website.get_size().await ) .as_bytes(), ) diff --git a/examples/subscribe_download.rs b/examples/subscribe_download.rs index a9d8c1102..7abe1e284 100644 --- a/examples/subscribe_download.rs +++ b/examples/subscribe_download.rs @@ -48,11 +48,8 @@ async fn main() { .await .expect("Unable to open file"); - match page.get_bytes() { - Some(b) => { - file.write_all(b).await.unwrap_or_default(); - } - _ => (), + if let Some(b) = page.get_bytes() { + file.write_all(b).await.unwrap_or_default(); } log("downloaded", download_file) diff --git a/examples/transform_markdown.rs b/examples/transform_markdown.rs index fc8c5a279..f71c5d713 100644 --- a/examples/transform_markdown.rs +++ b/examples/transform_markdown.rs @@ -40,7 +40,7 @@ async fn main() { format!( "Time elapsed in website.crawl() is: {:?} for total pages: {:?}", duration, - website.get_links().len() + website.get_size().await ) .as_bytes(), ) diff --git a/examples/url_glob_subdomains.rs b/examples/url_glob_subdomains.rs index 6eb5a4525..20df4d348 100644 --- a/examples/url_glob_subdomains.rs +++ b/examples/url_glob_subdomains.rs @@ -16,7 +16,7 @@ async fn main() { website.crawl().await; let duration = start.elapsed(); - let links = website.get_links(); + let links = website.get_all_links_visited().await; for link in links.iter() { println!("- {:?}", link.as_ref()); diff --git a/spider/Cargo.toml b/spider/Cargo.toml index fed6fa89c..9de29f1a9 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.21.33" +version = "2.22.2" authors = [ "j-mendez " ] @@ -70,6 +70,7 @@ statrs = { version = "0.17", optional = true } aho-corasick = { version = "1", optional = true } tracing = { version = "0.1", default-features = false, features = ["std"], optional = true } sysinfo = { version = "0.33", default-features = false, features = ["system"], optional = true } +sqlx = { version = "0.8", features = [ "runtime-tokio", "sqlite" ], optional = true } [dependencies.spider_chrome] version = "2" @@ -113,7 +114,10 @@ reqwest = { version = "0.12", features = [ ] } [features] -default = ["sync", "reqwest_native_tls_native_roots", "cookies", "ua_generator", "encoding", "string_interner_buffer_backend", "balance"] +default = ["sync", "reqwest_native_tls_native_roots", "disk_native_tls", "cookies", "ua_generator", "encoding", "string_interner_buffer_backend", "balance"] +disk = ["dep:sqlx"] +disk_native_tls = ["disk", "sqlx/runtime-tokio-native-tls"] +disk_aws = ["disk", "sqlx/tls-rustls-aws-lc-rs"] adblock = ["chrome", "spider_chrome/adblock"] balance = ["dep:sysinfo"] regex = [] diff --git a/spider/README.md b/spider/README.md index 268cd30c3..0ecfcd138 100644 --- a/spider/README.md +++ b/spider/README.md @@ -97,7 +97,8 @@ spider = { version = "2", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. -1. `regex`: Enables blacklisting paths with regx +1. `regex`: Enables blacklisting and whitelisting paths with regex. +1. `disk`: Enables SQLite hybrid disk storage to balance memory usage. 1. `jemalloc`: Enables the [jemalloc](https://github.com/jemalloc/jemalloc) memory backend. 1. `decentralized`: Enables decentralized processing of IO, requires the [spider_worker](../spider_worker/README.md) startup before crawls. 1. `sync`: Subscribe to changes for Page data processing async. [Enabled by default] @@ -132,6 +133,9 @@ spider = { version = "2", features = ["regex", "ua_generator"] } 1. `headers`: Enables the extraction of header information on each retrieved page. Adds a `headers` field to the page struct. 1. `decentralized_headers`: Enables the extraction of suppressed header information of the decentralized processing of IO. This is needed if `headers` is set in both [spider](../spider/README.md) and [spider_worker](../spider_worker/README.md). +1. `string_interner_buffer_backend`: Enables the String interning using the buffer backend [default]. +1. `string_interner_string_backend`: Enables the String interning using the string backend. +1. `string_interner_bucket_backend`: Enables the String interning using the bucket backend. ### Decentralization diff --git a/spider/src/features/disk.rs b/spider/src/features/disk.rs new file mode 100644 index 000000000..f95e6e15d --- /dev/null +++ b/spider/src/features/disk.rs @@ -0,0 +1,376 @@ +#[cfg(feature = "disk")] +use case_insensitive_string::CaseInsensitiveString; +#[cfg(feature = "disk")] +use hashbrown::HashSet; +use std::sync::atomic::{AtomicUsize, Ordering}; + +#[cfg(feature = "disk")] +use crate::utils::emit_log; +#[cfg(feature = "disk")] +use sqlx::{sqlite::SqlitePool, Sqlite, Transaction}; + +#[derive(Default, Debug, Clone)] +#[cfg(feature = "disk")] +/// Manage Sqlite database operations +pub struct DatabaseHandler { + /// Persist after drop. + pub persist: bool, + /// The crawl ID. + pub crawl_id: Option, + /// The connection pool. + pool: tokio::sync::OnceCell, + /// Initial seed ran. + pub seeded: bool, +} + +#[derive(Default, Debug, Clone)] +#[cfg(not(feature = "disk"))] +/// Manage Sqlite database operations +pub struct DatabaseHandler { + /// Persist after drop. + pub persist: bool, +} + +#[cfg(not(feature = "disk"))] +impl DatabaseHandler { + /// A new DB handler. + pub fn new(_crawl_id: &Option) -> Self { + Default::default() + } + /// Delete the db by id. + pub fn delete_db_by_id(&mut self) {} +} + +#[cfg(feature = "disk")] +impl DatabaseHandler { + /// A new DB handler. + pub fn new(crawl_id: &Option) -> Self { + Self { + persist: false, + pool: tokio::sync::OnceCell::const_new(), + crawl_id: match crawl_id { + Some(id) => Some(format!("{}_{}", id.replace(".", "_"), get_id())), + _ => None, + }, + seeded: false, + } + } + + /// Determine if the pool is initialized. + pub fn pool_inited(&self) -> bool { + self.pool.initialized() + } + + /// Determine if a seed was already done. + pub fn ready(&self) -> bool { + self.seeded + } + + /// Get or initialize the database pool + pub async fn get_db_pool(&self) -> &SqlitePool { + self.pool + .get_or_init(|| async { + let db_path = get_db_path(&self.crawl_id); + let direct = db_path.starts_with("sqlite://"); + + // not a shared sqlite db. + if direct { + create_file_and_directory(&db_path[9..]).await; + } else { + create_file_and_directory(&db_path).await; + } + + let db_url = if direct { + db_path + } else { + format!("sqlite://{}", db_path) + }; + + let pool = + SqlitePool::connect_lazy(&db_url).expect("Failed to connect to the database"); + + sqlx::query( + r#" + CREATE TABLE IF NOT EXISTS resources ( + id INTEGER PRIMARY KEY, + url TEXT NOT NULL COLLATE NOCASE + ); + CREATE INDEX IF NOT EXISTS idx_url ON resources (url COLLATE NOCASE); + "#, + ) + .execute(&pool) + .await + .expect("Failed to create table and index."); + + pool + }) + .await + } + + /// Check if a URL exists (ignore case) + pub async fn url_exists(&self, pool: &SqlitePool, url_to_check: &str) -> bool { + match sqlx::query("SELECT 1 FROM resources WHERE url = ? LIMIT 1") + .bind(url_to_check) + .fetch_optional(pool) + .await + { + Ok(result) => result.is_some(), + Err(e) => { + if let Some(db_err) = e.as_database_error() { + emit_log(db_err.message()); + } else { + emit_log(&format!("A non-database error occurred: {:?}", e)); + } + false + } + } + } + + /// Insert a new URL if it doesn't exist + pub async fn insert_url(&self, pool: &SqlitePool, new_url: &CaseInsensitiveString) { + if !self.url_exists(pool, new_url).await { + if let Err(e) = sqlx::query("INSERT INTO resources (url) VALUES (?)") + .bind(new_url.to_string()) + .execute(pool) + .await + { + if let Some(db_err) = e.as_database_error() { + emit_log(db_err.message()); + } else { + emit_log(&format!("A non-database error occurred: {:?}", e)); + } + } + } + } + + /// Seed the database and manage URLs + pub async fn seed( + &self, + pool: &SqlitePool, + mut urls: HashSet, + ) -> Result, sqlx::Error> { + const CHUNK_SIZE: usize = 500; + const KEEP_COUNT: usize = 100; + + let mut tx: Transaction<'_, Sqlite> = pool.begin().await?; + let mut keep_urls = HashSet::with_capacity(KEEP_COUNT); + + for url in urls.iter().take(KEEP_COUNT) { + keep_urls.insert(url.clone()); + } + + urls.retain(|url| !keep_urls.contains(url)); + + for chunk in keep_urls.iter().collect::>().chunks(CHUNK_SIZE) { + let mut query = "INSERT OR IGNORE INTO resources (url) VALUES ".to_string(); + query.push_str(&vec!["(?)"; chunk.len()].join(", ")); + let mut statement = sqlx::query(&query); + + for url in chunk { + statement = statement.bind(url.to_string()); + } + + statement.execute(&mut *tx).await?; + } + + for chunk in urls.drain().collect::>().chunks(CHUNK_SIZE) { + let mut query = "INSERT OR IGNORE INTO resources (url) VALUES ".to_string(); + query.push_str(&vec!["(?)"; chunk.len()].join(", ")); + let mut statement = sqlx::query(&query); + + for url in chunk { + statement = statement.bind(url.to_string()); + } + + statement.execute(&mut *tx).await?; + } + + tx.commit().await?; + + Ok(keep_urls) + } + + /// Count the records stored. + pub async fn count_records(pool: &SqlitePool) -> Result { + let result = sqlx::query_scalar::<_, u64>("SELECT COUNT(*) FROM resources") + .fetch_one(pool) + .await?; + Ok(result) + } + + /// Get all the resources stored. + pub async fn get_all_resources( + pool: &SqlitePool, + ) -> Result, sqlx::Error> { + use sqlx::Row; + let rows = sqlx::query("SELECT url FROM resources") + .fetch_all(pool) // Fetches all rows at once. + .await?; + + let urls = rows + .into_iter() + .map(|row| row.get::("url").into()) + .collect(); + + Ok(urls) + } + + /// Clear DB by id + pub fn delete_db_by_id(&self) { + let _ = std::fs::remove_file(get_db_path(&self.crawl_id)); + } + + /// Clear the resources table. + pub async fn clear_table(pool: &SqlitePool) -> Result<(), sqlx::Error> { + sqlx::query("DELETE FROM resources").execute(pool).await?; + Ok(()) + } +} + +#[cfg(feature = "disk")] +impl Drop for DatabaseHandler { + fn drop(&mut self) { + if !self.persist { + self.delete_db_by_id(); + } + } +} + +/// simple counter to get the next ID. +#[cfg(feature = "disk")] +fn get_id() -> usize { + static COUNTER: AtomicUsize = AtomicUsize::new(1); + + let mut current = COUNTER.load(Ordering::Relaxed); + loop { + let next = if current == usize::MAX { + 1 + } else { + current + 1 + }; + match COUNTER.compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) { + Ok(_) => return current, + Err(updated) => current = updated, + } + } +} + +/// Get the db path. +pub fn get_db_path(crawl_id: &Option) -> String { + // Get the base database URL or default to a temporary directory + let base_url = std::env::var("SQLITE_DATABASE_URL").unwrap_or_else(|_| { + let temp_dir = std::env::temp_dir(); + temp_dir.to_string_lossy().into_owned() + }); + + let delim = if base_url.starts_with("sqlite://memory:") { + ":" + } else { + "/" + }; + + // Determine the db_path + let db_path = match crawl_id { + Some(crawl_id) => { + format!( + "{}{delim}spider_{}.db", + base_url.trim_end_matches('/'), + crawl_id.replace(".", "_") + ) + } + None => format!("{}{delim}spider.db", base_url.trim_end_matches('/')), + }; + + db_path +} + +/// Create the file and directory if locally. +#[cfg(feature = "disk")] +async fn create_file_and_directory(file_path: &str) { + let path = std::path::Path::new(file_path); + + if let Some(parent) = path.parent() { + let _ = tokio::fs::create_dir_all(parent).await; + } + + let _ = tokio::fs::File::create(path).await; +} + +#[cfg(test)] +#[cfg(feature = "disk")] +mod tests { + use super::*; + use tokio; + + #[tokio::test] + async fn test_connect_db() { + let handler = DatabaseHandler::new(&Some("example.com".into())); + let test_url = CaseInsensitiveString::new("http://example.com"); + let pool = handler.get_db_pool().await; + + if handler.url_exists(pool, &test_url).await { + println!("URL '{}' already exists in the database.", test_url); + } else { + handler.insert_url(pool, &test_url).await; + println!("URL '{}' was inserted into the database.", test_url); + } + + assert!( + handler.url_exists(pool, &test_url).await, + "URL should exist after insertion." + ); + } + + #[tokio::test] + async fn test_url_insert_and_exists() { + let handler = DatabaseHandler::new(&Some("example.com".into())); + let new_url = CaseInsensitiveString::new("http://new-example.com"); + let pool = handler.get_db_pool().await; + + assert!( + !handler.url_exists(pool, &new_url).await, + "URL should not exist initially." + ); + + handler.insert_url(pool, &new_url).await; + assert!( + handler.url_exists(pool, &new_url).await, + "URL should exist after insertion." + ); + } + + #[tokio::test] + async fn test_url_case_insensitivity() { + let handler = DatabaseHandler::new(&Some("case-test.com".into())); + let url1 = CaseInsensitiveString::new("http://case-test.com"); + let url2 = CaseInsensitiveString::new("http://CASE-TEST.com"); + let pool = handler.get_db_pool().await; + + handler.insert_url(pool, &url1).await; + assert!( + handler.url_exists(pool, &url2).await, + "URL check should be case-insensitive." + ); + } + + #[tokio::test] + async fn test_seed_urls() { + let handler = DatabaseHandler::new(&Some("example.com".into())); + let mut urls = HashSet::new(); + urls.insert(CaseInsensitiveString::new("http://foo.com")); + urls.insert(CaseInsensitiveString::new("http://bar.com")); + let pool = handler.get_db_pool().await; + + handler + .seed(pool, urls.clone()) + .await + .expect("Seeding failed"); + + for url in urls { + assert!( + handler.url_exists(pool, &url).await, + "Seeded URL should exist after seeding." + ); + } + } +} diff --git a/spider/src/features/mod.rs b/spider/src/features/mod.rs index a95c9b493..3e39202d1 100644 --- a/spider/src/features/mod.rs +++ b/spider/src/features/mod.rs @@ -12,6 +12,8 @@ pub mod chrome_viewport; /// Decentralized header handling #[cfg(feature = "decentralized_headers")] pub mod decentralized_headers; +/// Disk options +pub mod disk; /// URL globbing #[cfg(feature = "glob")] pub mod glob; @@ -22,3 +24,15 @@ pub mod openai; pub mod openai_common; /// Spoof the refereer pub mod spoof_referrer; + +lazy_static::lazy_static! { + /// The max links to store in memory. + pub(crate) static ref LINKS_VISITED_MEMORY_LIMIT: usize = { + const DEFAULT_LIMIT: usize = 150_00; + + match std::env::var("LINKS_VISITED_MEMORY_LIMIT") { + Ok(limit) => limit.parse::().unwrap_or(DEFAULT_LIMIT), + _ => DEFAULT_LIMIT + } + }; +} diff --git a/spider/src/page.rs b/spider/src/page.rs index a28402400..477cf8da8 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -10,7 +10,7 @@ use crate::RelativeSelectors; use auto_encoder::auto_encode_bytes; use bytes::Bytes; use hashbrown::HashSet; -use lol_html::{AsciiCompatibleEncoding, Settings}; +use lol_html::AsciiCompatibleEncoding; use regex::bytes::Regex; use reqwest::StatusCode; use tokio::time::Duration; @@ -1318,7 +1318,7 @@ impl Page { let base_input_domain = &selectors.2; // the domain after redirects let sub_matcher = &selectors.0; - let rewriter_settings = Settings { + let rewriter_settings = lol_html::Settings { element_content_handlers: vec![lol_html::element!("a[href]", |el| { if let Some(href) = el.get_attribute("href") { push_link( @@ -1413,7 +1413,7 @@ impl Page { let base_input_domain = &selectors.2; // the domain after redirects let sub_matcher = &selectors.0; - let rewriter_settings = Settings { + let rewriter_settings = lol_html::Settings { element_content_handlers: vec![ lol_html::element!("a[href]", |el| { if let Some(href) = el.get_attribute("href") { @@ -1661,7 +1661,7 @@ impl Page { let mut static_app = false; - let rewriter_settings = Settings { + let rewriter_settings = lol_html::Settings { element_content_handlers: vec![ element!("script", |element| { if !static_app { diff --git a/spider/src/utils/detect_cpu.rs b/spider/src/utils/detect_cpu.rs deleted file mode 100644 index ff2340b1b..000000000 --- a/spider/src/utils/detect_cpu.rs +++ /dev/null @@ -1,54 +0,0 @@ -use std::sync::atomic::{AtomicI8, Ordering}; -use sysinfo::System; -use tokio::sync::OnceCell; -use tokio::time::sleep; - -/// The CPU state for the crawl. -/// -static CPU_STATE: AtomicI8 = AtomicI8::new(0); - -/// `OnceCell` CPU tracking. -static INIT: OnceCell<()> = OnceCell::const_new(); - -/// Get the total avg CPU being used. -fn get_cpu_usage(sys: &System) -> f32 { - sys.cpus() - .iter() - .map(|cpu| cpu.cpu_usage() / sys.cpus().len() as f32) - .sum::() -} - -/// Update the cpu usage being used. -async fn update_cpu_usage() { - if sysinfo::IS_SUPPORTED_SYSTEM { - let mut sys = System::new(); - - loop { - sys.refresh_cpu_usage(); - let usage = get_cpu_usage(&sys); - let state = if usage >= 70.0 { - 1 - } else if usage >= 95.0 { - 2 - } else { - 0 - }; - CPU_STATE.store(state, Ordering::Relaxed); - sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL).await; - } - } -} - -/// Setup the cpu tracker. -async fn init_once() { - INIT.get_or_init(|| async { - tokio::spawn(update_cpu_usage()); - }) - .await; -} - -/// Get the cpu usage being used utility. -pub async fn get_global_cpu_usage() -> i8 { - init_once().await; - CPU_STATE.load(Ordering::Relaxed) -} diff --git a/spider/src/utils/detect_system.rs b/spider/src/utils/detect_system.rs new file mode 100644 index 000000000..277be003b --- /dev/null +++ b/spider/src/utils/detect_system.rs @@ -0,0 +1,119 @@ +use std::sync::atomic::{AtomicI8, Ordering}; +use sysinfo::System; +use tokio::sync::OnceCell; +use tokio::time::sleep; + +/// The CPU state for the crawl. +static CPU_STATE: AtomicI8 = AtomicI8::new(0); + +/// The System Memory state for the crawl. +#[cfg(feature = "disk")] +static MEMORY_STATE: AtomicI8 = AtomicI8::new(0); + +/// `OnceCell` CPU tracking. +static INIT: OnceCell<()> = OnceCell::const_new(); + +/// Get the total avg CPU being used. +fn get_cpu_usage(sys: &System) -> f32 { + sys.cpus() + .iter() + .map(|cpu| cpu.cpu_usage() / sys.cpus().len() as f32) + .sum::() +} + +/// The total memory used. +#[cfg(feature = "disk")] +fn get_memory_limits(sys: &System) -> u64 { + let total_memory = sys.total_memory(); + let used_memory = sys.used_memory(); + (used_memory / total_memory) * 100 +} + +/// The CPU state to determine how to use concurrency and delays. +/// 0 = Full Concurrency. +/// 1 = Shared Concurrency. +/// 2 = Shared Concurrency with delays. +fn determine_cpu_state(usage: f32) -> i8 { + if usage >= 70.0 { + 1 + } else if usage >= 95.0 { + 2 + } else { + 0 + } +} + +/// The Memory state to determine how to use concurrency and delays. +/// 0 = Full Memory. +/// 1 = Hybrid Memory/Disk. +/// 2 = Full Disk. +#[cfg(feature = "disk")] +fn determine_memory_state(usage: u64) -> i8 { + if usage >= 50 { + 1 + } else if usage >= 80 { + 2 + } else { + 0 + } +} + +/// Update the memory used. +#[cfg(feature = "disk")] +fn update_memory(sys: &mut System) { + sys.refresh_memory(); + MEMORY_STATE.store( + determine_memory_state(get_memory_limits(&sys)), + Ordering::Relaxed, + ); +} + +/// Update the memory used. +#[cfg(not(feature = "disk"))] +fn update_memory(_sys: &mut System) {} + +/// Update the cpu used. +fn update_cpu(sys: &mut System) { + sys.refresh_cpu_usage(); + CPU_STATE.store(determine_cpu_state(get_cpu_usage(&sys)), Ordering::Relaxed); +} + +/// Update the cpu usage being used. +async fn update_cpu_usage() { + if sysinfo::IS_SUPPORTED_SYSTEM { + let mut sys = System::new(); + + loop { + update_cpu(&mut sys); + update_memory(&mut sys); + sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL).await; + } + } +} + +/// Setup the cpu tracker. +async fn init_once() { + INIT.get_or_init(|| async { + tokio::spawn(update_cpu_usage()); + }) + .await; +} + +/// Get the cpu usage being used state utility. +pub async fn get_global_cpu_state() -> i8 { + init_once().await; + CPU_STATE.load(Ordering::Relaxed) +} + +/// Get the memory usage being used state utility. +#[cfg(feature = "disk")] +pub async fn get_global_memory_state() -> i8 { + init_once().await; + MEMORY_STATE.load(Ordering::Relaxed) +} + +/// Get the memory usage being used state utility. +#[cfg(not(feature = "disk"))] +pub async fn get_global_memory_state() -> i8 { + 0 +} diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index 051452b31..70718757c 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -8,8 +8,8 @@ pub mod interner; pub mod trie; #[cfg(feature = "balance")] -/// CPU detection to balance limitations. -pub mod detect_cpu; +/// CPU and Memory detection to balance limitations. +pub mod detect_system; use crate::RelativeSelectors; use abs::parse_absolute_url; @@ -3006,7 +3006,7 @@ const REBALANCE_TIME: std::time::Duration = std::time::Duration::from_millis(100 #[cfg(feature = "balance")] pub async fn get_semaphore(semaphore: &Arc, detect: bool) -> &Arc { let cpu_load = if detect { - crate::utils::detect_cpu::get_global_cpu_usage().await + crate::utils::detect_system::get_global_cpu_state().await } else { 0 }; diff --git a/spider/src/website.rs b/spider/src/website.rs index 3538be4a2..8715343e1 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -4,6 +4,8 @@ use crate::configuration::{ self, get_ua, AutomationScriptsMap, Configuration, ExecutionScriptsMap, RedirectPolicy, }; use crate::features::chrome_common::RequestInterceptConfiguration; +#[cfg(feature = "disk")] +use crate::features::disk::DatabaseHandler; use crate::packages::robotparser::parser::RobotFileParser; use crate::page::{Page, PageLinkBuildSettings}; use crate::utils::abs::{convert_abs_url, parse_absolute_url}; @@ -268,12 +270,14 @@ pub struct Website { /// The initial status code of the first request. initial_status_code: StatusCode, /// Set the crawl ID to track. This allows explicit targeting for shutdown, pause, and etc. - #[cfg(feature = "control")] pub crawl_id: Box, /// The website was manually stopped. shutdown: bool, /// The request client. Stored for re-use between runs. client: Option, + #[cfg(feature = "disk")] + /// The disk handler to use + sqlite: DatabaseHandler, } impl Website { @@ -320,6 +324,117 @@ impl Website { self } + /// Get the target id for a crawl. This takes the crawl ID and the url and concats it without delimiters. + pub fn target_id(&self) -> String { + string_concat!(self.crawl_id, self.url.inner()) + } + + /// Setup SQLite. This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + pub fn setup_disk(&mut self) { + if !self.sqlite.pool_inited() { + self.sqlite + .clone_from(&DatabaseHandler::new(&Some(self.target_id()))); + } + } + + /// Setup SQLite. This does nothing with [disk] flag enabled. + #[cfg(not(feature = "disk"))] + pub fn setup_disk(&mut self) {} + + /// Get the db pool. This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + async fn get_db_pool(&self) -> &sqlx::SqlitePool { + self.sqlite.get_db_pool().await + } + + /// Check if URL exists (ignore case). This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + async fn is_allowed_disk(&self, url_to_check: &str) -> bool { + if !self.sqlite.ready() { + true + } else { + !self + .sqlite + .url_exists(self.get_db_pool().await, url_to_check) + .await + } + } + + /// Check if URL exists (ignore case). This does nothing with [disk] flag enabled. + #[cfg(not(feature = "disk"))] + async fn is_allowed_disk(&self, _url_to_check: &str) -> bool { + true + } + + /// Clear the disk. This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + async fn clear_disk(&self) { + let _ = DatabaseHandler::clear_table(self.get_db_pool().await).await; + } + + /// Clear the disk. This does nothing with [disk] flag enabled. + #[cfg(not(feature = "disk"))] + async fn clear_disk(&self) {} + + /// Insert a new URL to disk if it doesn't exist. This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + async fn insert_url_disk(&self, new_url: &CaseInsensitiveString) { + self.sqlite + .insert_url(self.get_db_pool().await, new_url) + .await + } + + /// Insert a new URL if it doesn't exist. This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + async fn insert_link(&mut self, new_url: CaseInsensitiveString) { + let mem_load = crate::utils::detect_system::get_global_memory_state().await; + let beyond_memory_limits = + self.links_visited.len() >= *crate::features::LINKS_VISITED_MEMORY_LIMIT; + + let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits; + + if seed_check && !self.sqlite.ready() { + let _ = self.seed().await; + } + + if mem_load == 2 || beyond_memory_limits { + self.insert_url_disk(&new_url).await + } else if mem_load == 1 { + if self.links_visited.len() <= 100 { + self.links_visited.insert(new_url); + } else { + self.insert_url_disk(&new_url).await + } + } else { + self.links_visited.insert(new_url); + } + } + + /// Insert a new URL if it doesn't exist. This does nothing with [disk] flag enabled. + #[cfg(not(feature = "disk"))] + async fn insert_link(&mut self, link: CaseInsensitiveString) { + self.links_visited.insert(link); + } + + /// Seed the DB and clear the Hashset. This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + async fn seed(&mut self) -> Result<(), sqlx::Error> { + let links = self.get_links(); + + if let Ok(links) = self.sqlite.seed(self.get_db_pool().await, links).await { + self.links_visited.clear(); + + for link in links { + self.links_visited.insert(link); + } + + self.sqlite.seeded = true; + } + + Ok(()) + } + /// Return `false` if the crawl should shutdown. Process in between each link. async fn handle_process( &self, @@ -568,11 +683,32 @@ impl Website { } } - /// Amount of pages crawled. + /// Amount of pages crawled in memory only. Use get_size for full links between memory and disk. pub fn size(&self) -> usize { self.links_visited.len() } + /// Get the amount of resources collected. + #[cfg(not(feature = "disk"))] + pub async fn get_size(&self) -> usize { + self.links_visited.len() + } + + /// Get the amount of resources collected. + #[cfg(feature = "disk")] + pub async fn get_size(&self) -> usize { + use crate::features::LINKS_VISITED_MEMORY_LIMIT; + let disk_count = DatabaseHandler::count_records(self.get_db_pool().await).await; + let disk_count = disk_count.unwrap_or_default() as usize; + let mut mem_count = self.links_visited.len(); + + if mem_count >= *LINKS_VISITED_MEMORY_LIMIT { + mem_count -= *LINKS_VISITED_MEMORY_LIMIT; + } + + disk_count + mem_count + } + /// Drain the extra links used for things like the sitemap. pub fn drain_extra_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> { self.extra_links.drain() @@ -614,6 +750,12 @@ impl Website { &self.extra_links } + /// Clear all pages, disk, and links stored in memory. + pub async fn clear_all(&mut self) { + self.clear(); + self.clear_disk().await; + } + /// Clear all pages and links stored. pub fn clear(&mut self) { self.links_visited.clear(); @@ -631,7 +773,40 @@ impl Website { self.pages.as_ref() } - /// Links visited getter. + /// Links visited getter for disk. This does nothing with [disk] flag enabled. + #[cfg(not(feature = "disk"))] + pub async fn get_links_disk(&self) -> HashSet { + Default::default() + } + + /// Links visited getter for disk. This does nothing with [disk] flag enabled. + #[cfg(feature = "disk")] + pub async fn get_links_disk(&self) -> HashSet { + if let Ok(links) = DatabaseHandler::get_all_resources(self.get_db_pool().await).await { + links + } else { + Default::default() + } + } + + /// Links all the links visited between memory and disk. + #[cfg(feature = "disk")] + pub async fn get_all_links_visited(&self) -> HashSet { + let mut l = self.get_links_disk().await; + let m = self.links_visited.get_links(); + + l.extend(m); + + l + } + + /// Links all the links visited between memory and disk. + #[cfg(not(feature = "disk"))] + pub async fn get_all_links_visited(&self) -> HashSet { + self.get_links() + } + + /// Links visited getter for memory resources. pub fn get_links(&self) -> HashSet { self.links_visited.get_links() } @@ -1142,7 +1317,8 @@ impl Website { use crate::utils::{Handler, CONTROLLER}; let c: Arc = Arc::new(AtomicI8::new(0)); let handle = c.clone(); - let target_id = string_concat!(self.crawl_id, self.url.inner()); + let target_id = self.target_id(); + let c_lock = CONTROLLER.clone(); let join_handle = spawn_task("control_handler", async move { @@ -1207,9 +1383,10 @@ impl Website { #[cfg(feature = "control")] async fn setup(&mut self) -> (Client, Option<(Arc, tokio::task::JoinHandle<()>)>) { self.determine_limits(); + self.setup_disk(); if self.status != CrawlStatus::Active { - self.clear(); + self.clear_all().await; } let client = match self.client.take() { @@ -1227,9 +1404,10 @@ impl Website { #[cfg(not(feature = "control"))] async fn setup(&mut self) -> (Client, Option<(Arc, tokio::task::JoinHandle<()>)>) { self.determine_limits(); + self.setup_disk(); if self.status != CrawlStatus::Active { - self.clear(); + self.clear_all().await; } let client = match self.client.take() { @@ -1379,13 +1557,14 @@ impl Website { emit_log(url); - self.links_visited.insert(match self.on_link_find_callback { + self.insert_link(match self.on_link_find_callback { Some(cb) => { let c = cb(*self.url.clone(), None); c.0 } _ => *self.url.clone(), - }); + }) + .await; if page.is_empty() { self.status = CrawlStatus::Empty; @@ -1477,14 +1656,15 @@ impl Website { emit_log(&self.url.inner()); - self.links_visited.insert(match self.on_link_find_callback { + self.insert_link(match self.on_link_find_callback { Some(cb) => { let c = cb(*self.url.clone(), None); c.0 } _ => *self.url.clone(), - }); + }) + .await; // setup link tracking. if self.configuration.return_page_links && page.page_links.is_none() { @@ -1663,14 +1843,15 @@ impl Website { emit_log(&self.url.inner()); - self.links_visited.insert(match self.on_link_find_callback { + self.insert_link(match self.on_link_find_callback { Some(cb) => { let c = cb(*self.url.clone(), None); c.0 } _ => *self.url.clone(), - }); + }) + .await; let links = if !page_links.is_empty() { page_links @@ -1731,14 +1912,15 @@ impl Website { ) .await; - self.links_visited.insert(match self.on_link_find_callback { + self.insert_link(match self.on_link_find_callback { Some(cb) => { let c = cb(*self.url.to_owned(), None); c.0 } _ => *self.url.to_owned(), - }); + }) + .await; self.initial_status_code = page.status_code; @@ -1785,7 +1967,7 @@ impl Website { if allowed.eq(&ProcessLinkStatus::BudgetExceeded) { break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await { continue; } @@ -1807,7 +1989,7 @@ impl Website { _ => (u, None), }; - self.links_visited.insert(link_result.0); + self.insert_link(link_result.0).await; if self.configuration.return_page_links { page.page_links = Some(Default::default()); @@ -1842,7 +2024,7 @@ impl Website { if allowed.eq(&ProcessLinkStatus::BudgetExceeded) { break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await { continue; } @@ -1861,7 +2043,7 @@ impl Website { _ => (u, None), }; - self.links_visited.insert(link_result.0); + self.insert_link(link_result.0).await; if self.configuration.return_page_links { page.page_links = Some(Default::default()); @@ -1906,7 +2088,7 @@ impl Website { break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await { continue; } @@ -1934,7 +2116,7 @@ impl Website { _ => (u, None), }; - self.links_visited.insert(link_result.0); + self.insert_link(link_result.0).await; if !page.is_empty() { if self.configuration.return_page_links { @@ -2075,11 +2257,12 @@ impl Website { w.crawl().await; }); - if let Some(p) = self.pages.as_mut() { + if let Some(mut p) = self.pages.as_mut().cloned() { while let Ok(res) = rx2.recv().await { - self.links_visited.insert(res.get_url().into()); + self.insert_link(res.get_url().into()).await; p.push(res); } + self.pages.replace(p); } } @@ -2096,11 +2279,12 @@ impl Website { w.crawl_raw().await; }); - if let Some(p) = self.pages.as_mut() { + if let Some(mut p) = self.pages.as_mut().cloned() { while let Ok(res) = rx2.recv().await { - self.links_visited.insert(res.get_url().into()); + self.insert_link(res.get_url().into()).await; p.push(res); } + self.pages.replace(p); } } @@ -2117,11 +2301,12 @@ impl Website { w.crawl_smart().await; }); - if let Some(p) = self.pages.as_mut() { + if let Some(mut p) = self.pages.as_mut().cloned() { while let Ok(res) = rx2.recv().await { - self.links_visited.insert(res.get_url().into()); + self.insert_link(res.get_url().into()).await; p.push(res); } + self.pages.replace(p); } } @@ -2138,11 +2323,12 @@ impl Website { w.crawl_sitemap().await; }); - if let Some(p) = self.pages.as_mut() { + if let Some(mut p) = self.pages.as_mut().cloned() { while let Ok(res) = rx2.recv().await { - self.links_visited.insert(res.get_url().into()); + self.insert_link(res.get_url().into()).await; p.push(res); } + self.pages.replace(p); } } @@ -2238,13 +2424,13 @@ impl Website { break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await { continue; } emit_log(link.inner()); - self.links_visited.insert(link.clone()); + self.insert_link(link.clone()).await; if let Ok(permit) = semaphore.clone().acquire_owned().await { let shared = shared.clone(); @@ -2349,9 +2535,10 @@ impl Website { exceeded_budget = true; break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&s).await { continue; } + self.links_visited.extend_with_new_links(&mut links, s); } } @@ -2509,13 +2696,13 @@ impl Website { exceeded_budget = true; break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await { continue; } emit_log(&link.inner()); - self.links_visited.insert(link.clone()); + self.insert_link(link.clone()).await; if let Ok(permit) = semaphore.clone().acquire_owned().await { let shared = shared.clone(); @@ -2783,13 +2970,15 @@ impl Website { exceeded_budget = true; break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) + || !self.is_allowed_disk(&link).await + { continue; } emit_log(&link.inner()); - self.links_visited.insert(link.clone()); + self.insert_link(link.clone()).await; if let Ok(permit) = SEM.acquire().await { let client = client.clone(); @@ -2826,7 +3015,9 @@ impl Website { exceeded_budget = true; break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) + || !self.is_allowed_disk(&s).await + { continue; } @@ -2972,12 +3163,12 @@ impl Website { exceeded_budget = true; break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await { continue; } emit_log(&link.inner()); - self.links_visited.insert(link.clone()); + self.insert_link(link.clone()).await; if let Ok(permit) = semaphore.clone().acquire_owned().await { let shared = shared.clone(); @@ -3082,7 +3273,7 @@ impl Website { exceeded_budget = true; break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&s).await { continue; } @@ -3272,7 +3463,7 @@ impl Website { continue; } - self.links_visited.insert(link.clone()); + self.insert_link(link.clone()).await; let client = client.clone(); let tx = tx.clone(); @@ -3375,7 +3566,9 @@ impl Website { exceeded_budget = true; break; } - if allowed.eq(&ProcessLinkStatus::Blocked) { + if allowed.eq(&ProcessLinkStatus::Blocked) + || !self.is_allowed_disk(&s).await + { continue; } @@ -3545,8 +3738,8 @@ impl Website { continue; } - self.links_visited - .insert(link.clone()); + self.insert_link(link.clone()) + .await; let client = client.clone(); let tx = tx.clone(); diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index e0b2cdf6a..9500fc1d7 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.21.33" +version = "2.22.2" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 70ee26d9a..cc88b54d1 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.21.33" +version = "2.22.2" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 4bbdc9ba1..562b0062c 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.21.33" +version = "2.22.2" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index e359410e5..7df06e5f3 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.21.33" +version = "2.22.2" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 83e16dddf..8eb330966 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.21.33" +version = "2.22.2" authors = [ "j-mendez " ]