diff --git a/.gitignore b/.gitignore index e37cd72..880597d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target .data -.env \ No newline at end of file +.env +.fastembed_cache diff --git a/Cargo.lock b/Cargo.lock index 023690e..467a750 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,7 +30,7 @@ dependencies = [ "actix-service", "actix-utils", "ahash", - "base64", + "base64 0.21.7", "bitflags 2.4.2", "brotli", "bytes", @@ -65,7 +65,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -178,7 +178,7 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -288,7 +288,7 @@ checksum = "7e1df052c2bd7b241fc828bc2fda74ce9a7ef05e0a593c37275aaaba52caf49d" dependencies = [ "async-convert", "backoff", - "base64", + "base64 0.21.7", "derive_builder", "futures", "rand", @@ -311,7 +311,7 @@ checksum = "fbc0b1877fb1bc415caa14d1899f0f477e8eb38f2fe16f54be196d7c4a92e15c" dependencies = [ "async-convert", "backoff", - "base64", + "base64 0.21.7", "bytes", "derive_builder", "futures", @@ -347,7 +347,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -358,7 +358,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -441,12 +441,24 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bincode" version = "1.3.3" @@ -475,7 +487,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.52", + "syn 2.0.61", "which", ] @@ -589,6 +601,15 @@ dependencies = [ "bytes", ] +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cbc" version = "0.1.2" @@ -680,6 +701,19 @@ dependencies = [ "libloading", ] +[[package]] +name = "compact_str" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "ryu", + "static_assertions", +] + [[package]] name = "config" version = "0.14.0" @@ -700,6 +734,19 @@ dependencies = [ "yaml-rust", ] +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys 0.52.0", +] + [[package]] name = "const-random" version = "0.1.18" @@ -841,7 +888,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -948,6 +995,27 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + [[package]] name = "dlv-list" version = "0.5.2" @@ -990,6 +1058,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "encoding_rs" version = "0.8.33" @@ -1015,6 +1089,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" + [[package]] name = "eventsource-stream" version = "0.2.3" @@ -1054,12 +1134,40 @@ dependencies = [ "regex", ] +[[package]] +name = "fastembed" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3037262d3bb0f50b32737a863b6117035772345f2f337b580d69fa9f57500cb" +dependencies = [ + "anyhow", + "hf-hub", + "ndarray", + "ort", + "rayon", + "serde_json", + "tokenizers", + "variant_count", +] + [[package]] name = "fastrand" version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +[[package]] +name = "filetime" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "windows-sys 0.52.0", +] + [[package]] name = "flate2" version = "1.0.28" @@ -1166,7 +1274,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -1266,7 +1374,7 @@ dependencies = [ "bstr", "log", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.2", ] [[package]] @@ -1336,6 +1444,23 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hf-hub" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" +dependencies = [ + "dirs", + "indicatif", + "log", + "native-tls", + "rand", + "serde", + "serde_json", + "thiserror", + "ureq", +] + [[package]] name = "home" version = "0.5.9" @@ -1442,11 +1567,11 @@ dependencies = [ "http", "hyper", "log", - "rustls", + "rustls 0.21.10", "rustls-native-certs", "tokio", "tokio-rustls", - "webpki-roots", + "webpki-roots 0.25.4", ] [[package]] @@ -1549,6 +1674,19 @@ dependencies = [ "hashbrown 0.14.3", ] +[[package]] +name = "indicatif" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" +dependencies = [ + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", +] + [[package]] name = "inout" version = "0.1.3" @@ -1658,6 +1796,16 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.4.2", + "libc", +] + [[package]] name = "libsql" version = "0.3.1" @@ -1667,7 +1815,7 @@ dependencies = [ "anyhow", "async-stream", "async-trait", - "base64", + "base64 0.21.7", "bincode", "bitflags 2.4.2", "bytes", @@ -1744,7 +1892,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e260ef2a25e4d75bad7a45355a7f777429e59bf0867bbe43322cdc8d9eada774" dependencies = [ - "base64", + "base64 0.21.7", "bytes", "libsql-ffi", "libsql-rusqlite", @@ -1832,6 +1980,22 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" +[[package]] +name = "macro_rules_attribute" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" + [[package]] name = "markup5ever" version = "0.11.0" @@ -1852,6 +2016,16 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +[[package]] +name = "matrixmultiply" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "memchr" version = "2.7.1" @@ -1901,6 +2075,27 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "monostate" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e" +dependencies = [ + "monostate-impl", + "serde", +] + +[[package]] +name = "monostate-impl" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.61", +] + [[package]] name = "native-tls" version = "0.2.11" @@ -1919,6 +2114,19 @@ dependencies = [ "tempfile", ] +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -1935,12 +2143,30 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "num-complex" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.18" @@ -1960,6 +2186,12 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "object" version = "0.32.2" @@ -1975,6 +2207,28 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "onig" +version = "6.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" +dependencies = [ + "bitflags 1.3.2", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "openssl" version = "0.10.64" @@ -1998,7 +2252,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -2019,6 +2273,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-multimap" version = "0.6.0" @@ -2029,6 +2289,31 @@ dependencies = [ "hashbrown 0.13.2", ] +[[package]] +name = "ort" +version = "2.0.0-rc.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8e5caf4eb2ead4bc137c3ff4e347940e3e556ceb11a4180627f04b63d7342dd" +dependencies = [ + "compact_str", + "ndarray", + "ort-sys", + "thiserror", + "tracing", +] + +[[package]] +name = "ort-sys" +version = "2.0.0-rc.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3d9c1373fc813d3f024d394f621f4c6dde0734c79b1c17113c3bb5bf0084bbe" +dependencies = [ + "flate2", + "sha2", + "tar", + "ureq", +] + [[package]] name = "parking_lot" version = "0.12.1" @@ -2116,7 +2401,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -2199,7 +2484,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -2238,7 +2523,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -2259,6 +2544,12 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "portable-atomic" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" + [[package]] name = "powerfmt" version = "0.2.0" @@ -2284,14 +2575,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" dependencies = [ "unicode-ident", ] @@ -2316,7 +2607,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -2358,6 +2649,43 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +dependencies = [ + "either", + "itertools", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.4.1" @@ -2367,6 +2695,17 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_users" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +dependencies = [ + "getrandom", + "libredox", + "thiserror", +] + [[package]] name = "regex" version = "1.10.3" @@ -2376,7 +2715,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.2", ] [[package]] @@ -2387,9 +2726,15 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.2", ] +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + [[package]] name = "regex-syntax" version = "0.8.2" @@ -2402,7 +2747,7 @@ version = "0.11.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eea5a9eb898d3783f17c6407670e3592fd174cb81a10e51d4c37f49450b9946" dependencies = [ - "base64", + "base64 0.21.7", "bytes", "encoding_rs", "futures-core", @@ -2422,7 +2767,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls", + "rustls 0.21.10", "rustls-native-certs", "rustls-pemfile", "serde", @@ -2480,7 +2825,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b91f7eff05f748767f183df4320a63d6936e9c6107d97c9e6bdd9784f4289c94" dependencies = [ - "base64", + "base64 0.21.7", "bitflags 2.4.2", "serde", "serde_derive", @@ -2538,10 +2883,24 @@ checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" dependencies = [ "log", "ring", - "rustls-webpki", + "rustls-webpki 0.101.7", "sct", ] +[[package]] +name = "rustls" +version = "0.22.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" +dependencies = [ + "log", + "ring", + "rustls-pki-types", + "rustls-webpki 0.102.3", + "subtle", + "zeroize", +] + [[package]] name = "rustls-native-certs" version = "0.6.3" @@ -2560,9 +2919,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "base64", + "base64 0.21.7", ] +[[package]] +name = "rustls-pki-types" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "beb461507cee2c2ff151784c52762cf4d9ff6a61f3e80968600ed24fa837fa54" + [[package]] name = "rustls-webpki" version = "0.101.7" @@ -2573,6 +2938,17 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustls-webpki" +version = "0.102.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3bce581c0dd41bce533ce695a1437fa16a7ab5ac3ccfa99fe1a620a7885eabf" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.14" @@ -2710,7 +3086,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -2838,12 +3214,30 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "string_cache" version = "0.8.7" @@ -2876,6 +3270,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + [[package]] name = "syn" version = "1.0.109" @@ -2889,9 +3289,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.52" +version = "2.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "c993ed8ccba56ae856363b1845da7266a7cb78e1d146c8a32d54b45a8b831fc9" dependencies = [ "proc-macro2", "quote", @@ -2925,6 +3325,17 @@ dependencies = [ "libc", ] +[[package]] +name = "tar" +version = "0.4.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.10.1" @@ -2987,7 +3398,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -2998,7 +3409,7 @@ checksum = "40894b788eb28bbb7e36bdc8b7b1b1488b9c93fa3730f315ab965330c94c0842" dependencies = [ "anyhow", "async-openai 0.14.3", - "base64", + "base64 0.21.7", "bstr", "fancy-regex", "lazy_static", @@ -3061,6 +3472,37 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9be88c795d8b9f9c4002b3a8f26a6d0876103a6f523b32ea3bac52d8560c17c" +dependencies = [ + "aho-corasick", + "derive_builder", + "esaxx-rs", + "getrandom", + "itertools", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax 0.7.5", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.36.0" @@ -3098,7 +3540,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -3117,7 +3559,7 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls", + "rustls 0.21.10", "tokio", ] @@ -3189,7 +3631,7 @@ dependencies = [ "async-stream", "async-trait", "axum", - "base64", + "base64 0.21.7", "bytes", "h2", "http", @@ -3199,7 +3641,7 @@ dependencies = [ "percent-encoding", "pin-project", "prost", - "rustls", + "rustls 0.21.10", "rustls-native-certs", "rustls-pemfile", "tokio", @@ -3209,7 +3651,7 @@ dependencies = [ "tower-layer", "tower-service", "tracing", - "webpki-roots", + "webpki-roots 0.25.4", ] [[package]] @@ -3218,7 +3660,7 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fddb2a37b247e6adcb9f239f4e5cefdcc5ed526141a416b943929f13aea2cce" dependencies = [ - "base64", + "base64 0.21.7", "bytes", "http", "http-body", @@ -3304,7 +3746,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] @@ -3423,6 +3865,15 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + [[package]] name = "unicode-segmentation" version = "1.11.0" @@ -3435,12 +3886,38 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "untrusted" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "2.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd" +dependencies = [ + "base64 0.22.1", + "flate2", + "log", + "native-tls", + "once_cell", + "rustls 0.22.4", + "rustls-pki-types", + "rustls-webpki 0.102.3", + "serde", + "serde_json", + "url", + "webpki-roots 0.26.1", +] + [[package]] name = "url" version = "2.5.0" @@ -3468,6 +3945,16 @@ dependencies = [ "serde", ] +[[package]] +name = "variant_count" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae2faf80ac463422992abf4de234731279c058aaf33171ca70277c98406b124" +dependencies = [ + "quote", + "syn 1.0.109", +] + [[package]] name = "vcpkg" version = "0.2.15" @@ -3526,7 +4013,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", "wasm-bindgen-shared", ] @@ -3560,7 +4047,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3600,6 +4087,15 @@ version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" +[[package]] +name = "webpki-roots" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.2" @@ -3803,6 +4299,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "xattr" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" +dependencies = [ + "libc", + "linux-raw-sys", + "rustix", +] + [[package]] name = "yaml-rust" version = "0.4.5" @@ -3822,6 +4329,7 @@ dependencies = [ "chrono", "config", "dotenvy", + "fastembed", "libsql", "reqwest", "scraper", @@ -3850,7 +4358,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.61", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 89613b0..0024f12 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,10 @@ path = "src/bin/sm_scraper.rs" name = "embed_scrapes" path = "src/bin/embed_scrapes.rs" +[[bin]] +name = "fast_embed_scrapes" +path = "src/bin/fast_embed_scrapes.rs" + [[bin]] name = "migrate" path = "src/bin/migrate.rs" @@ -39,3 +43,4 @@ async-openai = "0.19.1" tiktoken-rs = { version = "0.5.8", features = ["async-openai"] } tera = "1.19.1" dotenvy = "0.15.7" +fastembed = "3.5.0" diff --git a/src/bin/fast_embed_scrapes.rs b/src/bin/fast_embed_scrapes.rs new file mode 100644 index 0000000..63c4873 --- /dev/null +++ b/src/bin/fast_embed_scrapes.rs @@ -0,0 +1,12 @@ +use zero2prod::jobs; + +#[tokio::main] +async fn main() -> Result<(), Box> { + match jobs::fast_embed_scrapes::main().await { + Ok(v) => Ok(v), + Err(e) => { + println!("{:?}", e); + Err(e.into()) + } + } +} diff --git a/src/jobs/fast_embed_scrapes.rs b/src/jobs/fast_embed_scrapes.rs new file mode 100644 index 0000000..d58baaa --- /dev/null +++ b/src/jobs/fast_embed_scrapes.rs @@ -0,0 +1,50 @@ +use crate::{ + db::start_db, + models::{fast_embeds, sm_scrape::get_page}, +}; +use fastembed::{InitOptions, TextEmbedding}; +use uuid::Uuid; + +pub async fn main() -> Result<(), Box> { + let db = start_db().await.unwrap(); + let conn = db.connect().unwrap(); + + let model = TextEmbedding::try_new(InitOptions { + show_download_progress: true, + ..Default::default() + })?; + let limit = 10; + let mut page = 0; + let mut scrapes = get_page(conn.clone(), limit, limit * page, false) + .await + .unwrap(); + while !scrapes.is_empty() { + println!( + "Processing {:?} scrapes. Ids {:?}", + scrapes.len(), + scrapes + .iter() + .map(|f| f.id.to_string()) + .collect::>() + ); + let docs = scrapes.iter().map(|f| f.content.to_string()).collect(); + let embeddings = model.embed(docs, None)?; + for (idx, scrape) in scrapes.iter().enumerate() { + let new_fast_embed = fast_embeds::FastEmbed { + id: Uuid::new_v4().to_string(), + doc_type: String::from("sm_scrape"), + doc_id: scrape.id.to_string(), + embedding: embeddings[idx] + .iter() + .flat_map(|f| f.to_ne_bytes().to_vec()) + .collect(), + }; + new_fast_embed.insert(&conn).await?; + } + page += 1; + scrapes = get_page(conn.clone(), limit, limit * page, false) + .await + .unwrap(); + } + Ok(()) +} diff --git a/src/jobs/mod.rs b/src/jobs/mod.rs index 32f515c..5c570c7 100644 --- a/src/jobs/mod.rs +++ b/src/jobs/mod.rs @@ -1,4 +1,5 @@ pub mod embed_scrapes; +pub mod fast_embed_scrapes; pub mod migrations; pub mod process; pub mod scrape_sm; diff --git a/src/models/fast_embeds.rs b/src/models/fast_embeds.rs new file mode 100644 index 0000000..df4ae05 --- /dev/null +++ b/src/models/fast_embeds.rs @@ -0,0 +1,48 @@ +use libsql::Connection; + +#[derive(Debug)] +pub struct FastEmbed { + pub id: String, + pub doc_type: String, + pub doc_id: String, + pub embedding: Vec, +} + +impl FastEmbed { + pub fn new() -> Self { + Self { + id: String::new(), + doc_type: String::new(), + doc_id: String::new(), + embedding: Vec::new(), + } + } + + pub async fn insert( + &self, + conn: &Connection, + ) -> Result<&FastEmbed, Box> { + let mut stmt = conn + .prepare( + r#" + INSERT INTO fast_embeds (id, doc_type, doc_id, embedding) + VALUES (?, ?, ?, ?) + "#, + ) + .await?; + stmt.execute(( + self.id.to_string(), + self.doc_type.to_string(), + self.doc_id.to_string(), + self.embedding.clone(), + )) + .await?; + Ok(self) + } +} + +impl Default for FastEmbed { + fn default() -> Self { + Self::new() + } +} diff --git a/src/models/mod.rs b/src/models/mod.rs index 4b870c9..ea767e7 100644 --- a/src/models/mod.rs +++ b/src/models/mod.rs @@ -1,3 +1,4 @@ +pub mod fast_embeds; pub mod jobs; pub mod sm_scrape; pub mod subscriptions;