From aece18e02eac7dee28710932fb219306afb6bacb Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 21 Mar 2022 18:56:00 +1100 Subject: [PATCH] update patch to remove failing hunk --- .../patches/0001-Upgrade-pyo3-to-0.15.patch | 2249 ++++++++++++++++- 1 file changed, 2215 insertions(+), 34 deletions(-) diff --git a/recipe/patches/0001-Upgrade-pyo3-to-0.15.patch b/recipe/patches/0001-Upgrade-pyo3-to-0.15.patch index a631890..ad9ea70 100644 --- a/recipe/patches/0001-Upgrade-pyo3-to-0.15.patch +++ b/recipe/patches/0001-Upgrade-pyo3-to-0.15.patch @@ -1,47 +1,2228 @@ -From 00932743fa1f7eb4a9d7392ea86f9a82e3a19e5c Mon Sep 17 00:00:00 2001 +From e8cc2692f1a377621b645908a3e4d7088151d17f Mon Sep 17 00:00:00 2001 From: messense Date: Wed, 10 Mar 2021 10:45:50 +0800 Subject: [PATCH] Upgrade pyo3 to 0.15 Rebased-By: H. Vetinari --- - bindings/python/.cargo/config.toml | 11 + - bindings/python/Cargo.lock | 313 +++++++++---------- - bindings/python/Cargo.toml | 10 +- - bindings/python/src/decoders.rs | 26 +- - bindings/python/src/encoding.rs | 24 +- - bindings/python/src/error.rs | 2 +- - bindings/python/src/models.rs | 40 +-- - bindings/python/src/normalizers.rs | 54 ++-- - bindings/python/src/pre_tokenizers.rs | 52 +-- - bindings/python/src/processors.rs | 24 +- - bindings/python/src/token.rs | 2 +- - bindings/python/src/tokenizer.rs | 57 ++-- - bindings/python/src/trainers.rs | 16 +- - bindings/python/src/utils/iterators.rs | 2 +- - bindings/python/src/utils/normalization.rs | 40 +-- - bindings/python/src/utils/pretokenization.rs | 16 +- - bindings/python/src/utils/regex.rs | 4 +- - 17 files changed, 348 insertions(+), 345 deletions(-) - create mode 100644 bindings/python/.cargo/config.toml + 0001-Upgrade-pyo3-to-0.15.patch | 2192 ++++++++++++++++++ + bindings/python/Cargo.lock | 313 ++- + bindings/python/Cargo.toml | 10 +- + bindings/python/src/decoders.rs | 26 +- + bindings/python/src/encoding.rs | 24 +- + bindings/python/src/error.rs | 2 +- + bindings/python/src/models.rs | 40 +- + bindings/python/src/normalizers.rs | 54 +- + bindings/python/src/pre_tokenizers.rs | 52 +- + bindings/python/src/processors.rs | 24 +- + bindings/python/src/token.rs | 2 +- + bindings/python/src/tokenizer.rs | 57 +- + bindings/python/src/trainers.rs | 16 +- + bindings/python/src/utils/iterators.rs | 2 +- + bindings/python/src/utils/normalization.rs | 40 +- + bindings/python/src/utils/pretokenization.rs | 16 +- + bindings/python/src/utils/regex.rs | 4 +- + 17 files changed, 2529 insertions(+), 345 deletions(-) + create mode 100644 0001-Upgrade-pyo3-to-0.15.patch -diff --git a/bindings/python/.cargo/config.toml b/bindings/python/.cargo/config.toml +diff --git a/0001-Upgrade-pyo3-to-0.15.patch b/0001-Upgrade-pyo3-to-0.15.patch new file mode 100644 -index 0000000..d47f983 +index 0000000..a631890 --- /dev/null -+++ b/bindings/python/.cargo/config.toml -@@ -0,0 +1,11 @@ -+[target.x86_64-apple-darwin] -+rustflags = [ -+ "-C", "link-arg=-undefined", -+ "-C", "link-arg=dynamic_lookup", -+] ++++ b/0001-Upgrade-pyo3-to-0.15.patch +@@ -0,0 +1,2192 @@ ++From 00932743fa1f7eb4a9d7392ea86f9a82e3a19e5c Mon Sep 17 00:00:00 2001 ++From: messense ++Date: Wed, 10 Mar 2021 10:45:50 +0800 ++Subject: [PATCH] Upgrade pyo3 to 0.15 ++ ++Rebased-By: H. Vetinari ++--- ++ bindings/python/.cargo/config.toml | 11 + ++ bindings/python/Cargo.lock | 313 +++++++++---------- ++ bindings/python/Cargo.toml | 10 +- ++ bindings/python/src/decoders.rs | 26 +- ++ bindings/python/src/encoding.rs | 24 +- ++ bindings/python/src/error.rs | 2 +- ++ bindings/python/src/models.rs | 40 +-- ++ bindings/python/src/normalizers.rs | 54 ++-- ++ bindings/python/src/pre_tokenizers.rs | 52 +-- ++ bindings/python/src/processors.rs | 24 +- ++ bindings/python/src/token.rs | 2 +- ++ bindings/python/src/tokenizer.rs | 57 ++-- ++ bindings/python/src/trainers.rs | 16 +- ++ bindings/python/src/utils/iterators.rs | 2 +- ++ bindings/python/src/utils/normalization.rs | 40 +-- ++ bindings/python/src/utils/pretokenization.rs | 16 +- ++ bindings/python/src/utils/regex.rs | 4 +- ++ 17 files changed, 348 insertions(+), 345 deletions(-) ++ create mode 100644 bindings/python/.cargo/config.toml ++ ++diff --git a/bindings/python/.cargo/config.toml b/bindings/python/.cargo/config.toml ++new file mode 100644 ++index 0000000..d47f983 ++--- /dev/null +++++ b/bindings/python/.cargo/config.toml ++@@ -0,0 +1,11 @@ +++[target.x86_64-apple-darwin] +++rustflags = [ +++ "-C", "link-arg=-undefined", +++ "-C", "link-arg=dynamic_lookup", +++] +++ +++[target.aarch64-apple-darwin] +++rustflags = [ +++ "-C", "link-arg=-undefined", +++ "-C", "link-arg=dynamic_lookup", +++] ++diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock ++index 823f4a2..286cd68 100644 ++--- a/bindings/python/Cargo.lock +++++ b/bindings/python/Cargo.lock ++@@ -90,9 +90,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "bumpalo" ++-version = "3.7.0" +++version = "3.8.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" +++checksum = "8f1e260c3a9040a7c19a12468758f4c16f31a81a1fe087482be9570ec864bb6c" ++ ++ [[package]] ++ name = "byteorder" ++@@ -152,9 +152,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "cc" ++-version = "1.0.70" +++version = "1.0.71" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0" +++checksum = "79c2681d6594606957bbb8631c4b90a7fcaaa72cdb714743a437b156d6a7eedd" ++ ++ [[package]] ++ name = "cfg-if" ++@@ -185,13 +185,13 @@ dependencies = [ ++ ++ [[package]] ++ name = "console" ++-version = "0.14.1" +++version = "0.15.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45" +++checksum = "a28b32d32ca44b70c3e4acd7db1babf555fa026e385fb95f18028f88848b3c31" ++ dependencies = [ ++ "encode_unicode", ++- "lazy_static", ++ "libc", +++ "once_cell", ++ "regex", ++ "terminal_size", ++ "unicode-width", ++@@ -200,9 +200,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "core-foundation" ++-version = "0.9.1" +++version = "0.9.2" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" +++checksum = "6888e10551bb93e424d8df1d07f1a8b4fceb0001a3a4b048bfc47554946f47b3" ++ dependencies = [ ++ "core-foundation-sys", ++ "libc", ++@@ -210,9 +210,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "core-foundation-sys" ++-version = "0.8.2" +++version = "0.8.3" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" +++checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" ++ ++ [[package]] ++ name = "cpufeatures" ++@@ -276,16 +276,6 @@ dependencies = [ ++ "lazy_static", ++ ] ++ ++-[[package]] ++-name = "ctor" ++-version = "0.1.21" ++-source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "ccc0a48a9b826acdf4028595adc9db92caea352f7af011a3034acd172a52a0aa" ++-dependencies = [ ++- "quote", ++- "syn", ++-] ++- ++ [[package]] ++ name = "darling" ++ version = "0.10.2" ++@@ -389,9 +379,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" ++ ++ [[package]] ++ name = "encoding_rs" ++-version = "0.8.28" +++version = "0.8.29" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +++checksum = "a74ea89a0a1b98f6332de42c95baff457ada66d1cb4030f9ff151b2041a1c746" ++ dependencies = [ ++ "cfg-if 1.0.0", ++ ] ++@@ -432,9 +422,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "flate2" ++-version = "1.0.21" +++version = "1.0.22" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "80edafed416a46fb378521624fab1cfa2eb514784fd8921adbe8a8d8321da811" +++checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f" ++ dependencies = [ ++ "cfg-if 1.0.0", ++ "crc32fast", ++@@ -570,17 +560,6 @@ dependencies = [ ++ "wasi 0.10.2+wasi-snapshot-preview1", ++ ] ++ ++-[[package]] ++-name = "ghost" ++-version = "0.1.2" ++-source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "1a5bcf1bbeab73aa4cf2fde60a846858dc036163c7c33bec309f8d17de785479" ++-dependencies = [ ++- "proc-macro2", ++- "quote", ++- "syn", ++-] ++- ++ [[package]] ++ name = "glob" ++ version = "0.3.0" ++@@ -589,9 +568,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" ++ ++ [[package]] ++ name = "h2" ++-version = "0.3.4" +++version = "0.3.7" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "d7f3675cfef6a30c8031cf9e6493ebdc3bb3272a3fea3923c4210d1830e6a472" +++checksum = "7fd819562fcebdac5afc5c113c3ec36f902840b70fd4fc458799c8ce4607ae55" ++ dependencies = [ ++ "bytes", ++ "fnv", ++@@ -623,9 +602,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "http" ++-version = "0.2.4" +++version = "0.2.5" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" +++checksum = "1323096b05d41827dadeaee54c9981958c0f94e670bc94ed80037d1a7b8b186b" ++ dependencies = [ ++ "bytes", ++ "fnv", ++@@ -634,9 +613,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "http-body" ++-version = "0.4.3" +++version = "0.4.4" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "399c583b2979440c60be0821a6199eca73bc3c8dcd9d070d75ac726e2c6186e5" +++checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" ++ dependencies = [ ++ "bytes", ++ "http", ++@@ -666,9 +645,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "hyper" ++-version = "0.14.12" +++version = "0.14.14" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "13f67199e765030fa08fe0bd581af683f0d5bc04ea09c2b1102012c5fb90e7fd" +++checksum = "2b91bb1f221b6ea1f1e4371216b70f40748774c2fb5971b450c07773fb92d26b" ++ dependencies = [ ++ "bytes", ++ "futures-channel", ++@@ -765,35 +744,13 @@ dependencies = [ ++ ++ [[package]] ++ name = "instant" ++-version = "0.1.10" +++version = "0.1.12" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "bee0328b1209d157ef001c94dd85b4f8f64139adb0eac2659f4b08382b2f474d" +++checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" ++ dependencies = [ ++ "cfg-if 1.0.0", ++ ] ++ ++-[[package]] ++-name = "inventory" ++-version = "0.1.10" ++-source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "0f0f7efb804ec95e33db9ad49e4252f049e37e8b0a4652e3cd61f7999f2eff7f" ++-dependencies = [ ++- "ctor", ++- "ghost", ++- "inventory-impl", ++-] ++- ++-[[package]] ++-name = "inventory-impl" ++-version = "0.1.10" ++-source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "75c094e94816723ab936484666968f5b58060492e880f3c8d00489a1e244fa51" ++-dependencies = [ ++- "proc-macro2", ++- "quote", ++- "syn", ++-] ++- ++ [[package]] ++ name = "ipnet" ++ version = "2.3.1" ++@@ -826,9 +783,9 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" ++ ++ [[package]] ++ name = "js-sys" ++-version = "0.3.53" +++version = "0.3.55" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "e4bf49d50e2961077d9c99f4b7997d770a1114f087c3c2e0069b36c13fc2979d" +++checksum = "7cc9ffccd38c451a86bf13657df244e9c3f37493cce8e5e21e940963777acc84" ++ dependencies = [ ++ "wasm-bindgen", ++ ] ++@@ -854,9 +811,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "libc" ++-version = "0.2.101" +++version = "0.2.107" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" +++checksum = "fbe5e23404da5b4f555ef85ebed98fb4083e55a00c317800bc2a50ede9f3d219" ++ ++ [[package]] ++ name = "lock_api" ++@@ -891,6 +848,15 @@ dependencies = [ ++ "rawpointer", ++ ] ++ +++[[package]] +++name = "matrixmultiply" +++version = "0.3.1" +++source = "registry+https://github.com/rust-lang/crates.io-index" +++checksum = "5a8a15b776d9dfaecd44b03c5828c2199cddff5247215858aac14624f8d6b741" +++dependencies = [ +++ "rawpointer", +++] +++ ++ [[package]] ++ name = "memchr" ++ version = "2.3.4" ++@@ -924,9 +890,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "mio" ++-version = "0.7.13" +++version = "0.7.14" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" +++checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" ++ dependencies = [ ++ "libc", ++ "log", ++@@ -968,8 +934,21 @@ version = "0.13.1" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++ checksum = "ac06db03ec2f46ee0ecdca1a1c34a99c0d188a0d83439b84bf0cb4b386e4ab09" ++ dependencies = [ ++- "matrixmultiply", ++- "num-complex", +++ "matrixmultiply 0.2.4", +++ "num-complex 0.2.4", +++ "num-integer", +++ "num-traits", +++ "rawpointer", +++] +++ +++[[package]] +++name = "ndarray" +++version = "0.15.3" +++source = "registry+https://github.com/rust-lang/crates.io-index" +++checksum = "08e854964160a323e65baa19a0b1a027f76d590faba01f05c0cbc3187221a8c9" +++dependencies = [ +++ "matrixmultiply 0.3.1", +++ "num-complex 0.4.0", ++ "num-integer", ++ "num-traits", ++ "rawpointer", ++@@ -1007,6 +986,15 @@ dependencies = [ ++ "num-traits", ++ ] ++ +++[[package]] +++name = "num-complex" +++version = "0.4.0" +++source = "registry+https://github.com/rust-lang/crates.io-index" +++checksum = "26873667bbbb7c5182d4a37c1add32cdf09f841af72da53318fdb81543c15085" +++dependencies = [ +++ "num-traits", +++] +++ ++ [[package]] ++ name = "num-integer" ++ version = "0.1.44" ++@@ -1044,14 +1032,14 @@ checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" ++ ++ [[package]] ++ name = "numpy" ++-version = "0.12.2" +++version = "0.15.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "9fd9e8e652becf4ba6c11803945f8bf463c23f482f704bb33f70ae9d22482d10" +++checksum = "e590538dba8432d54d3587b06df73d7c044e83cfa4b200cbc7d0567f924ac0a7" ++ dependencies = [ ++ "cfg-if 0.1.10", ++ "libc", ++- "ndarray", ++- "num-complex", +++ "ndarray 0.15.3", +++ "num-complex 0.4.0", ++ "num-traits", ++ "pyo3", ++ ] ++@@ -1064,9 +1052,9 @@ checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" ++ ++ [[package]] ++ name = "onig" ++-version = "6.2.0" +++version = "6.3.1" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "b16fd3c0e73b516af509c13c4ba76ec0c987ce20d78b38cff356b8d01fc6a6c0" +++checksum = "67ddfe2c93bb389eea6e6d713306880c7f6dcc99a75b659ce145d962c861b225" ++ dependencies = [ ++ "bitflags", ++ "lazy_static", ++@@ -1076,9 +1064,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "onig_sys" ++-version = "69.7.0" +++version = "69.7.1" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "9fd9442a09e4fbd08d196ddf419b2c79a43c3a46c800320cc841d45c2449a240" +++checksum = "5dd3eee045c84695b53b20255bb7317063df090b68e18bfac0abb6c39cf7f33e" ++ dependencies = [ ++ "cc", ++ "pkg-config", ++@@ -1092,9 +1080,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" ++ ++ [[package]] ++ name = "openssl" ++-version = "0.10.36" +++version = "0.10.38" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "8d9facdb76fec0b73c406f125d44d86fdad818d66fef0531eec9233ca425ff4a" +++checksum = "0c7ae222234c30df141154f159066c5093ff73b63204dcda7121eb082fc56a95" ++ dependencies = [ ++ "bitflags", ++ "cfg-if 1.0.0", ++@@ -1112,9 +1100,9 @@ checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a" ++ ++ [[package]] ++ name = "openssl-sys" ++-version = "0.9.66" +++version = "0.9.70" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "1996d2d305e561b70d1ee0c53f1542833f4e1ac6ce9a6708b6ff2738ca67dc82" +++checksum = "c6517987b3f8226b5da3661dad65ff7f300cc59fb5ea8333ca191fc65fde3edf" ++ dependencies = [ ++ "autocfg", ++ "cc", ++@@ -1193,15 +1181,15 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" ++ ++ [[package]] ++ name = "pkg-config" ++-version = "0.3.19" +++version = "0.3.22" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" +++checksum = "12295df4f294471248581bc09bef3c38a5e46f1e36d6a37353621a0c6c357e1f" ++ ++ [[package]] ++ name = "ppv-lite86" ++-version = "0.2.10" +++version = "0.2.15" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" +++checksum = "ed0cfbc8191465bed66e1718596ee0b0b35d5ee1f41c5df2189d0fe8bde535ba" ++ ++ [[package]] ++ name = "proc-macro-hack" ++@@ -1211,9 +1199,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" ++ ++ [[package]] ++ name = "proc-macro2" ++-version = "1.0.29" +++version = "1.0.32" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" +++checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" ++ dependencies = [ ++ "unicode-xid", ++ ] ++@@ -1224,38 +1212,48 @@ version = "0.1.0" ++ ++ [[package]] ++ name = "pyo3" ++-version = "0.12.4" +++version = "0.15.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "bf6bbbe8f70d179260b3728e5d04eb012f4f0c7988e58c11433dd689cecaa72e" +++checksum = "64664505ce285a59b8b7e940fbe54ad65b1758a0810eddc5bc26df6f6ec8c557" ++ dependencies = [ ++- "ctor", +++ "cfg-if 1.0.0", ++ "indoc", ++- "inventory", ++ "libc", ++ "parking_lot", ++ "paste 0.1.18", ++- "pyo3cls", +++ "pyo3-build-config", +++ "pyo3-macros", ++ "unindent", ++ ] ++ ++ [[package]] ++-name = "pyo3-derive-backend" ++-version = "0.12.4" +++name = "pyo3-build-config" +++version = "0.15.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "10ecd0eb6ed7b3d9965b4f4370b5b9e99e3e5e8742000e1c452c018f8c2a322f" +++checksum = "5f1e4a72de84cdcd69f62211b62f62753d0c11b7b5715f3467f3754dab22a7ca" ++ dependencies = [ ++- "proc-macro2", +++ "once_cell", +++] +++ +++[[package]] +++name = "pyo3-macros" +++version = "0.15.0" +++source = "registry+https://github.com/rust-lang/crates.io-index" +++checksum = "244f21d0a3887a9c02018b94e3b78d693dc7eca5c56839b7796a499cc364deb4" +++dependencies = [ +++ "pyo3-macros-backend", ++ "quote", ++ "syn", ++ ] ++ ++ [[package]] ++-name = "pyo3cls" ++-version = "0.12.4" +++name = "pyo3-macros-backend" +++version = "0.15.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "d344fdaa6a834a06dd1720ff104ea12fe101dad2e8db89345af9db74c0bb11a0" +++checksum = "b3d3d18ac41d05199bb82645d56e39f8c8b4909a0137c6f2640f03685b29f672" ++ dependencies = [ ++- "pyo3-derive-backend", +++ "proc-macro2", +++ "pyo3-build-config", ++ "quote", ++ "syn", ++ ] ++@@ -1268,9 +1266,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" ++ ++ [[package]] ++ name = "quote" ++-version = "1.0.9" +++version = "1.0.10" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +++checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" ++ dependencies = [ ++ "proc-macro2", ++ ] ++@@ -1451,9 +1449,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "reqwest" ++-version = "0.11.4" +++version = "0.11.6" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22" +++checksum = "66d2927ca2f685faf0fc620ac4834690d29e7abb153add10f5812eef20b5e280" ++ dependencies = [ ++ "base64 0.13.0", ++ "bytes", ++@@ -1473,6 +1471,7 @@ dependencies = [ ++ "percent-encoding", ++ "pin-project-lite", ++ "serde", +++ "serde_json", ++ "serde_urlencoded", ++ "tokio", ++ "tokio-native-tls", ++@@ -1550,9 +1549,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "serde_json" ++-version = "1.0.67" +++version = "1.0.69" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950" +++checksum = "e466864e431129c7e0d3476b92f20458e5879919a0596c6472738d9fa2d342f8" ++ dependencies = [ ++ "itoa", ++ "ryu", ++@@ -1573,9 +1572,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "sha2" ++-version = "0.9.6" +++version = "0.9.8" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "9204c41a1597a8c5af23c82d1c921cb01ec0a4c59e07a9c7306062829a3903f3" +++checksum = "b69f9a4c9740d74c5baa3fd2e547f9525fa8088a8a958e0ca2409a514e33f5fa" ++ dependencies = [ ++ "block-buffer", ++ "cfg-if 1.0.0", ++@@ -1586,21 +1585,21 @@ dependencies = [ ++ ++ [[package]] ++ name = "slab" ++-version = "0.4.4" +++version = "0.4.5" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "c307a32c1c5c437f38c7fd45d753050587732ba8628319fbdf12a7e289ccc590" +++checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" ++ ++ [[package]] ++ name = "smallvec" ++-version = "1.6.1" +++version = "1.7.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" +++checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" ++ ++ [[package]] ++ name = "socket2" ++-version = "0.4.1" +++version = "0.4.2" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "765f090f0e423d2b55843402a07915add955e7d60657db13707a159727326cad" +++checksum = "5dc90fe6c7be1a323296982db1836d1ea9e47b6839496dde9a541bc496df3516" ++ dependencies = [ ++ "libc", ++ "winapi", ++@@ -1638,9 +1637,9 @@ checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c" ++ ++ [[package]] ++ name = "syn" ++-version = "1.0.76" +++version = "1.0.81" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "c6f107db402c2c2055242dbf4d2af0e69197202e9faacbef9571bbe47f5a1b84" +++checksum = "f2afee18b8beb5a596ecb4a2dce128c719b4ba399d34126b9e4396e3f9860966" ++ dependencies = [ ++ "proc-macro2", ++ "quote", ++@@ -1708,18 +1707,18 @@ dependencies = [ ++ ++ [[package]] ++ name = "thiserror" ++-version = "1.0.29" +++version = "1.0.30" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88" +++checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" ++ dependencies = [ ++ "thiserror-impl", ++ ] ++ ++ [[package]] ++ name = "thiserror-impl" ++-version = "1.0.29" +++version = "1.0.30" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c" +++checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" ++ dependencies = [ ++ "proc-macro2", ++ "quote", ++@@ -1738,9 +1737,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "tinyvec" ++-version = "1.3.1" +++version = "1.5.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "848a1e1181b9f6753b5e96a092749e29b11d19ede67dfbbd6c7dc7e0f49b5338" +++checksum = "f83b2a3d4d9091d0abd7eba4dc2710b1718583bd4d8992e2190720ea38f391f7" ++ dependencies = [ ++ "tinyvec_macros", ++ ] ++@@ -1789,7 +1788,7 @@ dependencies = [ ++ "env_logger", ++ "itertools 0.9.0", ++ "libc", ++- "ndarray", +++ "ndarray 0.13.1", ++ "numpy", ++ "onig", ++ "pyo3", ++@@ -1802,9 +1801,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "tokio" ++-version = "1.11.0" +++version = "1.13.0" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "b4efe6fc2395938c8155973d7be49fe8d03a843726e285e100a8a383cc0154ce" +++checksum = "588b2d10a336da58d877567cd8fb8a14b463e2104910f8132cd054b4b96e29ee" ++ dependencies = [ ++ "autocfg", ++ "bytes", ++@@ -1828,9 +1827,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "tokio-util" ++-version = "0.6.8" +++version = "0.6.9" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "08d3725d3efa29485e87311c5b699de63cde14b00ed4d256b8318aa30ca452cd" +++checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" ++ dependencies = [ ++ "bytes", ++ "futures-core", ++@@ -1848,9 +1847,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" ++ ++ [[package]] ++ name = "tracing" ++-version = "0.1.26" +++version = "0.1.29" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" +++checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105" ++ dependencies = [ ++ "cfg-if 1.0.0", ++ "pin-project-lite", ++@@ -1859,9 +1858,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "tracing-core" ++-version = "0.1.19" +++version = "0.1.21" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8" +++checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4" ++ dependencies = [ ++ "lazy_static", ++ ] ++@@ -1880,9 +1879,9 @@ checksum = "b63708a265f51345575b27fe43f9500ad611579e764c79edbc2037b1121959ec" ++ ++ [[package]] ++ name = "unicode-bidi" ++-version = "0.3.6" +++version = "0.3.7" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "246f4c42e67e7a4e3c6106ff716a5d067d4132a642840b242e357e468a2a0085" +++checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" ++ ++ [[package]] ++ name = "unicode-normalization" ++@@ -1910,9 +1909,9 @@ checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" ++ ++ [[package]] ++ name = "unicode-width" ++-version = "0.1.8" +++version = "0.1.9" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" +++checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" ++ ++ [[package]] ++ name = "unicode-xid" ++@@ -1986,21 +1985,19 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" ++ ++ [[package]] ++ name = "wasm-bindgen" ++-version = "0.2.76" +++version = "0.2.78" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "8ce9b1b516211d33767048e5d47fa2a381ed8b76fc48d2ce4aa39877f9f183e0" +++checksum = "632f73e236b219150ea279196e54e610f5dbafa5d61786303d4da54f84e47fce" ++ dependencies = [ ++ "cfg-if 1.0.0", ++- "serde", ++- "serde_json", ++ "wasm-bindgen-macro", ++ ] ++ ++ [[package]] ++ name = "wasm-bindgen-backend" ++-version = "0.2.76" +++version = "0.2.78" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "cfe8dc78e2326ba5f845f4b5bf548401604fa20b1dd1d365fb73b6c1d6364041" +++checksum = "a317bf8f9fba2476b4b2c85ef4c4af8ff39c3c7f0cdfeed4f82c34a880aa837b" ++ dependencies = [ ++ "bumpalo", ++ "lazy_static", ++@@ -2013,9 +2010,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "wasm-bindgen-futures" ++-version = "0.4.26" +++version = "0.4.28" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "95fded345a6559c2cfee778d562300c581f7d4ff3edb9b0d230d69800d213972" +++checksum = "8e8d7523cb1f2a4c96c1317ca690031b714a51cc14e05f712446691f413f5d39" ++ dependencies = [ ++ "cfg-if 1.0.0", ++ "js-sys", ++@@ -2025,9 +2022,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "wasm-bindgen-macro" ++-version = "0.2.76" +++version = "0.2.78" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "44468aa53335841d9d6b6c023eaab07c0cd4bddbcfdee3e2bb1e8d2cb8069fef" +++checksum = "d56146e7c495528bf6587663bea13a8eb588d39b36b679d83972e1a2dbbdacf9" ++ dependencies = [ ++ "quote", ++ "wasm-bindgen-macro-support", ++@@ -2035,9 +2032,9 @@ dependencies = [ ++ ++ [[package]] ++ name = "wasm-bindgen-macro-support" ++-version = "0.2.76" +++version = "0.2.78" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "0195807922713af1e67dc66132c7328206ed9766af3858164fb583eedc25fbad" +++checksum = "7803e0eea25835f8abdc585cd3021b3deb11543c6fe226dcd30b228857c5c5ab" ++ dependencies = [ ++ "proc-macro2", ++ "quote", ++@@ -2048,15 +2045,15 @@ dependencies = [ ++ ++ [[package]] ++ name = "wasm-bindgen-shared" ++-version = "0.2.76" +++version = "0.2.78" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "acdb075a845574a1fa5f09fd77e43f7747599301ea3417a9fbffdeedfc1f4a29" +++checksum = "0237232789cf037d5480773fe568aac745bfe2afbc11a863e97901780a6b47cc" ++ ++ [[package]] ++ name = "web-sys" ++-version = "0.3.53" +++version = "0.3.55" ++ source = "registry+https://github.com/rust-lang/crates.io-index" ++-checksum = "224b2f6b67919060055ef1a67807367c2066ed520c3862cc013d26cf893a783c" +++checksum = "38eb105f1c59d9eaa6b5cdc92b859d85b926e82cb2e0945cd0c9259faa6fe9fb" ++ dependencies = [ ++ "js-sys", ++ "wasm-bindgen", ++diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml ++index 7ac0dd9..9123be2 100644 ++--- a/bindings/python/Cargo.toml +++++ b/bindings/python/Cargo.toml ++@@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]} ++ serde_json = "1.0" ++ libc = "0.2" ++ env_logger = "0.7.1" ++-pyo3 = "0.12" ++-numpy = "0.12" +++pyo3 = "0.15.0" +++numpy = "0.15.0" ++ ndarray = "0.13" ++ onig = { version = "6.0", default-features = false } ++ itertools = "0.9" ++@@ -29,9 +29,3 @@ tempfile = "3.1" ++ ++ [features] ++ default = ["pyo3/extension-module"] ++- ++-[target.x86_64-apple-darwin] ++-rustflags = [ ++- "-C", "link-arg=-undefined", ++- "-C", "link-arg=dynamic_lookup", ++-] ++diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs ++index 5f15838..a5524dc 100644 ++--- a/bindings/python/src/decoders.rs +++++ b/bindings/python/src/decoders.rs ++@@ -21,7 +21,7 @@ use super::error::ToPyResult; ++ /// ++ /// This class is not supposed to be instantiated directly. Instead, any implementation of ++ /// a Decoder will return an instance of this class when instantiated. ++-#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)] +++#[pyclass(dict, module = "tokenizers.decoders", name = "Decoder")] ++ #[derive(Clone, Deserialize, Serialize)] ++ pub struct PyDecoder { ++ #[serde(flatten)] ++@@ -97,7 +97,7 @@ impl PyDecoder { ++ /// ++ /// Returns: ++ /// :obj:`str`: The decoded string ++- #[text_signature = "(self, tokens)"] +++ #[pyo3(text_signature = "(self, tokens)")] ++ fn decode(&self, tokens: Vec) -> PyResult { ++ ToPyResult(self.decoder.decode(tokens)).into() ++ } ++@@ -141,8 +141,8 @@ macro_rules! setter { ++ /// ++ /// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel` ++ /// :class:`~tokenizers.pre_tokenizers.PreTokenizer`. ++-#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=ByteLevel)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteLevel")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyByteLevelDec {} ++ #[pymethods] ++ impl PyByteLevelDec { ++@@ -161,8 +161,8 @@ impl PyByteLevelDec { ++ /// cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`): ++ /// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, ++ /// and some abbreviated english forms. ++-#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=WordPiece)] ++-#[text_signature = "(self, prefix=\"##\", cleanup=True)"] +++#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "WordPiece")] +++#[pyo3(text_signature = "(self, prefix=\"##\", cleanup=True)")] ++ pub struct PyWordPieceDec {} ++ #[pymethods] ++ impl PyWordPieceDec { ++@@ -203,8 +203,8 @@ impl PyWordPieceDec { ++ /// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): ++ /// Whether to add a space to the first word if there isn't already one. This ++ /// lets us treat `hello` exactly like `say hello`. ++-#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=Metaspace)] ++-#[text_signature = "(self, replacement = \"▁\", add_prefix_space = True)"] +++#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")] +++#[pyo3(text_signature = "(self, replacement = \"▁\", add_prefix_space = True)")] ++ pub struct PyMetaspaceDec {} ++ #[pymethods] ++ impl PyMetaspaceDec { ++@@ -244,8 +244,8 @@ impl PyMetaspaceDec { ++ /// suffix (:obj:`str`, `optional`, defaults to :obj:``): ++ /// The suffix that was used to caracterize an end-of-word. This suffix will ++ /// be replaced by whitespaces during the decoding ++-#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=BPEDecoder)] ++-#[text_signature = "(self, suffix=\"\")"] +++#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")] +++#[pyo3(text_signature = "(self, suffix=\"\")")] ++ pub struct PyBPEDecoder {} ++ #[pymethods] ++ impl PyBPEDecoder { ++@@ -276,8 +276,8 @@ impl PyBPEDecoder { ++ /// cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`): ++ /// Whether to cleanup some tokenization artifacts. ++ /// Mainly spaces before punctuation, and some abbreviated english forms. ++-#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=CTC)] ++-#[text_signature = "(self, pad_token=\"\", word_delimiter_token=\"|\", cleanup=True)"] +++#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "CTC")] +++#[pyo3(text_signature = "(self, pad_token=\"\", word_delimiter_token=\"|\", cleanup=True)")] ++ pub struct PyCTCDecoder {} ++ #[pymethods] ++ impl PyCTCDecoder { ++@@ -421,7 +421,7 @@ mod test { ++ let gil = Python::acquire_gil(); ++ assert_eq!( ++ "tokenizers.decoders.Metaspace", ++- py_meta.as_ref(gil.python()).get_type().name() +++ py_meta.as_ref(gil.python()).get_type().name().unwrap() ++ ); ++ } ++ ++diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs ++index 976b2c6..c4e2743 100644 ++--- a/bindings/python/src/encoding.rs +++++ b/bindings/python/src/encoding.rs ++@@ -9,7 +9,7 @@ use tokenizers as tk; ++ use crate::error::{deprecation_warning, PyError}; ++ ++ /// The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. ++-#[pyclass(dict, module = "tokenizers", name=Encoding)] +++#[pyclass(dict, module = "tokenizers", name = "Encoding")] ++ #[repr(transparent)] ++ pub struct PyEncoding { ++ pub encoding: tk::tokenizer::Encoding, ++@@ -86,7 +86,7 @@ impl PyEncoding { ++ /// :class:`~tokenizers.Encoding`: The resulting Encoding ++ #[staticmethod] ++ #[args(growing_offsets = true)] ++- #[text_signature = "(encodings, growing_offsets=True)"] +++ #[pyo3(text_signature = "(encodings, growing_offsets=True)")] ++ fn merge(encodings: Vec>, growing_offsets: bool) -> PyEncoding { ++ tk::tokenizer::Encoding::merge( ++ encodings.into_iter().map(|e| e.encoding.clone()), ++@@ -108,7 +108,7 @@ impl PyEncoding { ++ /// ++ /// Set the given sequence index for the whole range of tokens contained in this ++ /// :class:`~tokenizers.Encoding`. ++- #[text_signature = "(self, sequence_id)"] +++ #[pyo3(text_signature = "(self, sequence_id)")] ++ fn set_sequence_id(&mut self, sequence_id: usize) { ++ self.encoding.set_sequence_id(sequence_id); ++ } ++@@ -270,7 +270,7 @@ impl PyEncoding { ++ /// Returns: ++ /// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` ++ #[args(sequence_index = 0)] ++- #[text_signature = "(self, word_index, sequence_index=0)"] +++ #[pyo3(text_signature = "(self, word_index, sequence_index=0)")] ++ fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> { ++ self.encoding.word_to_tokens(word_index, sequence_index) ++ } ++@@ -286,7 +286,7 @@ impl PyEncoding { ++ /// Returns: ++ /// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` ++ #[args(sequence_index = 0)] ++- #[text_signature = "(self, word_index, sequence_index=0)"] +++ #[pyo3(text_signature = "(self, word_index, sequence_index=0)")] ++ fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option { ++ self.encoding.word_to_chars(word_index, sequence_index) ++ } ++@@ -302,7 +302,7 @@ impl PyEncoding { ++ /// ++ /// Returns: ++ /// :obj:`int`: The sequence id of the given token ++- #[text_signature = "(self, token_index)"] +++ #[pyo3(text_signature = "(self, token_index)")] ++ fn token_to_sequence(&self, token_index: usize) -> Option { ++ self.encoding.token_to_sequence(token_index) ++ } ++@@ -319,7 +319,7 @@ impl PyEncoding { ++ /// ++ /// Returns: ++ /// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` ++- #[text_signature = "(self, token_index)"] +++ #[pyo3(text_signature = "(self, token_index)")] ++ fn token_to_chars(&self, token_index: usize) -> Option { ++ let (_, offsets) = self.encoding.token_to_chars(token_index)?; ++ Some(offsets) ++@@ -337,7 +337,7 @@ impl PyEncoding { ++ /// ++ /// Returns: ++ /// :obj:`int`: The index of the word in the relevant input sequence. ++- #[text_signature = "(self, token_index)"] +++ #[pyo3(text_signature = "(self, token_index)")] ++ fn token_to_word(&self, token_index: usize) -> Option { ++ let (_, word_idx) = self.encoding.token_to_word(token_index)?; ++ Some(word_idx) ++@@ -354,7 +354,7 @@ impl PyEncoding { ++ /// Returns: ++ /// :obj:`int`: The index of the token that contains this char in the encoded sequence ++ #[args(sequence_index = 0)] ++- #[text_signature = "(self, char_pos, sequence_index=0)"] +++ #[pyo3(text_signature = "(self, char_pos, sequence_index=0)")] ++ fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option { ++ self.encoding.char_to_token(char_pos, sequence_index) ++ } ++@@ -370,7 +370,7 @@ impl PyEncoding { ++ /// Returns: ++ /// :obj:`int`: The index of the word that contains this char in the input sequence ++ #[args(sequence_index = 0)] ++- #[text_signature = "(self, char_pos, sequence_index=0)"] +++ #[pyo3(text_signature = "(self, char_pos, sequence_index=0)")] ++ fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option { ++ self.encoding.char_to_word(char_pos, sequence_index) ++ } ++@@ -393,7 +393,7 @@ impl PyEncoding { ++ /// pad_token (:obj:`str`, defaults to `[PAD]`): ++ /// The pad token to use ++ #[args(kwargs = "**")] ++- #[text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"] +++ #[pyo3(text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')")] ++ fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { ++ let mut pad_id = 0; ++ let mut pad_type_id = 0; ++@@ -445,7 +445,7 @@ impl PyEncoding { ++ /// Truncate direction ++ #[args(stride = "0")] ++ #[args(direction = "\"right\"")] ++- #[text_signature = "(self, max_length, stride=0, direction='right')"] +++ #[pyo3(text_signature = "(self, max_length, stride=0, direction='right')")] ++ fn truncate(&mut self, max_length: usize, stride: usize, direction: &str) -> PyResult<()> { ++ let tdir = match direction { ++ "left" => Ok(TruncationDirection::Left), ++diff --git a/bindings/python/src/error.rs b/bindings/python/src/error.rs ++index a6bcaf3..1e8c5a1 100644 ++--- a/bindings/python/src/error.rs +++++ b/bindings/python/src/error.rs ++@@ -37,7 +37,7 @@ impl ToPyResult { ++ pub(crate) fn deprecation_warning(version: &str, message: &str) -> PyResult<()> { ++ let gil = pyo3::Python::acquire_gil(); ++ let python = gil.python(); ++- let deprecation_warning = python.import("builtins")?.get("DeprecationWarning")?; +++ let deprecation_warning = python.import("builtins")?.getattr("DeprecationWarning")?; ++ let full_message = format!("Deprecated in {}: {}", version, message); ++ pyo3::PyErr::warn(python, deprecation_warning, &full_message, 0) ++ } ++diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs ++index e7aaa79..779d66e 100644 ++--- a/bindings/python/src/models.rs +++++ b/bindings/python/src/models.rs ++@@ -24,7 +24,7 @@ use super::error::{deprecation_warning, ToPyResult}; ++ /// will contain and manage the learned vocabulary. ++ /// ++ /// This class cannot be constructed directly. Please use one of the concrete models. ++-#[pyclass(module = "tokenizers.models", name=Model)] +++#[pyclass(module = "tokenizers.models", name = "Model")] ++ #[derive(Clone, Serialize, Deserialize)] ++ pub struct PyModel { ++ #[serde(flatten)] ++@@ -132,7 +132,7 @@ impl PyModel { ++ /// ++ /// Returns: ++ /// A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens ++- #[text_signature = "(self, sequence)"] +++ #[pyo3(text_signature = "(self, sequence)")] ++ fn tokenize(&self, sequence: &str) -> PyResult> { ++ Ok(ToPyResult(self.model.read().unwrap().tokenize(sequence)) ++ .into_py()? ++@@ -149,7 +149,7 @@ impl PyModel { ++ /// ++ /// Returns: ++ /// :obj:`int`: The ID associated to the token ++- #[text_signature = "(self, tokens)"] +++ #[pyo3(text_signature = "(self, tokens)")] ++ fn token_to_id(&self, token: &str) -> Option { ++ self.model.read().unwrap().token_to_id(token) ++ } ++@@ -162,7 +162,7 @@ impl PyModel { ++ /// ++ /// Returns: ++ /// :obj:`str`: The token associated to the ID ++- #[text_signature = "(self, id)"] +++ #[pyo3(text_signature = "(self, id)")] ++ fn id_to_token(&self, id: u32) -> Option { ++ self.model.read().unwrap().id_to_token(id) ++ } ++@@ -182,7 +182,7 @@ impl PyModel { ++ /// ++ /// Returns: ++ /// :obj:`List[str]`: The list of saved files ++- #[text_signature = "(self, folder, prefix)"] +++ #[pyo3(text_signature = "(self, folder, prefix)")] ++ fn save<'a>( ++ &self, ++ folder: &str, ++@@ -248,8 +248,8 @@ impl PyModel { ++ /// ++ /// fuse_unk (:obj:`bool`, `optional`): ++ /// Whether to fuse any subsequent unknown tokens into a single one ++-#[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)] ++-#[text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"] +++#[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")] +++#[pyo3(text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)")] ++ pub struct PyBPE {} ++ ++ impl PyBPE { ++@@ -437,7 +437,7 @@ impl PyBPE { ++ /// A :obj:`Tuple` with the vocab and the merges: ++ /// The vocabulary and merges loaded into memory ++ #[staticmethod] ++- #[text_signature = "(self, vocab, merges)"] +++ #[pyo3(text_signature = "(self, vocab, merges)")] ++ fn read_file(vocab: &str, merges: &str) -> PyResult<(Vocab, Merges)> { ++ BPE::read_file(vocab, merges).map_err(|e| { ++ exceptions::PyException::new_err(format!( ++@@ -469,7 +469,7 @@ impl PyBPE { ++ /// :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files ++ #[classmethod] ++ #[args(kwargs = "**")] ++- #[text_signature = "(cls, vocab, merge, **kwargs)"] +++ #[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")] ++ fn from_file( ++ _cls: &PyType, ++ py: Python, ++@@ -502,8 +502,8 @@ impl PyBPE { ++ /// ++ /// max_input_chars_per_word (:obj:`int`, `optional`): ++ /// The maximum number of characters to authorize in a single word. ++-#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)] ++-#[text_signature = "(self, vocab, unk_token, max_input_chars_per_word)"] +++#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")] +++#[pyo3(text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")] ++ pub struct PyWordPiece {} ++ ++ impl PyWordPiece { ++@@ -613,7 +613,7 @@ impl PyWordPiece { ++ /// Returns: ++ /// :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` ++ #[staticmethod] ++- #[text_signature = "(vocab)"] +++ #[pyo3(text_signature = "(vocab)")] ++ fn read_file(vocab: &str) -> PyResult { ++ WordPiece::read_file(vocab).map_err(|e| { ++ exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e)) ++@@ -639,7 +639,7 @@ impl PyWordPiece { ++ /// :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file ++ #[classmethod] ++ #[args(kwargs = "**")] ++- #[text_signature = "(vocab, **kwargs)"] +++ #[pyo3(text_signature = "(vocab, **kwargs)")] ++ fn from_file( ++ _cls: &PyType, ++ py: Python, ++@@ -663,8 +663,8 @@ impl PyWordPiece { ++ /// ++ /// unk_token (:obj:`str`, `optional`): ++ /// The unknown token to be used by the model. ++-#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)] ++-#[text_signature = "(self, vocab, unk_token)"] +++#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")] +++#[pyo3(text_signature = "(self, vocab, unk_token)")] ++ pub struct PyWordLevel {} ++ ++ #[pymethods] ++@@ -725,7 +725,7 @@ impl PyWordLevel { ++ /// Returns: ++ /// :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` ++ #[staticmethod] ++- #[text_signature = "(vocab)"] +++ #[pyo3(text_signature = "(vocab)")] ++ fn read_file(vocab: &str) -> PyResult { ++ WordLevel::read_file(vocab).map_err(|e| { ++ exceptions::PyException::new_err(format!("Error while reading WordLevel file: {}", e)) ++@@ -751,7 +751,7 @@ impl PyWordLevel { ++ /// :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file ++ #[classmethod] ++ #[args(unk_token = "None")] ++- #[text_signature = "(vocab, unk_token)"] +++ #[pyo3(text_signature = "(vocab, unk_token)")] ++ fn from_file( ++ _cls: &PyType, ++ py: Python, ++@@ -773,8 +773,8 @@ impl PyWordLevel { ++ /// Args: ++ /// vocab (:obj:`List[Tuple[str, float]]`, `optional`): ++ /// A list of vocabulary items and their relative score [("am", -0.2442),...] ++-#[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)] ++-#[text_signature = "(self, vocab)"] +++#[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")] +++#[pyo3(text_signature = "(self, vocab)")] ++ pub struct PyUnigram {} ++ ++ #[pymethods] ++@@ -810,7 +810,7 @@ mod test { ++ let gil = Python::acquire_gil(); ++ assert_eq!( ++ "tokenizers.models.BPE", ++- py_bpe.as_ref(gil.python()).get_type().name() +++ py_bpe.as_ref(gil.python()).get_type().name().unwrap() ++ ); ++ } ++ ++diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs ++index fb1a5cc..59f9c64 100644 ++--- a/bindings/python/src/normalizers.rs +++++ b/bindings/python/src/normalizers.rs ++@@ -43,7 +43,7 @@ impl PyNormalizedStringMut<'_> { ++ /// ++ /// This class is not supposed to be instantiated directly. Instead, any implementation of a ++ /// Normalizer will return an instance of this class when instantiated. ++-#[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)] +++#[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer")] ++ #[derive(Clone, Serialize, Deserialize)] ++ pub struct PyNormalizer { ++ #[serde(flatten)] ++@@ -144,7 +144,7 @@ impl PyNormalizer { ++ /// normalized (:class:`~tokenizers.NormalizedString`): ++ /// The normalized string on which to apply this ++ /// :class:`~tokenizers.normalizers.Normalizer` ++- #[text_signature = "(self, normalized)"] +++ #[pyo3(text_signature = "(self, normalized)")] ++ fn normalize(&self, mut normalized: PyNormalizedStringMut) -> PyResult<()> { ++ normalized.normalize_with(&self.normalizer) ++ } ++@@ -162,7 +162,7 @@ impl PyNormalizer { ++ /// ++ /// Returns: ++ /// :obj:`str`: A string after normalization ++- #[text_signature = "(self, sequence)"] +++ #[pyo3(text_signature = "(self, sequence)")] ++ fn normalize_str(&self, sequence: &str) -> PyResult { ++ let mut normalized = NormalizedString::from(sequence); ++ ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?; ++@@ -217,8 +217,8 @@ macro_rules! setter { ++ /// ++ /// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): ++ /// Whether to lowercase. ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)] ++-#[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")] +++#[pyo3(text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")] ++ pub struct PyBertNormalizer {} ++ #[pymethods] ++ impl PyBertNormalizer { ++@@ -287,8 +287,8 @@ impl PyBertNormalizer { ++ } ++ ++ /// NFD Unicode Normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyNFD {} ++ #[pymethods] ++ impl PyNFD { ++@@ -299,8 +299,8 @@ impl PyNFD { ++ } ++ ++ /// NFKD Unicode Normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyNFKD {} ++ #[pymethods] ++ impl PyNFKD { ++@@ -311,8 +311,8 @@ impl PyNFKD { ++ } ++ ++ /// NFC Unicode Normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyNFC {} ++ #[pymethods] ++ impl PyNFC { ++@@ -323,8 +323,8 @@ impl PyNFC { ++ } ++ ++ /// NFKC Unicode Normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyNFKC {} ++ #[pymethods] ++ impl PyNFKC { ++@@ -340,7 +340,7 @@ impl PyNFKC { ++ /// Args: ++ /// normalizers (:obj:`List[Normalizer]`): ++ /// A list of Normalizer to be run as a sequence ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Sequence")] ++ pub struct PySequence {} ++ #[pymethods] ++ impl PySequence { ++@@ -373,8 +373,8 @@ impl PySequenceProtocol for PySequence { ++ } ++ ++ /// Lowercase Normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyLowercase {} ++ #[pymethods] ++ impl PyLowercase { ++@@ -385,8 +385,8 @@ impl PyLowercase { ++ } ++ ++ /// Strip normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)] ++-#[text_signature = "(self, left=True, right=True)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")] +++#[pyo3(text_signature = "(self, left=True, right=True)")] ++ pub struct PyStrip {} ++ #[pymethods] ++ impl PyStrip { ++@@ -418,8 +418,8 @@ impl PyStrip { ++ } ++ ++ /// StripAccents normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyStripAccents {} ++ #[pymethods] ++ impl PyStripAccents { ++@@ -430,8 +430,8 @@ impl PyStripAccents { ++ } ++ ++ /// Nmt normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyNmt {} ++ #[pymethods] ++ impl PyNmt { ++@@ -443,8 +443,8 @@ impl PyNmt { ++ ++ /// Precompiled normalizer ++ /// Don't use manually it is used for compatiblity for SentencePiece. ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)] ++-#[text_signature = "(self, precompiled_charsmap)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")] +++#[pyo3(text_signature = "(self, precompiled_charsmap)")] ++ pub struct PyPrecompiled {} ++ #[pymethods] ++ impl PyPrecompiled { ++@@ -466,8 +466,8 @@ impl PyPrecompiled { ++ } ++ ++ /// Replace normalizer ++-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)] ++-#[text_signature = "(self, pattern, content)"] +++#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")] +++#[pyo3(text_signature = "(self, pattern, content)")] ++ pub struct PyReplace {} ++ #[pymethods] ++ impl PyReplace { ++@@ -631,7 +631,7 @@ mod test { ++ let gil = Python::acquire_gil(); ++ assert_eq!( ++ "tokenizers.normalizers.NFC", ++- py_nfc.as_ref(gil.python()).get_type().name() +++ py_nfc.as_ref(gil.python()).get_type().name().unwrap() ++ ); ++ } ++ ++diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs ++index 947e267..3d8ab58 100644 ++--- a/bindings/python/src/pre_tokenizers.rs +++++ b/bindings/python/src/pre_tokenizers.rs ++@@ -28,7 +28,7 @@ use super::utils::*; ++ /// ++ /// This class is not supposed to be instantiated directly. Instead, any implementation of a ++ /// PreTokenizer will return an instance of this class when instantiated. ++-#[pyclass(dict, module = "tokenizers.pre_tokenizers", name=PreTokenizer)] +++#[pyclass(dict, module = "tokenizers.pre_tokenizers", name = "PreTokenizer")] ++ #[derive(Clone, Serialize, Deserialize)] ++ pub struct PyPreTokenizer { ++ #[serde(flatten)] ++@@ -146,7 +146,7 @@ impl PyPreTokenizer { ++ /// pretok (:class:`~tokenizers.PreTokenizedString): ++ /// The pre-tokenized string on which to apply this ++ /// :class:`~tokenizers.pre_tokenizers.PreTokenizer` ++- #[text_signature = "(self, pretok)"] +++ #[pyo3(text_signature = "(self, pretok)")] ++ fn pre_tokenize(&self, pretok: &mut PyPreTokenizedString) -> PyResult<()> { ++ ToPyResult(self.pretok.pre_tokenize(&mut pretok.pretok)).into() ++ } ++@@ -166,7 +166,7 @@ impl PyPreTokenizer { ++ /// Returns: ++ /// :obj:`List[Tuple[str, Offsets]]`: ++ /// A list of tuple with the pre-tokenized parts and their offsets ++- #[text_signature = "(self, sequence)"] +++ #[pyo3(text_signature = "(self, sequence)")] ++ fn pre_tokenize_str(&self, s: &str) -> PyResult> { ++ let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s); ++ ++@@ -228,8 +228,8 @@ macro_rules! setter { ++ /// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): ++ /// Whether to add a space to the first word if there isn't already one. This ++ /// lets us treat `hello` exactly like `say hello`. ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)] ++-#[text_signature = "(self, add_prefix_space=True)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")] +++#[pyo3(text_signature = "(self, add_prefix_space=True)")] ++ pub struct PyByteLevel {} ++ #[pymethods] ++ impl PyByteLevel { ++@@ -263,7 +263,7 @@ impl PyByteLevel { ++ /// Returns: ++ /// :obj:`List[str]`: A list of characters that compose the alphabet ++ #[staticmethod] ++- #[text_signature = "()"] +++ #[pyo3(text_signature = "()")] ++ fn alphabet() -> Vec { ++ ByteLevel::alphabet() ++ .into_iter() ++@@ -273,8 +273,8 @@ impl PyByteLevel { ++ } ++ ++ /// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Whitespace)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyWhitespace {} ++ #[pymethods] ++ impl PyWhitespace { ++@@ -285,8 +285,8 @@ impl PyWhitespace { ++ } ++ ++ /// This pre-tokenizer simply splits on the whitespace. Works like `.split()` ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=WhitespaceSplit)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "WhitespaceSplit")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyWhitespaceSplit {} ++ #[pymethods] ++ impl PyWhitespaceSplit { ++@@ -313,8 +313,8 @@ impl PyWhitespaceSplit { ++ /// ++ /// invert (:obj:`bool`, `optional`, defaults to :obj:`False`): ++ /// Whether to invert the pattern. ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Split)] ++-#[text_signature = "(self, pattern, behavior, invert=False)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Split")] +++#[pyo3(text_signature = "(self, pattern, behavior, invert=False)")] ++ pub struct PySplit {} ++ #[pymethods] ++ impl PySplit { ++@@ -343,7 +343,7 @@ impl PySplit { ++ /// Args: ++ /// delimiter: str: ++ /// The delimiter char that will be used to split input ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=CharDelimiterSplit)] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "CharDelimiterSplit")] ++ pub struct PyCharDelimiterSplit {} ++ #[pymethods] ++ impl PyCharDelimiterSplit { ++@@ -374,8 +374,8 @@ impl PyCharDelimiterSplit { ++ /// ++ /// This pre-tokenizer splits tokens on spaces, and also on punctuation. ++ /// Each occurence of a punctuation character will be treated separately. ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=BertPreTokenizer)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyBertPreTokenizer {} ++ #[pymethods] ++ impl PyBertPreTokenizer { ++@@ -392,8 +392,8 @@ impl PyBertPreTokenizer { ++ /// The behavior to use when splitting. ++ /// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next", ++ /// "contiguous" ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)] ++-#[text_signature = "(self, behavior=\"isolated\")"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Punctuation")] +++#[pyo3(text_signature = "(self, behavior=\"isolated\")")] ++ pub struct PyPunctuation {} ++ #[pymethods] ++ impl PyPunctuation { ++@@ -405,8 +405,8 @@ impl PyPunctuation { ++ } ++ ++ /// This pre-tokenizer composes other pre_tokenizers and applies them in sequence ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Sequence)] ++-#[text_signature = "(self, pretokenizers)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")] +++#[pyo3(text_signature = "(self, pretokenizers)")] ++ pub struct PySequence {} ++ #[pymethods] ++ impl PySequence { ++@@ -446,8 +446,8 @@ impl PySequence { ++ /// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): ++ /// Whether to add a space to the first word if there isn't already one. This ++ /// lets us treat `hello` exactly like `say hello`. ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Metaspace)] ++-#[text_signature = "(self, replacement=\"_\", add_prefix_space=True)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")] +++#[pyo3(text_signature = "(self, replacement=\"_\", add_prefix_space=True)")] ++ pub struct PyMetaspace {} ++ #[pymethods] ++ impl PyMetaspace { ++@@ -496,8 +496,8 @@ impl PyMetaspace { ++ /// If set to False, digits will grouped as follows:: ++ /// ++ /// "Call 123 please" -> "Call ", "123", " please" ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Digits)] ++-#[text_signature = "(self, individual_digits=False)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Digits")] +++#[pyo3(text_signature = "(self, individual_digits=False)")] ++ pub struct PyDigits {} ++ #[pymethods] ++ impl PyDigits { ++@@ -522,8 +522,8 @@ impl PyDigits { ++ /// It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt ++ /// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. ++ /// This mimicks SentencePiece Unigram implementation. ++-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)] ++-#[text_signature = "(self)"] +++#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "UnicodeScripts")] +++#[pyo3(text_signature = "(self)")] ++ pub struct PyUnicodeScripts {} ++ #[pymethods] ++ impl PyUnicodeScripts { ++@@ -687,7 +687,7 @@ mod test { ++ let gil = Python::acquire_gil(); ++ assert_eq!( ++ "tokenizers.pre_tokenizers.Whitespace", ++- py_wsp.as_ref(gil.python()).get_type().name() +++ py_wsp.as_ref(gil.python()).get_type().name().unwrap() ++ ); ++ } ++ ++diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs ++index 12990b3..4fd7c50 100644 ++--- a/bindings/python/src/processors.rs +++++ b/bindings/python/src/processors.rs ++@@ -20,7 +20,7 @@ use tokenizers as tk; ++ /// ++ /// This class is not supposed to be instantiated directly. Instead, any implementation of ++ /// a PostProcessor will return an instance of this class when instantiated. ++-#[pyclass(dict, module = "tokenizers.processors", name=PostProcessor)] +++#[pyclass(dict, module = "tokenizers.processors", name = "PostProcessor")] ++ #[derive(Clone, Deserialize, Serialize)] ++ pub struct PyPostProcessor { ++ #[serde(flatten)] ++@@ -100,7 +100,7 @@ impl PyPostProcessor { ++ /// ++ /// Returns: ++ /// :obj:`int`: The number of tokens to add ++- #[text_signature = "(self, is_pair)"] +++ #[pyo3(text_signature = "(self, is_pair)")] ++ fn num_special_tokens_to_add(&self, is_pair: bool) -> usize { ++ self.processor.added_tokens(is_pair) ++ } ++@@ -120,7 +120,7 @@ impl PyPostProcessor { ++ /// Return: ++ /// :class:`~tokenizers.Encoding`: The final encoding ++ #[args(pair = "None", add_special_tokens = "true")] ++- #[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"] +++ #[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")] ++ fn process( ++ &self, ++ encoding: &PyEncoding, ++@@ -149,8 +149,8 @@ impl PyPostProcessor { ++ /// ++ /// cls (:obj:`Tuple[str, int]`): ++ /// A tuple with the string representation of the CLS token, and its id ++-#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=BertProcessing)] ++-#[text_signature = "(self, sep, cls)"] +++#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "BertProcessing")] +++#[pyo3(text_signature = "(self, sep, cls)")] ++ pub struct PyBertProcessing {} ++ #[pymethods] ++ impl PyBertProcessing { ++@@ -191,8 +191,8 @@ impl PyBertProcessing { ++ /// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): ++ /// Whether the add_prefix_space option was enabled during pre-tokenization. This ++ /// is relevant because it defines the way the offsets are trimmed out. ++-#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=RobertaProcessing)] ++-#[text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)"] +++#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "RobertaProcessing")] +++#[pyo3(text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")] ++ pub struct PyRobertaProcessing {} ++ #[pymethods] ++ impl PyRobertaProcessing { ++@@ -226,8 +226,8 @@ impl PyRobertaProcessing { ++ /// Args: ++ /// trim_offsets (:obj:`bool`): ++ /// Whether to trim the whitespaces from the produced offsets. ++-#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=ByteLevel)] ++-#[text_signature = "(self, trim_offsets=True)"] +++#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] +++#[pyo3(text_signature = "(self, trim_offsets=True)")] ++ pub struct PyByteLevel {} ++ #[pymethods] ++ impl PyByteLevel { ++@@ -378,8 +378,8 @@ impl FromPyObject<'_> for PyTemplate { ++ /// ++ /// The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have ++ /// the same length. ++-#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)] ++-#[text_signature = "(self, single, pair, special_tokens)"] +++#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "TemplateProcessing")] +++#[pyo3(text_signature = "(self, single, pair, special_tokens)")] ++ pub struct PyTemplateProcessing {} ++ #[pymethods] ++ impl PyTemplateProcessing { ++@@ -429,7 +429,7 @@ mod test { ++ let gil = Python::acquire_gil(); ++ assert_eq!( ++ "tokenizers.processors.BertProcessing", ++- py_bert.as_ref(gil.python()).get_type().name() +++ py_bert.as_ref(gil.python()).get_type().name().unwrap() ++ ); ++ } ++ ++diff --git a/bindings/python/src/token.rs b/bindings/python/src/token.rs ++index eb2a472..f1db997 100644 ++--- a/bindings/python/src/token.rs +++++ b/bindings/python/src/token.rs ++@@ -1,7 +1,7 @@ ++ use pyo3::prelude::*; ++ use tk::Token; ++ ++-#[pyclass(module = "tokenizers", name=Token)] +++#[pyclass(module = "tokenizers", name = "Token")] ++ #[derive(Clone)] ++ pub struct PyToken { ++ token: Token, ++diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs ++index 89073a4..cfa7358 100644 ++--- a/bindings/python/src/tokenizer.rs +++++ b/bindings/python/src/tokenizer.rs ++@@ -55,8 +55,8 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator}; ++ /// lowercasing the text, the token could be extract from the input ``"I saw a lion ++ /// Yesterday"``. ++ /// ++-#[pyclass(dict, module = "tokenizers", name=AddedToken)] ++-#[text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"] +++#[pyclass(dict, module = "tokenizers", name = "AddedToken")] +++#[pyo3(text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")] ++ pub struct PyAddedToken { ++ pub content: String, ++ pub is_special_token: bool, ++@@ -285,6 +285,7 @@ impl FromPyObject<'_> for PyArrayUnicode { ++ let seq = (0..n_elem) ++ .map(|i| { ++ let bytes = &all_bytes[i * elsize..(i + 1) * elsize]; +++ #[allow(deprecated)] ++ let unicode = pyo3::ffi::PyUnicode_FromUnicode( ++ bytes.as_ptr() as *const _, ++ elsize as isize / alignment as isize, ++@@ -438,8 +439,8 @@ type Tokenizer = TokenizerImpl PyResult { ++ let tokenizer: PyResult<_> = ToPyResult(json.parse()).into(); ++ Ok(Self::new(tokenizer?)) ++@@ -518,7 +519,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :class:`~tokenizers.Tokenizer`: The new tokenizer ++ #[staticmethod] ++- #[text_signature = "(path)"] +++ #[pyo3(text_signature = "(path)")] ++ fn from_file(path: &str) -> PyResult { ++ let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into(); ++ Ok(Self::new(tokenizer?)) ++@@ -533,7 +534,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :class:`~tokenizers.Tokenizer`: The new tokenizer ++ #[staticmethod] ++- #[text_signature = "(buffer)"] +++ #[pyo3(text_signature = "(buffer)")] ++ fn from_buffer(buffer: &PyBytes) -> PyResult { ++ let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| { ++ exceptions::PyValueError::new_err(format!( ++@@ -561,7 +562,7 @@ impl PyTokenizer { ++ /// :class:`~tokenizers.Tokenizer`: The new tokenizer ++ #[staticmethod] ++ #[args(revision = "String::from(\"main\")", auth_token = "None")] ++- #[text_signature = "(identifier, revision=\"main\", auth_token=None)"] +++ #[pyo3(text_signature = "(identifier, revision=\"main\", auth_token=None)")] ++ fn from_pretrained( ++ identifier: &str, ++ revision: String, ++@@ -590,7 +591,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :obj:`str`: A string representing the serialized Tokenizer ++ #[args(pretty = false)] ++- #[text_signature = "(self, pretty=False)"] +++ #[pyo3(text_signature = "(self, pretty=False)")] ++ fn to_str(&self, pretty: bool) -> PyResult { ++ ToPyResult(self.tokenizer.to_string(pretty)).into() ++ } ++@@ -604,7 +605,7 @@ impl PyTokenizer { ++ /// pretty (:obj:`bool`, defaults to :obj:`True`): ++ /// Whether the JSON file should be pretty formatted. ++ #[args(pretty = true)] ++- #[text_signature = "(self, path, pretty=True)"] +++ #[pyo3(text_signature = "(self, path, pretty=True)")] ++ fn save(&self, path: &str, pretty: bool) -> PyResult<()> { ++ ToPyResult(self.tokenizer.save(path, pretty)).into() ++ } ++@@ -612,7 +613,7 @@ impl PyTokenizer { ++ /// Return the number of special tokens that would be added for single/pair sentences. ++ /// :param is_pair: Boolean indicating if the input would be a single sentence or a pair ++ /// :return: ++- #[text_signature = "(self, is_pair)"] +++ #[pyo3(text_signature = "(self, is_pair)")] ++ fn num_special_tokens_to_add(&self, is_pair: bool) -> usize { ++ self.tokenizer ++ .get_post_processor() ++@@ -628,7 +629,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :obj:`Dict[str, int]`: The vocabulary ++ #[args(with_added_tokens = true)] ++- #[text_signature = "(self, with_added_tokens=True)"] +++ #[pyo3(text_signature = "(self, with_added_tokens=True)")] ++ fn get_vocab(&self, with_added_tokens: bool) -> HashMap { ++ self.tokenizer.get_vocab(with_added_tokens) ++ } ++@@ -642,7 +643,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :obj:`int`: The size of the vocabulary ++ #[args(with_added_tokens = true)] ++- #[text_signature = "(self, with_added_tokens=True)"] +++ #[pyo3(text_signature = "(self, with_added_tokens=True)")] ++ fn get_vocab_size(&self, with_added_tokens: bool) -> usize { ++ self.tokenizer.get_vocab_size(with_added_tokens) ++ } ++@@ -664,7 +665,7 @@ impl PyTokenizer { ++ /// direction (:obj:`str`, defaults to :obj:`right`): ++ /// Truncate direction ++ #[args(kwargs = "**")] ++- #[text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"] +++ #[pyo3(text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')")] ++ fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { ++ let mut params = TruncationParams { ++ max_length, ++@@ -714,7 +715,7 @@ impl PyTokenizer { ++ } ++ ++ /// Disable truncation ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn no_truncation(&mut self) { ++ self.tokenizer.with_truncation(None); ++ } ++@@ -764,7 +765,7 @@ impl PyTokenizer { ++ /// If specified, the length at which to pad. If not specified we pad using the size of ++ /// the longest sequence in a batch. ++ #[args(kwargs = "**")] ++- #[text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"] +++ #[pyo3(text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)")] ++ fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { ++ let mut params = PaddingParams::default(); ++ ++@@ -822,7 +823,7 @@ impl PyTokenizer { ++ } ++ ++ /// Disable padding ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn no_padding(&mut self) { ++ self.tokenizer.with_padding(None); ++ } ++@@ -891,7 +892,7 @@ impl PyTokenizer { ++ /// :class:`~tokenizers.Encoding`: The encoded result ++ /// ++ #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")] ++- #[text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"] +++ #[pyo3(text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)")] ++ fn encode( ++ &self, ++ sequence: &PyAny, ++@@ -956,7 +957,7 @@ impl PyTokenizer { ++ /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch ++ /// ++ #[args(is_pretokenized = "false", add_special_tokens = "true")] ++- #[text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)"] +++ #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")] ++ fn encode_batch( ++ &self, ++ input: Vec<&PyAny>, ++@@ -999,7 +1000,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :obj:`str`: The decoded string ++ #[args(skip_special_tokens = true)] ++- #[text_signature = "(self, ids, skip_special_tokens=True)"] +++ #[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")] ++ fn decode(&self, ids: Vec, skip_special_tokens: bool) -> PyResult { ++ ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into() ++ } ++@@ -1016,7 +1017,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :obj:`List[str]`: A list of decoded strings ++ #[args(skip_special_tokens = true)] ++- #[text_signature = "(self, sequences, skip_special_tokens=True)"] +++ #[pyo3(text_signature = "(self, sequences, skip_special_tokens=True)")] ++ fn decode_batch( ++ &self, ++ sequences: Vec>, ++@@ -1036,7 +1037,7 @@ impl PyTokenizer { ++ /// ++ /// Returns: ++ /// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary ++- #[text_signature = "(self, token)"] +++ #[pyo3(text_signature = "(self, token)")] ++ fn token_to_id(&self, token: &str) -> Option { ++ self.tokenizer.token_to_id(token) ++ } ++@@ -1049,7 +1050,7 @@ impl PyTokenizer { ++ /// ++ /// Returns: ++ /// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary ++- #[text_signature = "(self, id)"] +++ #[pyo3(text_signature = "(self, id)")] ++ fn id_to_token(&self, id: u32) -> Option { ++ self.tokenizer.id_to_token(id) ++ } ++@@ -1066,7 +1067,7 @@ impl PyTokenizer { ++ /// ++ /// Returns: ++ /// :obj:`int`: The number of tokens that were created in the vocabulary ++- #[text_signature = "(self, tokens)"] +++ #[pyo3(text_signature = "(self, tokens)")] ++ fn add_tokens(&mut self, tokens: &PyList) -> PyResult { ++ let tokens = tokens ++ .into_iter() ++@@ -1103,7 +1104,7 @@ impl PyTokenizer { ++ /// ++ /// Returns: ++ /// :obj:`int`: The number of tokens that were created in the vocabulary ++- #[text_signature = "(self, tokens)"] +++ #[pyo3(text_signature = "(self, tokens)")] ++ fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult { ++ let tokens = tokens ++ .into_iter() ++@@ -1137,7 +1138,7 @@ impl PyTokenizer { ++ /// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`): ++ /// An optional trainer that should be used to train our Model ++ #[args(trainer = "None")] ++- #[text_signature = "(self, files, trainer = None)"] +++ #[pyo3(text_signature = "(self, files, trainer = None)")] ++ fn train(&mut self, files: Vec, trainer: Option<&mut PyTrainer>) -> PyResult<()> { ++ let mut trainer = ++ trainer.map_or_else(|| self.tokenizer.get_model().get_trainer(), |t| t.clone()); ++@@ -1173,7 +1174,7 @@ impl PyTokenizer { ++ /// The total number of sequences in the iterator. This is used to ++ /// provide meaningful progress tracking ++ #[args(trainer = "None", length = "None")] ++- #[text_signature = "(self, iterator, trainer=None, length=None)"] +++ #[pyo3(text_signature = "(self, iterator, trainer=None, length=None)")] ++ fn train_from_iterator( ++ &mut self, ++ py: Python, ++@@ -1239,7 +1240,7 @@ impl PyTokenizer { ++ /// Returns: ++ /// :class:`~tokenizers.Encoding`: The final post-processed encoding ++ #[args(pair = "None", add_special_tokens = true)] ++- #[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"] +++ #[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")] ++ fn post_process( ++ &self, ++ encoding: &PyEncoding, ++diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs ++index 7def6fc..98082dd 100644 ++--- a/bindings/python/src/trainers.rs +++++ b/bindings/python/src/trainers.rs ++@@ -15,9 +15,9 @@ use crate::utils::PyChar; ++ /// ++ /// This class is not supposed to be instantiated directly. Instead, any implementation of a ++ /// Trainer will return an instance of this class when instantiated. ++-#[pyclass(name=Trainer, module = "tokenizers.trainers", name=Trainer)] +++#[pyclass(module = "tokenizers.trainers", name = "Trainer")] ++ #[derive(Clone)] ++-#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"] +++#[pyo3(text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)")] ++ pub struct PyTrainer { ++ pub trainer: Arc>, ++ } ++@@ -132,7 +132,7 @@ macro_rules! setter { ++ /// ++ /// end_of_word_suffix (:obj:`str`, `optional`): ++ /// A suffix to be used for every subword that is a end-of-word. ++-#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=BpeTrainer)] +++#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "BpeTrainer")] ++ pub struct PyBpeTrainer {} ++ #[pymethods] ++ impl PyBpeTrainer { ++@@ -335,8 +335,8 @@ impl PyBpeTrainer { ++ /// ++ /// end_of_word_suffix (:obj:`str`, `optional`): ++ /// A suffix to be used for every subword that is a end-of-word. ++-#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordPieceTrainer)] ++-#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"] +++#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")] +++#[pyo3(text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)")] ++ pub struct PyWordPieceTrainer {} ++ #[pymethods] ++ impl PyWordPieceTrainer { ++@@ -525,7 +525,7 @@ impl PyWordPieceTrainer { ++ /// ++ /// special_tokens (:obj:`List[Union[str, AddedToken]]`): ++ /// A list of special tokens the model should know of. ++-#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordLevelTrainer)] +++#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordLevelTrainer")] ++ pub struct PyWordLevelTrainer {} ++ #[pymethods] ++ impl PyWordLevelTrainer { ++@@ -681,8 +681,8 @@ impl PyWordLevelTrainer { ++ /// n_sub_iterations (:obj:`int`): ++ /// The number of iterations of the EM algorithm to perform before ++ /// pruning the vocabulary. ++-#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=UnigramTrainer)] ++-#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"] +++#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")] +++#[pyo3(text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)")] ++ pub struct PyUnigramTrainer {} ++ #[pymethods] ++ impl PyUnigramTrainer { ++diff --git a/bindings/python/src/utils/iterators.rs b/bindings/python/src/utils/iterators.rs ++index 0715df5..cf6310b 100644 ++--- a/bindings/python/src/utils/iterators.rs +++++ b/bindings/python/src/utils/iterators.rs ++@@ -1,5 +1,5 @@ ++ use pyo3::prelude::*; ++-use pyo3::{AsPyPointer, PyNativeType}; +++use pyo3::AsPyPointer; ++ use std::collections::VecDeque; ++ ++ /// An simple iterator that can be instantiated with a specified length. ++diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs ++index 39b1b73..d60d91c 100644 ++--- a/bindings/python/src/utils/normalization.rs +++++ b/bindings/python/src/utils/normalization.rs ++@@ -192,7 +192,7 @@ fn slice( ++ /// Args: ++ /// sequence: str: ++ /// The string sequence used to initialize this NormalizedString ++-#[pyclass(module = "tokenizers", name=NormalizedString)] +++#[pyclass(module = "tokenizers", name = "NormalizedString")] ++ #[derive(Clone)] ++ pub struct PyNormalizedString { ++ pub(crate) normalized: NormalizedString, ++@@ -217,91 +217,91 @@ impl PyNormalizedString { ++ } ++ ++ /// Runs the NFD normalization ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn nfd(&mut self) { ++ self.normalized.nfd(); ++ } ++ ++ /// Runs the NFKD normalization ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn nfkd(&mut self) { ++ self.normalized.nfkd(); ++ } ++ ++ /// Runs the NFC normalization ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn nfc(&mut self) { ++ self.normalized.nfc(); ++ } ++ ++ /// Runs the NFKC normalization ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn nfkc(&mut self) { ++ self.normalized.nfkc(); ++ } ++ ++ /// Lowercase the string ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn lowercase(&mut self) { ++ self.normalized.lowercase(); ++ } ++ ++ /// Uppercase the string ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn uppercase(&mut self) { ++ self.normalized.uppercase(); ++ } ++ ++ /// Prepend the given sequence to the string ++- #[text_signature = "(self, s)"] +++ #[pyo3(text_signature = "(self, s)")] ++ fn prepend(&mut self, s: &str) { ++ self.normalized.prepend(s); ++ } ++ ++ /// Append the given sequence to the string ++- #[text_signature = "(self, s)"] +++ #[pyo3(text_signature = "(self, s)")] ++ fn append(&mut self, s: &str) { ++ self.normalized.append(s); ++ } ++ ++ /// Strip the left of the string ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn lstrip(&mut self) { ++ self.normalized.lstrip(); ++ } ++ ++ /// Strip the right of the string ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn rstrip(&mut self) { ++ self.normalized.rstrip(); ++ } ++ ++ /// Strip both ends of the string ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn strip(&mut self) { ++ self.normalized.strip(); ++ } ++ ++ /// Clears the string ++- #[text_signature = "(self)"] +++ #[pyo3(text_signature = "(self)")] ++ fn clear(&mut self) { ++ self.normalized.clear(); ++ } ++ ++ /// Slice the string using the given range ++- #[text_signature = "(self, range)"] +++ #[pyo3(text_signature = "(self, range)")] ++ fn slice(&self, range: PyRange) -> PyResult> { ++ slice(&self.normalized, &range) ++ } ++ ++ /// Filter each character of the string using the given func ++- #[text_signature = "(self, func)"] +++ #[pyo3(text_signature = "(self, func)")] ++ fn filter(&mut self, func: &PyAny) -> PyResult<()> { ++ filter(&mut self.normalized, func) ++ } ++ ++ /// Calls the given function for each character of the string ++- #[text_signature = "(self, func)"] +++ #[pyo3(text_signature = "(self, func)")] ++ fn for_each(&self, func: &PyAny) -> PyResult<()> { ++ for_each(&self.normalized, func) ++ } ++@@ -310,7 +310,7 @@ impl PyNormalizedString { ++ /// ++ /// Replaces each character of the string using the returned value. Each ++ /// returned value **must** be a str of length 1 (ie a character). ++- #[text_signature = "(self, func)"] +++ #[pyo3(text_signature = "(self, func)")] ++ fn map(&mut self, func: &PyAny) -> PyResult<()> { ++ map(&mut self.normalized, func) ++ } ++@@ -328,7 +328,7 @@ impl PyNormalizedString { ++ /// ++ /// Returns: ++ /// A list of NormalizedString, representing each split ++- #[text_signature = "(self, pattern, behavior)"] +++ #[pyo3(text_signature = "(self, pattern, behavior)")] ++ fn split( ++ &mut self, ++ pattern: PyPattern, ++@@ -349,7 +349,7 @@ impl PyNormalizedString { ++ /// ++ /// content: str: ++ /// The content to be used as replacement ++- #[text_signature = "(self, pattern, content)"] +++ #[pyo3(text_signature = "(self, pattern, content)")] ++ fn replace(&mut self, pattern: PyPattern, content: &str) -> PyResult<()> { ++ ToPyResult(self.normalized.replace(pattern, content)).into() ++ } ++@@ -389,7 +389,7 @@ impl From for NormalizedString { ++ } ++ } ++ ++-#[pyclass(module = "tokenizers", name=NormalizedStringRefMut)] +++#[pyclass(module = "tokenizers", name = "NormalizedStringRefMut")] ++ #[derive(Clone)] ++ pub struct PyNormalizedStringRefMut { ++ inner: RefMutContainer, ++diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs ++index b4d5a66..fb692c7 100644 ++--- a/bindings/python/src/utils/pretokenization.rs +++++ b/bindings/python/src/utils/pretokenization.rs ++@@ -147,8 +147,8 @@ fn to_encoding( ++ /// Args: ++ /// sequence: str: ++ /// The string sequence used to initialize this PreTokenizedString ++-#[pyclass(module = "tokenizers", name=PreTokenizedString)] ++-#[text_signature = "(self, sequence)"] +++#[pyclass(module = "tokenizers", name = "PreTokenizedString")] +++#[pyo3(text_signature = "(self, sequence)")] ++ pub struct PyPreTokenizedString { ++ pub(crate) pretok: tk::PreTokenizedString, ++ } ++@@ -182,7 +182,7 @@ impl PyPreTokenizedString { ++ /// just return it directly. ++ /// In order for the offsets to be tracked accurately, any returned `NormalizedString` ++ /// should come from calling either `.split` or `.slice` on the received one. ++- #[text_signature = "(self, func)"] +++ #[pyo3(text_signature = "(self, func)")] ++ fn split(&mut self, func: &PyAny) -> PyResult<()> { ++ split(&mut self.pretok, func) ++ } ++@@ -194,7 +194,7 @@ impl PyPreTokenizedString { ++ /// The function used to normalize each underlying split. This function ++ /// does not need to return anything, just calling the methods on the provided ++ /// NormalizedString allow its modification. ++- #[text_signature = "(self, func)"] +++ #[pyo3(text_signature = "(self, func)")] ++ fn normalize(&mut self, func: &PyAny) -> PyResult<()> { ++ normalize(&mut self.pretok, func) ++ } ++@@ -205,7 +205,7 @@ impl PyPreTokenizedString { ++ /// func: Callable[[str], List[Token]]: ++ /// The function used to tokenize each underlying split. This function must return ++ /// a list of Token generated from the input str. ++- #[text_signature = "(self, func)"] +++ #[pyo3(text_signature = "(self, func)")] ++ fn tokenize(&mut self, func: &PyAny) -> PyResult<()> { ++ tokenize(&mut self.pretok, func) ++ } ++@@ -224,7 +224,7 @@ impl PyPreTokenizedString { ++ /// Returns: ++ /// An Encoding ++ #[args(type_id = "0", word_idx = "None")] ++- #[text_signature = "(self, type_id=0, word_idx=None)"] +++ #[pyo3(text_signature = "(self, type_id=0, word_idx=None)")] ++ fn to_encoding(&self, type_id: u32, word_idx: Option) -> PyResult { ++ to_encoding(&self.pretok, type_id, word_idx) ++ } ++@@ -249,7 +249,7 @@ impl PyPreTokenizedString { ++ offset_referential = "PyOffsetReferential(OffsetReferential::Original)", ++ offset_type = "PyOffsetType(OffsetType::Char)" ++ )] ++- #[text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")"] +++ #[pyo3(text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")")] ++ fn get_splits( ++ &self, ++ offset_referential: PyOffsetReferential, ++@@ -259,7 +259,7 @@ impl PyPreTokenizedString { ++ } ++ } ++ ++-#[pyclass(module = "tokenizers", name=PreTokenizedString)] +++#[pyclass(module = "tokenizers", name = "PreTokenizedString")] ++ #[derive(Clone)] ++ pub struct PyPreTokenizedStringRefMut { ++ inner: RefMutContainer, ++diff --git a/bindings/python/src/utils/regex.rs b/bindings/python/src/utils/regex.rs ++index 8170ffc..9e0d424 100644 ++--- a/bindings/python/src/utils/regex.rs +++++ b/bindings/python/src/utils/regex.rs ++@@ -3,8 +3,8 @@ use pyo3::exceptions; ++ use pyo3::prelude::*; ++ ++ /// Instantiate a new Regex with the given pattern ++-#[pyclass(module = "tokenizers", name=Regex)] ++-#[text_signature = "(self, pattern)"] +++#[pyclass(module = "tokenizers", name = "Regex")] +++#[pyo3(text_signature = "(self, pattern)")] ++ pub struct PyRegex { ++ pub inner: Regex, ++ pub pattern: String, ++-- ++2.35.1.windows.2 + -+[target.aarch64-apple-darwin] -+rustflags = [ -+ "-C", "link-arg=-undefined", -+ "-C", "link-arg=dynamic_lookup", -+] diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 823f4a2..286cd68 100644 --- a/bindings/python/Cargo.lock