diff --git a/poetry.lock b/poetry.lock index c9a078f955..68f21010e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -60,22 +60,22 @@ files = [ [[package]] name = "attrs" -version = "22.2.0" +version = "23.1.0" description = "Classes Without Boilerplate" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, - {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, + {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, + {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, ] [package.extras] -cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs[docs,tests]"] -docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] -tests = ["attrs[tests-no-zope]", "zope.interface"] -tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] +cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] +dev = ["attrs[docs,tests]", "pre-commit"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] +tests = ["attrs[tests-no-zope]", "zope-interface"] +tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] [[package]] name = "black" @@ -122,6 +122,18 @@ d = ["aiohttp (>=3.7.4)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "blinker" +version = "1.6.2" +description = "Fast, simple object-to-object and broadcast signaling" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "blinker-1.6.2-py3-none-any.whl", hash = "sha256:c3d739772abb7bc2860abf5f2ec284223d9ad5c76da018234f6f50d6f31ab1f0"}, + {file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"}, +] + [[package]] name = "blis" version = "0.7.9" @@ -487,31 +499,31 @@ toml = ["tomli"] [[package]] name = "cryptography" -version = "40.0.1" +version = "40.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "dev" optional = false python-versions = ">=3.6" files = [ - {file = "cryptography-40.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:918cb89086c7d98b1b86b9fdb70c712e5a9325ba6f7d7cfb509e784e0cfc6917"}, - {file = "cryptography-40.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9618a87212cb5200500e304e43691111570e1f10ec3f35569fdfcd17e28fd797"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a4805a4ca729d65570a1b7cac84eac1e431085d40387b7d3bbaa47e39890b88"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63dac2d25c47f12a7b8aa60e528bfb3c51c5a6c5a9f7c86987909c6c79765554"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a4e3406cfed6b1f6d6e87ed243363652b2586b2d917b0609ca4f97072994405"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1e0af458515d5e4028aad75f3bb3fe7a31e46ad920648cd59b64d3da842e4356"}, - {file = "cryptography-40.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d8aa3609d337ad85e4eb9bb0f8bcf6e4409bfb86e706efa9a027912169e89122"}, - {file = "cryptography-40.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:cf91e428c51ef692b82ce786583e214f58392399cf65c341bc7301d096fa3ba2"}, - {file = "cryptography-40.0.1-cp36-abi3-win32.whl", hash = "sha256:650883cc064297ef3676b1db1b7b1df6081794c4ada96fa457253c4cc40f97db"}, - {file = "cryptography-40.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:a805a7bce4a77d51696410005b3e85ae2839bad9aa38894afc0aa99d8e0c3160"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cd033d74067d8928ef00a6b1327c8ea0452523967ca4463666eeba65ca350d4c"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d36bbeb99704aabefdca5aee4eba04455d7a27ceabd16f3b3ba9bdcc31da86c4"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:32057d3d0ab7d4453778367ca43e99ddb711770477c4f072a51b3ca69602780a"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f5d7b79fa56bc29580faafc2ff736ce05ba31feaa9d4735048b0de7d9ceb2b94"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7c872413353c70e0263a9368c4993710070e70ab3e5318d85510cc91cce77e7c"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:28d63d75bf7ae4045b10de5413fb1d6338616e79015999ad9cf6fc538f772d41"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6f2bbd72f717ce33100e6467572abaedc61f1acb87b8d546001328d7f466b778"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cc3a621076d824d75ab1e1e530e66e7e8564e357dd723f2533225d40fe35c60c"}, - {file = "cryptography-40.0.1.tar.gz", hash = "sha256:2803f2f8b1e95f614419926c7e6f55d828afc614ca5ed61543877ae668cc3472"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:8f79b5ff5ad9d3218afb1e7e20ea74da5f76943ee5edb7f76e56ec5161ec782b"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:05dc219433b14046c476f6f09d7636b92a1c3e5808b9a6536adf4932b3b2c440"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4df2af28d7bedc84fe45bd49bc35d710aede676e2a4cb7fc6d103a2adc8afe4d"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dcca15d3a19a66e63662dc8d30f8036b07be851a8680eda92d079868f106288"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a04386fb7bc85fab9cd51b6308633a3c271e3d0d3eae917eebab2fac6219b6d2"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:adc0d980fd2760c9e5de537c28935cc32b9353baaf28e0814df417619c6c8c3b"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d5a1bd0e9e2031465761dfa920c16b0065ad77321d8a8c1f5ee331021fda65e9"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a95f4802d49faa6a674242e25bfeea6fc2acd915b5e5e29ac90a32b1139cae1c"}, + {file = "cryptography-40.0.2-cp36-abi3-win32.whl", hash = "sha256:aecbb1592b0188e030cb01f82d12556cf72e218280f621deed7d806afd2113f9"}, + {file = "cryptography-40.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:b12794f01d4cacfbd3177b9042198f3af1c856eedd0a98f10f141385c809a14b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:142bae539ef28a1c76794cca7f49729e7c54423f615cfd9b0b1fa90ebe53244b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:956ba8701b4ffe91ba59665ed170a2ebbdc6fc0e40de5f6059195d9f2b33ca0e"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4f01c9863da784558165f5d4d916093737a75203a5c5286fde60e503e4276c7a"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3daf9b114213f8ba460b829a02896789751626a2a4e7a43a28ee77c04b5e4958"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48f388d0d153350f378c7f7b41497a54ff1513c816bcbbcafe5b829e59b9ce5b"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c0764e72b36a3dc065c155e5b22f93df465da9c39af65516fe04ed3c68c92636"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:cbaba590180cba88cb99a5f76f90808a624f18b169b90a4abb40c1fd8c19420e"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7a38250f433cd41df7fcb763caa3ee9362777fdb4dc642b9a349721d2bf47404"}, + {file = "cryptography-40.0.2.tar.gz", hash = "sha256:c33c0d32b8594fa647d2e01dbccc303478e16fdd7cf98652d5b3ed11aa5e5c99"}, ] [package.dependencies] @@ -606,14 +618,14 @@ files = [ [[package]] name = "domdf-python-tools" -version = "3.6.0" +version = "3.6.1" description = "Helpful functions for Python 🐍 🛠️" category = "dev" optional = false python-versions = ">=3.6" files = [ - {file = "domdf_python_tools-3.6.0-py3-none-any.whl", hash = "sha256:7a0a3b2c716854465b09b5c0c5f53d41f37562c5a0cd8746cd042ad7955430f1"}, - {file = "domdf_python_tools-3.6.0.tar.gz", hash = "sha256:0ac5efa2ac648dca5653e386fe73aa995e66b215c9d16b7ee87e931322a1e6c8"}, + {file = "domdf_python_tools-3.6.1-py3-none-any.whl", hash = "sha256:e18158460850957f18e740eb94ede56f580ddb0cb162ab9d9834ed8bbb1b6431"}, + {file = "domdf_python_tools-3.6.1.tar.gz", hash = "sha256:acc04563d23bce4d437dd08af6b9bea788328c412772a044d8ca428a7ad861be"}, ] [package.dependencies] @@ -714,14 +726,14 @@ peewee = "*" [[package]] name = "faker" -version = "18.4.0" +version = "18.5.1" description = "Faker is a Python package that generates fake data for you." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "Faker-18.4.0-py3-none-any.whl", hash = "sha256:170ead9d0d140916168b142df69c44722b8f622ced2070802d0af9c476f0cb84"}, - {file = "Faker-18.4.0.tar.gz", hash = "sha256:977ad0b7aa7a61ed57287d6a0723a827e9d3dd1f8cc82aaf08707f281b33bacc"}, + {file = "Faker-18.5.1-py3-none-any.whl", hash = "sha256:137c6667583b0b458599b11305eed5a486e3932a14cb792b2b5b82ad1ad1a430"}, + {file = "Faker-18.5.1.tar.gz", hash = "sha256:64e9ab619d75684cc0593aa9f336170b0b58fa77c07fc0ebc7b2b1258e53b67d"}, ] [package.dependencies] @@ -786,19 +798,19 @@ test = ["pytest"] [[package]] name = "filelock" -version = "3.11.0" +version = "3.12.0" description = "A platform independent file lock." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "filelock-3.11.0-py3-none-any.whl", hash = "sha256:f08a52314748335c6460fc8fe40cd5638b85001225db78c2aa01c8c0db83b318"}, - {file = "filelock-3.11.0.tar.gz", hash = "sha256:3618c0da67adcc0506b015fd11ef7faf1b493f0b40d87728e19986b536890c37"}, + {file = "filelock-3.12.0-py3-none-any.whl", hash = "sha256:ad98852315c2ab702aeb628412cbf7e95b7ce8c3bf9565670b4eaecf1db370a9"}, + {file = "filelock-3.12.0.tar.gz", hash = "sha256:fc03ae43288c013d2ea83c8597001b1129db351aad9c57fe2409327916b8e718"}, ] [package.extras] -docs = ["furo (>=2023.3.27)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.2.2)", "diff-cover (>=7.5)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"] +docs = ["furo (>=2023.3.27)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"] [[package]] name = "flake8" @@ -852,35 +864,25 @@ files = [ domdf-python-tools = ">=2.0.0" flake8 = ">=3.8.4" -[[package]] -name = "flashtext" -version = "2.7" -description = "Extract/Replaces keywords in sentences." -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "flashtext-2.7.tar.gz", hash = "sha256:a1be2b93e09d4f0deee4aad72b91a7127b61fb8b8034ca9a9c78ea745d8b05cf"}, -] - [[package]] name = "flask" -version = "2.2.3" +version = "2.3.1" description = "A simple framework for building complex web applications." category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "Flask-2.2.3-py3-none-any.whl", hash = "sha256:c0bec9477df1cb867e5a67c9e1ab758de9cb4a3e52dd70681f59fa40a62b3f2d"}, - {file = "Flask-2.2.3.tar.gz", hash = "sha256:7eb373984bf1c770023fce9db164ed0c3353cd0b53f130f4693da0ca756a2e6d"}, + {file = "Flask-2.3.1-py3-none-any.whl", hash = "sha256:8ba2a854608fdd603b67dccd4514a46450132227fb9df40127a8d0c1de8769ec"}, + {file = "Flask-2.3.1.tar.gz", hash = "sha256:a6059db4297106e5a64b3215fa16ae641822c1cb97ecb498573549b2478602cb"}, ] [package.dependencies] -click = ">=8.0" +blinker = ">=1.6.2" +click = ">=8.1.3" importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} -itsdangerous = ">=2.0" -Jinja2 = ">=3.0" -Werkzeug = ">=2.2.2" +itsdangerous = ">=2.1.2" +Jinja2 = ">=3.1.2" +Werkzeug = ">=2.3.0" [package.extras] async = ["asgiref (>=3.2)"] @@ -912,6 +914,42 @@ ufo = ["fs (>=2.2.0,<3)"] unicode = ["unicodedata2 (>=15.0.0)"] woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] +[[package]] +name = "fsspec" +version = "2023.4.0" +description = "File-system specification" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fsspec-2023.4.0-py3-none-any.whl", hash = "sha256:f398de9b49b14e9d84d2c2d11b7b67121bc072fe97b930c4e5668ac3917d8307"}, + {file = "fsspec-2023.4.0.tar.gz", hash = "sha256:bf064186cd8808f0b2f6517273339ba0a0c8fb1b7048991c28bc67f58b8b67cd"}, +] + +[package.extras] +abfs = ["adlfs"] +adl = ["adlfs"] +arrow = ["pyarrow (>=1)"] +dask = ["dask", "distributed"] +devel = ["pytest", "pytest-cov"] +dropbox = ["dropbox", "dropboxdrivefs", "requests"] +full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] +fuse = ["fusepy"] +gcs = ["gcsfs"] +git = ["pygit2"] +github = ["requests"] +gs = ["gcsfs"] +gui = ["panel"] +hdfs = ["pyarrow (>=1)"] +http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"] +libarchive = ["libarchive-c"] +oci = ["ocifs"] +s3 = ["s3fs"] +sftp = ["paramiko"] +smb = ["smbprotocol"] +ssh = ["paramiko"] +tqdm = ["tqdm"] + [[package]] name = "ghp-import" version = "2.1.0" @@ -932,61 +970,61 @@ dev = ["flake8", "markdown", "twine", "wheel"] [[package]] name = "grpcio" -version = "1.53.0" +version = "1.54.0" description = "HTTP/2-based RPC framework" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "grpcio-1.53.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:752d2949b40e12e6ad3ed8cc552a65b54d226504f6b1fb67cab2ccee502cc06f"}, - {file = "grpcio-1.53.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:8a48fd3a7222be226bb86b7b413ad248f17f3101a524018cdc4562eeae1eb2a3"}, - {file = "grpcio-1.53.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:f3e837d29f0e1b9d6e7b29d569e2e9b0da61889e41879832ea15569c251c303a"}, - {file = "grpcio-1.53.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aef7d30242409c3aa5839b501e877e453a2c8d3759ca8230dd5a21cda029f046"}, - {file = "grpcio-1.53.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6f90698b5d1c5dd7b3236cd1fa959d7b80e17923f918d5be020b65f1c78b173"}, - {file = "grpcio-1.53.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a96c3c7f564b263c5d7c0e49a337166c8611e89c4c919f66dba7b9a84abad137"}, - {file = "grpcio-1.53.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ee81349411648d1abc94095c68cd25e3c2812e4e0367f9a9355be1e804a5135c"}, - {file = "grpcio-1.53.0-cp310-cp310-win32.whl", hash = "sha256:fdc6191587de410a184550d4143e2b24a14df495c86ca15e59508710681690ac"}, - {file = "grpcio-1.53.0-cp310-cp310-win_amd64.whl", hash = "sha256:658ffe1e39171be00490db5bd3b966f79634ac4215a1eb9a85c6cd6783bf7f6e"}, - {file = "grpcio-1.53.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:1b172e6d497191940c4b8d75b53de82dc252e15b61de2951d577ec5b43316b29"}, - {file = "grpcio-1.53.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:82434ba3a5935e47908bc861ce1ebc43c2edfc1001d235d6e31e5d3ed55815f7"}, - {file = "grpcio-1.53.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:1c734a2d4843e4e14ececf5600c3c4750990ec319e1299db7e4f0d02c25c1467"}, - {file = "grpcio-1.53.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6a2ead3de3b2d53119d473aa2f224030257ef33af1e4ddabd4afee1dea5f04c"}, - {file = "grpcio-1.53.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a34d6e905f071f9b945cabbcc776e2055de1fdb59cd13683d9aa0a8f265b5bf9"}, - {file = "grpcio-1.53.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eaf8e3b97caaf9415227a3c6ca5aa8d800fecadd526538d2bf8f11af783f1550"}, - {file = "grpcio-1.53.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:da95778d37be8e4e9afca771a83424f892296f5dfb2a100eda2571a1d8bbc0dc"}, - {file = "grpcio-1.53.0-cp311-cp311-win32.whl", hash = "sha256:e4f513d63df6336fd84b74b701f17d1bb3b64e9d78a6ed5b5e8a198bbbe8bbfa"}, - {file = "grpcio-1.53.0-cp311-cp311-win_amd64.whl", hash = "sha256:ddb2511fbbb440ed9e5c9a4b9b870f2ed649b7715859fd6f2ebc585ee85c0364"}, - {file = "grpcio-1.53.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:2a912397eb8d23c177d6d64e3c8bc46b8a1c7680b090d9f13a640b104aaec77c"}, - {file = "grpcio-1.53.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:55930c56b8f5b347d6c8c609cc341949a97e176c90f5cbb01d148d778f3bbd23"}, - {file = "grpcio-1.53.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:6601d812105583948ab9c6e403a7e2dba6e387cc678c010e74f2d6d589d1d1b3"}, - {file = "grpcio-1.53.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c705e0c21acb0e8478a00e7e773ad0ecdb34bd0e4adc282d3d2f51ba3961aac7"}, - {file = "grpcio-1.53.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba074af9ca268ad7b05d3fc2b920b5fb3c083da94ab63637aaf67f4f71ecb755"}, - {file = "grpcio-1.53.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:14817de09317dd7d3fbc8272864288320739973ef0f4b56bf2c0032349da8cdf"}, - {file = "grpcio-1.53.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c7ad9fbedb93f331c2e9054e202e95cf825b885811f1bcbbdfdc301e451442db"}, - {file = "grpcio-1.53.0-cp37-cp37m-win_amd64.whl", hash = "sha256:dad5b302a4c21c604d88a5d441973f320134e6ff6a84ecef9c1139e5ffd466f6"}, - {file = "grpcio-1.53.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:fa8eaac75d3107e3f5465f2c9e3bbd13db21790c6e45b7de1756eba16b050aca"}, - {file = "grpcio-1.53.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:104a2210edd3776c38448b4f76c2f16e527adafbde171fc72a8a32976c20abc7"}, - {file = "grpcio-1.53.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:dbc1ba968639c1d23476f75c356e549e7bbf2d8d6688717dcab5290e88e8482b"}, - {file = "grpcio-1.53.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:95952d3fe795b06af29bb8ec7bbf3342cdd867fc17b77cc25e6733d23fa6c519"}, - {file = "grpcio-1.53.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f144a790f14c51b8a8e591eb5af40507ffee45ea6b818c2482f0457fec2e1a2e"}, - {file = "grpcio-1.53.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0698c094688a2dd4c7c2f2c0e3e142cac439a64d1cef6904c97f6cde38ba422f"}, - {file = "grpcio-1.53.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6b6d60b0958be711bab047e9f4df5dbbc40367955f8651232bfdcdd21450b9ab"}, - {file = "grpcio-1.53.0-cp38-cp38-win32.whl", hash = "sha256:1948539ce78805d4e6256ab0e048ec793956d54787dc9d6777df71c1d19c7f81"}, - {file = "grpcio-1.53.0-cp38-cp38-win_amd64.whl", hash = "sha256:df9ba1183b3f649210788cf80c239041dddcb375d6142d8bccafcfdf549522cd"}, - {file = "grpcio-1.53.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:19caa5b7282a89b799e63776ff602bb39604f7ca98db6df27e2de06756ae86c3"}, - {file = "grpcio-1.53.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:b5bd026ac928c96cc23149e6ef79183125542062eb6d1ccec34c0a37e02255e7"}, - {file = "grpcio-1.53.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:7dc8584ca6c015ad82e186e82f4c0fe977394588f66b8ecfc4ec873285314619"}, - {file = "grpcio-1.53.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2eddaae8af625e45b5c8500dcca1043264d751a6872cde2eda5022df8a336959"}, - {file = "grpcio-1.53.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5fb6f3d7824696c1c9f2ad36ddb080ba5a86f2d929ef712d511b4d9972d3d27"}, - {file = "grpcio-1.53.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8270d1dc2c98ab57e6dbf36fa187db8df4c036f04a398e5d5e25b4e01a766d70"}, - {file = "grpcio-1.53.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:976a7f24eb213e8429cab78d5e120500dfcdeb01041f1f5a77b17b9101902615"}, - {file = "grpcio-1.53.0-cp39-cp39-win32.whl", hash = "sha256:9c84a481451e7174f3a764a44150f93b041ab51045aa33d7b5b68b6979114e48"}, - {file = "grpcio-1.53.0-cp39-cp39-win_amd64.whl", hash = "sha256:6beb84f83360ff29a3654f43f251ec11b809dcb5524b698d711550243debd289"}, - {file = "grpcio-1.53.0.tar.gz", hash = "sha256:a4952899b4931a6ba12951f9a141ef3e74ff8a6ec9aa2dc602afa40f63595e33"}, + {file = "grpcio-1.54.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:a947d5298a0bbdd4d15671024bf33e2b7da79a70de600ed29ba7e0fef0539ebb"}, + {file = "grpcio-1.54.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:e355ee9da9c1c03f174efea59292b17a95e0b7b4d7d2a389265f731a9887d5a9"}, + {file = "grpcio-1.54.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:73c238ef6e4b64272df7eec976bb016c73d3ab5a6c7e9cd906ab700523d312f3"}, + {file = "grpcio-1.54.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c59d899ee7160638613a452f9a4931de22623e7ba17897d8e3e348c2e9d8d0b"}, + {file = "grpcio-1.54.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48cb7af77238ba16c77879009003f6b22c23425e5ee59cb2c4c103ec040638a5"}, + {file = "grpcio-1.54.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2262bd3512ba9e9f0e91d287393df6f33c18999317de45629b7bd46c40f16ba9"}, + {file = "grpcio-1.54.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:224166f06ccdaf884bf35690bf4272997c1405de3035d61384ccb5b25a4c1ca8"}, + {file = "grpcio-1.54.0-cp310-cp310-win32.whl", hash = "sha256:ed36e854449ff6c2f8ee145f94851fe171298e1e793f44d4f672c4a0d78064e7"}, + {file = "grpcio-1.54.0-cp310-cp310-win_amd64.whl", hash = "sha256:27fb030a4589d2536daec5ff5ba2a128f4f155149efab578fe2de2cb21596d3d"}, + {file = "grpcio-1.54.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:f4a7dca8ccd8023d916b900aa3c626f1bd181bd5b70159479b142f957ff420e4"}, + {file = "grpcio-1.54.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:1209d6b002b26e939e4c8ea37a3d5b4028eb9555394ea69fb1adbd4b61a10bb8"}, + {file = "grpcio-1.54.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:860fcd6db7dce80d0a673a1cc898ce6bc3d4783d195bbe0e911bf8a62c93ff3f"}, + {file = "grpcio-1.54.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3930669c9e6f08a2eed824738c3d5699d11cd47a0ecc13b68ed11595710b1133"}, + {file = "grpcio-1.54.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62117486460c83acd3b5d85c12edd5fe20a374630475388cfc89829831d3eb79"}, + {file = "grpcio-1.54.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e3e526062c690517b42bba66ffe38aaf8bc99a180a78212e7b22baa86902f690"}, + {file = "grpcio-1.54.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ebff0738be0499d7db74d20dca9f22a7b27deae31e1bf92ea44924fd69eb6251"}, + {file = "grpcio-1.54.0-cp311-cp311-win32.whl", hash = "sha256:21c4a1aae861748d6393a3ff7867473996c139a77f90326d9f4104bebb22d8b8"}, + {file = "grpcio-1.54.0-cp311-cp311-win_amd64.whl", hash = "sha256:3db71c6f1ab688d8dfc102271cedc9828beac335a3a4372ec54b8bf11b43fd29"}, + {file = "grpcio-1.54.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:960b176e0bb2b4afeaa1cd2002db1e82ae54c9b6e27ea93570a42316524e77cf"}, + {file = "grpcio-1.54.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:d8ae6e0df3a608e99ee1acafaafd7db0830106394d54571c1ece57f650124ce9"}, + {file = "grpcio-1.54.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:c33744d0d1a7322da445c0fe726ea6d4e3ef2dfb0539eadf23dce366f52f546c"}, + {file = "grpcio-1.54.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d109df30641d050e009105f9c9ca5a35d01e34d2ee2a4e9c0984d392fd6d704"}, + {file = "grpcio-1.54.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:775a2f70501370e5ba54e1ee3464413bff9bd85bd9a0b25c989698c44a6fb52f"}, + {file = "grpcio-1.54.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c55a9cf5cba80fb88c850915c865b8ed78d5e46e1f2ec1b27692f3eaaf0dca7e"}, + {file = "grpcio-1.54.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1fa7d6ddd33abbd3c8b3d7d07c56c40ea3d1891ce3cd2aa9fa73105ed5331866"}, + {file = "grpcio-1.54.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ed3d458ded32ff3a58f157b60cc140c88f7ac8c506a1c567b2a9ee8a2fd2ce54"}, + {file = "grpcio-1.54.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:5942a3e05630e1ef5b7b5752e5da6582460a2e4431dae603de89fc45f9ec5aa9"}, + {file = "grpcio-1.54.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:125ed35aa3868efa82eabffece6264bf638cfdc9f0cd58ddb17936684aafd0f8"}, + {file = "grpcio-1.54.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:b7655f809e3420f80ce3bf89737169a9dce73238af594049754a1128132c0da4"}, + {file = "grpcio-1.54.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87f47bf9520bba4083d65ab911f8f4c0ac3efa8241993edd74c8dd08ae87552f"}, + {file = "grpcio-1.54.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16bca8092dd994f2864fdab278ae052fad4913f36f35238b2dd11af2d55a87db"}, + {file = "grpcio-1.54.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d2f62fb1c914a038921677cfa536d645cb80e3dd07dc4859a3c92d75407b90a5"}, + {file = "grpcio-1.54.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a7caf553ccaf715ec05b28c9b2ab2ee3fdb4036626d779aa09cf7cbf54b71445"}, + {file = "grpcio-1.54.0-cp38-cp38-win32.whl", hash = "sha256:2585b3c294631a39b33f9f967a59b0fad23b1a71a212eba6bc1e3ca6e6eec9ee"}, + {file = "grpcio-1.54.0-cp38-cp38-win_amd64.whl", hash = "sha256:3b170e441e91e4f321e46d3cc95a01cb307a4596da54aca59eb78ab0fc03754d"}, + {file = "grpcio-1.54.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:1382bc499af92901c2240c4d540c74eae8a671e4fe9839bfeefdfcc3a106b5e2"}, + {file = "grpcio-1.54.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:031bbd26656e0739e4b2c81c172155fb26e274b8d0312d67aefc730bcba915b6"}, + {file = "grpcio-1.54.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:a97b0d01ae595c997c1d9d8249e2d2da829c2d8a4bdc29bb8f76c11a94915c9a"}, + {file = "grpcio-1.54.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:533eaf5b2a79a3c6f35cbd6a095ae99cac7f4f9c0e08bdcf86c130efd3c32adf"}, + {file = "grpcio-1.54.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49eace8ea55fbc42c733defbda1e4feb6d3844ecd875b01bb8b923709e0f5ec8"}, + {file = "grpcio-1.54.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:30fbbce11ffeb4f9f91c13fe04899aaf3e9a81708bedf267bf447596b95df26b"}, + {file = "grpcio-1.54.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:650f5f2c9ab1275b4006707411bb6d6bc927886874a287661c3c6f332d4c068b"}, + {file = "grpcio-1.54.0-cp39-cp39-win32.whl", hash = "sha256:02000b005bc8b72ff50c477b6431e8886b29961159e8b8d03c00b3dd9139baed"}, + {file = "grpcio-1.54.0-cp39-cp39-win_amd64.whl", hash = "sha256:6dc1e2c9ac292c9a484ef900c568ccb2d6b4dfe26dfa0163d5bc815bb836c78d"}, + {file = "grpcio-1.54.0.tar.gz", hash = "sha256:eb0807323572642ab73fd86fe53d88d843ce617dd1ddf430351ad0759809a0ae"}, ] [package.extras] -protobuf = ["grpcio-tools (>=1.53.0)"] +protobuf = ["grpcio-tools (>=1.54.0)"] [[package]] name = "gunicorn" @@ -1011,18 +1049,19 @@ tornado = ["tornado (>=0.2)"] [[package]] name = "huggingface-hub" -version = "0.13.4" +version = "0.14.1" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" category = "main" optional = false python-versions = ">=3.7.0" files = [ - {file = "huggingface_hub-0.13.4-py3-none-any.whl", hash = "sha256:4d3d40593de6673d624a4baaaf249b9bf5165bfcafd1ad58de361931f0b4fda5"}, - {file = "huggingface_hub-0.13.4.tar.gz", hash = "sha256:db83d9c2f76aed8cf49893ffadd6be24e82074da2f64b1d36b8ba40eb255e115"}, + {file = "huggingface_hub-0.14.1-py3-none-any.whl", hash = "sha256:9fc619170d800ff3793ad37c9757c255c8783051e1b5b00501205eb43ccc4f27"}, + {file = "huggingface_hub-0.14.1.tar.gz", hash = "sha256:9ab899af8e10922eac65e290d60ab956882ab0bf643e3d990b1394b6b47b7fbc"}, ] [package.dependencies] filelock = "*" +fsspec = "*" packaging = ">=20.9" pyyaml = ">=5.1" requests = "*" @@ -1030,26 +1069,26 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"] tensorflow = ["graphviz", "pydot", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "jedi", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "soundfile"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "gradio", "jedi", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "soundfile"] torch = ["torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] [[package]] name = "identify" -version = "2.5.22" +version = "2.5.23" description = "File identification library for Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "identify-2.5.22-py2.py3-none-any.whl", hash = "sha256:f0faad595a4687053669c112004178149f6c326db71ee999ae4636685753ad2f"}, - {file = "identify-2.5.22.tar.gz", hash = "sha256:f7a93d6cf98e29bd07663c60728e7a4057615068d7a639d132dc883b2d54d31e"}, + {file = "identify-2.5.23-py2.py3-none-any.whl", hash = "sha256:17d9351c028a781456965e781ed2a435755cac655df1ebd930f7186b54399312"}, + {file = "identify-2.5.23.tar.gz", hash = "sha256:50b01b9d5f73c6b53e5fa2caf9f543d3e657a9d0bbdeb203ebb8d45960ba7433"}, ] [package.extras] @@ -1069,14 +1108,14 @@ files = [ [[package]] name = "importlib-metadata" -version = "6.3.0" +version = "6.6.0" description = "Read metadata from Python packages" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "importlib_metadata-6.3.0-py3-none-any.whl", hash = "sha256:8f8bd2af397cf33bd344d35cfe7f489219b7d14fc79a3f854b75b8417e9226b0"}, - {file = "importlib_metadata-6.3.0.tar.gz", hash = "sha256:23c2bcae4762dfb0bbe072d358faec24957901d75b6c4ab11172c0c982532402"}, + {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"}, + {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"}, ] [package.dependencies] @@ -1815,14 +1854,14 @@ files = [ [[package]] name = "packaging" -version = "23.0" +version = "23.1" description = "Core utilities for Python packages" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] [[package]] @@ -1960,19 +1999,19 @@ files = [ [[package]] name = "platformdirs" -version = "3.2.0" +version = "3.4.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.2.0-py3-none-any.whl", hash = "sha256:ebe11c0d7a805086e99506aa331612429a72ca7cd52a1f0d277dc4adc20cb10e"}, - {file = "platformdirs-3.2.0.tar.gz", hash = "sha256:d5b638ca397f25f979350ff789db335903d7ea010ab28903f57b27e1b16c2b08"}, + {file = "platformdirs-3.4.0-py3-none-any.whl", hash = "sha256:01437886022decaf285d8972f9526397bfae2ac55480ed372ed6d9eca048870a"}, + {file = "platformdirs-3.4.0.tar.gz", hash = "sha256:a5e1536e5ea4b1c238a1364da17ff2993d5bd28e15600c2c8224008aff6bbcad"}, ] [package.extras] -docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] +docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] [[package]] name = "pluggy" @@ -2270,14 +2309,14 @@ files = [ [[package]] name = "pygments" -version = "2.14.0" +version = "2.15.1" description = "Pygments is a syntax highlighting package written in Python." category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "Pygments-2.14.0-py3-none-any.whl", hash = "sha256:fa7bd7bd2771287c0de303af8bfdfc731f51bd2c6a47ab69d117138893b82717"}, - {file = "Pygments-2.14.0.tar.gz", hash = "sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297"}, + {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, + {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, ] [package.extras] @@ -2944,14 +2983,14 @@ tornado = ["tornado (>=5)"] [[package]] name = "setuptools" -version = "67.6.1" +version = "67.7.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "setuptools-67.6.1-py3-none-any.whl", hash = "sha256:e728ca814a823bf7bf60162daf9db95b93d532948c4c0bea762ce62f60189078"}, - {file = "setuptools-67.6.1.tar.gz", hash = "sha256:257de92a9d50a60b8e22abfcbb771571fde0dbf3ec234463212027a4eeecbe9a"}, + {file = "setuptools-67.7.2-py3-none-any.whl", hash = "sha256:23aaf86b85ca52ceb801d32703f12d77517b2556af839621c641fca11287952b"}, + {file = "setuptools-67.7.2.tar.gz", hash = "sha256:f104fa03692a2602fa0fec6c6a9e63b6c8a968de13e17c026957dd1f53d80990"}, ] [package.extras] @@ -3431,15 +3470,14 @@ vision = ["Pillow"] [[package]] name = "tritonclient" -version = "2.32.0" +version = "2.33.0" description = "Python client library and utilities for communicating with Triton Inference Server" category = "main" optional = false python-versions = "*" files = [ - {file = "tritonclient-2.32.0-py3-none-any.whl", hash = "sha256:2a9077a0f1424ef521ee89a519cdbe29892a058068b195641e83ba97817f150d"}, - {file = "tritonclient-2.32.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:0431006258c0cca6ecb840d0af5b97bd40e13ec3224005dcd31da14f9d396421"}, - {file = "tritonclient-2.32.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8a1476c64c07f8873a65025bfd14d40ffc153e0966bb2b361939ad4eceec0a6a"}, + {file = "tritonclient-2.33.0-py3-none-any.whl", hash = "sha256:8fd7db59c76a6e3e4506e682a3d5ba549685b70baf7c7ff560701852774ba0f5"}, + {file = "tritonclient-2.33.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:92c162dac8ed25724bc80d8d8cc6dd77d8518cd6d8fa0903dae75261a609c24c"}, ] [package.dependencies] @@ -3525,14 +3563,14 @@ files = [ [[package]] name = "types-pyopenssl" -version = "23.1.0.1" +version = "23.1.0.2" description = "Typing stubs for pyOpenSSL" category = "dev" optional = false python-versions = "*" files = [ - {file = "types-pyOpenSSL-23.1.0.1.tar.gz", hash = "sha256:59044283c475eaa5a29b36a903c123d52bdf4a7e012f0a1ca0e41115b99216da"}, - {file = "types_pyOpenSSL-23.1.0.1-py3-none-any.whl", hash = "sha256:ac7fbc240930c2f9a1cbd2d04f9cb14ad0f15b0ad8d6528732a83747b1b2086e"}, + {file = "types-pyOpenSSL-23.1.0.2.tar.gz", hash = "sha256:20b80971b86240e8432a1832bd8124cea49c3088c7bfc77dfd23be27ffe4a517"}, + {file = "types_pyOpenSSL-23.1.0.2-py3-none-any.whl", hash = "sha256:b050641aeff6dfebf231ad719bdac12d53b8ee818d4afb67b886333484629957"}, ] [package.dependencies] @@ -3610,14 +3648,14 @@ files = [ [[package]] name = "types-urllib3" -version = "1.26.25.10" +version = "1.26.25.11" description = "Typing stubs for urllib3" category = "dev" optional = false python-versions = "*" files = [ - {file = "types-urllib3-1.26.25.10.tar.gz", hash = "sha256:c44881cde9fc8256d05ad6b21f50c4681eb20092552351570ab0a8a0653286d6"}, - {file = "types_urllib3-1.26.25.10-py3-none-any.whl", hash = "sha256:12c744609d588340a07e45d333bf870069fc8793bcf96bae7a96d4712a42591d"}, + {file = "types-urllib3-1.26.25.11.tar.gz", hash = "sha256:697102ddf4f781eed6f692353f40cee1098643526f5a8b99f49d2ede90fd3754"}, + {file = "types_urllib3-1.26.25.11-py3-none-any.whl", hash = "sha256:04235e792139cf3624b25d38faab593456738fbdb7439634046172e3b1339400"}, ] [[package]] @@ -3682,24 +3720,24 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] name = "virtualenv" -version = "20.21.0" +version = "20.22.0" description = "Virtual Python Environment builder" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.21.0-py3-none-any.whl", hash = "sha256:31712f8f2a17bd06234fa97fdf19609e789dd4e3e4bf108c3da71d710651adbc"}, - {file = "virtualenv-20.21.0.tar.gz", hash = "sha256:f50e3e60f990a0757c9b68333c9fdaa72d7188caa417f96af9e52407831a3b68"}, + {file = "virtualenv-20.22.0-py3-none-any.whl", hash = "sha256:48fd3b907b5149c5aab7c23d9790bea4cac6bc6b150af8635febc4cfeab1275a"}, + {file = "virtualenv-20.22.0.tar.gz", hash = "sha256:278753c47aaef1a0f14e6db8a4c5e1e040e90aea654d0fc1dc7e0d8a42616cc3"}, ] [package.dependencies] distlib = ">=0.3.6,<1" -filelock = ">=3.4.1,<4" -platformdirs = ">=2.4,<4" +filelock = ">=3.11,<4" +platformdirs = ">=3.2,<4" [package.extras] -docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=22.12)"] -test = ["covdefaults (>=2.2.2)", "coverage (>=7.1)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23)", "pytest (>=7.2.1)", "pytest-env (>=0.8.1)", "pytest-freezegun (>=0.4.2)", "pytest-mock (>=3.10)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)"] +docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=22.12)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.3.1)", "pytest-env (>=0.8.1)", "pytest-freezegun (>=0.4.2)", "pytest-mock (>=3.10)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)"] [[package]] name = "wasabi" @@ -3770,21 +3808,21 @@ bracex = ">=2.1.1" [[package]] name = "werkzeug" -version = "2.2.3" +version = "2.3.0" description = "The comprehensive WSGI web application library." category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"}, - {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"}, + {file = "Werkzeug-2.3.0-py3-none-any.whl", hash = "sha256:340335057f72974d9281dbaf52c8090a9f9a59ba304ae814bf0656e6559c0020"}, + {file = "Werkzeug-2.3.0.tar.gz", hash = "sha256:3b6b46926d052b8ebca97c4dc73c12e47bdd07d57ab0600c039c3155450227bc"}, ] [package.dependencies] MarkupSafe = ">=2.1.1" [package.extras] -watchdog = ["watchdog"] +watchdog = ["watchdog (>=2.3)"] [[package]] name = "zipp" @@ -3805,4 +3843,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "e6751e5126b0605f9ace1fbd2f85f277503d05516d92eeeae1bbf8d18d884ee7" +content-hash = "5e4c5c585943a94c95de49e5005fc1bcd1bb13e2149746384cb155aba1f9fa68" diff --git a/pyproject.toml b/pyproject.toml index 38f99d2326..4c3a941300 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,6 @@ elasticsearch = "~8.5.3" pymongo = "~3.12.0" spacy = "~3.4.1" dacite = "~1.6.0" -flashtext = "~2.7" langid = "~1.1.6" influxdb-client = "~1.34.0" jsonschema = "~4.4.0" diff --git a/robotoff/prediction/category/matcher.py b/robotoff/prediction/category/matcher.py index 9a786aa461..ff392fd1a6 100644 --- a/robotoff/prediction/category/matcher.py +++ b/robotoff/prediction/category/matcher.py @@ -5,14 +5,13 @@ import re from typing import Iterable, Optional -from flashtext import KeywordProcessor - from robotoff import settings from robotoff.products import ProductDataset from robotoff.taxonomy import TaxonomyType, get_taxonomy from robotoff.types import Prediction, PredictionType, ServerType from robotoff.utils import dump_json, get_logger, load_json from robotoff.utils.text import ( + KeywordProcessor, get_lemmatizing_nlp, strip_accents_v1, strip_consecutive_spaces, diff --git a/robotoff/prediction/category/neural/keras_category_classifier_3_0/preprocessing.py b/robotoff/prediction/category/neural/keras_category_classifier_3_0/preprocessing.py index 55a5bd25a1..9b5f7d7507 100644 --- a/robotoff/prediction/category/neural/keras_category_classifier_3_0/preprocessing.py +++ b/robotoff/prediction/category/neural/keras_category_classifier_3_0/preprocessing.py @@ -6,11 +6,11 @@ from typing import Optional import numpy as np -from flashtext import KeywordProcessor from robotoff import settings from robotoff.taxonomy import Taxonomy, fetch_taxonomy from robotoff.types import JSONType +from robotoff.utils.text import KeywordProcessor from .text_utils import fold, get_tag diff --git a/robotoff/prediction/ocr/brand.py b/robotoff/prediction/ocr/brand.py index 49b2599535..7d3da8f395 100644 --- a/robotoff/prediction/ocr/brand.py +++ b/robotoff/prediction/ocr/brand.py @@ -1,13 +1,11 @@ import functools from typing import Iterable, Optional, Union -from flashtext import KeywordProcessor - from robotoff import settings from robotoff.brands import get_brand_blacklist, keep_brand_from_taxonomy from robotoff.types import Prediction, PredictionType from robotoff.utils import get_logger, text_file_iter -from robotoff.utils.text import get_tag +from robotoff.utils.text import KeywordProcessor, get_tag from .dataclass import OCRResult, get_match_bounding_box, get_text from .utils import generate_keyword_processor diff --git a/robotoff/prediction/ocr/image_flag.py b/robotoff/prediction/ocr/image_flag.py index 127e2ccea9..a05e558356 100644 --- a/robotoff/prediction/ocr/image_flag.py +++ b/robotoff/prediction/ocr/image_flag.py @@ -1,11 +1,10 @@ import functools from typing import Optional, Union -from flashtext import KeywordProcessor - from robotoff import settings from robotoff.types import Prediction, PredictionType from robotoff.utils import text_file_iter +from robotoff.utils.text import KeywordProcessor from .dataclass import OCRResult, SafeSearchAnnotationLikelihood, get_text diff --git a/robotoff/prediction/ocr/label.py b/robotoff/prediction/ocr/label.py index 15f02fc723..ff81e196a9 100644 --- a/robotoff/prediction/ocr/label.py +++ b/robotoff/prediction/ocr/label.py @@ -2,11 +2,10 @@ import re from typing import Iterable, Optional, Union -from flashtext import KeywordProcessor - from robotoff import settings from robotoff.types import Prediction, PredictionType from robotoff.utils import get_logger, text_file_iter +from robotoff.utils.text import KeywordProcessor from .dataclass import OCRField, OCRRegex, OCRResult, get_match_bounding_box, get_text from .utils import generate_keyword_processor diff --git a/robotoff/prediction/ocr/location.py b/robotoff/prediction/ocr/location.py index 88b9eb66cd..09d9f94d4e 100644 --- a/robotoff/prediction/ocr/location.py +++ b/robotoff/prediction/ocr/location.py @@ -5,13 +5,11 @@ from pathlib import Path from typing import BinaryIO, Iterable, Optional, Union -from flashtext import KeywordProcessor - from robotoff import settings from robotoff.types import Prediction, PredictionType from robotoff.utils import get_logger from robotoff.utils.cache import CachedStore -from robotoff.utils.text import strip_accents_v1 +from robotoff.utils.text import KeywordProcessor, strip_accents_v1 from .dataclass import OCRResult diff --git a/robotoff/prediction/ocr/packager_code.py b/robotoff/prediction/ocr/packager_code.py index d2e046e535..84a088fa0e 100644 --- a/robotoff/prediction/ocr/packager_code.py +++ b/robotoff/prediction/ocr/packager_code.py @@ -1,12 +1,11 @@ import re from typing import Optional, Union -from flashtext import KeywordProcessor - from robotoff import settings from robotoff.types import Prediction, PredictionType from robotoff.utils import text_file_iter from robotoff.utils.cache import CachedStore +from robotoff.utils.text import KeywordProcessor from .dataclass import OCRField, OCRRegex, OCRResult, get_match_bounding_box, get_text from .utils import generate_keyword_processor diff --git a/robotoff/prediction/ocr/utils.py b/robotoff/prediction/ocr/utils.py index d1f91a2ea7..1ea7eaed19 100644 --- a/robotoff/prediction/ocr/utils.py +++ b/robotoff/prediction/ocr/utils.py @@ -1,6 +1,6 @@ from typing import Callable, Iterable, Optional -from flashtext import KeywordProcessor +from robotoff.utils.text import KeywordProcessor def generate_keyword_processor( diff --git a/robotoff/utils/text.py b/robotoff/utils/text/__init__.py similarity index 97% rename from robotoff/utils/text.py rename to robotoff/utils/text/__init__.py index ce5af52271..bfb48e98a1 100644 --- a/robotoff/utils/text.py +++ b/robotoff/utils/text/__init__.py @@ -6,6 +6,7 @@ from robotoff.utils import get_logger +from .flashtext import KeywordProcessor # noqa: F401 from .fold_to_ascii import fold, fold_without_insertion_deletion logger = get_logger(__name__) diff --git a/robotoff/utils/text/flashtext.py b/robotoff/utils/text/flashtext.py new file mode 100644 index 0000000000..4e6dee2f12 --- /dev/null +++ b/robotoff/utils/text/flashtext.py @@ -0,0 +1,759 @@ +"""Copied and adapted from https://github.com/vi3k6i5/flashtext (MIT-licensed). + +Flashtext library is not maintained anymore, and we needed some bugs to be fixed +(especially https://github.com/vi3k6i5/flashtext/issues/119). +""" + + +import functools +import io +import os +import string +from pathlib import Path +from typing import Optional, Union + + +class KeywordProcessor: + """KeywordProcessor + + Attributes: + _keyword (str): Used as key to store keywords in trie dictionary. + Defaults to '_keyword_' + non_word_boundaries (set(str)): Characters that will determine if the word is continuing. + Defaults to set([A-Za-z0-9_]) + keyword_trie_dict (dict): Trie dict built character by character, that is used for lookup + Defaults to empty dictionary + case_sensitive (boolean): if the search algorithm should be case sensitive or not. + Defaults to False + + Examples: + >>> # import module + >>> from robotoff.utils.text import KeywordProcessor + >>> # Create an object of KeywordProcessor + >>> keyword_processor = KeywordProcessor() + >>> # add keywords + >>> keyword_names = ['NY', 'new-york', 'SF'] + >>> clean_names = ['new york', 'new york', 'san francisco'] + >>> for keyword_name, clean_name in zip(keyword_names, clean_names): + >>> keyword_processor.add_keyword(keyword_name, clean_name) + >>> keywords_found = keyword_processor.extract_keywords('I love SF and NY. new-york is the best.') + >>> keywords_found + >>> ['san francisco', 'new york', 'new york'] + + Note: + * loosely based on `Aho-Corasick algorithm `_. + * Idea came from this `Stack Overflow Question `_. + """ + + def __init__(self, case_sensitive: bool = False): + """ + Args: + case_sensitive (boolean): Keyword search should be case sensitive set or not. + Defaults to False + """ + self._keyword = "_keyword_" + self._white_space_chars = set([".", "\t", "\n", "\a", " ", ","]) + self.non_word_boundaries = set(string.digits + string.ascii_letters + "_") + self.keyword_trie_dict = dict() # type: ignore + self.case_sensitive = case_sensitive + self._terms_in_trie = 0 + + def __len__(self) -> int: + """Number of terms present in the keyword_trie_dict + + Returns: + length : int + Count of number of distinct terms in trie dictionary. + + """ + return self._terms_in_trie + + def __contains__(self, word: str) -> bool: + """To check if word is present in the keyword_trie_dict + + Args: + word : string + word that you want to check + + Returns: + status : bool + If word is present as it is in keyword_trie_dict then we return True, else False + + Examples: + >>> keyword_processor.add_keyword('Big Apple') + >>> 'Big Apple' in keyword_processor + >>> # True + + """ + if not self.case_sensitive: + word = word.lower() + current_dict = self.keyword_trie_dict + len_covered = 0 + for char in word: + if char in current_dict: + current_dict = current_dict[char] + len_covered += 1 + else: + break + return self._keyword in current_dict and len_covered == len(word) + + def __getitem__(self, word: str) -> Optional[str]: + """if word is present in keyword_trie_dict return the clean name for it. + + Args: + word : string + word that you want to check + + Returns: + keyword : string + If word is present as it is in keyword_trie_dict then we return keyword mapped to it. + + Examples: + >>> keyword_processor.add_keyword('Big Apple', 'New York') + >>> keyword_processor['Big Apple'] + >>> # New York + """ + if not self.case_sensitive: + word = word.lower() + current_dict = self.keyword_trie_dict + len_covered = 0 + for char in word: + if char in current_dict: + current_dict = current_dict[char] + len_covered += 1 + else: + break + if self._keyword in current_dict and len_covered == len(word): + return current_dict[self._keyword] + + return None + + def __setitem__(self, keyword: str, clean_name: Optional[str] = None) -> bool: + """To add keyword to the dictionary + pass the keyword and the clean name it maps to. + + Args: + keyword : string + keyword that you want to identify + + clean_name : string + clean term for that keyword that you would want to get back in return or replace + if not provided, keyword will be used as the clean name also. + + Examples: + >>> keyword_processor['Big Apple'] = 'New York' + """ + status = False + if not clean_name and keyword: + clean_name = keyword + + if keyword and clean_name: + if not self.case_sensitive: + keyword = keyword.lower() + current_dict = self.keyword_trie_dict + for letter in keyword: + current_dict = current_dict.setdefault(letter, {}) + if self._keyword not in current_dict: + status = True + self._terms_in_trie += 1 + current_dict[self._keyword] = clean_name + return status + + def __delitem__(self, keyword: str) -> bool: + """To remove keyword from the dictionary + pass the keyword and the clean name it maps to. + + Args: + keyword : string + keyword that you want to remove if it's present + + Examples: + >>> keyword_processor.add_keyword('Big Apple') + >>> del keyword_processor['Big Apple'] + """ + status = False + if keyword: + if not self.case_sensitive: + keyword = keyword.lower() + current_dict = self.keyword_trie_dict + character_trie_list = [] + for letter in keyword: + if letter in current_dict: + character_trie_list.append((letter, current_dict)) + current_dict = current_dict[letter] + else: + # if character is not found, break out of the loop + current_dict = None # type: ignore + break + # remove the characters from trie dict if there are no other keywords with them + if current_dict and self._keyword in current_dict: + # we found a complete match for input keyword. + character_trie_list.append((self._keyword, current_dict)) + character_trie_list.reverse() + + for key_to_remove, dict_pointer in character_trie_list: + if len(dict_pointer.keys()) == 1: + dict_pointer.pop(key_to_remove) + else: + # more than one key means more than 1 path. + # Delete not required path and keep the other + dict_pointer.pop(key_to_remove) + break + # successfully removed keyword + status = True + self._terms_in_trie -= 1 + return status + + def __iter__(self): + """Disabled iteration as get_all_keywords() is the right way to iterate""" + raise NotImplementedError("Please use get_all_keywords() instead") + + def set_non_word_boundaries(self, non_word_boundaries: set[str]) -> None: + """set of characters that will be considered as part of word. + + Args: + non_word_boundaries (set(str)): + Set of characters that will be considered as part of word. + + """ + self.non_word_boundaries = non_word_boundaries + + def add_non_word_boundary(self, character: str) -> None: + """add a character that will be considered as part of word. + + Args: + character (char): + Character that will be considered as part of word. + + """ + self.non_word_boundaries.add(character) + + def add_keyword(self, keyword: str, clean_name: Optional[str] = None) -> bool: + """To add one or more keywords to the dictionary + pass the keyword and the clean name it maps to. + + Args: + keyword : string + keyword that you want to identify + + clean_name : string + clean term for that keyword that you would want to get back in return or replace + if not provided, keyword will be used as the clean name also. + + Returns: + status : bool + The return value. True for success, False otherwise. + + Examples: + >>> keyword_processor.add_keyword('Big Apple', 'New York') + >>> # This case 'Big Apple' will return 'New York' + >>> # OR + >>> keyword_processor.add_keyword('Big Apple') + >>> # This case 'Big Apple' will return 'Big Apple' + """ + return self.__setitem__(keyword, clean_name) + + def remove_keyword(self, keyword: str) -> bool: + """To remove one or more keywords from the dictionary + pass the keyword and the clean name it maps to. + + Args: + keyword : string + keyword that you want to remove if it's present + + Returns: + status : bool + The return value. True for success, False otherwise. + + Examples: + >>> keyword_processor.add_keyword('Big Apple') + >>> keyword_processor.remove_keyword('Big Apple') + >>> # Returns True + >>> # This case 'Big Apple' will no longer be a recognized keyword + >>> keyword_processor.remove_keyword('Big Apple') + >>> # Returns False + + """ + return self.__delitem__(keyword) + + def get_keyword(self, word: str) -> Optional[str]: + """if word is present in keyword_trie_dict return the clean name for it. + + Args: + word : string + word that you want to check + + Returns: + keyword : string + If word is present as it is in keyword_trie_dict then we return keyword mapped to it. + + Examples: + >>> keyword_processor.add_keyword('Big Apple', 'New York') + >>> keyword_processor.get('Big Apple') + >>> # New York + """ + return self.__getitem__(word) + + def add_keyword_from_file( + self, keyword_file: Union[Path, str], encoding: str = "utf-8" + ) -> None: + """To add keywords from a file + + Args: + keyword_file : path to keywords file + encoding : specify the encoding of the file + + Examples: + keywords file format can be like: + + >>> # Option 1: keywords.txt content + >>> # java_2e=>java + >>> # java programing=>java + >>> # product management=>product management + >>> # product management techniques=>product management + + >>> # Option 2: keywords.txt content + >>> # java + >>> # python + >>> # c++ + + >>> keyword_processor.add_keyword_from_file('keywords.txt') + + Raises: + IOError: If `keyword_file` path is not valid + + """ + if not os.path.isfile(keyword_file): + raise IOError("Invalid file path {}".format(keyword_file)) + with io.open(keyword_file, encoding=encoding) as f: + for line in f: + if "=>" in line: + keyword, clean_name = line.split("=>") + self.add_keyword(keyword, clean_name.strip()) + else: + keyword = line.strip() + self.add_keyword(keyword) + + def add_keywords_from_dict(self, keyword_dict: dict[str, str]) -> None: + """To add keywords from a dictionary + + Args: + keyword_dict (dict): A dictionary with `str` key and (list `str`) as value + + Examples: + >>> keyword_dict = { + "java": ["java_2e", "java programing"], + "product management": ["PM", "product manager"] + } + >>> keyword_processor.add_keywords_from_dict(keyword_dict) + + Raises: + AttributeError: If value for a key in `keyword_dict` is not a list. + + """ + for clean_name, keywords in keyword_dict.items(): + if not isinstance(keywords, list): + raise AttributeError( + "Value of key {} should be a list".format(clean_name) + ) + + for keyword in keywords: + self.add_keyword(keyword, clean_name) + + def remove_keywords_from_dict(self, keyword_dict: dict[str, str]): + """To remove keywords from a dictionary + + Args: + keyword_dict (dict): A dictionary with `str` key and (list `str`) as value + + Examples: + >>> keyword_dict = { + "java": ["java_2e", "java programing"], + "product management": ["PM", "product manager"] + } + >>> keyword_processor.remove_keywords_from_dict(keyword_dict) + + Raises: + AttributeError: If value for a key in `keyword_dict` is not a list. + + """ + for clean_name, keywords in keyword_dict.items(): + if not isinstance(keywords, list): + raise AttributeError( + "Value of key {} should be a list".format(clean_name) + ) + + for keyword in keywords: + self.remove_keyword(keyword) + + def add_keywords_from_list(self, keyword_list: list[str]) -> None: + """To add keywords from a list + + Args: + keyword_list (list(str)): List of keywords to add + + Examples: + >>> keyword_processor.add_keywords_from_list(["java", "python"]}) + Raises: + AttributeError: If `keyword_list` is not a list. + + """ + if not isinstance(keyword_list, list): + raise AttributeError("keyword_list should be a list") + + for keyword in keyword_list: + self.add_keyword(keyword) + + def remove_keywords_from_list(self, keyword_list: list[str]) -> None: + """To remove keywords present in list + + Args: + keyword_list (list(str)): List of keywords to remove + + Examples: + >>> keyword_processor.remove_keywords_from_list(["java", "python"]}) + Raises: + AttributeError: If `keyword_list` is not a list. + + """ + if not isinstance(keyword_list, list): + raise AttributeError("keyword_list should be a list") + + for keyword in keyword_list: + self.remove_keyword(keyword) + + def get_all_keywords( + self, term_so_far: str = "", current_dict: Optional[dict] = None + ) -> dict: + """Recursively builds a dictionary of keywords present in the dictionary + And the clean name mapped to those keywords. + + Args: + term_so_far : string + term built so far by adding all previous characters + current_dict : dict + current recursive position in dictionary + + Returns: + terms_present : dict + A map of key and value where each key is a term in the keyword_trie_dict. + And value mapped to it is the clean name mapped to it. + + Examples: + >>> keyword_processor = KeywordProcessor() + >>> keyword_processor.add_keyword('j2ee', 'Java') + >>> keyword_processor.add_keyword('Python', 'Python') + >>> keyword_processor.get_all_keywords() + >>> {'j2ee': 'Java', 'python': 'Python'} + >>> # NOTE: for case_insensitive all keys will be lowercased. + """ + terms_present = {} + if not term_so_far: + term_so_far = "" + if current_dict is None: + current_dict = self.keyword_trie_dict + for key in current_dict: + if key == "_keyword_": + terms_present[term_so_far] = current_dict[key] + else: + sub_values = self.get_all_keywords(term_so_far + key, current_dict[key]) + for key in sub_values: + terms_present[key] = sub_values[key] + return terms_present + + def extract_keywords( + self, sentence: str, span_info: bool = False, max_cost: int = 0 + ) -> list[Union[str, tuple[str, int, int]]]: + """Searches in the string for all keywords present in corpus. + Keywords present are added to a list `keywords_extracted` and returned. + + Args: + sentence (str): Line of text where we will search for keywords + span_info (bool): True if you need to span the boundaries where the extraction has been performed + max_cost (int): maximum levensthein distance to accept when extracting keywords + + Returns: + keywords_extracted (list(str)): List of terms/keywords found in sentence that match our corpus + + Examples: + >>> from robotoff.utils.text import KeywordProcessor + >>> keyword_processor = KeywordProcessor() + >>> keyword_processor.add_keyword('Big Apple', 'New York') + >>> keyword_processor.add_keyword('Bay Area') + >>> keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.') + >>> keywords_found + >>> ['New York', 'Bay Area'] + >>> keywords_found = keyword_processor.extract_keywords('I love Big Aple and Baay Area.', max_cost=1) + >>> keywords_found + >>> ['New York', 'Bay Area'] + """ + keywords_extracted: list[Union[str, tuple[str, int, int]]] = [] + if not sentence: + # if sentence is empty or none just return empty list + return keywords_extracted + + index_mapping = get_index_mapping(sentence, self.case_sensitive) + get_span_indices = functools.partial( + _get_span_indices, index_mapping=index_mapping + ) + if not self.case_sensitive: + sentence = sentence.lower() + current_dict = self.keyword_trie_dict + sequence_start_pos = 0 + sequence_end_pos = 0 + reset_current_dict = False + idx = 0 + sentence_len = len(sentence) + curr_cost = max_cost + while idx < sentence_len: + char = sentence[idx] + # when we reach a character that might denote word end + if char not in self.non_word_boundaries: + + # if end is present in current_dict + if self._keyword in current_dict or char in current_dict: + # update longest sequence found + sequence_found = None + longest_sequence_found = None + is_longer_seq_found = False + if self._keyword in current_dict: + sequence_found = current_dict[self._keyword] + longest_sequence_found = current_dict[self._keyword] + sequence_end_pos = idx + + # re look for longest_sequence from this position + if char in current_dict: + current_dict_continued = current_dict[char] + + idy = idx + 1 + while idy < sentence_len: + inner_char = sentence[idy] + if ( + inner_char not in self.non_word_boundaries + and self._keyword in current_dict_continued + ): + # update longest sequence found + longest_sequence_found = current_dict_continued[ + self._keyword + ] + sequence_end_pos = idy + is_longer_seq_found = True + if inner_char in current_dict_continued: + current_dict_continued = current_dict_continued[ + inner_char + ] + elif curr_cost > 0: + next_word = self.get_next_word(sentence[idy:]) + current_dict_continued, cost, _ = next( + self.levensthein( + next_word, + max_cost=curr_cost, + start_node=current_dict_continued, + ), + ({}, 0, 0), + ) # current_dict_continued to empty dict by default, so next iteration goes to a `break` + curr_cost -= cost + idy += len(next_word) - 1 + if not current_dict_continued: + break + else: + break + idy += 1 + else: + # end of sentence reached. + if self._keyword in current_dict_continued: + # update longest sequence found + longest_sequence_found = current_dict_continued[ + self._keyword + ] + sequence_end_pos = idy + is_longer_seq_found = True + if is_longer_seq_found: + idx = sequence_end_pos + current_dict = self.keyword_trie_dict + if longest_sequence_found: + keywords_extracted.append( + ( # type: ignore + longest_sequence_found, + *get_span_indices(sequence_start_pos, idx), + ) + ) + curr_cost = max_cost + reset_current_dict = True + else: + # we reset current_dict + current_dict = self.keyword_trie_dict + reset_current_dict = True + elif char in current_dict: + # we can continue from this char + current_dict = current_dict[char] + elif curr_cost > 0: + next_word = self.get_next_word(sentence[idx:]) + current_dict, cost, _ = next( + self.levensthein( + next_word, max_cost=curr_cost, start_node=current_dict + ), + (self.keyword_trie_dict, 0, 0), + ) + curr_cost -= cost + idx += len(next_word) - 1 + else: + # we reset current_dict + current_dict = self.keyword_trie_dict + reset_current_dict = True + # skip to end of word + idy = idx + 1 + while idy < sentence_len: + char = sentence[idy] + if char not in self.non_word_boundaries: + break + idy += 1 + idx = idy + # if we are end of sentence and have a sequence discovered + if idx + 1 >= sentence_len: + if self._keyword in current_dict: + sequence_found = current_dict[self._keyword] + keywords_extracted.append( + ( + sequence_found, + *get_span_indices(sequence_start_pos, sentence_len), + ) + ) + idx += 1 + if reset_current_dict: + reset_current_dict = False + sequence_start_pos = idx + if span_info: + return keywords_extracted + return [value[0] for value in keywords_extracted] + + def get_next_word(self, sentence: str) -> str: + """ + Retrieve the next word in the sequence + Iterate in the string until finding the first char not in non_word_boundaries + + Args: + sentence (str): Line of text where we will look for the next word + + Returns: + next_word (str): The next word in the sentence + Examples: + >>> from robotoff.utils.text import KeywordProcessor + >>> keyword_processor = KeywordProcessor() + >>> keyword_processor.add_keyword('Big Apple') + >>> 'Big' + """ + next_word = str() + for char in sentence: + if char not in self.non_word_boundaries: + break + next_word += char + return next_word + + def levensthein( + self, word: str, max_cost: int = 2, start_node: Optional[dict] = None + ): + """ + Retrieve the nodes where there is a fuzzy match, + via levenshtein distance, and with respect to max_cost + + Args: + word (str): word to find a fuzzy match for + max_cost (int): maximum levenshtein distance when performing the fuzzy match + start_node (dict): Trie node from which the search is performed + + Yields: + node, cost, depth (tuple): A tuple containing the final node, + the cost (i.e the distance), and the depth in the trie + + Examples: + >>> from robotoff.utils.text import KeywordProcessor + >>> keyword_processor = KeywordProcessor(case_sensitive=True) + >>> keyword_processor.add_keyword('Marie', 'Mary') + >>> next(keyword_processor.levensthein('Maria', max_cost=1)) + >>> ({'_keyword_': 'Mary'}, 1, 5) + ... + >>> keyword_processor = KeywordProcessor(case_sensitive=True + >>> keyword_processor.add_keyword('Marie Blanc', 'Mary') + >>> next(keyword_processor.levensthein('Mari', max_cost=1)) + >>> ({' ': {'B': {'l': {'a': {'n': {'c': {'_keyword_': 'Mary'}}}}}}}, 1, 5) + """ + start_node = start_node or self.keyword_trie_dict + rows = range(len(word) + 1) + + for char, node in start_node.items(): + yield from self._levenshtein_rec(char, node, word, rows, max_cost, depth=1) + + def _levenshtein_rec(self, char, node, word, rows, max_cost, depth=0): + n_columns = len(word) + 1 + new_rows = [rows[0] + 1] + cost = 0 + + for col in range(1, n_columns): + insert_cost = new_rows[col - 1] + 1 + delete_cost = rows[col] + 1 + replace_cost = rows[col - 1] + int(word[col - 1] != char) + cost = min((insert_cost, delete_cost, replace_cost)) + new_rows.append(cost) + + stop_crit = isinstance(node, dict) and node.keys() & ( + self._white_space_chars | {self._keyword} + ) + if new_rows[-1] <= max_cost and stop_crit: + yield node, cost, depth + + elif isinstance(node, dict) and min(new_rows) <= max_cost: + for new_char, new_node in node.items(): + yield from self._levenshtein_rec( + new_char, new_node, word, new_rows, max_cost, depth=depth + 1 + ) + + +def _get_span_indices( + start_idx: int, end_idx: int, index_mapping: Optional[list[int]] = None +) -> tuple[int, int]: + """Return the span indices (start index, end_index) by taking into account + index shift due to lowercasing. See `get_index_mapping` for further + explanations. + + :param start_idx: start index of the match + :param end_idx: end index of the match + :param index_mapping: optional index mapping, defaults to None + :return: a (start_idx, end_idx) tuple, possibly shifted if `index_mapping` + is not None + """ + if index_mapping is None: + return start_idx, end_idx + return index_mapping[start_idx], index_mapping[end_idx - 1] + 1 + + +# LATIN CAPITAL LETTER I WITH DOT ABOVE is the only letter than changes length +# when lowercased: see http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt +LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = "İ" + + +def get_index_mapping(sentence: str, case_sensitive: bool) -> Optional[list[int]]: + """Get character index mapping (a list of indices of the same length as + the lowercased version of `sentence` or None). + + When lowercasing a string, the string changes length if it contains LATIN + CAPITAL LETTER I WITH DOT ABOVE (`İ`): the length of the lowercased + version of this letter is 2 (instead of 1). + If `case_sensitive=True` or if there is no `İ` in the string, this function + returns None: we don't to account for character index shift during keyword + extraction. + Otherwise, we return a list of indices of the same length as the lowercased + version of `sentence`, that gives the character index in the original + sentence. + + :param sentence: the original non-lowercased sentence + :param case_sensitive: whether the keyword extraction is case sensitive + """ + if case_sensitive or LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE not in sentence: + return None + offsets = [] + for idx, char in enumerate(sentence): + if char == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE: + offsets.append(idx) + offsets.append(idx) + return offsets diff --git a/robotoff/utils/fold_to_ascii.py b/robotoff/utils/text/fold_to_ascii.py similarity index 100% rename from robotoff/utils/fold_to_ascii.py rename to robotoff/utils/text/fold_to_ascii.py diff --git a/tests/unit/data/flashtext/keyword_extractor_test_cases.json b/tests/unit/data/flashtext/keyword_extractor_test_cases.json new file mode 100644 index 0000000000..c776e92c94 --- /dev/null +++ b/tests/unit/data/flashtext/keyword_extractor_test_cases.json @@ -0,0 +1 @@ +[{"sentence":"I like python","keyword_dict":{"Python":["python"]},"explanation":"Keyword at the end of the sentence.","keywords":["Python"],"keywords_case_sensitive":["Python"]},{"sentence":"I like python","keyword_dict":{"Pythonizer":["pythonizer"]},"explanation":"Incomplete keyword at the end of the sentence.","keywords":[],"keywords_case_sensitive":[]},{"sentence":"python I like","keyword_dict":{"Python":["python"]},"explanation":"Keyword at the beginning of the sentence.","keywords":["Python"],"keywords_case_sensitive":["Python"]},{"sentence":"I like python also","keyword_dict":{"Python":["python"]},"explanation":"Keyword before the end of the sentence.","keywords":["Python"],"keywords_case_sensitive":["Python"]},{"sentence":"I like python java","keyword_dict":{"Python":["python"],"Java":["java"]},"explanation":"Multiple keywords in the end of the sentence.","keywords":["Python","Java"],"keywords_case_sensitive":["Python","Java"]},{"sentence":"I like python and java","keyword_dict":{"Python":["python"],"Java":["java"]},"explanation":"Multiple keywords in the sentence with other word in between.","keywords":["Python","Java"],"keywords_case_sensitive":["Python","Java"]},{"sentence":"python","keyword_dict":{"Python":["python"]},"explanation":"Single keyword in the sentence.","keywords":["Python"],"keywords_case_sensitive":["Python"]},{"sentence":" python","keyword_dict":{"Python":["python"]},"explanation":"Single keyword in the sentence with space prefix.","keywords":["Python"],"keywords_case_sensitive":["Python"]},{"sentence":"I like r","keyword_dict":{"R":["r"]},"explanation":"Single char keyword at the end of the sentence.","keywords":["R"],"keywords_case_sensitive":["R"]},{"sentence":"r I like","keyword_dict":{"R":["r"]},"explanation":"Single char keyword at the beginning of the sentence.","keywords":["R"],"keywords_case_sensitive":["R"]},{"sentence":"I like R also","keyword_dict":{"R":["r"]},"explanation":"Single char keyword before the end of the sentence.","keywords":["R"],"keywords_case_sensitive":[]},{"sentence":"I like R java","keyword_dict":{"R":["r"],"Java":["java"]},"explanation":"Multiple keywords in the end of the sentence.","keywords":["R","Java"],"keywords_case_sensitive":["Java"]},{"sentence":"I like R and java","keyword_dict":{"R":["R"],"Java":["java"]},"explanation":"Multiple keywords in the sentence with other word in between.","keywords":["R","Java"],"keywords_case_sensitive":["R","Java"]},{"sentence":"R","keyword_dict":{"R":["r"]},"explanation":"Single character keyword in the sentence.","keywords":["R"],"keywords_case_sensitive":[]},{"sentence":" R","keyword_dict":{"R":["R"]},"explanation":"Single character keyword in the sentence with space prefix.","keywords":["R"],"keywords_case_sensitive":["R"]},{"sentence":"I like distributed super computing","keyword_dict":{"Distributed Super Computing":["distributed super computing"]},"explanation":"Multi word Keyword at the end of the sentence.","keywords":["Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing"]},{"sentence":"distributed super computing I like","keyword_dict":{"Distributed Super Computing":["distributed super computing"]},"explanation":"Multi word Keyword at the beginning of the sentence.","keywords":["Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing"]},{"sentence":"I like distributed super computing also","keyword_dict":{"Distributed Super Computing":["distributed super computing"]},"explanation":"Multi word Keyword before the end of the sentence.","keywords":["Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing"]},{"sentence":"I like distributed super computing java","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Java":["java"]},"explanation":"Multi word Keyword at the end of the sentence.","keywords":["Distributed Super Computing","Java"],"keywords_case_sensitive":["Distributed Super Computing","Java"]},{"sentence":"I like distributed super computing java programing","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Java":["java programing"]},"explanation":"Multiple Multi word Keyword at the end of the sentence.","keywords":["Distributed Super Computing","Java"],"keywords_case_sensitive":["Distributed Super Computing","Java"]},{"sentence":"I like distributed super computing and java","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Java":["java"]},"explanation":"Multiple keywords in the sentence with other word in between.","keywords":["Distributed Super Computing","Java"],"keywords_case_sensitive":["Distributed Super Computing","Java"]},{"sentence":"distributed super computing","keyword_dict":{"Distributed Super Computing":["distributed super computing"]},"explanation":"Single Multi word Keyword in the sentence.","keywords":["Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing"]},{"sentence":" distributed super computing","keyword_dict":{"Distributed Super Computing":["distributed super computing"]},"explanation":"Single Multi word Keyword in the sentence with space prefix.","keywords":["Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing"]},{"sentence":"distributed super computing distributed super computing","keyword_dict":{"Distributed Super Computing":["distributed super computing"]},"explanation":"Multi word Keyword twice","keywords":["Distributed Super Computing","Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing","Distributed Super Computing"]},{"sentence":"distributed super distributed super computing","keyword_dict":{"Distributed Super Computing":["distributed super computing"]},"explanation":"Multi word Keyword partial then complete.","keywords":["Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing"]},{"sentence":"distributed super distributed super computing java","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Java":["java"]},"explanation":"","keywords":["Distributed Super Computing","Java"],"keywords_case_sensitive":["Distributed Super Computing","Java"]},{"sentence":"distributed super distributed super computing institute","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"]},"explanation":"","keywords":["Distributed Super Computing Institute"],"keywords_case_sensitive":["Distributed Super Computing Institute"]},{"sentence":"distributed super distributed super computing insti","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"]},"explanation":"","keywords":["Distributed Super Computing"],"keywords_case_sensitive":["Distributed Super Computing"]},{"sentence":"distributed super distributed super computing insti java","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"],"Java":["java"]},"explanation":"","keywords":["Distributed Super Computing","Java"],"keywords_case_sensitive":["Distributed Super Computing","Java"]},{"sentence":"distributed super distributed super computing institute java","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"],"Java":["java"]},"explanation":"","keywords":["Distributed Super Computing Institute","Java"],"keywords_case_sensitive":["Distributed Super Computing Institute","Java"]},{"sentence":"distributed super distributed super computing institute and java","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"],"Java":["java"]},"explanation":"","keywords":["Distributed Super Computing Institute","Java"],"keywords_case_sensitive":["Distributed Super Computing Institute","Java"]},{"sentence":"distributed super distributed super computing insti r","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"],"R":["r"]},"explanation":"","keywords":["Distributed Super Computing","R"],"keywords_case_sensitive":["Distributed Super Computing","R"]},{"sentence":"distributed super distributed super computing institute r","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"],"R":["r"]},"explanation":"","keywords":["Distributed Super Computing Institute","R"],"keywords_case_sensitive":["Distributed Super Computing Institute","R"]},{"sentence":"distributed super distributed super computing institute and r","keyword_dict":{"Distributed Super Computing":["distributed super computing"],"Distributed Super Computing Institute":["distributed super computing institute"],"R":["r"]},"explanation":"","keywords":["Distributed Super Computing Institute","R"],"keywords_case_sensitive":["Distributed Super Computing Institute","R"]},{"sentence":"distributed pronoun game","keyword_dict":{"Distributed Programing":["distributed programing"],"Pronoun Game":["pronoun game"]},"explanation":"","keywords":["Pronoun Game"],"keywords_case_sensitive":["Pronoun Game"]},{"sentence":"distributed super computer game","keyword_dict":{"Distributed Super Computer":["distributed super computer"],"Computer Game":["computer game"]},"explanation":"","keywords":["Distributed Super Computer"],"keywords_case_sensitive":["Distributed Super Computer"]},{"sentence":"distributed super computer game","keyword_dict":{"Distributed Super Company":["distributed super company"],"Computer Game":["computer game"]},"explanation":"","keywords":["Computer Game"],"keywords_case_sensitive":["Computer Game"]},{"sentence":"distributed super computer game","keyword_dict":{"Distributed Super Company":["distributed super company"],"Super Computer":["super computer"],"Computer Game":["computer game"]},"explanation":"","keywords":["Super Computer"],"keywords_case_sensitive":["Super Computer"]},{"sentence":"distributed super compute game","keyword_dict":{"Distributed Super Company":["distributed super company"],"Super Computer":["super computer"],"Computer Game":["computer game"]},"explanation":"","keywords":[],"keywords_case_sensitive":[]},{"sentence":"computer game development","keyword_dict":{"Computer Game":["computer game"],"Computer Game Development":["computer game development"]},"explanation":"","keywords":["Computer Game Development"],"keywords_case_sensitive":["Computer Game Development"]},{"sentence":"computer game development","keyword_dict":{"Computer Gaming":["computer gaming"],"Computer Game Development":["computer game development"]},"explanation":"","keywords":["Computer Game Development"],"keywords_case_sensitive":["Computer Game Development"]},{"sentence":"I like .net","keyword_dict":{".NET":[".net"]},"explanation":"keyword with special character","keywords":[".NET"],"keywords_case_sensitive":[".NET"]},{"sentence":"I like c++","keyword_dict":{"Cpp":["c++"]},"explanation":"keyword with special character","keywords":["Cpp"],"keywords_case_sensitive":["Cpp"]},{"sentence":"python.","keyword_dict":{"Python":["python."]},"explanation":"Ending with special character","keywords":["Python"],"keywords_case_sensitive":["Python"]},{"sentence":"python ","keyword_dict":{"Python":["python"]},"explanation":"Ending with special character","keywords":["Python"],"keywords_case_sensitive":["Python"]},{"sentence":"i like python programming","keyword_dict":{"Python":["python prog"]},"explanation":"Negative test case","keywords":[],"keywords_case_sensitive":[]},{"sentence":"distributed super distributed super computing institute java","keyword_dict":{"Java":["java"],"Distributed Super Computing Institutes":["distributed super computing institutes"],"Institute":["institute"],"Distributed Super Computing":["distributed super computing"]},"explanation":"Negative test case","keywords":["Distributed Super Computing","Institute","Java"],"keywords_case_sensitive":["Distributed Super Computing","Institute","Java"]},{"sentence":"targets relative to targets of the IRE1/XBP1s and PERK arms of the UPR","keyword_dict":{"IRE1":["IRE1"],"XBP1s":["XBP1s"],"UPR":["upr"]},"explanation":"","keywords":["IRE1","XBP1s","UPR"],"keywords_case_sensitive":["IRE1","XBP1s"]},{"sentence":"spring framework","keyword_dict":{"spring framework":["spring","spring framework"],"framework":["framework"]},"explanation":"","keywords":["spring framework"],"keywords_case_sensitive":["spring framework"]}] \ No newline at end of file diff --git a/tests/unit/data/flashtext/keywords_format_one.txt b/tests/unit/data/flashtext/keywords_format_one.txt new file mode 100644 index 0000000000..17b6a16f4e --- /dev/null +++ b/tests/unit/data/flashtext/keywords_format_one.txt @@ -0,0 +1,4 @@ +java_2e=>java +java programing=>java +product management=>product management +product management techniques=>product management \ No newline at end of file diff --git a/tests/unit/data/flashtext/keywords_format_two.txt b/tests/unit/data/flashtext/keywords_format_two.txt new file mode 100644 index 0000000000..352be1d153 --- /dev/null +++ b/tests/unit/data/flashtext/keywords_format_two.txt @@ -0,0 +1,2 @@ +java +product management \ No newline at end of file diff --git a/tests/unit/prediction/ocr/test_location.py b/tests/unit/prediction/ocr/test_location.py index 6af91b1ab0..dec32a6504 100644 --- a/tests/unit/prediction/ocr/test_location.py +++ b/tests/unit/prediction/ocr/test_location.py @@ -1,5 +1,4 @@ import pytest -from flashtext import KeywordProcessor from robotoff import settings from robotoff.prediction.ocr.location import ( @@ -8,6 +7,7 @@ find_locations, load_cities_fr, ) +from robotoff.utils.text import KeywordProcessor module = "robotoff.prediction.ocr.location" diff --git a/tests/unit/utils/text/flashtext/__init__.py b/tests/unit/utils/text/flashtext/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/utils/text/flashtext/test_dictionary_loading.py b/tests/unit/utils/text/flashtext/test_dictionary_loading.py new file mode 100644 index 0000000000..78b165059e --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_dictionary_loading.py @@ -0,0 +1,37 @@ +import logging +import unittest + +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestDictionaryLoad(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_dictionary_loading(self): + keyword_processor = KeywordProcessor() + keyword_dict = { + "java": ["java_2e", "java programing"], + "product management": [ + "product management techniques", + "product management", + ], + } + keyword_processor.add_keywords_from_dict(keyword_dict) + + sentence = "I know java_2e and product management techniques" + keywords_extracted = keyword_processor.extract_keywords(sentence) + self.assertEqual( + keywords_extracted, + ["java", "product management"], + "Failed file format one test", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_extract_fuzzy.py b/tests/unit/utils/text/flashtext/test_extract_fuzzy.py new file mode 100644 index 0000000000..c3a8fbf8ca --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_extract_fuzzy.py @@ -0,0 +1,186 @@ +import logging +import unittest + +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestExtractFuzzy(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_extract_deletion(self): + """ + Fuzzy deletion + """ + keyword_proc = KeywordProcessor() + for keyword in (("skype", "messenger"),): + keyword_proc.add_keyword(*keyword) + + sentence = "hello, do you have skpe ?" + extracted_keywords = [("messenger", 19, 23)] + self.assertEqual( + keyword_proc.extract_keywords(sentence, span_info=True, max_cost=1), + extracted_keywords, + ) + + def test_extract_addition(self): + """ + Fuzzy addition + """ + keyword_proc = KeywordProcessor() + for keyword in (("colour here", "couleur ici"), ("and heere", "et ici")): + keyword_proc.add_keyword(*keyword) + + sentence = "color here blabla and here" + + extracted_keywords = [("couleur ici", 0, 10), ("et ici", 18, 26)] + self.assertListEqual( + keyword_proc.extract_keywords(sentence, span_info=True, max_cost=1), + extracted_keywords, + ) + + def test_correct_keyword_on_addition(self): + """ + Test for simple additions using the levensthein function + We ensure we end up on the right node in the trie when starting from the current node + """ + keyword_proc = KeywordProcessor() + for keyword in (("colour here", "couleur ici"), ("and heere", "et ici")): + keyword_proc.add_keyword(*keyword) + + current_dict = keyword_proc.keyword_trie_dict["c"]["o"]["l"]["o"] + closest_node, cost, depth = next( + keyword_proc.levensthein("r", max_cost=1, start_node=current_dict), + ({}, 0, 0), + ) + self.assertDictEqual(closest_node, current_dict["u"]["r"]) + self.assertEqual(cost, 1) + self.assertEqual(depth, 2) + + current_dict_continued = {"e": {"e": {"r": {"e": {"_keyword_": "et ici"}}}}} + closest_node, cost, depth = next( + keyword_proc.levensthein( + "ere", max_cost=1, start_node=current_dict_continued + ), + ({}, 0, 0), + ) + self.assertDictEqual(closest_node, current_dict_continued["e"]["e"]["r"]["e"]) + self.assertEqual(cost, 1) + self.assertEqual(depth, 4) + + def test_correct_keyword_on_deletion(self): + """ + Test for simple deletions using the levensthein function + We ensure we end up on the right node in the trie when starting from the current node + """ + keyword_proc = KeywordProcessor() + keyword_proc.add_keyword("skype") + current_dict = {"y": {"p": {"e": {"_keyword_": "skype"}}}} + + closest_node, cost, depth = next( + keyword_proc.levensthein("pe", max_cost=1, start_node=current_dict), + ({}, 0, 0), + ) + + self.assertDictEqual(closest_node, current_dict["y"]["p"]["e"]) + self.assertEqual(cost, 1) + self.assertEqual(depth, 3) + + def test_correct_keyword_on_substitution(self): + """ + Test for simple substitions using the levensthein function + We ensure we end up on the right node in the trie when starting from the current node + """ + keyword_proc = KeywordProcessor() + for keyword in (("skype", "messenger"),): + keyword_proc.add_keyword(*keyword) + + current_dict = keyword_proc.keyword_trie_dict["s"]["k"] + closest_node, cost, depth = next( + keyword_proc.levensthein("ope", max_cost=1, start_node=current_dict), + ({}, 0, 0), + ) + self.assertDictEqual(closest_node, current_dict["y"]["p"]["e"]) + self.assertEqual(cost, 1) + self.assertEqual(depth, 3) + + def test_extract_cost_spread_over_multiple_words(self): + """ + Here we try to extract a keyword made of different words + the current cost should be decreased by one when encountering 'maade' (1 insertion) + and again by one when encountering 'multple' (1 deletion) + """ + keyword_proc = KeywordProcessor() + keyword_made_of_multiple_words = "made of multiple words" + keyword_proc.add_keyword(keyword_made_of_multiple_words) + sentence = "this sentence contains a keyword maade of multple words" + + extracted_keywords = [(keyword_made_of_multiple_words, 33, 55)] + self.assertEqual( + keyword_proc.extract_keywords(sentence, span_info=True, max_cost=2), + extracted_keywords, + ) + + def test_extract_multiple_keywords(self): + keyword_proc = KeywordProcessor() + keyword_proc.add_keyword("first keyword") + keyword_proc.add_keyword("second keyword") + sentence = "starts with a first kyword then add a secand keyword" + extracted_keywords = [ + ("first keyword", 14, 26), + ("second keyword", 38, 52), + ] + self.assertEqual( + keyword_proc.extract_keywords(sentence, span_info=True, max_cost=1), + extracted_keywords, + ) + + def test_intermediate_match(self): + """ + In this test, we have an intermediate fuzzy match with a keyword (the shortest one) + We first check that we extract the longest keyword if the max_cost is big enough + Then we retry with a smaller max_cost, excluding the longest, and check that the shortest is extracted + """ + keyword_proc = KeywordProcessor() + keyword_proc.add_keyword("keyword") + keyword_proc.add_keyword("keyword with many words") + sentence = "This sentence contains a keywrd with many woords" + + shortest_keyword = ("keyword", 25, 31) + longest_keyword = ("keyword with many words", 25, 48) + + self.assertEqual( + keyword_proc.extract_keywords(sentence, span_info=True, max_cost=2), + [longest_keyword], + ) + self.assertEqual( + keyword_proc.extract_keywords(sentence, span_info=True, max_cost=1), + [shortest_keyword], + ) + + def test_intermediate_match_then_no_match(self): + """ + In this test, we have an intermediate fuzzy match with a keyword (the shortest one) + We check that we get only the shortest keyword when going further into fuzzy match is too + expansive to get the longest keyword. We also extract a classic match later in the string, + to check that the inner data structures all have a correct state + """ + keyword_proc = KeywordProcessor() + keyword_proc.add_keyword("keyword") + keyword_proc.add_keyword("keyword with many words") + sentence = "This sentence contains a keywrd with many items inside, a keyword at the end" + + keywords = [("keyword", 25, 31), ("keyword", 58, 65)] + self.assertEqual( + keyword_proc.extract_keywords(sentence, span_info=True, max_cost=2), + keywords, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_extractor.py b/tests/unit/utils/text/flashtext/test_extractor.py new file mode 100644 index 0000000000..f8645e45e9 --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_extractor.py @@ -0,0 +1,83 @@ +import json +import logging +import unittest + +from robotoff import settings +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestKeywordExtractor(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + with open( + settings.TEST_DATA_DIR / "flashtext/keyword_extractor_test_cases.json" + ) as f: + self.test_cases = json.load(f) + + def tearDown(self): + logger.info("Ending.") + + def test_extract_keywords(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Extract keywords and check if they match the expected result for the test case. + + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor() + keyword_processor.add_keywords_from_dict(test_case["keyword_dict"]) + keywords_extracted = keyword_processor.extract_keywords( + test_case["sentence"] + ) + self.assertEqual( + keywords_extracted, + test_case["keywords"], + "keywords_extracted don't match the expected results for test case: {}".format( + test_id + ), + ) + + def test_extract_keywords_case_sensitive(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Extract keywords and check if they match the expected result for the test case. + + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor(case_sensitive=True) + keyword_processor.add_keywords_from_dict(test_case["keyword_dict"]) + keywords_extracted = keyword_processor.extract_keywords( + test_case["sentence"] + ) + self.assertEqual( + keywords_extracted, + test_case["keywords_case_sensitive"], + "keywords_extracted don't match the expected results for test case: {}".format( + test_id + ), + ) + + def test_extract_keywords_case_insensitive_with_string_length_change(self): + sentence = "Word İngredients LTD İmages nutriments i̇ngredients PROTEİNS" + keyword_processor = KeywordProcessor(case_sensitive=False) + keyword_processor.add_keyword("İngredients", "ingredients") + keyword_processor.add_keyword("nutriments", "nutriments") + keyword_processor.add_keyword("PROTEİNS", "proteins") + extracted_keywords = keyword_processor.extract_keywords( + sentence, span_info=True + ) + self.assertEqual( + extracted_keywords, + [ + ("ingredients", 5, 16), + ("nutriments", 28, 38), + ("ingredients", 39, 51), + ("proteins", 52, 60), + ], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_file_load.py b/tests/unit/utils/text/flashtext/test_file_load.py new file mode 100644 index 0000000000..6544a1a763 --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_file_load.py @@ -0,0 +1,45 @@ +import logging +import unittest + +from robotoff import settings +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestFileLoad(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_file_format_one(self): + keyword_processor = KeywordProcessor() + keyword_processor.add_keyword_from_file( + settings.TEST_DATA_DIR / "flashtext/keywords_format_one.txt" + ) + sentence = "I know java_2e and product management techniques" + keywords_extracted = keyword_processor.extract_keywords(sentence) + self.assertEqual( + keywords_extracted, + ["java", "product management"], + "Failed file format one test", + ) + + def test_file_format_two(self): + keyword_processor = KeywordProcessor() + keyword_processor.add_keyword_from_file( + settings.TEST_DATA_DIR / "flashtext/keywords_format_two.txt" + ) + sentence = "I know java and product management" + keywords_extracted = keyword_processor.extract_keywords(sentence) + self.assertEqual( + keywords_extracted, + ["java", "product management"], + "Failed file format one test", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_kp_exceptions.py b/tests/unit/utils/text/flashtext/test_kp_exceptions.py new file mode 100644 index 0000000000..48a994c63a --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_kp_exceptions.py @@ -0,0 +1,66 @@ +import logging +import unittest + +import pytest + +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestKPExceptions(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_iterator_NotImplementedError(self): + keyword_processor = KeywordProcessor() + keyword_processor.add_keyword("j2ee", "Java") + keyword_processor.add_keyword("colour", "color") + keyword_processor.get_all_keywords() + with pytest.raises(NotImplementedError): + for _ in keyword_processor: + pass + + def test_add_keyword_file_missing(self): + keyword_processor = KeywordProcessor() + with pytest.raises(IOError): + keyword_processor.add_keyword_from_file("missing_file") + + def test_add_keyword_from_list(self): + keyword_processor = KeywordProcessor() + keyword_list = "java" + with pytest.raises(AttributeError): + keyword_processor.add_keywords_from_list(keyword_list) + + def test_add_keyword_from_dictionary(self): + keyword_processor = KeywordProcessor() + keyword_dict = {"java": "java_2e", "product management": "product manager"} + with pytest.raises(AttributeError): + keyword_processor.add_keywords_from_dict(keyword_dict) + + def test_remove_keyword_from_list(self): + keyword_processor = KeywordProcessor() + keyword_list = "java" + with pytest.raises(AttributeError): + keyword_processor.remove_keywords_from_list(keyword_list) + + def test_remove_keyword_from_dictionary(self): + keyword_processor = KeywordProcessor() + keyword_dict = {"java": "java_2e", "product management": "product manager"} + with pytest.raises(AttributeError): + keyword_processor.remove_keywords_from_dict(keyword_dict) + + def test_empty_string(self): + keyword_processor = KeywordProcessor() + self.assertEqual( + keyword_processor.extract_keywords(""), + [], + "new_sentence don't match the expected result", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_kp_extract_span.py b/tests/unit/utils/text/flashtext/test_kp_extract_span.py new file mode 100644 index 0000000000..b7d3e5572b --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_kp_extract_span.py @@ -0,0 +1,70 @@ +import json +import logging +import unittest + +from robotoff import settings +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestKPExtractorSpan(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + with open( + settings.TEST_DATA_DIR / "flashtext/keyword_extractor_test_cases.json" + ) as f: + self.test_cases = json.load(f) + + def tearDown(self): + logger.info("Ending.") + + def test_extract_keywords(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Extract keywords and check if they match the expected result for the test case. + + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor() + for key in test_case["keyword_dict"]: + keyword_processor.add_keywords_from_list(test_case["keyword_dict"][key]) + keywords_extracted = keyword_processor.extract_keywords( + test_case["sentence"], span_info=True + ) + for kwd in keywords_extracted: + # returned keyword lowered should match the sapn from sentence + self.assertEqual( + kwd[0].lower(), + test_case["sentence"].lower()[kwd[1] : kwd[2]], + "keywords span don't match the expected results for test case: {}".format( + test_id + ), + ) + + def test_extract_keywords_case_sensitive(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Extract keywords and check if they match the expected result for the test case. + + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor(case_sensitive=True) + for key in test_case["keyword_dict"]: + keyword_processor.add_keywords_from_list(test_case["keyword_dict"][key]) + keywords_extracted = keyword_processor.extract_keywords( + test_case["sentence"], span_info=True + ) + for kwd in keywords_extracted: + # returned keyword should match the sapn from sentence + self.assertEqual( + kwd[0], + test_case["sentence"][kwd[1] : kwd[2]], + "keywords span don't match the expected results for test case: {}".format( + test_id + ), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_kp_get_all_keywords.py b/tests/unit/utils/text/flashtext/test_kp_get_all_keywords.py new file mode 100644 index 0000000000..ea8eb78599 --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_kp_get_all_keywords.py @@ -0,0 +1,29 @@ +import logging +import unittest + +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestKPGetAllKeywords(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_get_all_keywords(self): + keyword_processor = KeywordProcessor() + keyword_processor.add_keyword("j2ee", "Java") + keyword_processor.add_keyword("colour", "color") + keyword_processor.get_all_keywords() + self.assertEqual( + keyword_processor.get_all_keywords(), + {"colour": "color", "j2ee": "Java"}, + "get_all_keywords didn't match expected results.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_kp_len.py b/tests/unit/utils/text/flashtext/test_kp_len.py new file mode 100644 index 0000000000..b4b8939405 --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_kp_len.py @@ -0,0 +1,62 @@ +import json +import logging +import sys +import unittest +from collections import defaultdict + +from robotoff import settings +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) +logger.level = logging.DEBUG +stream_handler = logging.StreamHandler(sys.stdout) +logger.addHandler(stream_handler) + + +class TestKPLen(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + with open( + settings.TEST_DATA_DIR / "flashtext/keyword_remover_test_cases.json" + ) as f: + self.test_cases = json.load(f) + + def tearDown(self): + logger.info("Ending.") + + def test_remove_keywords_dictionary_len(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Remove the keywords in remove_keyword_dict + Extract keywords and check if they match the expected result for the test case. + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor() + keyword_processor.add_keywords_from_dict(test_case["keyword_dict"]) + keyword_processor.remove_keywords_from_dict( + test_case["remove_keyword_dict"] + ) + + kp_len = len(keyword_processor) + + new_dictionary = defaultdict(list) + for key, values in test_case["keyword_dict"].items(): + for value in values: + if not ( + key in test_case["remove_keyword_dict"] + and value in test_case["remove_keyword_dict"][key] + ): + new_dictionary[key].append(value) + + keyword_processor_two = KeywordProcessor() + keyword_processor_two.add_keywords_from_dict(new_dictionary) + kp_len_two = len(keyword_processor_two) + self.assertEqual( + kp_len, + kp_len_two, + "keyword processor length doesn't match for Text ID {}".format(test_id), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_kp_next_word.py b/tests/unit/utils/text/flashtext/test_kp_next_word.py new file mode 100644 index 0000000000..5a1d1ce648 --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_kp_next_word.py @@ -0,0 +1,27 @@ +import logging +import unittest + +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestKPNextWord(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_next_word(self): + """ + Test for next word extraction + """ + keyword_proc = KeywordProcessor() + self.assertEqual(keyword_proc.get_next_word(""), "") + self.assertEqual(keyword_proc.get_next_word("random sentence"), "random") + self.assertEqual(keyword_proc.get_next_word(" random sentence"), "") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_kp_term_in_kp.py b/tests/unit/utils/text/flashtext/test_kp_term_in_kp.py new file mode 100644 index 0000000000..49d03acbfb --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_kp_term_in_kp.py @@ -0,0 +1,72 @@ +import logging +import unittest + +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestKPDictionaryLikeFeatures(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_term_in_dictionary(self): + keyword_processor = KeywordProcessor() + keyword_processor.add_keyword("j2ee", "Java") + keyword_processor.add_keyword("colour", "color") + keyword_processor.get_keyword("j2ee") + self.assertEqual( + keyword_processor.get_keyword("j2ee"), + "Java", + "get_keyword didn't return expected Keyword", + ) + self.assertEqual( + keyword_processor["colour"], + "color", + "get_keyword didn't return expected Keyword", + ) + self.assertEqual( + keyword_processor["Test"], + None, + "get_keyword didn't return expected Keyword", + ) + self.assertTrue( + "colour" in keyword_processor, "get_keyword didn't return expected Keyword" + ) + self.assertFalse( + "Test" in keyword_processor, "get_keyword didn't return expected Keyword" + ) + + def test_term_in_dictionary_case_sensitive(self): + keyword_processor = KeywordProcessor(case_sensitive=True) + keyword_processor.add_keyword("j2ee", "Java") + keyword_processor.add_keyword("colour", "color") + keyword_processor.get_keyword("j2ee") + self.assertEqual( + keyword_processor.get_keyword("j2ee"), + "Java", + "get_keyword didn't return expected Keyword", + ) + self.assertEqual( + keyword_processor["colour"], + "color", + "get_keyword didn't return expected Keyword", + ) + self.assertEqual( + keyword_processor["J2ee"], + None, + "get_keyword didn't return expected Keyword", + ) + self.assertTrue( + "colour" in keyword_processor, "get_keyword didn't return expected Keyword" + ) + self.assertFalse( + "Colour" in keyword_processor, "get_keyword didn't return expected Keyword" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_loading_keyword_list.py b/tests/unit/utils/text/flashtext/test_loading_keyword_list.py new file mode 100644 index 0000000000..ea226d3051 --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_loading_keyword_list.py @@ -0,0 +1,30 @@ +import logging +import unittest + +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestListLoad(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + + def tearDown(self): + logger.info("Ending.") + + def test_list_loading(self): + keyword_processor = KeywordProcessor() + keyword_list = ["java", "product management"] + keyword_processor.add_keywords_from_list(keyword_list) + sentence = "I know java and product management" + keywords_extracted = keyword_processor.extract_keywords(sentence) + self.assertEqual( + keywords_extracted, + ["java", "product management"], + "Failed file format one test", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/text/flashtext/test_remove_keywords.py b/tests/unit/utils/text/flashtext/test_remove_keywords.py new file mode 100644 index 0000000000..ba6c1623d2 --- /dev/null +++ b/tests/unit/utils/text/flashtext/test_remove_keywords.py @@ -0,0 +1,105 @@ +import json +import logging +import unittest +from collections import defaultdict + +from robotoff import settings +from robotoff.utils.text import KeywordProcessor + +logger = logging.getLogger(__name__) + + +class TestKeywordRemover(unittest.TestCase): + def setUp(self): + logger.info("Starting...") + with open( + settings.TEST_DATA_DIR / "flashtext/keyword_remover_test_cases.json" + ) as f: + self.test_cases = json.load(f) + + def tearDown(self): + logger.info("Ending.") + + def test_remove_keywords(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Remove the keywords in remove_keyword_dict + Extract keywords and check if they match the expected result for the test case. + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor() + keyword_processor.add_keywords_from_dict(test_case["keyword_dict"]) + keyword_processor.remove_keywords_from_dict( + test_case["remove_keyword_dict"] + ) + keywords_extracted = keyword_processor.extract_keywords( + test_case["sentence"] + ) + self.assertEqual( + keywords_extracted, + test_case["keywords"], + "keywords_extracted don't match the expected results for test case: {}".format( + test_id + ), + ) + + def test_remove_keywords_using_list(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Remove the keywords in remove_keyword_dict + Extract keywords and check if they match the expected result for the test case. + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor() + keyword_processor.add_keywords_from_dict(test_case["keyword_dict"]) + for key in test_case["remove_keyword_dict"]: + keyword_processor.remove_keywords_from_list( + test_case["remove_keyword_dict"][key] + ) + keywords_extracted = keyword_processor.extract_keywords( + test_case["sentence"] + ) + self.assertEqual( + keywords_extracted, + test_case["keywords"], + "keywords_extracted don't match the expected results for test case: {}".format( + test_id + ), + ) + + def test_remove_keywords_dictionary_compare(self): + """For each of the test case initialize a new KeywordProcessor. + Add the keywords the test case to KeywordProcessor. + Remove the keywords in remove_keyword_dict + Extract keywords and check if they match the expected result for the test case. + """ + for test_id, test_case in enumerate(self.test_cases): + keyword_processor = KeywordProcessor() + keyword_processor.add_keywords_from_dict(test_case["keyword_dict"]) + keyword_processor.remove_keywords_from_dict( + test_case["remove_keyword_dict"] + ) + keyword_trie_dict = keyword_processor.keyword_trie_dict + + new_dictionary = defaultdict(list) + for key, values in test_case["keyword_dict"].items(): + for value in values: + if not ( + key in test_case["remove_keyword_dict"] + and value in test_case["remove_keyword_dict"][key] + ): + new_dictionary[key].append(value) + + keyword_processor_two = KeywordProcessor() + keyword_processor_two.add_keywords_from_dict(new_dictionary) + keyword_trie_dict_two = keyword_processor_two.keyword_trie_dict + self.assertTrue( + keyword_trie_dict == keyword_trie_dict_two, + "keywords_extracted don't match the expected results for test case: {}".format( + test_id + ), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/utils/test_text.py b/tests/unit/utils/text/test_text.py similarity index 100% rename from tests/unit/utils/test_text.py rename to tests/unit/utils/text/test_text.py