From 1e419640ef977e12b6489adc0cb4d13b875fbedd Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Wed, 31 May 2023 07:51:01 -0400 Subject: [PATCH 01/52] Draft files --- services/worker/poetry.lock | 101 +++++++++++++++++- services/worker/pyproject.toml | 2 + .../job_runners/split/index_elasticsearch.py | 20 ++++ .../worker/job_runners/split/index_parquet.py | 52 +++++++++ .../worker/job_runners/split/read_index.py | 22 ++++ 5 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 services/worker/src/worker/job_runners/split/index_elasticsearch.py create mode 100644 services/worker/src/worker/job_runners/split/index_parquet.py create mode 100644 services/worker/src/worker/job_runners/split/read_index.py diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock index 3e747ec224..9563703a02 100644 --- a/services/worker/poetry.lock +++ b/services/worker/poetry.lock @@ -986,6 +986,101 @@ files = [ dnssec = ["ecdsa (>=0.13)", "pycryptodome"] idna = ["idna (>=2.1)"] +[[package]] +name = "duckdb" +version = "0.8.0" +description = "DuckDB embedded database" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"}, + {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"}, + {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"}, + {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"}, + {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"}, + {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"}, + {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"}, + {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"}, + {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"}, + {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"}, + {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"}, + {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"}, + {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"}, + {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"}, + {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"}, + {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"}, + {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"}, + {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"}, + {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"}, + {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"}, + {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"}, + {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"}, + {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"}, + {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"}, + {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"}, + {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"}, + {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"}, + {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"}, + {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"}, + {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"}, + {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"}, + {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"}, + {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"}, + {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"}, + {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"}, + {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"}, + {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"}, + {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"}, + {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"}, + {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"}, + {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"}, + {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"}, + {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"}, + {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"}, + {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"}, + {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"}, + {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"}, +] + +[[package]] +name = "elastic-transport" +version = "8.4.0" +description = "Transport classes and utilities shared among Python Elastic client libraries" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"}, + {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"}, +] + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.26.2,<2" + +[package.extras] +develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"] + +[[package]] +name = "elasticsearch" +version = "8.8.0" +description = "Python client for Elasticsearch" +category = "main" +optional = false +python-versions = ">=3.6, <4" +files = [ + {file = "elasticsearch-8.8.0-py3-none-any.whl", hash = "sha256:2223ee9daaa3c80c25b28ec3f7c48e66fce6b767a338333d9a81886046a07df6"}, + {file = "elasticsearch-8.8.0.tar.gz", hash = "sha256:6878313cd598c7c90079fed1d4be72e198da35cba57f4083e6bee91f9c70b0eb"}, +] + +[package.dependencies] +elastic-transport = ">=8,<9" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +requests = ["requests (>=2.4.0,<3.0.0)"] + [[package]] name = "environs" version = "9.5.0" @@ -1740,7 +1835,6 @@ optional = false python-versions = "*" files = [ {file = "libclang-15.0.6.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:8621795e07b87e17fc7aac9f071bc7fe6b52ed6110c0a96a9975d8113c8c2527"}, - {file = "libclang-15.0.6.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0bf192c48a8d2992fc5034393ddc99e772ac30e105df84927d62fc88ef8a659f"}, {file = "libclang-15.0.6.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:69b01a23ab543908a661532595daa23cf88bd96d80e41f58ba0eaa6a378fe0d8"}, {file = "libclang-15.0.6.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:4a5188184b937132c198ee9de9a8a2316d5fdd1a825398d5ad1a8f5e06f9b40e"}, {file = "libclang-15.0.6.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:f7ffa02ac5e586cfffde039dcccc439d88d0feac7d77bf9426d9ba7543d16545"}, @@ -4343,7 +4437,6 @@ files = [ {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"}, - {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"}, {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"}, {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"}, {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"}, @@ -4642,6 +4735,8 @@ python-versions = ">=3.8" files = [ {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:db464c88e10e927725997f9b872a21c9d057789d3b7e9a26e4ef1af41d0bcc8c"}, {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:172277c33cb1ae0da19f98c5bcd4946149cfa73c8ea05c6ba18365d58dd3c6f2"}, + {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:9c9b14fbb73ec4cb0f209722a1489020fd8614c92ae22589f2309c48cefdf21f"}, + {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6a54539bd076746f69ae8bef7282f981674fe4dbf59c3a84c4af86ae6bae9d5c"}, {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e3fa53e63672fd71998bbd71cc5478c74dbe5a2d9291d1801c575358c28403c2"}, {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:5499312c21ed3ed47cc6b4cf861896e9564c2c32d8d3c2ef1437c5ca31adfc73"}, {file = "tensorflow_macos-2.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:84cb873c90be63efabfecca53fdc48b734a037d0750532b55cb7ce7c343b5cac"}, @@ -5551,4 +5646,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "3.9.15" -content-hash = "2a3dd73c87ace648b1ae56a4b2139c6f658a095b4cb24f1d8bf96a5c5f748903" +content-hash = "64b8f4a53abc1ae8c6bed9553eff820bab0171629c43a35d2b61acb3985f7920" diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml index 0e9ff2ef5d..0a14102caf 100644 --- a/services/worker/pyproject.toml +++ b/services/worker/pyproject.toml @@ -47,6 +47,8 @@ typer = "^0.4.2" wget = "^3.2" mirakuru = "^2.4.2" pytest-asyncio = "^0.21.0" +duckdb = "0.8.0" +elasticsearch = "^8.8.0" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4" diff --git a/services/worker/src/worker/job_runners/split/index_elasticsearch.py b/services/worker/src/worker/job_runners/split/index_elasticsearch.py new file mode 100644 index 0000000000..d788c5993a --- /dev/null +++ b/services/worker/src/worker/job_runners/split/index_elasticsearch.py @@ -0,0 +1,20 @@ +from datasets import load_dataset +from elasticsearch import Elasticsearch +from datetime import datetime + +duorc = load_dataset("LLMs/Alpaca-ShareGPT", split="train") +es = Elasticsearch("http://localhost:9200") +start_time = datetime.now() + +for i, row in enumerate(duorc): + doc = { + "config": "LLMs--Alpaca-ShareGPT", + "split": "train", + "index": i, + "row": row, + } + + es.index(index="LLMs--Alpaca-ShareGPT".lower(), id=i, document=doc) + print(f"indexed row {i}") +end_time = datetime.now() +print(f"Duration: {end_time - start_time}") \ No newline at end of file diff --git a/services/worker/src/worker/job_runners/split/index_parquet.py b/services/worker/src/worker/job_runners/split/index_parquet.py new file mode 100644 index 0000000000..16b32e2bb6 --- /dev/null +++ b/services/worker/src/worker/job_runners/split/index_parquet.py @@ -0,0 +1,52 @@ + +from typing import List +import duckdb +import pandas as pd +import requests +from datetime import datetime + +DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co" +PARQUET_REVISION="refs/convert/parquet" + +EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT" + +con = duckdb.connect('datasets-server.db') + +def get_parquet_urls(dataset: str) -> List[str]: + splits = requests.get(f"{DATASETS_SERVER_ENDPOINT}/splits?dataset={dataset}", timeout=60).json().get("splits") + split = splits[0] + response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={split['config']}", timeout=60) + if response.status_code != 200: + raise Exception(response) + + response = response.json() + parquet_files = response["parquet_files"] + urls = [content["url"] for content in parquet_files if content["split"] == split["split"]] + if len(urls) == 0: + raise Exception("No parquet files found for dataset") + return urls + +def import_data(): + start_time = datetime.now() + + duckdb.execute("INSTALL 'httpfs';") + duckdb.execute("LOAD 'httpfs';") + duckdb.execute("INSTALL 'fts';") + duckdb.execute("LOAD 'fts';") + # duckdb.sql("select * from duckdb_extensions();").show() + + # Import data + index + parquet_url = get_parquet_urls(EXAMPLE_DATASET_NAME)[0] + print("parquet_url", parquet_url) + con.sql("CREATE SEQUENCE serial START 1;") + # We need a sequence id column for Full text search + # I'm very rusty in SQL so it's very possible there are simpler ways. + + con.sql(f"CREATE TABLE data AS SELECT nextval('serial') AS id, * FROM '{parquet_url}';") + con.sql("PRAGMA create_fts_index('data', 'id', '*');") + + con.sql("DESCRIBE SELECT * FROM data").show() + end_time = datetime.now() + print(f"Duration: {end_time - start_time}") + +import_data() \ No newline at end of file diff --git a/services/worker/src/worker/job_runners/split/read_index.py b/services/worker/src/worker/job_runners/split/read_index.py new file mode 100644 index 0000000000..d4a2e49a54 --- /dev/null +++ b/services/worker/src/worker/job_runners/split/read_index.py @@ -0,0 +1,22 @@ +import duckdb +import pandas as pd + +DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co" +PARQUET_REVISION="refs/convert/parquet" + +EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT" + +con = duckdb.connect('datasets-server.db') + +def run_command(query: str) -> pd.DataFrame: + try: + result = con.execute("SELECT fts_main_data.match_bm25(id, ?) AS score, id, instruction, input, output FROM data WHERE score IS NOT NULL ORDER BY score DESC;", [query]) + print("Ok") + except Exception as error: + print(f"Error: {str(error)}") + return pd.DataFrame({"Error": [f"❌ {str(error)}"]}) + print(result) + return result.df() + +result = run_command("Jonny Walker") +print(result) \ No newline at end of file From f37a829bba707e862edf8ccbb8ac7956dd357517 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 2 Jun 2023 08:11:34 -0400 Subject: [PATCH 02/52] Adding duckdb index job runner --- libs/libcommon/src/libcommon/config.py | 24 ++++ libs/libcommon/src/libcommon/constants.py | 2 + libs/libcommon/src/libcommon/exceptions.py | 7 + libs/libcommon/src/libcommon/storage.py | 15 ++ .../src/libcommon/viewer_utils/index_utils.py | 20 +++ services/api/poetry.lock | 60 +++++++- services/api/pyproject.toml | 1 + services/worker/src/worker/config.py | 3 + .../worker/src/worker/job_runner_factory.py | 12 +- .../worker/job_runners/split/duckdb_index.py | 129 ++++++++++++++++++ .../job_runners/split/index_elasticsearch.py | 20 --- .../worker/job_runners/split/index_parquet.py | 52 ------- .../worker/job_runners/split/read_index.py | 22 --- services/worker/src/worker/main.py | 4 +- .../worker/src/worker/start_worker_loop.py | 4 +- services/worker/src/worker/utils.py | 4 + 16 files changed, 280 insertions(+), 99 deletions(-) create mode 100644 libs/libcommon/src/libcommon/viewer_utils/index_utils.py create mode 100644 services/worker/src/worker/job_runners/split/duckdb_index.py delete mode 100644 services/worker/src/worker/job_runners/split/index_elasticsearch.py delete mode 100644 services/worker/src/worker/job_runners/split/index_parquet.py delete mode 100644 services/worker/src/worker/job_runners/split/read_index.py diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index 716a82ce11..a5de43748b 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -29,6 +29,7 @@ PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION, PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_COUNT_VERSION, PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_SCAN_VERSION, + PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, ) from libcommon.processing_graph import ProcessingGraphSpecification @@ -104,6 +105,22 @@ def from_env(cls) -> "ParquetMetadataConfig": ) +DUCKDB_INDEX_STORAGE_DIRECTORY = None + + +@dataclass(frozen=True) +class DuckDbIndexConfig: + storage_directory: Optional[str] = DUCKDB_INDEX_STORAGE_DIRECTORY + + @classmethod + def from_env(cls) -> "ParquetMetadataConfig": + env = Env(expand_vars=True) + with env.prefixed("DUCKDB_INDEX_"): + return cls( + storage_directory=env.str(name="STORAGE_DIRECTORY", default=DUCKDB_INDEX_STORAGE_DIRECTORY), + ) + + COMMON_HF_ENDPOINT = "https://huggingface.co" COMMON_HF_TOKEN = None @@ -320,6 +337,13 @@ class ProcessingGraphConfig: "triggered_by": ["dataset-config-names", "config-opt-in-out-urls-count"], "job_runner_version": PROCESSING_STEP_DATASET_OPT_IN_OUT_URLS_COUNT_VERSION, }, + "split-duckdb-index": { + "input_type": "split", + "triggered_by": [ + "split-first-rows-from-streaming", "split-first-rows-from-parquet", "config-parquet", + ], + "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, + } } ) diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py index 0a3a549420..a9dcbf92f7 100644 --- a/libs/libcommon/src/libcommon/constants.py +++ b/libs/libcommon/src/libcommon/constants.py @@ -6,6 +6,7 @@ CACHE_MONGOENGINE_ALIAS = "cache" CACHED_ASSETS_CACHE_APPNAME = "datasets_server_cached_assets" PARQUET_METADATA_CACHE_APPNAME = "datasets_server_parquet_metadata" +DUCKDB_INDEX_CACHE_APPNAME="datasets_server_duckdb_index" METRICS_COLLECTION_CACHE_TOTAL_METRIC = "cacheTotalMetric" METRICS_COLLECTION_JOB_TOTAL_METRIC = "jobTotalMetric" METRICS_MONGOENGINE_ALIAS = "metrics" @@ -35,6 +36,7 @@ PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_COUNT_VERSION = 2 PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_SCAN_VERSION = 4 PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION = 1 +PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION = 1 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = 100 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = 100 diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 0f97f5c699..06721e9e47 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -505,3 +505,10 @@ class UnsupportedExternalFilesError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True) + + +class NoIndexableColumnsError(CacheableError): + """Raised when split does not have string columns to index.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) diff --git a/libs/libcommon/src/libcommon/storage.py b/libs/libcommon/src/libcommon/storage.py index bbef1442be..46f402df18 100644 --- a/libs/libcommon/src/libcommon/storage.py +++ b/libs/libcommon/src/libcommon/storage.py @@ -13,6 +13,7 @@ ASSETS_CACHE_APPNAME, CACHED_ASSETS_CACHE_APPNAME, PARQUET_METADATA_CACHE_APPNAME, + DUCKDB_INDEX_CACHE_APPNAME, ) StrPath = Union[str, PathLike[str]] @@ -81,6 +82,20 @@ def init_parquet_metadata_dir(directory: Optional[StrPath] = None) -> StrPath: return init_dir(directory, appname=PARQUET_METADATA_CACHE_APPNAME) +def init_duckdb_index_dir(directory: Optional[StrPath] = None) -> StrPath: + """Initialize the duckdb index directory. + + If directory is None, it will be set to the default duckdb index location on the machine. + + Args: + directory (Optional[Union[str, PathLike[str]]], optional): The directory to initialize. Defaults to None. + + Returns: + Union[str, PathLike[str]]: The directory. + """ + return init_dir(directory, appname=DUCKDB_INDEX_CACHE_APPNAME) + + def exists(path: StrPath) -> bool: """Check if a path exists. diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py new file mode 100644 index 0000000000..22b346a2c9 --- /dev/null +++ b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2022 The HuggingFace Authors. + +from os import makedirs +from pathlib import Path +from libcommon.storage import StrPath +from typing import Tuple + +DATASET_SEPARATOR = "--" +INDEX_DIR_MODE = 0o755 + + +def create_index_dir_split( + dataset: str, config: str, split: str, index_directory: StrPath +) -> Tuple[str, str]: + split_path = dataset / DATASET_SEPARATOR / config / split + dir_path = Path(index_directory).resolve() / split_path + makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True) + return split_path, dir_path + diff --git a/services/api/poetry.lock b/services/api/poetry.lock index a906e4f0b3..9c64231e25 100644 --- a/services/api/poetry.lock +++ b/services/api/poetry.lock @@ -711,6 +711,63 @@ files = [ dnssec = ["ecdsa (>=0.13)", "pycryptodome"] idna = ["idna (>=2.1)"] +[[package]] +name = "duckdb" +version = "0.8.0" +description = "DuckDB embedded database" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"}, + {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"}, + {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"}, + {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"}, + {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"}, + {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"}, + {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"}, + {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"}, + {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"}, + {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"}, + {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"}, + {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"}, + {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"}, + {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"}, + {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"}, + {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"}, + {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"}, + {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"}, + {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"}, + {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"}, + {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"}, + {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"}, + {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"}, + {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"}, + {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"}, + {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"}, + {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"}, + {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"}, + {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"}, + {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"}, + {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"}, + {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"}, + {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"}, + {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"}, + {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"}, + {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"}, + {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"}, + {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"}, + {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"}, + {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"}, + {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"}, + {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"}, + {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"}, + {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"}, + {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"}, + {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"}, + {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"}, +] + [[package]] name = "environs" version = "9.5.0" @@ -2880,7 +2937,6 @@ files = [ {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"}, - {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"}, {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"}, {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"}, {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"}, @@ -3441,4 +3497,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "3.9.15" -content-hash = "4e76b1586360769e88d2439840cbbd3cb91c8b1087d4b17b0e4246d465cc163c" +content-hash = "1cbdff67ee9555ae24c1f162b595c50a5fa9fa2e37c2d3784728b01ebdb5a278" diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml index 16a6fd6f3d..a16ca6c02f 100644 --- a/services/api/pyproject.toml +++ b/services/api/pyproject.toml @@ -19,6 +19,7 @@ starlette = "^0.27.0" starlette-prometheus = "^0.9.0" uvicorn = "^0.20.0" watchdog = { extras = ["watchmedo"], version = "^2.2.1" } +duckdb = "^0.8.0" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4" diff --git a/services/worker/src/worker/config.py b/services/worker/src/worker/config.py index b2c2ecf99c..cc27936f41 100644 --- a/services/worker/src/worker/config.py +++ b/services/worker/src/worker/config.py @@ -13,6 +13,7 @@ ParquetMetadataConfig, ProcessingGraphConfig, QueueConfig, + DuckDbIndexConfig, ) WORKER_CONTENT_MAX_BYTES = 10_000_000 @@ -232,6 +233,7 @@ class AppConfig: worker: WorkerConfig = field(default_factory=WorkerConfig) urls_scan: OptInOutUrlsScanConfig = field(default_factory=OptInOutUrlsScanConfig) parquet_metadata: ParquetMetadataConfig = field(default_factory=ParquetMetadataConfig) + duckdb_index: DuckDbIndexConfig = field(default_factory=DuckDbIndexConfig) @classmethod def from_env(cls) -> "AppConfig": @@ -249,4 +251,5 @@ def from_env(cls) -> "AppConfig": worker=WorkerConfig.from_env(), urls_scan=OptInOutUrlsScanConfig.from_env(), parquet_metadata=ParquetMetadataConfig.from_env(), + duckdb_index=DuckDbIndexConfig.from_env(), ) diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py index 0e4bfe8259..82b071b7e5 100644 --- a/services/worker/src/worker/job_runner_factory.py +++ b/services/worker/src/worker/job_runner_factory.py @@ -47,7 +47,7 @@ from worker.job_runners.split.opt_in_out_urls_scan_from_streaming import ( SplitOptInOutUrlsScanJobRunner, ) - +from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner class BaseJobRunnerFactory(ABC): """ @@ -73,6 +73,7 @@ class JobRunnerFactory(BaseJobRunnerFactory): hf_datasets_cache: Path assets_directory: StrPath parquet_metadata_directory: StrPath + duckdb_index_directory: StrPath def _create_job_runner(self, job_info: JobInfo) -> JobRunner: job_type = job_info["type"] @@ -213,6 +214,14 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner: processing_step=processing_step, ) + if job_type == SplitDuckDbIndexJobRunner.get_job_type(): + return SplitDuckDbIndexJobRunner( + job_info=job_info, + app_config=self.app_config, + processing_step=processing_step, + duckdb_index_directory=self.duckdb_index_directory, + ) + supported_job_types = [ DatasetConfigNamesJobRunner.get_job_type(), ConfigSplitNamesFromStreamingJobRunner.get_job_type(), @@ -232,5 +241,6 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner: SplitOptInOutUrlsCountJobRunner.get_job_type(), ConfigOptInOutUrlsCountJobRunner.get_job_type(), DatasetOptInOutUrlsCountJobRunner.get_job_type(), + SplitDuckDbIndexJobRunner.get_job_type(), ] raise ValueError(f"Unsupported job type: '{job_type}'. The supported job types are: {supported_job_types}") diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py new file mode 100644 index 0000000000..45586c3ff1 --- /dev/null +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2023 The HuggingFace Authors. + +import logging + +import duckdb +from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION +from libcommon.exceptions import ParquetResponseEmptyError, PreviousStepFormatError, NoIndexableColumnsError +from libcommon.processing_graph import ProcessingStep +from libcommon.storage import StrPath +from libcommon.utils import JobInfo +from libcommon.viewer_utils.index_utils import create_index_dir_split + +from worker.config import AppConfig +from worker.job_runners.split.split_job_runner import SplitJobRunner +from worker.utils import ( + CompleteJobResult, + IndexRowsResponse, + get_previous_step_or_raise, +) + +STRING_FEATURE_DTYPE = "string" +VALUE_FEATURE_TYPE = "Value" +DUCKDB_DEFAULT_DB_NAME = "index.db" + +def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse: + logging.info(f"get index-rows for dataset={dataset} config={config} split={split}") + + # get the first rows from previous job + upstream_response = get_previous_step_or_raise( + kinds=["split-first-rows-from-streaming", "split-first-rows-from-parquet"], + dataset=dataset, + config=config, + split=split, + ) + try: + first_rows = upstream_response.response["content"] + features = first_rows["features"] + except KeyError as e: + raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e + + # look for string columns using the first rows + string_columns = [ + feature["name"] + for feature in features + if "dtype" in feature["type"] + and "_type" in feature["type"] + and feature["type"]["dtype"] == STRING_FEATURE_DTYPE + and feature["type"]["_type"] == VALUE_FEATURE_TYPE + ] + + if not string_columns: + raise NoIndexableColumnsError("No string columns available to index.") + + # get list of parquet urls + config_parquet = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) + try: + parquet_files = config_parquet.response["content"]["parquet_files"] + parquet_urls = [content["url"] for content in parquet_files if content["split"] == split] + + if not parquet_urls: + raise ParquetResponseEmptyError("No parquet files found.") + except Exception as e: + raise PreviousStepFormatError("Previous step did not return the expected content.") from e + + # create duckdb index location + # TODO: Need to manage re index, maybe delete folder/file or perform a table drop/delete? + split_path, dir_path = create_index_dir_split( + dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory + ) + duck_db_name = split_path / DUCKDB_DEFAULT_DB_NAME + db_location = dir_path / DUCKDB_DEFAULT_DB_NAME + + # configure duckdb extensions + duckdb.execute("INSTALL 'httpfs';") + duckdb.execute("LOAD 'httpfs';") + duckdb.execute("INSTALL 'fts';") + duckdb.execute("LOAD 'fts';") + logging.info(str(db_location)) + + # index + con = duckdb.connect(str(db_location)) + con.sql("CREATE SEQUENCE serial START 1;") + # TODO: We need a sequence id column for Full text search, maybe there is a better way + filter_columns = ",".join(string_columns) # TODO: What if already exists an id? need to create an identity column + con.sql( + f"CREATE TABLE data AS SELECT nextval('serial') AS id, {filter_columns} FROM read_parquet({parquet_urls});" + ) + con.sql("PRAGMA create_fts_index('data', 'id', '*');") + + return IndexRowsResponse( + duckdb_db_name=str(duck_db_name) + ) + + +class SplitDuckDbIndexJobRunner(SplitJobRunner): + duckdb_index_directory: StrPath + + def __init__( + self, + job_info: JobInfo, + app_config: AppConfig, + processing_step: ProcessingStep, + duckdb_index_directory: StrPath, + ) -> None: + super().__init__( + job_info=job_info, + app_config=app_config, + processing_step=processing_step, + ) + self.duckdb_index_directory = duckdb_index_directory + + @staticmethod + def get_job_type() -> str: + return "split-duckdb-index" + + @staticmethod + def get_job_runner_version() -> int: + return PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION + + def compute(self) -> CompleteJobResult: + return CompleteJobResult( + compute_index_rows( + dataset=self.dataset, + config=self.config, + split=self.split, + assets_directory=self.assets_directory, + ) + ) diff --git a/services/worker/src/worker/job_runners/split/index_elasticsearch.py b/services/worker/src/worker/job_runners/split/index_elasticsearch.py deleted file mode 100644 index d788c5993a..0000000000 --- a/services/worker/src/worker/job_runners/split/index_elasticsearch.py +++ /dev/null @@ -1,20 +0,0 @@ -from datasets import load_dataset -from elasticsearch import Elasticsearch -from datetime import datetime - -duorc = load_dataset("LLMs/Alpaca-ShareGPT", split="train") -es = Elasticsearch("http://localhost:9200") -start_time = datetime.now() - -for i, row in enumerate(duorc): - doc = { - "config": "LLMs--Alpaca-ShareGPT", - "split": "train", - "index": i, - "row": row, - } - - es.index(index="LLMs--Alpaca-ShareGPT".lower(), id=i, document=doc) - print(f"indexed row {i}") -end_time = datetime.now() -print(f"Duration: {end_time - start_time}") \ No newline at end of file diff --git a/services/worker/src/worker/job_runners/split/index_parquet.py b/services/worker/src/worker/job_runners/split/index_parquet.py deleted file mode 100644 index 16b32e2bb6..0000000000 --- a/services/worker/src/worker/job_runners/split/index_parquet.py +++ /dev/null @@ -1,52 +0,0 @@ - -from typing import List -import duckdb -import pandas as pd -import requests -from datetime import datetime - -DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co" -PARQUET_REVISION="refs/convert/parquet" - -EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT" - -con = duckdb.connect('datasets-server.db') - -def get_parquet_urls(dataset: str) -> List[str]: - splits = requests.get(f"{DATASETS_SERVER_ENDPOINT}/splits?dataset={dataset}", timeout=60).json().get("splits") - split = splits[0] - response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={split['config']}", timeout=60) - if response.status_code != 200: - raise Exception(response) - - response = response.json() - parquet_files = response["parquet_files"] - urls = [content["url"] for content in parquet_files if content["split"] == split["split"]] - if len(urls) == 0: - raise Exception("No parquet files found for dataset") - return urls - -def import_data(): - start_time = datetime.now() - - duckdb.execute("INSTALL 'httpfs';") - duckdb.execute("LOAD 'httpfs';") - duckdb.execute("INSTALL 'fts';") - duckdb.execute("LOAD 'fts';") - # duckdb.sql("select * from duckdb_extensions();").show() - - # Import data + index - parquet_url = get_parquet_urls(EXAMPLE_DATASET_NAME)[0] - print("parquet_url", parquet_url) - con.sql("CREATE SEQUENCE serial START 1;") - # We need a sequence id column for Full text search - # I'm very rusty in SQL so it's very possible there are simpler ways. - - con.sql(f"CREATE TABLE data AS SELECT nextval('serial') AS id, * FROM '{parquet_url}';") - con.sql("PRAGMA create_fts_index('data', 'id', '*');") - - con.sql("DESCRIBE SELECT * FROM data").show() - end_time = datetime.now() - print(f"Duration: {end_time - start_time}") - -import_data() \ No newline at end of file diff --git a/services/worker/src/worker/job_runners/split/read_index.py b/services/worker/src/worker/job_runners/split/read_index.py deleted file mode 100644 index d4a2e49a54..0000000000 --- a/services/worker/src/worker/job_runners/split/read_index.py +++ /dev/null @@ -1,22 +0,0 @@ -import duckdb -import pandas as pd - -DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co" -PARQUET_REVISION="refs/convert/parquet" - -EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT" - -con = duckdb.connect('datasets-server.db') - -def run_command(query: str) -> pd.DataFrame: - try: - result = con.execute("SELECT fts_main_data.match_bm25(id, ?) AS score, id, instruction, input, output FROM data WHERE score IS NOT NULL ORDER BY score DESC;", [query]) - print("Ok") - except Exception as error: - print(f"Error: {str(error)}") - return pd.DataFrame({"Error": [f"❌ {str(error)}"]}) - print(result) - return result.df() - -result = run_command("Jonny Walker") -print(result) \ No newline at end of file diff --git a/services/worker/src/worker/main.py b/services/worker/src/worker/main.py index da297ccc67..31d6686956 100644 --- a/services/worker/src/worker/main.py +++ b/services/worker/src/worker/main.py @@ -6,7 +6,7 @@ from libcommon.log import init_logging from libcommon.processing_graph import ProcessingGraph from libcommon.resources import CacheMongoResource, QueueMongoResource -from libcommon.storage import init_assets_dir, init_parquet_metadata_dir +from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir from worker.config import AppConfig from worker.executor import WorkerExecutor @@ -27,6 +27,7 @@ # ^ set first to have logs as soon as possible assets_directory = init_assets_dir(directory=app_config.assets.storage_directory) parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory) + duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory) processing_graph = ProcessingGraph(app_config.processing_graph.specification) @@ -54,6 +55,7 @@ hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, + duckdb_index_directory=duckdb_index_directory, ) worker_executor = WorkerExecutor( app_config=app_config, diff --git a/services/worker/src/worker/start_worker_loop.py b/services/worker/src/worker/start_worker_loop.py index 92be5d69bd..3e66ea1ed5 100644 --- a/services/worker/src/worker/start_worker_loop.py +++ b/services/worker/src/worker/start_worker_loop.py @@ -6,7 +6,7 @@ from libcommon.log import init_logging from libcommon.processing_graph import ProcessingGraph from libcommon.resources import CacheMongoResource, QueueMongoResource -from libcommon.storage import init_assets_dir, init_parquet_metadata_dir +from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir from worker.config import AppConfig from worker.job_runner_factory import JobRunnerFactory @@ -26,6 +26,7 @@ # ^ set first to have logs as soon as possible assets_directory = init_assets_dir(directory=app_config.assets.storage_directory) parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory) + duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory) processing_graph = ProcessingGraph(app_config.processing_graph.specification) @@ -53,6 +54,7 @@ hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, + duckdb_index_directory=duckdb_index_directory, ) loop = Loop( library_cache_paths=libraries_resource.storage_paths, diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 17ccd75b33..69aaba10c9 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -132,6 +132,10 @@ class ImageUrlColumnsResponse(TypedDict): columns: List[str] +class IndexRowsResponse(TypedDict): + duckdb_db_name: str + + Row = Mapping[str, Any] From 340d85edf017c38cce0033cce09ff36b18a309eb Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 2 Jun 2023 08:21:01 -0400 Subject: [PATCH 03/52] Fix style --- libs/libcommon/src/libcommon/config.py | 10 ++++++---- libs/libcommon/src/libcommon/constants.py | 2 +- libs/libcommon/src/libcommon/exceptions.py | 1 + libs/libcommon/src/libcommon/storage.py | 2 +- .../src/libcommon/viewer_utils/index_utils.py | 10 ++++------ services/worker/src/worker/config.py | 2 +- services/worker/src/worker/job_runner_factory.py | 3 ++- .../src/worker/job_runners/split/duckdb_index.py | 16 +++++++++------- services/worker/src/worker/main.py | 6 +++++- services/worker/src/worker/start_worker_loop.py | 6 +++++- services/worker/tests/conftest.py | 12 +++++++++++- services/worker/tests/test_executor.py | 2 ++ services/worker/tests/test_job_runner_factory.py | 2 ++ 13 files changed, 50 insertions(+), 24 deletions(-) diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index a5de43748b..47031b0ab4 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -24,12 +24,12 @@ PROCESSING_STEP_DATASET_PARQUET_VERSION, PROCESSING_STEP_DATASET_SIZE_VERSION, PROCESSING_STEP_DATASET_SPLIT_NAMES_VERSION, + PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION, PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION, PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION, PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_COUNT_VERSION, PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_SCAN_VERSION, - PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, ) from libcommon.processing_graph import ProcessingGraphSpecification @@ -113,7 +113,7 @@ class DuckDbIndexConfig: storage_directory: Optional[str] = DUCKDB_INDEX_STORAGE_DIRECTORY @classmethod - def from_env(cls) -> "ParquetMetadataConfig": + def from_env(cls) -> "DuckDbIndexConfig": env = Env(expand_vars=True) with env.prefixed("DUCKDB_INDEX_"): return cls( @@ -340,10 +340,12 @@ class ProcessingGraphConfig: "split-duckdb-index": { "input_type": "split", "triggered_by": [ - "split-first-rows-from-streaming", "split-first-rows-from-parquet", "config-parquet", + "split-first-rows-from-streaming", + "split-first-rows-from-parquet", + "config-parquet", ], "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, - } + }, } ) diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py index a9dcbf92f7..26f089f970 100644 --- a/libs/libcommon/src/libcommon/constants.py +++ b/libs/libcommon/src/libcommon/constants.py @@ -6,7 +6,7 @@ CACHE_MONGOENGINE_ALIAS = "cache" CACHED_ASSETS_CACHE_APPNAME = "datasets_server_cached_assets" PARQUET_METADATA_CACHE_APPNAME = "datasets_server_parquet_metadata" -DUCKDB_INDEX_CACHE_APPNAME="datasets_server_duckdb_index" +DUCKDB_INDEX_CACHE_APPNAME = "datasets_server_duckdb_index" METRICS_COLLECTION_CACHE_TOTAL_METRIC = "cacheTotalMetric" METRICS_COLLECTION_JOB_TOTAL_METRIC = "jobTotalMetric" METRICS_MONGOENGINE_ALIAS = "metrics" diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 06721e9e47..d9cead1daa 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -100,6 +100,7 @@ def as_response(self) -> ErrorResponse: "JobManagerCrashedError", "JobManagerExceededMaximumDurationError", "MissingSpawningTokenError", + "NoIndexableColumnsError", "NormalRowsError", "ParameterMissingError", "ParquetResponseEmptyError", diff --git a/libs/libcommon/src/libcommon/storage.py b/libs/libcommon/src/libcommon/storage.py index 46f402df18..5a8230107e 100644 --- a/libs/libcommon/src/libcommon/storage.py +++ b/libs/libcommon/src/libcommon/storage.py @@ -12,8 +12,8 @@ from libcommon.constants import ( ASSETS_CACHE_APPNAME, CACHED_ASSETS_CACHE_APPNAME, - PARQUET_METADATA_CACHE_APPNAME, DUCKDB_INDEX_CACHE_APPNAME, + PARQUET_METADATA_CACHE_APPNAME, ) StrPath = Union[str, PathLike[str]] diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py index 22b346a2c9..d00b4754cc 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py +++ b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py @@ -3,18 +3,16 @@ from os import makedirs from pathlib import Path -from libcommon.storage import StrPath from typing import Tuple +from libcommon.storage import StrPath + DATASET_SEPARATOR = "--" INDEX_DIR_MODE = 0o755 -def create_index_dir_split( - dataset: str, config: str, split: str, index_directory: StrPath -) -> Tuple[str, str]: - split_path = dataset / DATASET_SEPARATOR / config / split +def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Tuple[str, Path]: + split_path = f"{dataset}/{DATASET_SEPARATOR}/{config}/{split}" dir_path = Path(index_directory).resolve() / split_path makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True) return split_path, dir_path - diff --git a/services/worker/src/worker/config.py b/services/worker/src/worker/config.py index cc27936f41..7a0697d2e2 100644 --- a/services/worker/src/worker/config.py +++ b/services/worker/src/worker/config.py @@ -9,11 +9,11 @@ AssetsConfig, CacheConfig, CommonConfig, + DuckDbIndexConfig, LogConfig, ParquetMetadataConfig, ProcessingGraphConfig, QueueConfig, - DuckDbIndexConfig, ) WORKER_CONTENT_MAX_BYTES = 10_000_000 diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py index 82b071b7e5..bd73c19c46 100644 --- a/services/worker/src/worker/job_runner_factory.py +++ b/services/worker/src/worker/job_runner_factory.py @@ -34,6 +34,7 @@ from worker.job_runners.dataset.parquet import DatasetParquetJobRunner from worker.job_runners.dataset.size import DatasetSizeJobRunner from worker.job_runners.dataset.split_names import DatasetSplitNamesJobRunner +from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner from worker.job_runners.split.first_rows_from_parquet import ( SplitFirstRowsFromParquetJobRunner, ) @@ -47,7 +48,7 @@ from worker.job_runners.split.opt_in_out_urls_scan_from_streaming import ( SplitOptInOutUrlsScanJobRunner, ) -from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner + class BaseJobRunnerFactory(ABC): """ diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 45586c3ff1..19f0b0e368 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -5,7 +5,11 @@ import duckdb from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION -from libcommon.exceptions import ParquetResponseEmptyError, PreviousStepFormatError, NoIndexableColumnsError +from libcommon.exceptions import ( + NoIndexableColumnsError, + ParquetResponseEmptyError, + PreviousStepFormatError, +) from libcommon.processing_graph import ProcessingStep from libcommon.storage import StrPath from libcommon.utils import JobInfo @@ -23,6 +27,7 @@ VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_DB_NAME = "index.db" + def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse: logging.info(f"get index-rows for dataset={dataset} config={config} split={split}") @@ -68,7 +73,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc split_path, dir_path = create_index_dir_split( dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory ) - duck_db_name = split_path / DUCKDB_DEFAULT_DB_NAME + duck_db_name = f"{split_path}/{DUCKDB_DEFAULT_DB_NAME}" db_location = dir_path / DUCKDB_DEFAULT_DB_NAME # configure duckdb extensions @@ -76,7 +81,6 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc duckdb.execute("LOAD 'httpfs';") duckdb.execute("INSTALL 'fts';") duckdb.execute("LOAD 'fts';") - logging.info(str(db_location)) # index con = duckdb.connect(str(db_location)) @@ -88,9 +92,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc ) con.sql("PRAGMA create_fts_index('data', 'id', '*');") - return IndexRowsResponse( - duckdb_db_name=str(duck_db_name) - ) + return IndexRowsResponse(duckdb_db_name=duck_db_name) class SplitDuckDbIndexJobRunner(SplitJobRunner): @@ -124,6 +126,6 @@ def compute(self) -> CompleteJobResult: dataset=self.dataset, config=self.config, split=self.split, - assets_directory=self.assets_directory, + duckdb_index_directory=self.duckdb_index_directory, ) ) diff --git a/services/worker/src/worker/main.py b/services/worker/src/worker/main.py index 31d6686956..5a866aa74f 100644 --- a/services/worker/src/worker/main.py +++ b/services/worker/src/worker/main.py @@ -6,7 +6,11 @@ from libcommon.log import init_logging from libcommon.processing_graph import ProcessingGraph from libcommon.resources import CacheMongoResource, QueueMongoResource -from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir +from libcommon.storage import ( + init_assets_dir, + init_duckdb_index_dir, + init_parquet_metadata_dir, +) from worker.config import AppConfig from worker.executor import WorkerExecutor diff --git a/services/worker/src/worker/start_worker_loop.py b/services/worker/src/worker/start_worker_loop.py index 3e66ea1ed5..039f69811a 100644 --- a/services/worker/src/worker/start_worker_loop.py +++ b/services/worker/src/worker/start_worker_loop.py @@ -6,7 +6,11 @@ from libcommon.log import init_logging from libcommon.processing_graph import ProcessingGraph from libcommon.resources import CacheMongoResource, QueueMongoResource -from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir +from libcommon.storage import ( + init_assets_dir, + init_duckdb_index_dir, + init_parquet_metadata_dir, +) from worker.config import AppConfig from worker.job_runner_factory import JobRunnerFactory diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py index 987c2b0d57..5b32a89726 100644 --- a/services/worker/tests/conftest.py +++ b/services/worker/tests/conftest.py @@ -8,7 +8,12 @@ from libcommon.queue import _clean_queue_database from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.simple_cache import _clean_cache_database -from libcommon.storage import StrPath, init_assets_dir, init_parquet_metadata_dir +from libcommon.storage import ( + StrPath, + init_assets_dir, + init_duckdb_index_dir, + init_parquet_metadata_dir, +) from pytest import MonkeyPatch, fixture from worker.config import AppConfig @@ -114,6 +119,11 @@ def parquet_metadata_directory(app_config: AppConfig) -> StrPath: return init_parquet_metadata_dir(app_config.parquet_metadata.storage_directory) +@fixture +def duckdb_index_directory(app_config: AppConfig) -> StrPath: + return init_duckdb_index_dir(app_config.duckdb_index.storage_directory) + + @fixture def test_processing_graph() -> ProcessingGraph: return ProcessingGraph( diff --git a/services/worker/tests/test_executor.py b/services/worker/tests/test_executor.py index a34fa1f3aa..4dc2c47862 100644 --- a/services/worker/tests/test_executor.py +++ b/services/worker/tests/test_executor.py @@ -199,6 +199,7 @@ def job_runner_factory( libraries_resource: LibrariesResource, assets_directory: StrPath, parquet_metadata_directory: StrPath, + duckdb_index_directory: StrPath, ) -> JobRunnerFactory: processing_graph = ProcessingGraph(app_config.processing_graph.specification) return JobRunnerFactory( @@ -207,6 +208,7 @@ def job_runner_factory( hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, + duckdb_index_directory=duckdb_index_directory, ) diff --git a/services/worker/tests/test_job_runner_factory.py b/services/worker/tests/test_job_runner_factory.py index 3ed3b0e7e6..982c0ae2a5 100644 --- a/services/worker/tests/test_job_runner_factory.py +++ b/services/worker/tests/test_job_runner_factory.py @@ -39,6 +39,7 @@ def test_create_job_runner( libraries_resource: LibrariesResource, assets_directory: StrPath, parquet_metadata_directory: StrPath, + duckdb_index_directory: StrPath, job_type: str, expected_job_runner: Optional[str], ) -> None: @@ -48,6 +49,7 @@ def test_create_job_runner( hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, + duckdb_index_directory=duckdb_index_directory, ) job_info: JobInfo = { "type": job_type, From c53af5f6a8f80e165ec678937630073bce22f42e Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 2 Jun 2023 08:33:23 -0400 Subject: [PATCH 04/52] WIP adding fts on API --- services/api/src/api/app.py | 12 ++++ services/api/src/api/routes/fts.py | 95 ++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 services/api/src/api/routes/fts.py diff --git a/services/api/src/api/app.py b/services/api/src/api/app.py index 13479c33ad..8b291f56f1 100644 --- a/services/api/src/api/app.py +++ b/services/api/src/api/app.py @@ -16,6 +16,7 @@ from api.config import AppConfig, EndpointConfig, UvicornConfig from api.jwt_token import fetch_jwt_public_key from api.routes.endpoint import EndpointsDefinition, create_endpoint +from api.routes.fts import create_fts_endpoint from api.routes.healthcheck import healthcheck_endpoint from api.routes.metrics import create_metrics_endpoint from api.routes.rows import create_rows_endpoint @@ -121,6 +122,17 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi max_age_short=app_config.api.max_age_short, ), ), + Route( + "/fts", + endpoint=create_fts_endpoint( + processing_graph=processing_graph, + cached_assets_directory=cached_assets_directory, + hf_endpoint=app_config.common.hf_endpoint, + hf_token=app_config.common.hf_token, + max_age_long=app_config.api.max_age_long, + max_age_short=app_config.api.max_age_short, + ), + ), ] return Starlette(routes=routes, middleware=middleware, on_shutdown=[resource.release for resource in resources]) diff --git a/services/api/src/api/routes/fts.py b/services/api/src/api/routes/fts.py new file mode 100644 index 0000000000..833766f824 --- /dev/null +++ b/services/api/src/api/routes/fts.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2022 The HuggingFace Authors. + +import logging +from os import PathLike +from typing import List, Optional, Set, Union + +import duckdb +from libcommon.processing_graph import ProcessingGraph +from libcommon.prometheus import StepProfiler +from libcommon.simple_cache import get_valid_datasets +from starlette.requests import Request +from starlette.responses import Response + +from api.routes.endpoint import get_cache_entry_from_steps +from api.utils import ( + Endpoint, + MissingRequiredParameterError, + UnexpectedError, + are_valid_parameters, + get_json_api_error_response, + get_json_ok_response, +) + + +def get_valid(processing_graph: ProcessingGraph) -> List[str]: + # a dataset is considered valid if at least one response for PROCESSING_STEPS_FOR_VALID + # is valid. + datasets: Optional[Set[str]] = None + for processing_step in processing_graph.get_processing_steps_required_by_dataset_viewer(): + kind_datasets = get_valid_datasets(kind=processing_step.cache_kind) + if datasets is None: + # first iteration fills the set of datasets + datasets = kind_datasets + else: + # next iterations remove the datasets that miss a required processing step + datasets.intersection_update(kind_datasets) + # note that the list is sorted alphabetically for consistency + return [] if datasets is None else sorted(datasets) + + +StrPath = Union[str, PathLike[str]] + + +def create_fts_endpoint( + processing_graph: ProcessingGraph, + cached_assets_directory: StrPath, + hf_endpoint: str, + max_age_long: int = 0, + max_age_short: int = 0, + hf_token: Optional[str] = None, +) -> Endpoint: + async def fts_endpoint(request: Request) -> Response: + with StepProfiler(method="fts_endpoint", step="all"): + try: + logging.info("/fts") + # processing_step = processing_graph.get_processing_step("split-duckdb-index") + dataset = request.query_params.get("dataset") + config = request.query_params.get("config") + split = request.query_params.get("split") + query = request.query_params.get("query") + if not dataset or not config or not split or not are_valid_parameters([dataset, config, split]): + raise MissingRequiredParameterError("Parameter 'dataset', 'config' and 'split' are required") + if not query: + raise MissingRequiredParameterError("Parameter 'query' is required") + # upstream_result = get_cache_entry_from_steps( + # processing_steps=[processing_step], + # dataset=dataset, + # config=config, + # split=split, + # processing_graph=processing_graph, + # hf_endpoint=hf_endpoint, + # hf_token=hf_token, + # ) + # content = result["content"] + # duck_db_name = content["duckdb_db_name"] + + except Exception as e: + with StepProfiler(method="fts_endpoint", step="generate API error response"): + return get_json_api_error_response(UnexpectedError("Unexpected error.", e), max_age=max_age_short) + duckdb.execute("INSTALL 'fts';") + duckdb.execute("LOAD 'fts';") + # db_location = cached_assets_directory / duck_db_name + db_location = "/tmp/asoria/openfire/--/default/train/index.db" + con = duckdb.connect(str(db_location)) + result = con.execute( + ( + "SELECT fts_main_data.match_bm25(id, ?) AS score, * FROM data WHERE score IS NOT NULL ORDER BY" + " score DESC;" + ), + [query], + ).df() + return get_json_ok_response({"result": result.to_json()}, max_age=max_age_long) + + return fts_endpoint From 8cac1c54efe99d9441a2b1a336d98de0036c78fd Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 2 Jun 2023 16:01:33 -0400 Subject: [PATCH 05/52] Remove non used code --- libs/libcommon/src/libcommon/exceptions.py | 8 ++ services/api/src/api/app.py | 12 --- services/api/src/api/routes/fts.py | 95 ------------------- .../worker/job_runners/split/duckdb_index.py | 28 ++++-- 4 files changed, 28 insertions(+), 115 deletions(-) delete mode 100644 services/api/src/api/routes/fts.py diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index d9cead1daa..45dcbbcce7 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -116,6 +116,7 @@ def as_response(self) -> ErrorResponse: "TooManyColumnsError", "UnexpectedError", "UnsupportedExternalFilesError", + "UnsupportedIndexableColumnsError", ] @@ -513,3 +514,10 @@ class NoIndexableColumnsError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) + + +class UnsupportedIndexableColumnsError(CacheableError): + """Raised when some unsupported indexable columns present.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedIndexableColumnsError", cause, True) diff --git a/services/api/src/api/app.py b/services/api/src/api/app.py index 8b291f56f1..13479c33ad 100644 --- a/services/api/src/api/app.py +++ b/services/api/src/api/app.py @@ -16,7 +16,6 @@ from api.config import AppConfig, EndpointConfig, UvicornConfig from api.jwt_token import fetch_jwt_public_key from api.routes.endpoint import EndpointsDefinition, create_endpoint -from api.routes.fts import create_fts_endpoint from api.routes.healthcheck import healthcheck_endpoint from api.routes.metrics import create_metrics_endpoint from api.routes.rows import create_rows_endpoint @@ -122,17 +121,6 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi max_age_short=app_config.api.max_age_short, ), ), - Route( - "/fts", - endpoint=create_fts_endpoint( - processing_graph=processing_graph, - cached_assets_directory=cached_assets_directory, - hf_endpoint=app_config.common.hf_endpoint, - hf_token=app_config.common.hf_token, - max_age_long=app_config.api.max_age_long, - max_age_short=app_config.api.max_age_short, - ), - ), ] return Starlette(routes=routes, middleware=middleware, on_shutdown=[resource.release for resource in resources]) diff --git a/services/api/src/api/routes/fts.py b/services/api/src/api/routes/fts.py deleted file mode 100644 index 833766f824..0000000000 --- a/services/api/src/api/routes/fts.py +++ /dev/null @@ -1,95 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright 2022 The HuggingFace Authors. - -import logging -from os import PathLike -from typing import List, Optional, Set, Union - -import duckdb -from libcommon.processing_graph import ProcessingGraph -from libcommon.prometheus import StepProfiler -from libcommon.simple_cache import get_valid_datasets -from starlette.requests import Request -from starlette.responses import Response - -from api.routes.endpoint import get_cache_entry_from_steps -from api.utils import ( - Endpoint, - MissingRequiredParameterError, - UnexpectedError, - are_valid_parameters, - get_json_api_error_response, - get_json_ok_response, -) - - -def get_valid(processing_graph: ProcessingGraph) -> List[str]: - # a dataset is considered valid if at least one response for PROCESSING_STEPS_FOR_VALID - # is valid. - datasets: Optional[Set[str]] = None - for processing_step in processing_graph.get_processing_steps_required_by_dataset_viewer(): - kind_datasets = get_valid_datasets(kind=processing_step.cache_kind) - if datasets is None: - # first iteration fills the set of datasets - datasets = kind_datasets - else: - # next iterations remove the datasets that miss a required processing step - datasets.intersection_update(kind_datasets) - # note that the list is sorted alphabetically for consistency - return [] if datasets is None else sorted(datasets) - - -StrPath = Union[str, PathLike[str]] - - -def create_fts_endpoint( - processing_graph: ProcessingGraph, - cached_assets_directory: StrPath, - hf_endpoint: str, - max_age_long: int = 0, - max_age_short: int = 0, - hf_token: Optional[str] = None, -) -> Endpoint: - async def fts_endpoint(request: Request) -> Response: - with StepProfiler(method="fts_endpoint", step="all"): - try: - logging.info("/fts") - # processing_step = processing_graph.get_processing_step("split-duckdb-index") - dataset = request.query_params.get("dataset") - config = request.query_params.get("config") - split = request.query_params.get("split") - query = request.query_params.get("query") - if not dataset or not config or not split or not are_valid_parameters([dataset, config, split]): - raise MissingRequiredParameterError("Parameter 'dataset', 'config' and 'split' are required") - if not query: - raise MissingRequiredParameterError("Parameter 'query' is required") - # upstream_result = get_cache_entry_from_steps( - # processing_steps=[processing_step], - # dataset=dataset, - # config=config, - # split=split, - # processing_graph=processing_graph, - # hf_endpoint=hf_endpoint, - # hf_token=hf_token, - # ) - # content = result["content"] - # duck_db_name = content["duckdb_db_name"] - - except Exception as e: - with StepProfiler(method="fts_endpoint", step="generate API error response"): - return get_json_api_error_response(UnexpectedError("Unexpected error.", e), max_age=max_age_short) - duckdb.execute("INSTALL 'fts';") - duckdb.execute("LOAD 'fts';") - # db_location = cached_assets_directory / duck_db_name - db_location = "/tmp/asoria/openfire/--/default/train/index.db" - con = duckdb.connect(str(db_location)) - result = con.execute( - ( - "SELECT fts_main_data.match_bm25(id, ?) AS score, * FROM data WHERE score IS NOT NULL ORDER BY" - " score DESC;" - ), - [query], - ).df() - return get_json_ok_response({"result": result.to_json()}, max_age=max_age_long) - - return fts_endpoint diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 19f0b0e368..696ed71964 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -9,6 +9,7 @@ NoIndexableColumnsError, ParquetResponseEmptyError, PreviousStepFormatError, + UnsupportedIndexableColumnsError, ) from libcommon.processing_graph import ProcessingStep from libcommon.storage import StrPath @@ -26,6 +27,7 @@ STRING_FEATURE_DTYPE = "string" VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_DB_NAME = "index.db" +UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"] def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse: @@ -57,6 +59,15 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc if not string_columns: raise NoIndexableColumnsError("No string columns available to index.") + # look for image, audio and binary columns, if present, raise exeception do not supported yet and index everything + if any( + feature["name"] + for feature in features + if "_type" in feature["type"] + and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS + ): + raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.") + # get list of parquet urls config_parquet = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) try: @@ -69,7 +80,6 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc raise PreviousStepFormatError("Previous step did not return the expected content.") from e # create duckdb index location - # TODO: Need to manage re index, maybe delete folder/file or perform a table drop/delete? split_path, dir_path = create_index_dir_split( dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory ) @@ -84,13 +94,15 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc # index con = duckdb.connect(str(db_location)) - con.sql("CREATE SEQUENCE serial START 1;") - # TODO: We need a sequence id column for Full text search, maybe there is a better way - filter_columns = ",".join(string_columns) # TODO: What if already exists an id? need to create an identity column - con.sql( - f"CREATE TABLE data AS SELECT nextval('serial') AS id, {filter_columns} FROM read_parquet({parquet_urls});" - ) - con.sql("PRAGMA create_fts_index('data', 'id', '*');") + con.sql("CREATE OR REPLACE SEQUENCE serial START 1;") + + # TODO: What if already exists an __id field? need to create an identity column, maybe some random name? + con.sql(f"CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM read_parquet({parquet_urls});") + con.sql("PRAGMA drop_fts_index('data');") + + # TODO: by default, 'porter' stemmer is being used, we might need to use a specific one by dataset language in the future + # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter + con.sql("PRAGMA create_fts_index('data', '__id', '*');") return IndexRowsResponse(duckdb_db_name=duck_db_name) From 23ce3eee617eb6af6efb125863f02a09f4d8e02b Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 2 Jun 2023 16:05:51 -0400 Subject: [PATCH 06/52] Fix style --- .../worker/src/worker/job_runners/split/duckdb_index.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 696ed71964..a146694da1 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -63,8 +63,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc if any( feature["name"] for feature in features - if "_type" in feature["type"] - and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS + if "_type" in feature["type"] and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS ): raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.") @@ -99,8 +98,8 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc # TODO: What if already exists an __id field? need to create an identity column, maybe some random name? con.sql(f"CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM read_parquet({parquet_urls});") con.sql("PRAGMA drop_fts_index('data');") - - # TODO: by default, 'porter' stemmer is being used, we might need to use a specific one by dataset language in the future + + # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter con.sql("PRAGMA create_fts_index('data', '__id', '*');") From ac0a2d9af7f78d4e34c73bc600b2f772dd3131dd Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 2 Jun 2023 16:48:57 -0400 Subject: [PATCH 07/52] Adding chart objects --- chart/Chart.yaml | 2 +- chart/templates/_envDuckDbIndex.tpl | 7 ++++++ chart/templates/_helpers.tpl | 9 +++++++ chart/templates/_initContainerDuckDBIndex.tpl | 21 ++++++++++++++++ chart/templates/_volumeMountDuckDBIndex.tpl | 10 ++++++++ chart/templates/worker/_container.tpl | 2 ++ chart/templates/worker/_deployment.yaml | 1 + chart/values.yaml | 3 +++ .../worker/job_runners/split/duckdb_index.py | 25 +++++++++++-------- tools/docker-compose-datasets-server.yml | 3 +++ tools/docker-compose-dev-datasets-server.yml | 3 +++ 11 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 chart/templates/_envDuckDbIndex.tpl create mode 100644 chart/templates/_initContainerDuckDBIndex.tpl create mode 100644 chart/templates/_volumeMountDuckDBIndex.tpl diff --git a/chart/Chart.yaml b/chart/Chart.yaml index a315e64960..9a4a0a379f 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -18,7 +18,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.12.4 +version: 1.13.4 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/chart/templates/_envDuckDbIndex.tpl b/chart/templates/_envDuckDbIndex.tpl new file mode 100644 index 0000000000..a0a12059bb --- /dev/null +++ b/chart/templates/_envDuckDbIndex.tpl @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2023 The HuggingFace Authors. + +{{- define "envDuckDBIndex" -}} +- name: DUCKDB_INDEX_STORAGE_DIRECTORY + value: {{ .Values.duckDBIndex.storageDirectory | quote }} +{{- end -}} diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 12aeec4c7d..ac9370e664 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -169,6 +169,15 @@ The parquet-metadata/ subpath in the NFS {{- printf "%s/%s/%s/" .Chart.Name .Release.Name "parquet-metadata" }} {{- end }} +{{/* +The duckdb-index/ subpath in the NFS +- in a subdirectory named as the chart (datasets-server/), and below it, +- in a subdirectory named as the Release, so that Releases will not share the same dir +*/}} +{{- define "duckDBIndex.subpath" -}} +{{- printf "%s/%s/%s/" .Chart.Name .Release.Name "duckdb-index" }} +{{- end }} + {{/* The datasets library will use this directory as a cache - in a subdirectory named as the chart (datasets-server/), and below it, diff --git a/chart/templates/_initContainerDuckDBIndex.tpl b/chart/templates/_initContainerDuckDBIndex.tpl new file mode 100644 index 0000000000..ed7cb43bc3 --- /dev/null +++ b/chart/templates/_initContainerDuckDBIndex.tpl @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2023 The HuggingFace Authors. + +{{- define "initContainerDuckDBIndex" -}} +- name: prepare-duckdb-index + image: ubuntu:focal + imagePullPolicy: {{ .Values.images.pullPolicy }} + command: ["/bin/sh", "-c"] + args: + - chown {{ .Values.uid }}:{{ .Values.gid }} /mounted-path; + volumeMounts: + - mountPath: /mounted-path + mountPropagation: None + name: data + subPath: "{{ include "duckDBIndex.subpath" . }}" + readOnly: false + securityContext: + runAsNonRoot: false + runAsUser: 0 + runAsGroup: 0 +{{- end -}} diff --git a/chart/templates/_volumeMountDuckDBIndex.tpl b/chart/templates/_volumeMountDuckDBIndex.tpl new file mode 100644 index 0000000000..01c37b8919 --- /dev/null +++ b/chart/templates/_volumeMountDuckDBIndex.tpl @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2023 The HuggingFace Authors. + +{{- define "volumeMountDuckDBIndexRW" -}} +- mountPath: {{ .Values.duckDBIndex.storageDirectory | quote }} + mountPropagation: None + name: data + subPath: "{{ include "duckDBIndex.subpath" . }}" + readOnly: false +{{- end -}} diff --git a/chart/templates/worker/_container.tpl b/chart/templates/worker/_container.tpl index 899c8c5800..d5c1ed9527 100644 --- a/chart/templates/worker/_container.tpl +++ b/chart/templates/worker/_container.tpl @@ -9,6 +9,7 @@ {{ include "envAssets" . | nindent 2 }} {{ include "envCache" . | nindent 2 }} {{ include "envParquetMetadata" . | nindent 2 }} + {{ include "envDuckDBIndex" . | nindent 2 }} {{ include "envQueue" . | nindent 2 }} {{ include "envCommon" . | nindent 2 }} {{ include "envLog" . | nindent 2 }} @@ -26,6 +27,7 @@ {{ include "volumeMountAssetsRW" . | nindent 2 }} {{ include "volumeMountCache" . | nindent 2 }} {{ include "volumeMountParquetMetadataRW" . | nindent 2 }} + {{ include "volumeMountDuckDBIndexRW" . | nindent 2 }} securityContext: allowPrivilegeEscalation: false resources: {{ toYaml .workerValues.resources | nindent 4 }} diff --git a/chart/templates/worker/_deployment.yaml b/chart/templates/worker/_deployment.yaml index e06d319c65..03a70646ae 100644 --- a/chart/templates/worker/_deployment.yaml +++ b/chart/templates/worker/_deployment.yaml @@ -26,6 +26,7 @@ spec: {{ include "initContainerAssets" . | nindent 8 }} {{ include "initContainerCache" . | nindent 8 }} {{ include "initContainerParquetMetadata" . | nindent 8 }} + {{ include "initContainerDuckDBIndex" . | nindent 8 }} containers: {{ include "containerWorker" . | nindent 8 }} nodeSelector: {{ toYaml .workerValues.nodeSelector | nindent 8 }} tolerations: {{ toYaml .workerValues.tolerations | nindent 8 }} diff --git a/chart/values.yaml b/chart/values.yaml index 0afac4b83e..d378e3d472 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -207,6 +207,9 @@ parquetMetadata: # Directory on the shared storage (parquet metadata files used for random access in /rows) storageDirectory: "/parquet-metadata" +duckDBIndex: + # Directory on the shared storage (duckdb db files used for datasets indexing) + storageDirectory: "/duckdb-index" # Directory where the cache data will be stored cacheDirectory: "/datasets-server-cache" diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index a146694da1..0f2a996465 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -28,6 +28,13 @@ VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_DB_NAME = "index.db" UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"] +CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" +DROP_INDEX_COMMAND = "PRAGMA drop_fts_index('data');" +CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*');" +CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM" +INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';" +LOAD_EXTENSION_COMMAND = "LOAD '{extension}';" +# TODO: What if __id field already exist? def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse: @@ -86,22 +93,20 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc db_location = dir_path / DUCKDB_DEFAULT_DB_NAME # configure duckdb extensions - duckdb.execute("INSTALL 'httpfs';") - duckdb.execute("LOAD 'httpfs';") - duckdb.execute("INSTALL 'fts';") - duckdb.execute("LOAD 'fts';") + duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs")) + duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="httpfs")) + duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="fts")) + duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts")) # index con = duckdb.connect(str(db_location)) - con.sql("CREATE OR REPLACE SEQUENCE serial START 1;") - - # TODO: What if already exists an __id field? need to create an identity column, maybe some random name? - con.sql(f"CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM read_parquet({parquet_urls});") - con.sql("PRAGMA drop_fts_index('data');") + con.sql(CREATE_SEQUENCE_COMMAND) + con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});") + con.sql(DROP_INDEX_COMMAND) # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter - con.sql("PRAGMA create_fts_index('data', '__id', '*');") + con.sql(CREATE_INDEX_COMMAND) return IndexRowsResponse(duckdb_db_name=duck_db_name) diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml index cead78fa98..de36efd79c 100644 --- a/tools/docker-compose-datasets-server.yml +++ b/tools/docker-compose-datasets-server.yml @@ -90,6 +90,7 @@ services: volumes: - assets:${ASSETS_STORAGE_DIRECTORY-/assets}:rw - parquet-metadata:${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}:rw + - duckdb-index:${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}:rw extends: file: docker-compose-base.yml service: datasets-worker @@ -110,6 +111,7 @@ services: PARQUET_AND_INFO_TARGET_REVISION: ${PARQUET_AND_INFO_TARGET_REVISION-refs/convert/parquet} PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata} + DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} # ^ note: the datasets cache is automatically added, so no need to add it here OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} @@ -143,3 +145,4 @@ volumes: parquet-modules-cache: parquet-numba-cache: parquet-metadata: + duckdb-index: diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml index 6489bc4155..ad6c44f3f8 100644 --- a/tools/docker-compose-dev-datasets-server.yml +++ b/tools/docker-compose-dev-datasets-server.yml @@ -94,6 +94,7 @@ services: volumes: - assets:${ASSETS_STORAGE_DIRECTORY-/assets}:rw - parquet-metadata:${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}:rw + - duckdb-index:${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}:rw extends: file: docker-compose-dev-base.yml service: datasets-worker @@ -114,6 +115,7 @@ services: PARQUET_AND_INFO_TARGET_REVISION: ${PARQUET_AND_INFO_TARGET_REVISION-refs/convert/parquet} PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata} + DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} # ^ note: the datasets cache is automatically added, so no need to add it here OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} @@ -145,3 +147,4 @@ volumes: parquet-modules-cache: parquet-numba-cache: parquet-metadata: + duckdb-index: From dff50cffa5cb7963dd8509311e99443652e9b6d8 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 2 Jun 2023 16:51:11 -0400 Subject: [PATCH 08/52] Rollback dependency in API --- services/api/poetry.lock | 59 +------------------------------------ services/api/pyproject.toml | 1 - 2 files changed, 1 insertion(+), 59 deletions(-) diff --git a/services/api/poetry.lock b/services/api/poetry.lock index a1e1e4ec2b..e5c9e22603 100644 --- a/services/api/poetry.lock +++ b/services/api/poetry.lock @@ -711,63 +711,6 @@ files = [ dnssec = ["ecdsa (>=0.13)", "pycryptodome"] idna = ["idna (>=2.1)"] -[[package]] -name = "duckdb" -version = "0.8.0" -description = "DuckDB embedded database" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"}, - {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"}, - {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"}, - {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"}, - {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"}, - {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"}, - {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"}, - {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"}, - {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"}, - {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"}, - {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"}, - {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"}, - {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"}, - {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"}, - {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"}, - {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"}, - {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"}, - {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"}, - {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"}, - {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"}, - {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"}, - {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"}, - {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"}, - {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"}, - {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"}, - {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"}, - {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"}, - {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"}, - {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"}, - {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"}, - {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"}, - {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"}, - {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"}, - {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"}, - {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"}, - {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"}, - {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"}, - {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"}, - {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"}, - {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"}, - {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"}, - {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"}, - {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"}, - {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"}, - {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"}, - {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"}, - {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"}, -] - [[package]] name = "environs" version = "9.5.0" @@ -3497,4 +3440,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "3.9.15" -content-hash = "1cbdff67ee9555ae24c1f162b595c50a5fa9fa2e37c2d3784728b01ebdb5a278" +content-hash = "4e76b1586360769e88d2439840cbbd3cb91c8b1087d4b17b0e4246d465cc163c" diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml index a16ca6c02f..16a6fd6f3d 100644 --- a/services/api/pyproject.toml +++ b/services/api/pyproject.toml @@ -19,7 +19,6 @@ starlette = "^0.27.0" starlette-prometheus = "^0.9.0" uvicorn = "^0.20.0" watchdog = { extras = ["watchmedo"], version = "^2.2.1" } -duckdb = "^0.8.0" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4" From 4659117a71fa27c3ce2148f7ee690001eae111d5 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 6 Jun 2023 10:50:35 -0400 Subject: [PATCH 09/52] Depend on parquet an split --- libs/libcommon/src/libcommon/config.py | 4 +- .../worker/job_runners/split/duckdb_index.py | 78 ++++++++++++++----- .../split/first_rows_from_parquet.py | 31 +------- services/worker/src/worker/utils.py | 30 ++++++- .../split/test_first_rows_from_parquet.py | 4 +- 5 files changed, 96 insertions(+), 51 deletions(-) diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index 5e01500cbf..b925e58bc5 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -341,8 +341,8 @@ class ProcessingGraphConfig: "split-duckdb-index": { "input_type": "split", "triggered_by": [ - "split-first-rows-from-streaming", - "split-first-rows-from-parquet", + "config-split-names-from-info", + "config-split-names-from-streaming", "config-parquet", ], "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 0f2a996465..0001f9609b 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -2,31 +2,40 @@ # Copyright 2023 The HuggingFace Authors. import logging +from functools import partial +from typing import List, Optional import duckdb +from datasets import Features from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION from libcommon.exceptions import ( + FileSystemError, NoIndexableColumnsError, ParquetResponseEmptyError, PreviousStepFormatError, UnsupportedIndexableColumnsError, + SplitNotFoundError, ) from libcommon.processing_graph import ProcessingStep from libcommon.storage import StrPath from libcommon.utils import JobInfo from libcommon.viewer_utils.index_utils import create_index_dir_split +from pyarrow.parquet import ParquetFile +from tqdm.contrib.concurrent import thread_map from worker.config import AppConfig from worker.job_runners.split.split_job_runner import SplitJobRunner from worker.utils import ( CompleteJobResult, IndexRowsResponse, + get_hf_fs, + get_hf_parquet_uris, get_previous_step_or_raise, ) STRING_FEATURE_DTYPE = "string" VALUE_FEATURE_TYPE = "Value" -DUCKDB_DEFAULT_DB_NAME = "index.db" +DUCKDB_DEFAULT_INDEX_FILENAME = "index.db" UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"] CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" DROP_INDEX_COMMAND = "PRAGMA drop_fts_index('data');" @@ -37,22 +46,57 @@ # TODO: What if __id field already exist? -def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse: +def compute_index_rows( + dataset: str, + config: str, + split: str, + duckdb_index_directory: StrPath, + hf_token: Optional[str], +) -> IndexRowsResponse: logging.info(f"get index-rows for dataset={dataset} config={config} split={split}") - # get the first rows from previous job - upstream_response = get_previous_step_or_raise( - kinds=["split-first-rows-from-streaming", "split-first-rows-from-parquet"], - dataset=dataset, - config=config, - split=split, + # validate split + split_names_best_response = get_previous_step_or_raise( + kinds=["config-split-names-from-streaming", "config-split-names-from-info"], dataset=dataset, config=config ) try: - first_rows = upstream_response.response["content"] - features = first_rows["features"] - except KeyError as e: + splits_content = split_names_best_response.response["content"]["splits"] + except Exception as e: raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e + if split not in [split_item["split"] for split_item in splits_content]: + raise SplitNotFoundError(f"The split '{split}' does not exist for the config '{config}' of the dataset.") + + # get parquet content + config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) + + try: + parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"] + sources = sorted( + f"{config}/{parquet_file['filename']}" + for parquet_file in parquet_files_content + if parquet_file["split"] == split and parquet_file["config"] == config + ) + if not sources: + raise ParquetResponseEmptyError("No parquet files found.") + except Exception as e: + raise PreviousStepFormatError("Previous step did not return the expected content.") from e + + logging.debug(f"Found {len(sources)} parquet files for {dataset=}, {config=}, {split=}: {sources}") + + fs = get_hf_fs(hf_token=hf_token) + source_uris = get_hf_parquet_uris(sources, dataset=dataset) + desc = f"{dataset}/{config}/{split}" + try: + parquet_files: List[ParquetFile] = thread_map( + partial(ParquetFile, filesystem=fs), source_uris, desc=desc, unit="pq", disable=True + ) + except Exception as e: + raise FileSystemError(f"Could not read the parquet files: {e}") from e + + # get the features + features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema()) + # look for string columns using the first rows string_columns = [ feature["name"] @@ -74,11 +118,8 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc ): raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.") - # get list of parquet urls - config_parquet = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) try: - parquet_files = config_parquet.response["content"]["parquet_files"] - parquet_urls = [content["url"] for content in parquet_files if content["split"] == split] + parquet_urls = [content["url"] for content in parquet_files_content if content["split"] == split] if not parquet_urls: raise ParquetResponseEmptyError("No parquet files found.") @@ -89,8 +130,8 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc split_path, dir_path = create_index_dir_split( dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory ) - duck_db_name = f"{split_path}/{DUCKDB_DEFAULT_DB_NAME}" - db_location = dir_path / DUCKDB_DEFAULT_DB_NAME + duckdb_index_filename = f"{split_path}/{DUCKDB_DEFAULT_INDEX_FILENAME}" + db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME # configure duckdb extensions duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs")) @@ -108,7 +149,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter con.sql(CREATE_INDEX_COMMAND) - return IndexRowsResponse(duckdb_db_name=duck_db_name) + return IndexRowsResponse(duckdb_index_filename=duckdb_index_filename) class SplitDuckDbIndexJobRunner(SplitJobRunner): @@ -143,5 +184,6 @@ def compute(self) -> CompleteJobResult: config=self.config, split=self.split, duckdb_index_directory=self.duckdb_index_directory, + hf_token=self.app_config.common.hf_token, ) ) diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py index c8685b7a6f..e39ac4ff34 100644 --- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py +++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py @@ -2,15 +2,12 @@ # Copyright 2022 The HuggingFace Authors. import logging -from functools import lru_cache, partial +from functools import partial from typing import List, Optional import pyarrow as pa from datasets import Features -from huggingface_hub import HfFileSystem -from huggingface_hub.hf_file_system import safe_quote from libcommon.constants import ( - PARQUET_REVISION, PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION, PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION, ) @@ -38,6 +35,8 @@ RowItem, SplitFirstRowsResponse, create_truncated_row_items, + get_hf_fs, + get_hf_parquet_uris, get_json_size, get_previous_step_or_raise, to_features_list, @@ -72,30 +71,6 @@ def transform_rows( ] -@lru_cache(maxsize=128) -def get_hf_fs(hf_token: Optional[str]) -> HfFileSystem: - """Get the Hugging Face filesystem. - - Args: - hf_token (Optional[str]): The token to access the filesystem. - Returns: - HfFileSystem: The Hugging Face filesystem. - """ - return HfFileSystem(token=hf_token) - - -def get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]: - """Get the Hugging Face URIs from the Parquet branch of the dataset repository (see PARQUET_REVISION). - - Args: - paths (List[str]): List of paths. - dataset (str): The dataset name. - Returns: - List[str]: List of Parquet URIs. - """ - return [f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}" for path in paths] - - def compute_first_rows_response( dataset: str, config: str, diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 69aaba10c9..580c73d2a2 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -7,6 +7,7 @@ import time import warnings from dataclasses import dataclass, field +from functools import lru_cache from http import HTTPStatus from typing import ( Any, @@ -28,6 +29,9 @@ IterableDataset, load_dataset, ) +from huggingface_hub import HfFileSystem +from huggingface_hub.hf_file_system import safe_quote +from libcommon.constants import PARQUET_REVISION from libcommon.exceptions import NormalRowsError, StreamingRowsError from libcommon.simple_cache import BestResponse, CachedArtifactError, get_best_response from libcommon.utils import orjson_dumps @@ -133,7 +137,7 @@ class ImageUrlColumnsResponse(TypedDict): class IndexRowsResponse(TypedDict): - duckdb_db_name: str + duckdb_index_filename: str Row = Mapping[str, Any] @@ -421,3 +425,27 @@ def get_previous_step_or_raise( cache_entry_with_details=best_response.response, ) return best_response + + +@lru_cache(maxsize=128) +def get_hf_fs(hf_token: Optional[str]) -> HfFileSystem: + """Get the Hugging Face filesystem. + + Args: + hf_token (Optional[str]): The token to access the filesystem. + Returns: + HfFileSystem: The Hugging Face filesystem. + """ + return HfFileSystem(token=hf_token) + + +def get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]: + """Get the Hugging Face URIs from the Parquet branch of the dataset repository (see PARQUET_REVISION). + + Args: + paths (List[str]): List of paths. + dataset (str): The dataset name. + Returns: + List[str]: List of Parquet URIs. + """ + return [f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}" for path in paths] diff --git a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py index 46a92c80d5..8ab70191ea 100644 --- a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py +++ b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py @@ -107,9 +107,9 @@ def test_compute( http_status=HTTPStatus.OK, ) - with patch("worker.job_runners.split.first_rows_from_parquet.get_hf_fs") as mock_read: + with patch("worker.utils.get_hf_fs") as mock_read: with patch( - "worker.job_runners.split.first_rows_from_parquet.get_hf_parquet_uris", + "worker.utils.get_hf_parquet_uris", side_effect=mock_get_hf_parquet_uris, ): initial_location = os.getcwd() From f0794a8f2fd35d9689c3818d97826aba10d59e71 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 6 Jun 2023 13:42:59 -0400 Subject: [PATCH 10/52] Fix libcommon test --- libs/libcommon/tests/test_processing_graph.py | 23 ++++++++++++++++-- .../worker/job_runners/split/duckdb_index.py | 24 +++++++------------ 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py index a4861594a4..c5946ab36c 100644 --- a/libs/libcommon/tests/test_processing_graph.py +++ b/libs/libcommon/tests/test_processing_graph.py @@ -93,13 +93,19 @@ def graph() -> ProcessingGraph: "config-opt-in-out-urls-count", "split-first-rows-from-streaming", "dataset-split-names", + "split-duckdb-index", ], ["config-info"], ["dataset-config-names", "config-parquet-and-info", "config-info"], ), ( "config-split-names-from-streaming", - ["split-first-rows-from-streaming", "dataset-split-names", "config-opt-in-out-urls-count"], + [ + "split-first-rows-from-streaming", + "dataset-split-names", + "config-opt-in-out-urls-count", + "split-duckdb-index", + ], ["dataset-config-names"], ["dataset-config-names"], ), @@ -142,7 +148,7 @@ def graph() -> ProcessingGraph: ), ( "config-parquet", - ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet"], + ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet", "split-duckdb-index"], ["config-parquet-and-info"], ["dataset-config-names", "config-parquet-and-info"], ), @@ -287,6 +293,19 @@ def graph() -> ProcessingGraph: "split-image-url-columns", ], ), + ( + "split-duckdb-index", + [], + ["config-parquet", "config-split-names-from-streaming", "config-split-names-from-info"], + [ + "config-split-names-from-streaming", + "config-split-names-from-info", + "config-parquet-and-info", + "config-info", + "config-parquet", + "dataset-config-names", + ], + ), ], ) def test_default_graph_steps( diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 0001f9609b..2704413e97 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -13,8 +13,8 @@ NoIndexableColumnsError, ParquetResponseEmptyError, PreviousStepFormatError, - UnsupportedIndexableColumnsError, SplitNotFoundError, + UnsupportedIndexableColumnsError, ) from libcommon.processing_graph import ProcessingStep from libcommon.storage import StrPath @@ -38,8 +38,7 @@ DUCKDB_DEFAULT_INDEX_FILENAME = "index.db" UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"] CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" -DROP_INDEX_COMMAND = "PRAGMA drop_fts_index('data');" -CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*');" +CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);" CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM" INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';" LOAD_EXTENSION_COMMAND = "LOAD '{extension}';" @@ -98,23 +97,19 @@ def compute_index_rows( features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema()) # look for string columns using the first rows - string_columns = [ - feature["name"] - for feature in features - if "dtype" in feature["type"] - and "_type" in feature["type"] - and feature["type"]["dtype"] == STRING_FEATURE_DTYPE - and feature["type"]["_type"] == VALUE_FEATURE_TYPE - ] + string_columns = [column for column, feature in features.items() if STRING_FEATURE_DTYPE in str(feature)] if not string_columns: raise NoIndexableColumnsError("No string columns available to index.") # look for image, audio and binary columns, if present, raise exeception do not supported yet and index everything if any( - feature["name"] - for feature in features - if "_type" in feature["type"] and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS + feature + for feature in features.values() + if next( + (feature_type for feature_type in UNSUPPORTED_FEATURES_MAGIC_STRINGS if feature_type in str(feature)), None + ) + is not None ): raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.") @@ -143,7 +138,6 @@ def compute_index_rows( con = duckdb.connect(str(db_location)) con.sql(CREATE_SEQUENCE_COMMAND) con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});") - con.sql(DROP_INDEX_COMMAND) # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter From 05d33624b602ab8f268d6e9bcccaa6d4ad305470 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 6 Jun 2023 16:50:46 -0400 Subject: [PATCH 11/52] Send index file to dedicated branch --- libs/libcommon/src/libcommon/config.py | 12 ++ .../worker/job_runners/split/duckdb_index.py | 125 +++++++++++++++++- services/worker/src/worker/utils.py | 7 +- 3 files changed, 139 insertions(+), 5 deletions(-) diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index b925e58bc5..caa34d6e84 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -106,11 +106,19 @@ def from_env(cls) -> "ParquetMetadataConfig": DUCKDB_INDEX_STORAGE_DIRECTORY = None +DUCKDB_INDEX_COMMIT_MESSAGE = "Update duckdb index file" +DUCKDB_INDEX_COMMITTER_HF_TOKEN = None +DUCKDB_INDEX_TARGET_REVISION = "duckdb/index" +DUCKDB_INDEX_URL_TEMPLATE = "/datasets/%s/resolve/%s/%s" @dataclass(frozen=True) class DuckDbIndexConfig: storage_directory: Optional[str] = DUCKDB_INDEX_STORAGE_DIRECTORY + commit_message: str = DUCKDB_INDEX_COMMIT_MESSAGE + committer_hf_token: Optional[str] = DUCKDB_INDEX_COMMITTER_HF_TOKEN + target_revision: str = DUCKDB_INDEX_TARGET_REVISION + url_template: str = DUCKDB_INDEX_URL_TEMPLATE @classmethod def from_env(cls) -> "DuckDbIndexConfig": @@ -118,6 +126,10 @@ def from_env(cls) -> "DuckDbIndexConfig": with env.prefixed("DUCKDB_INDEX_"): return cls( storage_directory=env.str(name="STORAGE_DIRECTORY", default=DUCKDB_INDEX_STORAGE_DIRECTORY), + commit_message=env.str(name="COMMIT_MESSAGE", default=DUCKDB_INDEX_COMMIT_MESSAGE), + committer_hf_token=env.str(name="COMMITTER_HF_TOKEN", default=DUCKDB_INDEX_COMMITTER_HF_TOKEN), + target_revision=env.str(name="TARGET_REVISION", default=DUCKDB_INDEX_TARGET_REVISION), + url_template=env.str(name="URL_TEMPLATE", default=DUCKDB_INDEX_URL_TEMPLATE), ) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 2704413e97..827875cd05 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -3,12 +3,23 @@ import logging from functools import partial -from typing import List, Optional +from pathlib import Path +from typing import List, Optional, Set +from urllib.parse import quote import duckdb from datasets import Features +from huggingface_hub._commit_api import ( + CommitOperation, + CommitOperationAdd, + CommitOperationDelete, +) +from huggingface_hub.hf_api import HfApi, RepoFile +from huggingface_hub.utils._errors import RepositoryNotFoundError +from libcommon.config import DuckDbIndexConfig from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION from libcommon.exceptions import ( + DatasetNotFoundError, FileSystemError, NoIndexableColumnsError, ParquetResponseEmptyError, @@ -17,7 +28,7 @@ UnsupportedIndexableColumnsError, ) from libcommon.processing_graph import ProcessingStep -from libcommon.storage import StrPath +from libcommon.storage import StrPath, remove_dir from libcommon.utils import JobInfo from libcommon.viewer_utils.index_utils import create_index_dir_split from pyarrow.parquet import ParquetFile @@ -33,6 +44,7 @@ get_previous_step_or_raise, ) +DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_INDEX_FILENAME = "index.db" @@ -45,12 +57,48 @@ # TODO: What if __id field already exist? +def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str: + return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename) + + +def create_index_item( + repo_file: RepoFile, + dataset: str, + config: str, + split: str, + hf_endpoint: str, + target_revision: str, + url_template: str, +) -> IndexRowsResponse: + if repo_file.size is None: + raise ValueError(f"Cannot get size of {repo_file.rfilename}") + return { + "dataset": dataset, + "config": config, + "split": split, + "url": hf_hub_url( + repo_id=dataset, + filename=repo_file.rfilename, + hf_endpoint=hf_endpoint, + revision=target_revision, + url_template=url_template, + ), + "filename": Path(repo_file.rfilename).name, + "size": repo_file.size, + } + + def compute_index_rows( dataset: str, config: str, split: str, duckdb_index_directory: StrPath, + target_revision: str, + hf_endpoint: str, + commit_message: str, + url_template: str, hf_token: Optional[str], + committer_hf_token: Optional[str], ) -> IndexRowsResponse: logging.info(f"get index-rows for dataset={dataset} config={config} split={split}") @@ -125,7 +173,6 @@ def compute_index_rows( split_path, dir_path = create_index_dir_split( dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory ) - duckdb_index_filename = f"{split_path}/{DUCKDB_DEFAULT_INDEX_FILENAME}" db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME # configure duckdb extensions @@ -143,10 +190,74 @@ def compute_index_rows( # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter con.sql(CREATE_INDEX_COMMAND) - return IndexRowsResponse(duckdb_index_filename=duckdb_index_filename) + # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) + hf_api = HfApi(endpoint=hf_endpoint, token=hf_token) + committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token) + + try: + refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE) + if all(ref.ref != target_revision for ref in refs.converts): + initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id + committer_hf_api.create_branch( + repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit + ) + except RepositoryNotFoundError as err: + raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err + except Exception as e: + # TODO: improve error handling + logging.error(str(e)) + target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False) + all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings} + previous_index = f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}" + delete_operations: List[CommitOperation] = [] + if previous_index in all_repo_files: + delete_operations.append(CommitOperationDelete(path_in_repo=previous_index)) + logging.debug(f"{delete_operations=}") + + # send the files to the target revision + add_operations: List[CommitOperation] = [ + CommitOperationAdd( + path_in_repo=f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}", path_or_fileobj=db_location + ) + ] + logging.debug(f"{add_operations=}") + + # TODO: Delete local index file + committer_hf_api.create_commit( + repo_id=dataset, + repo_type=DATASET_TYPE, + revision=target_revision, + operations=delete_operations + add_operations, + commit_message=commit_message, + parent_commit=target_dataset_info.sha, + ) + + # call the API again to get the list of parquet files + target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True) + repo_files = [ + repo_file + for repo_file in target_dataset_info.siblings + if repo_file.rfilename.startswith(f"{config}/{split}") and repo_file.rfilename.endswith(".db") + ] + if len(repo_files) != 1: + # TODO: improve exception type + raise Exception("NO FILE WAS UPLOADED TO BRANCH") + index_file = repo_files[0] + + remove_dir(dir_path) + return create_index_item( + repo_file=index_file, + dataset=dataset, + config=config, + split=split, + hf_endpoint=hf_endpoint, + target_revision=target_revision, + url_template=url_template, + ) class SplitDuckDbIndexJobRunner(SplitJobRunner): + duckdb_index_config: DuckDbIndexConfig duckdb_index_directory: StrPath def __init__( @@ -162,6 +273,7 @@ def __init__( processing_step=processing_step, ) self.duckdb_index_directory = duckdb_index_directory + self.duckdb_index_config = app_config.duckdb_index @staticmethod def get_job_type() -> str: @@ -179,5 +291,10 @@ def compute(self) -> CompleteJobResult: split=self.split, duckdb_index_directory=self.duckdb_index_directory, hf_token=self.app_config.common.hf_token, + url_template=self.duckdb_index_config.url_template, + commit_message=self.duckdb_index_config.commit_message, + committer_hf_token=self.duckdb_index_config.committer_hf_token, + hf_endpoint=self.app_config.common.hf_endpoint, + target_revision=self.duckdb_index_config.target_revision, ) ) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 580c73d2a2..8e849055bc 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -137,7 +137,12 @@ class ImageUrlColumnsResponse(TypedDict): class IndexRowsResponse(TypedDict): - duckdb_index_filename: str + dataset: str + config: str + split: str + url: str + filename: str + size: int Row = Mapping[str, Any] From cec74e3ab98e4e5817244744256c8eafd56f3274 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Wed, 7 Jun 2023 12:50:55 -0400 Subject: [PATCH 12/52] Fix test in first parquet --- .../split/first_rows_from_parquet.py | 5 +- services/worker/tests/conftest.py | 2 +- services/worker/tests/fixtures/fsspec.py | 119 ++++++++++++++++++ .../split/test_first_rows_from_parquet.py | 43 +++++-- 4 files changed, 155 insertions(+), 14 deletions(-) create mode 100644 services/worker/tests/fixtures/fsspec.py diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py index 7fe3f6e9e5..d1b0678333 100644 --- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py +++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py @@ -111,7 +111,10 @@ def compute_first_rows_response( partial(ParquetFile, filesystem=fs), source_uris, desc=desc, unit="pq", disable=True ) except Exception as e: - raise FileSystemError(f"Could not read the parquet files: {e}") from e + raise e + # print(f"ERROR") + # print(str(e)) + # raise FileSystemError(f"Could not read the parquet files: {e}") from e # get the features features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema()) diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py index d2a80b3055..6092d5babf 100644 --- a/services/worker/tests/conftest.py +++ b/services/worker/tests/conftest.py @@ -150,4 +150,4 @@ def another_processing_step(test_processing_graph: ProcessingGraph) -> Processin # Import fixture modules as plugins -pytest_plugins = ["tests.fixtures.datasets", "tests.fixtures.files", "tests.fixtures.hub"] +pytest_plugins = ["tests.fixtures.datasets", "tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"] diff --git a/services/worker/tests/fixtures/fsspec.py b/services/worker/tests/fixtures/fsspec.py new file mode 100644 index 0000000000..848dceb54a --- /dev/null +++ b/services/worker/tests/fixtures/fsspec.py @@ -0,0 +1,119 @@ +# type: ignore +import posixpath +import shutil +from pathlib import Path +from unittest.mock import patch + +import fsspec +import pytest +from fsspec.implementations.local import ( + AbstractFileSystem, + LocalFileSystem, + stringify_path, +) + + +class MockFileSystem(AbstractFileSystem): + protocol = "mock" + + def __init__(self, *args, local_root_dir, **kwargs): + super().__init__() + self._fs = LocalFileSystem(*args, **kwargs) + self.local_root_dir = Path(local_root_dir).resolve().as_posix() + "/" + + def mkdir(self, path, *args, **kwargs): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs.mkdir(path, *args, **kwargs) + + def makedirs(self, path, *args, **kwargs): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs.makedirs(path, *args, **kwargs) + + def rmdir(self, path): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs.rmdir(path) + + def ls(self, path, detail=True, *args, **kwargs): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + out = self._fs.ls(path, detail=detail, *args, **kwargs) + if detail: + return [{**info, "name": info["name"][len(self.local_root_dir) :]} for info in out] # noqa: E203 + else: + return [name[len(self.local_root_dir) :] for name in out] # noqa: E203 + + def info(self, path, *args, **kwargs): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + out = dict(self._fs.info(path, *args, **kwargs)) + out["name"] = out["name"][len(self.local_root_dir) :] # noqa: E203 + return out + + def cp_file(self, path1, path2, *args, **kwargs): + path1 = posixpath.join(self.local_root_dir, self._strip_protocol(path1)) + path2 = posixpath.join(self.local_root_dir, self._strip_protocol(path2)) + return self._fs.cp_file(path1, path2, *args, **kwargs) + + def rm_file(self, path, *args, **kwargs): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs.rm_file(path, *args, **kwargs) + + def rm(self, path, *args, **kwargs): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs.rm(path, *args, **kwargs) + + def _open(self, path, *args, **kwargs): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs._open(path, *args, **kwargs) + + def created(self, path): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs.created(path) + + def modified(self, path): + path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) + return self._fs.modified(path) + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("mock://"): + path = path[7:] + return path + + +class TmpDirFileSystem(MockFileSystem): + protocol = "tmp" + tmp_dir = None + + def __init__(self, *args, **kwargs): + assert self.tmp_dir is not None, "TmpDirFileSystem.tmp_dir is not set" + super().__init__(*args, **kwargs, local_root_dir=self.tmp_dir, auto_mkdir=True) + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("tmp://"): + path = path[6:] + return path + + +@pytest.fixture +def mock_fsspec(): + original_registry = fsspec.registry.copy() + fsspec.register_implementation("mock", MockFileSystem) + fsspec.register_implementation("tmp", TmpDirFileSystem) + yield + fsspec.registry = original_registry + + +@pytest.fixture +def mockfs(tmp_path_factory, mock_fsspec): + local_fs_dir = tmp_path_factory.mktemp("mockfs") + return MockFileSystem(local_root_dir=local_fs_dir, auto_mkdir=True) + + +@pytest.fixture +def tmpfs(tmp_path_factory, mock_fsspec): + tmp_fs_dir = tmp_path_factory.mktemp("tmpfs") + with patch.object(TmpDirFileSystem, "tmp_dir", tmp_fs_dir): + yield TmpDirFileSystem() + shutil.rmtree(tmp_fs_dir) diff --git a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py index 8ab70191ea..591557f74a 100644 --- a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py +++ b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py @@ -4,9 +4,9 @@ import os from dataclasses import replace from http import HTTPStatus -from typing import Callable, List +from typing import Callable, List, Generator from unittest.mock import patch - +from datasets import Dataset import pytest from libcommon.exceptions import CustomError from libcommon.processing_graph import ProcessingGraph @@ -14,7 +14,7 @@ from libcommon.simple_cache import upsert_response from libcommon.storage import StrPath from libcommon.utils import Priority -from pyarrow.fs import LocalFileSystem +from fsspec import AbstractFileSystem from worker.config import AppConfig from worker.job_runners.split.first_rows_from_parquet import ( @@ -70,20 +70,38 @@ def _get_job_runner( def mock_get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]: + print("----------------------------->>>") return paths +@pytest.fixture +def ds() -> Dataset: + return Dataset.from_dict({"text": ["Hello there", "General Kenobi"]}) + +@pytest.fixture +def ds_fs(ds: Dataset, tmpfs: AbstractFileSystem) -> Generator[AbstractFileSystem, None, None]: + with tmpfs.open("config/dataset-split.parquet", "wb") as f: + print("---->AAAA") + try: + ds.to_parquet(f) + except Exception as e: + print("-------------->CCCCCCCCCCCC") + print(str(e)) + print("---->BBBB") + yield tmpfs + @pytest.mark.parametrize( "rows_max_bytes,columns_max_number,error_code", [ (0, 10, "TooBigContentError"), # too small limit, even with truncation - (1_000, 1, "TooManyColumnsError"), # too small columns limit - (1_000, 10, None), + # (1_000, 1, "TooManyColumnsError"), # too small columns limit + # (1_000, 10, None), ], ) def test_compute( get_job_runner: GetJobRunner, app_config: AppConfig, + ds_fs: AbstractFileSystem, rows_max_bytes: int, columns_max_number: int, error_code: str, @@ -99,6 +117,7 @@ def test_compute( "dataset": dataset, "config": config, "split": split, + "url": f"https://fake.huggingface.co/datasets/ds/resolve/refs%2Fconvert%2Fparquet/{config}/{dataset}-{split}.parquet", # noqa: E501 "filename": f"{dataset}-{split}.parquet", "size": 1000, } @@ -107,16 +126,16 @@ def test_compute( http_status=HTTPStatus.OK, ) - with patch("worker.utils.get_hf_fs") as mock_read: + with patch("worker.utils.get_hf_fs", return_value=ds_fs): with patch( "worker.utils.get_hf_parquet_uris", side_effect=mock_get_hf_parquet_uris, ): - initial_location = os.getcwd() - os.chdir("tests/job_runners/split") - # TODO: Make localsystem by relative path - fs = LocalFileSystem() - mock_read.return_value = fs + # initial_location = os.getcwd() + # os.chdir("tests/job_runners/split") + # # TODO: Make localsystem by relative path + # fs = LocalFileSystem() + # mock_read.return_value = fs # ^ Mocking file system with local file job_runner = get_job_runner( dataset, @@ -167,4 +186,4 @@ def test_compute( assert response["rows"][2]["row_idx"] == 2 assert response["rows"][2]["truncated_cells"] == [] assert response["rows"][2]["row"] == {"col1": 3, "col2": "c"} - os.chdir(initial_location) + # os.chdir(initial_location) From 8679ce9183d3821d12459ff88c5d4df70f63b1f2 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 9 Jun 2023 14:16:34 -0400 Subject: [PATCH 13/52] Fix merge hanges --- .../worker/job_runners/split/duckdb_index.py | 10 +++------- .../split/first_rows_from_parquet.py | 2 -- services/worker/src/worker/utils.py | 3 --- .../split/test_first_rows_from_parquet.py | 18 ++++-------------- 4 files changed, 7 insertions(+), 26 deletions(-) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 827875cd05..69e662af0c 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -27,7 +27,9 @@ SplitNotFoundError, UnsupportedIndexableColumnsError, ) +from libcommon.parquet_utils import get_hf_fs, get_hf_parquet_uris from libcommon.processing_graph import ProcessingStep +from libcommon.simple_cache import get_previous_step_or_raise from libcommon.storage import StrPath, remove_dir from libcommon.utils import JobInfo from libcommon.viewer_utils.index_utils import create_index_dir_split @@ -36,13 +38,7 @@ from worker.config import AppConfig from worker.job_runners.split.split_job_runner import SplitJobRunner -from worker.utils import ( - CompleteJobResult, - IndexRowsResponse, - get_hf_fs, - get_hf_parquet_uris, - get_previous_step_or_raise, -) +from worker.utils import CompleteJobResult, IndexRowsResponse DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py index b79083c3be..64112c28b4 100644 --- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py +++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py @@ -29,8 +29,6 @@ RowItem, SplitFirstRowsResponse, create_truncated_row_items, - get_hf_fs, - get_hf_parquet_uris, get_json_size, to_features_list, ) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 916c9f6bea..b28c4193c2 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -27,9 +27,6 @@ IterableDataset, load_dataset, ) -from huggingface_hub import HfFileSystem -from huggingface_hub.hf_file_system import safe_quote -from libcommon.constants import PARQUET_REVISION from libcommon.exceptions import NormalRowsError, StreamingRowsError from libcommon.utils import orjson_dumps diff --git a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py index df6253cf7c..3dcb37c15b 100644 --- a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py +++ b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py @@ -5,7 +5,7 @@ from http import HTTPStatus from typing import Callable, Generator, List from unittest.mock import patch -from datasets import Dataset + import pytest from datasets import Dataset from fsspec import AbstractFileSystem @@ -91,29 +91,19 @@ def ds_fs(ds: Dataset, tmpfs: AbstractFileSystem) -> Generator[AbstractFileSyste def mock_get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]: return paths -@pytest.fixture -def ds() -> Dataset: - return Dataset.from_dict({"text": ["Hello there", "General Kenobi"]}) - -@pytest.fixture -def ds_fs(ds: Dataset, tmpfs: AbstractFileSystem) -> Generator[AbstractFileSystem, None, None]: - with tmpfs.open("config/dataset-split.parquet", "wb") as f: - ds.to_parquet(f) - yield tmpfs - @pytest.mark.parametrize( "rows_max_bytes,columns_max_number,error_code", [ (0, 10, "TooBigContentError"), # too small limit, even with truncation - # (1_000, 1, "TooManyColumnsError"), # too small columns limit - # (1_000, 10, None), + (1_000, 1, "TooManyColumnsError"), # too small columns limit + (1_000, 10, None), ], ) def test_compute( + ds_fs: AbstractFileSystem, get_job_runner: GetJobRunner, app_config: AppConfig, - ds_fs: AbstractFileSystem, rows_max_bytes: int, columns_max_number: int, error_code: str, From 163928e51354cb621ad2261b4c7cdfdd41b84457 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 9 Jun 2023 14:28:31 -0400 Subject: [PATCH 14/52] Fix poetry files --- services/api/poetry.lock | 1 + services/worker/poetry.lock | 43 +++------------------------------- services/worker/pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 41 deletions(-) diff --git a/services/api/poetry.lock b/services/api/poetry.lock index ef10527085..a4b0eaf999 100644 --- a/services/api/poetry.lock +++ b/services/api/poetry.lock @@ -2867,6 +2867,7 @@ files = [ {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"}, + {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"}, {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"}, {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"}, {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"}, diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock index c82510a896..59cb048ebc 100644 --- a/services/worker/poetry.lock +++ b/services/worker/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -1051,44 +1051,6 @@ files = [ {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"}, ] -[[package]] -name = "elastic-transport" -version = "8.4.0" -description = "Transport classes and utilities shared among Python Elastic client libraries" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"}, - {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"}, -] - -[package.dependencies] -certifi = "*" -urllib3 = ">=1.26.2,<2" - -[package.extras] -develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"] - -[[package]] -name = "elasticsearch" -version = "8.8.0" -description = "Python client for Elasticsearch" -category = "main" -optional = false -python-versions = ">=3.6, <4" -files = [ - {file = "elasticsearch-8.8.0-py3-none-any.whl", hash = "sha256:2223ee9daaa3c80c25b28ec3f7c48e66fce6b767a338333d9a81886046a07df6"}, - {file = "elasticsearch-8.8.0.tar.gz", hash = "sha256:6878313cd598c7c90079fed1d4be72e198da35cba57f4083e6bee91f9c70b0eb"}, -] - -[package.dependencies] -elastic-transport = ">=8,<9" - -[package.extras] -async = ["aiohttp (>=3,<4)"] -requests = ["requests (>=2.4.0,<3.0.0)"] - [[package]] name = "environs" version = "9.5.0" @@ -2956,6 +2918,7 @@ optional = false python-versions = "*" files = [ {file = "pdf2image-1.16.3-py3-none-any.whl", hash = "sha256:b6154164af3677211c22cbb38b2bd778b43aca02758e962fe1e231f6d3b0e380"}, + {file = "pdf2image-1.16.3.tar.gz", hash = "sha256:74208810c2cef4d9e347769b8e62a52303982ddb4f2dfd744c7ab4b940ae287e"}, ] [package.dependencies] @@ -5692,4 +5655,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "3.9.15" -content-hash = "ad3c5c34e9ea75e4cb4394930bb35c5afdff65fe03c5f51b8cbebbea37f62f1d" +content-hash = "732285314a1b756206bdba83a83ee9e97635117f5fd9a6fd8d2b92d8f51e6679" diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml index b2cf6d1701..3215758fb7 100644 --- a/services/worker/pyproject.toml +++ b/services/worker/pyproject.toml @@ -45,7 +45,7 @@ trec-car-tools = { path = "vendors/trec-car-tools/python3" } typer = "^0.4.2" wget = "^3.2" mirakuru = "^2.4.2" -duckdb = "0.8.0" +duckdb = "^0.8.0" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4" From b1238f503aa69b0686c76cb1359f377e407746d5 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 12 Jun 2023 16:28:47 -0400 Subject: [PATCH 15/52] Adding happy path test --- libs/libcommon/src/libcommon/exceptions.py | 8 + .../src/libcommon/viewer_utils/index_utils.py | 7 +- .../src/worker/job_runners/config/parquet.py | 3 +- .../job_runners/config/parquet_and_info.py | 18 +- .../job_runners/config/parquet_metadata.py | 9 +- .../src/worker/job_runners/dataset/parquet.py | 3 +- .../worker/job_runners/split/duckdb_index.py | 76 ++++---- services/worker/src/worker/utils.py | 24 +++ services/worker/tests/conftest.py | 1 + services/worker/tests/fixtures/datasets.py | 14 ++ services/worker/tests/fixtures/hub.py | 17 ++ .../tests/job_runners/config/test_parquet.py | 6 +- .../config/test_parquet_metadata.py | 2 +- .../tests/job_runners/dataset/test_parquet.py | 2 +- .../job_runners/split/test_duckdb_index.py | 163 ++++++++++++++++++ 15 files changed, 271 insertions(+), 82 deletions(-) create mode 100644 services/worker/tests/job_runners/split/test_duckdb_index.py diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index c9ac5d5058..cdf8a3d95c 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -101,6 +101,7 @@ def as_response(self) -> ErrorResponse: "MissingSpawningTokenError", "NoIndexableColumnsError", "NormalRowsError", + "NotAvailableIndexFileError", "ParameterMissingError", "ParquetResponseEmptyError", "PreviousStepFormatError", @@ -495,3 +496,10 @@ class UnsupportedIndexableColumnsError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedIndexableColumnsError", cause, True) + + +class NotAvailableIndexFileError(CacheableError): + """Raised when no duckdb index file was found for split.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NotAvailableIndexFileError", cause, False) diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py index d00b4754cc..5beb9f6a31 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py +++ b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -# Copyright 2022 The HuggingFace Authors. +# Copyright 2023 The HuggingFace Authors. from os import makedirs from pathlib import Path -from typing import Tuple from libcommon.storage import StrPath @@ -11,8 +10,8 @@ INDEX_DIR_MODE = 0o755 -def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Tuple[str, Path]: +def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Path: split_path = f"{dataset}/{DATASET_SEPARATOR}/{config}/{split}" dir_path = Path(index_directory).resolve() / split_path makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True) - return split_path, dir_path + return dir_path diff --git a/services/worker/src/worker/job_runners/config/parquet.py b/services/worker/src/worker/job_runners/config/parquet.py index eae1d780c2..4fa17de2ed 100644 --- a/services/worker/src/worker/job_runners/config/parquet.py +++ b/services/worker/src/worker/job_runners/config/parquet.py @@ -9,8 +9,7 @@ from libcommon.simple_cache import get_previous_step_or_raise from worker.job_runners.config.config_job_runner import ConfigJobRunner -from worker.job_runners.config.parquet_and_info import ParquetFileItem -from worker.utils import CompleteJobResult +from worker.utils import CompleteJobResult, ParquetFileItem class ConfigParquetResponse(TypedDict): diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 91de381e19..318ff270da 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -8,7 +8,6 @@ from multiprocessing.pool import ThreadPool from pathlib import Path from typing import Any, Dict, List, Optional, Set, Tuple, TypedDict -from urllib.parse import quote import datasets import datasets.config @@ -72,16 +71,7 @@ from worker.config import AppConfig, ParquetAndInfoConfig from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner -from worker.utils import CompleteJobResult - - -class ParquetFileItem(TypedDict): - dataset: str - config: str - split: str - url: str - filename: str - size: int +from worker.utils import CompleteJobResult, ParquetFileItem, hf_hub_url class ConfigParquetAndInfoResponse(TypedDict): @@ -106,12 +96,6 @@ def path_in_repo(self) -> str: return f'{self.config}/{self.local_file.removeprefix(f"{self.local_dir}/")}' -# TODO: use huggingface_hub's hf_hub_url after -# https://github.com/huggingface/huggingface_hub/issues/1082 -def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str: - return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename) - - p = re.compile(r"(?P[\w-]+?)-(?P\w+(\.\w+)*?)(-[0-9]{5}-of-[0-9]{5})?.parquet") diff --git a/services/worker/src/worker/job_runners/config/parquet_metadata.py b/services/worker/src/worker/job_runners/config/parquet_metadata.py index e1cb6ef575..6c361168f9 100644 --- a/services/worker/src/worker/job_runners/config/parquet_metadata.py +++ b/services/worker/src/worker/job_runners/config/parquet_metadata.py @@ -5,7 +5,6 @@ from functools import partial from typing import List, Optional, TypedDict -from datasets.utils.file_utils import get_authentication_headers_for_url from fsspec.implementations.http import HTTPFileSystem from libcommon.constants import PROCESSING_STEP_CONFIG_PARQUET_METADATA_VERSION from libcommon.exceptions import ( @@ -23,8 +22,7 @@ from worker.config import AppConfig from worker.job_runners.config.config_job_runner import ConfigJobRunner -from worker.job_runners.config.parquet_and_info import ParquetFileItem -from worker.utils import CompleteJobResult +from worker.utils import CompleteJobResult, ParquetFileItem, get_parquet_file class ParquetFileMetadataItem(TypedDict): @@ -42,11 +40,6 @@ class ConfigParquetMetadataResponse(TypedDict): parquet_files_metadata: List[ParquetFileMetadataItem] -def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> ParquetFile: - headers = get_authentication_headers_for_url(url, use_auth_token=hf_token) - return ParquetFile(fs.open(url, headers=headers)) - - def compute_parquet_metadata_response( dataset: str, config: str, hf_token: Optional[str], parquet_metadata_directory: StrPath ) -> ConfigParquetMetadataResponse: diff --git a/services/worker/src/worker/job_runners/dataset/parquet.py b/services/worker/src/worker/job_runners/dataset/parquet.py index 4a3609505d..7c6bb4a82c 100644 --- a/services/worker/src/worker/job_runners/dataset/parquet.py +++ b/services/worker/src/worker/job_runners/dataset/parquet.py @@ -14,9 +14,8 @@ ) from worker.job_runners.config.parquet import ConfigParquetResponse -from worker.job_runners.config.parquet_and_info import ParquetFileItem from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner -from worker.utils import JobResult, PreviousJob +from worker.utils import JobResult, ParquetFileItem, PreviousJob class DatasetParquetResponse(TypedDict): diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 69e662af0c..29fdacfdba 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -5,10 +5,10 @@ from functools import partial from pathlib import Path from typing import List, Optional, Set -from urllib.parse import quote import duckdb from datasets import Features +from fsspec.implementations.http import HTTPFileSystem from huggingface_hub._commit_api import ( CommitOperation, CommitOperationAdd, @@ -22,12 +22,12 @@ DatasetNotFoundError, FileSystemError, NoIndexableColumnsError, + NotAvailableIndexFileError, ParquetResponseEmptyError, PreviousStepFormatError, SplitNotFoundError, UnsupportedIndexableColumnsError, ) -from libcommon.parquet_utils import get_hf_fs, get_hf_parquet_uris from libcommon.processing_graph import ProcessingStep from libcommon.simple_cache import get_previous_step_or_raise from libcommon.storage import StrPath, remove_dir @@ -38,7 +38,13 @@ from worker.config import AppConfig from worker.job_runners.split.split_job_runner import SplitJobRunner -from worker.utils import CompleteJobResult, IndexRowsResponse +from worker.utils import ( + CompleteJobResult, + IndexRowsResponse, + ParquetFileItem, + get_parquet_file, + hf_hub_url, +) DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" @@ -53,10 +59,6 @@ # TODO: What if __id field already exist? -def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str: - return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename) - - def create_index_item( repo_file: RepoFile, dataset: str, @@ -96,7 +98,7 @@ def compute_index_rows( hf_token: Optional[str], committer_hf_token: Optional[str], ) -> IndexRowsResponse: - logging.info(f"get index-rows for dataset={dataset} config={config} split={split}") + logging.info(f"get split-duckdb-index for dataset={dataset} config={config} split={split}") # validate split split_names_best_response = get_previous_step_or_raise( @@ -112,27 +114,22 @@ def compute_index_rows( # get parquet content config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) - try: parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"] - sources = sorted( - f"{config}/{parquet_file['filename']}" - for parquet_file in parquet_files_content - if parquet_file["split"] == split and parquet_file["config"] == config - ) - if not sources: + parquet_file_items: List[ParquetFileItem] = [ + parquet_file_item for parquet_file_item in parquet_files_content if parquet_file_item["config"] == config + ] + if not parquet_file_items: raise ParquetResponseEmptyError("No parquet files found.") except Exception as e: raise PreviousStepFormatError("Previous step did not return the expected content.") from e - logging.debug(f"Found {len(sources)} parquet files for {dataset=}, {config=}, {split=}: {sources}") - - fs = get_hf_fs(hf_token=hf_token) - source_uris = get_hf_parquet_uris(sources, dataset=dataset) - desc = f"{dataset}/{config}/{split}" + fs = HTTPFileSystem() + source_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items] + desc = f"{dataset}/{config}" try: parquet_files: List[ParquetFile] = thread_map( - partial(ParquetFile, filesystem=fs), source_uris, desc=desc, unit="pq", disable=True + partial(get_parquet_file, fs=fs, hf_token=hf_token), source_urls, desc=desc, unit="pq", disable=True ) except Exception as e: raise FileSystemError(f"Could not read the parquet files: {e}") from e @@ -166,7 +163,7 @@ def compute_index_rows( raise PreviousStepFormatError("Previous step did not return the expected content.") from e # create duckdb index location - split_path, dir_path = create_index_dir_split( + dir_path = create_index_dir_split( dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory ) db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME @@ -189,36 +186,28 @@ def compute_index_rows( # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) hf_api = HfApi(endpoint=hf_endpoint, token=hf_token) committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token) - + index_file_location = f"{config}/{dataset}-{split}.db" try: refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE) if all(ref.ref != target_revision for ref in refs.converts): initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id committer_hf_api.create_branch( - repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit + repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True ) except RepositoryNotFoundError as err: raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err - except Exception as e: - # TODO: improve error handling - logging.error(str(e)) + target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False) all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings} - previous_index = f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}" delete_operations: List[CommitOperation] = [] - if previous_index in all_repo_files: - delete_operations.append(CommitOperationDelete(path_in_repo=previous_index)) - logging.debug(f"{delete_operations=}") + if index_file_location in all_repo_files: + delete_operations.append(CommitOperationDelete(path_in_repo=index_file_location)) # send the files to the target revision add_operations: List[CommitOperation] = [ - CommitOperationAdd( - path_in_repo=f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}", path_or_fileobj=db_location - ) + CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location) ] - logging.debug(f"{add_operations=}") - # TODO: Delete local index file committer_hf_api.create_commit( repo_id=dataset, repo_type=DATASET_TYPE, @@ -231,18 +220,19 @@ def compute_index_rows( # call the API again to get the list of parquet files target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True) repo_files = [ - repo_file - for repo_file in target_dataset_info.siblings - if repo_file.rfilename.startswith(f"{config}/{split}") and repo_file.rfilename.endswith(".db") + repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location ] + + if not repo_files: + raise NotAvailableIndexFileError("No index file was found") + if len(repo_files) != 1: - # TODO: improve exception type - raise Exception("NO FILE WAS UPLOADED TO BRANCH") - index_file = repo_files[0] + logging.warning(f"Found {len(repo_files)} index files, should be only 1") remove_dir(dir_path) + # remove index file since it is no more used and is stored in NFS return create_index_item( - repo_file=index_file, + repo_file=repo_files[0], dataset=dataset, config=config, split=split, diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index b28c4193c2..e414e34d70 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -18,6 +18,7 @@ Union, cast, ) +from urllib.parse import quote from datasets import ( Dataset, @@ -27,8 +28,11 @@ IterableDataset, load_dataset, ) +from datasets.utils.file_utils import get_authentication_headers_for_url +from fsspec.implementations.http import HTTPFileSystem from libcommon.exceptions import NormalRowsError, StreamingRowsError from libcommon.utils import orjson_dumps +from pyarrow.parquet import ParquetFile class JobRunnerInfo(TypedDict): @@ -147,6 +151,15 @@ class RowsContent(TypedDict): all_fetched: bool +class ParquetFileItem(TypedDict): + dataset: str + config: str + split: str + url: str + filename: str + size: int + + # TODO: separate functions from common classes and named dicts otherwise this file will continue growing @@ -407,3 +420,14 @@ def get_rows_or_raise( "Cannot load the dataset split (in normal download mode) to extract the first rows.", cause=err, ) from err + + +# TODO: use huggingface_hub's hf_hub_url after +# https://github.com/huggingface/huggingface_hub/issues/1082 +def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str: + return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename) + + +def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> ParquetFile: + headers = get_authentication_headers_for_url(url, use_auth_token=hf_token) + return ParquetFile(fs.open(url, headers=headers)) diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py index 6092d5babf..a3f9cbef54 100644 --- a/services/worker/tests/conftest.py +++ b/services/worker/tests/conftest.py @@ -70,6 +70,7 @@ def set_env_vars( mp.setenv("PARQUET_AND_INFO_MAX_DATASET_SIZE", "10_000") mp.setenv("PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES", "10") mp.setenv("PARQUET_AND_INFO_COMMITTER_HF_TOKEN", CI_PARQUET_CONVERTER_APP_TOKEN) + mp.setenv("DUCKDB_INDEX_COMMITTER_HF_TOKEN", CI_PARQUET_CONVERTER_APP_TOKEN) mp.setenv("DATASETS_BASED_HF_DATASETS_CACHE", str(datasets_cache_directory)) mp.setenv("HF_MODULES_CACHE", str(modules_cache_directory)) mp.setenv("WORKER_CONTENT_MAX_BYTES", "10_000_000") diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py index 18edaad1f1..fe6413489d 100644 --- a/services/worker/tests/fixtures/datasets.py +++ b/services/worker/tests/fixtures/datasets.py @@ -143,4 +143,18 @@ def datasets() -> Mapping[str, Dataset]: dtype=pd.StringDtype(storage="python"), ) ), + "duckdb_index": Dataset.from_pandas( + pd.DataFrame( + { + "text": [ + "foo", + "bar", + "foobar", + "- Hello there !", + "- General Kenobi !", + ] + }, + dtype=pd.StringDtype(storage="python"), + ) + ), } diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index 110dba75d2..db01d0fb3a 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -274,6 +274,13 @@ def hub_public_spawning_opt_in_out(datasets: Mapping[str, Dataset]) -> Iterator[ delete_hub_dataset_repo(repo_id=repo_id) +@pytest.fixture(scope="session") +def hub_public_duckdb_index(datasets: Mapping[str, Dataset]) -> Iterator[str]: + repo_id = create_hub_dataset_repo(prefix="duckdb_index", dataset=datasets["duckdb_index"]) + yield repo_id + delete_hub_dataset_repo(repo_id=repo_id) + + class HubDatasetTest(TypedDict): name: str config_names_response: Any @@ -588,6 +595,7 @@ def hub_datasets( hub_public_big_csv: str, hub_public_external_files: str, hub_public_spawning_opt_in_out: str, + hub_public_duckdb_index: str, ) -> HubDatasets: return { "does_not_exist": { @@ -714,4 +722,13 @@ def hub_datasets( ), "parquet_and_info_response": None, }, + "duckdb_index": { + "name": hub_public_duckdb_index, + "config_names_response": create_config_names_response(hub_public_duckdb_index), + "splits_response": create_splits_response(hub_public_duckdb_index), + "first_rows_response": create_first_rows_response(hub_public_duckdb_index, TEXT_cols, TEXT_rows), + "parquet_and_info_response": create_parquet_and_info_response( + dataset=hub_public_duckdb_index, data_type="csv" + ), + }, } diff --git a/services/worker/tests/job_runners/config/test_parquet.py b/services/worker/tests/job_runners/config/test_parquet.py index 4b58e5f7a1..eeecda8bfd 100644 --- a/services/worker/tests/job_runners/config/test_parquet.py +++ b/services/worker/tests/job_runners/config/test_parquet.py @@ -16,10 +16,8 @@ ConfigParquetJobRunner, ConfigParquetResponse, ) -from worker.job_runners.config.parquet_and_info import ( - ConfigParquetAndInfoResponse, - ParquetFileItem, -) +from worker.job_runners.config.parquet_and_info import ConfigParquetAndInfoResponse +from worker.utils import ParquetFileItem @pytest.fixture(autouse=True) diff --git a/services/worker/tests/job_runners/config/test_parquet_metadata.py b/services/worker/tests/job_runners/config/test_parquet_metadata.py index fadd6e711b..dbc9eb62d9 100644 --- a/services/worker/tests/job_runners/config/test_parquet_metadata.py +++ b/services/worker/tests/job_runners/config/test_parquet_metadata.py @@ -20,12 +20,12 @@ from worker.config import AppConfig from worker.job_runners.config.parquet import ConfigParquetResponse -from worker.job_runners.config.parquet_and_info import ParquetFileItem from worker.job_runners.config.parquet_metadata import ( ConfigParquetMetadataJobRunner, ConfigParquetMetadataResponse, ParquetFileMetadataItem, ) +from worker.utils import ParquetFileItem @pytest.fixture(autouse=True) diff --git a/services/worker/tests/job_runners/dataset/test_parquet.py b/services/worker/tests/job_runners/dataset/test_parquet.py index c698848c32..ba377eb65a 100644 --- a/services/worker/tests/job_runners/dataset/test_parquet.py +++ b/services/worker/tests/job_runners/dataset/test_parquet.py @@ -13,11 +13,11 @@ from worker.config import AppConfig from worker.job_runners.config.parquet import ConfigParquetResponse -from worker.job_runners.config.parquet_and_info import ParquetFileItem from worker.job_runners.dataset.parquet import ( DatasetParquetJobRunner, DatasetParquetResponse, ) +from worker.utils import ParquetFileItem from ..utils import UpstreamResponse diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py new file mode 100644 index 0000000000..595e12d022 --- /dev/null +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2023 The HuggingFace Authors. + +from http import HTTPStatus +from typing import Callable + +import pytest +from libcommon.processing_graph import ProcessingGraph +from libcommon.resources import CacheMongoResource, QueueMongoResource +from libcommon.simple_cache import upsert_response +from libcommon.storage import StrPath +from libcommon.utils import Priority + +from worker.config import AppConfig +from worker.job_runners.config.parquet_and_info import ConfigParquetAndInfoJobRunner +from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner +from worker.resources import LibrariesResource + +from ...fixtures.hub import HubDatasets + +GetJobRunner = Callable[[str, str, str, AppConfig], SplitDuckDbIndexJobRunner] + +GetParquetJobRunner = Callable[[str, str, AppConfig], ConfigParquetAndInfoJobRunner] + + +@pytest.fixture +def get_parquet_job_runner( + libraries_resource: LibrariesResource, + cache_mongo_resource: CacheMongoResource, + queue_mongo_resource: QueueMongoResource, +) -> GetParquetJobRunner: + def _get_job_runner( + dataset: str, + config: str, + app_config: AppConfig, + ) -> ConfigParquetAndInfoJobRunner: + processing_step_name = ConfigParquetAndInfoJobRunner.get_job_type() + processing_graph = ProcessingGraph( + { + "dataset-level": {"input_type": "dataset"}, + processing_step_name: { + "input_type": "dataset", + "job_runner_version": ConfigParquetAndInfoJobRunner.get_job_runner_version(), + "triggered_by": "dataset-level", + }, + } + ) + return ConfigParquetAndInfoJobRunner( + job_info={ + "type": ConfigParquetAndInfoJobRunner.get_job_type(), + "params": { + "dataset": dataset, + "revision": "revision", + "config": config, + "split": None, + }, + "job_id": "job_id", + "priority": Priority.NORMAL, + }, + app_config=app_config, + processing_step=processing_graph.get_processing_step(processing_step_name), + hf_datasets_cache=libraries_resource.hf_datasets_cache, + ) + + return _get_job_runner + + +@pytest.fixture +def get_job_runner( + duckdb_index_directory: StrPath, + cache_mongo_resource: CacheMongoResource, + queue_mongo_resource: QueueMongoResource, +) -> GetJobRunner: + def _get_job_runner( + dataset: str, + config: str, + split: str, + app_config: AppConfig, + ) -> SplitDuckDbIndexJobRunner: + processing_step_name = SplitDuckDbIndexJobRunner.get_job_type() + processing_graph = ProcessingGraph( + { + "dataset-step": {"input_type": "dataset"}, + "config-parquet": { + "input_type": "config", + "triggered_by": "dataset-step", + "provides_config_parquet": True, + }, + "config-split-names-from-streaming": { + "input_type": "config", + "triggered_by": "dataset-step", + }, + processing_step_name: { + "input_type": "dataset", + "job_runner_version": SplitDuckDbIndexJobRunner.get_job_runner_version(), + "triggered_by": ["config-parquet", "config-split-names-from-streaming"], + }, + } + ) + return SplitDuckDbIndexJobRunner( + job_info={ + "type": SplitDuckDbIndexJobRunner.get_job_type(), + "params": { + "dataset": dataset, + "revision": "revision", + "config": config, + "split": split, + }, + "job_id": "job_id", + "priority": Priority.NORMAL, + }, + app_config=app_config, + processing_step=processing_graph.get_processing_step(processing_step_name), + duckdb_index_directory=duckdb_index_directory, + ) + + return _get_job_runner + + +def test_compute( + get_parquet_job_runner: GetParquetJobRunner, + get_job_runner: GetJobRunner, + app_config: AppConfig, + hub_datasets: HubDatasets, +) -> None: + hub_duckdb_index = "duckdb_index" + dataset = hub_datasets[hub_duckdb_index]["name"] + config_names = hub_datasets[hub_duckdb_index]["config_names_response"] + config = hub_datasets[hub_duckdb_index]["config_names_response"]["config_names"][0]["config"] + splits_response = hub_datasets[hub_duckdb_index]["splits_response"] + split = "train" + + upsert_response( + "dataset-config-names", + dataset=dataset, + http_status=HTTPStatus.OK, + content=config_names, + ) + + upsert_response( + "config-split-names-from-streaming", + dataset=dataset, + config=config, + http_status=HTTPStatus.OK, + content=splits_response, + ) + + parquet_job_runner = get_parquet_job_runner(dataset, config, app_config) + parquet_response = parquet_job_runner.compute() + config_parquet = parquet_response.content + + upsert_response( + "config-parquet", + dataset=dataset, + config=config, + http_status=HTTPStatus.OK, + content=config_parquet, + ) + + assert parquet_response + job_runner = get_job_runner(dataset, config, split, app_config) + response = job_runner.compute() + assert response From fd298befa4b9d371a59bbd65ac167d9e26bc7879 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 12 Jun 2023 17:25:55 -0400 Subject: [PATCH 16/52] Adding other test scenarios --- .../worker/job_runners/split/duckdb_index.py | 20 ++-- services/worker/tests/fixtures/datasets.py | 7 ++ services/worker/tests/fixtures/hub.py | 37 +++++++ .../job_runners/split/test_duckdb_index.py | 102 ++++++++++-------- 4 files changed, 111 insertions(+), 55 deletions(-) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 29fdacfdba..2897b6449f 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -117,7 +117,9 @@ def compute_index_rows( try: parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"] parquet_file_items: List[ParquetFileItem] = [ - parquet_file_item for parquet_file_item in parquet_files_content if parquet_file_item["config"] == config + parquet_file_item + for parquet_file_item in parquet_files_content + if parquet_file_item["config"] == config and parquet_file_item["split"] == split ] if not parquet_file_items: raise ParquetResponseEmptyError("No parquet files found.") @@ -125,11 +127,11 @@ def compute_index_rows( raise PreviousStepFormatError("Previous step did not return the expected content.") from e fs = HTTPFileSystem() - source_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items] + parquet_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items] desc = f"{dataset}/{config}" try: parquet_files: List[ParquetFile] = thread_map( - partial(get_parquet_file, fs=fs, hf_token=hf_token), source_urls, desc=desc, unit="pq", disable=True + partial(get_parquet_file, fs=fs, hf_token=hf_token), parquet_urls, desc=desc, unit="pq", disable=True ) except Exception as e: raise FileSystemError(f"Could not read the parquet files: {e}") from e @@ -143,7 +145,7 @@ def compute_index_rows( if not string_columns: raise NoIndexableColumnsError("No string columns available to index.") - # look for image, audio and binary columns, if present, raise exeception do not supported yet and index everything + # look for image, audio and binary columns, if present, raise exeception (not supported yet) if any( feature for feature in features.values() @@ -154,14 +156,6 @@ def compute_index_rows( ): raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.") - try: - parquet_urls = [content["url"] for content in parquet_files_content if content["split"] == split] - - if not parquet_urls: - raise ParquetResponseEmptyError("No parquet files found.") - except Exception as e: - raise PreviousStepFormatError("Previous step did not return the expected content.") from e - # create duckdb index location dir_path = create_index_dir_split( dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory @@ -174,7 +168,7 @@ def compute_index_rows( duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="fts")) duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts")) - # index + # index all columns con = duckdb.connect(str(db_location)) con.sql(CREATE_SEQUENCE_COMMAND) con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});") diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py index fe6413489d..357b91e0ed 100644 --- a/services/worker/tests/fixtures/datasets.py +++ b/services/worker/tests/fixtures/datasets.py @@ -157,4 +157,11 @@ def datasets() -> Mapping[str, Dataset]: dtype=pd.StringDtype(storage="python"), ) ), + "text_image": other( + { + "col": str(Path(__file__).resolve().parent / "data" / "test_image_rgb.jpg"), + "text": "This is a text", + }, + {"col": Image(), "text": Value(dtype="string")}, + ), } diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index db01d0fb3a..4ab34e7277 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -281,6 +281,13 @@ def hub_public_duckdb_index(datasets: Mapping[str, Dataset]) -> Iterator[str]: delete_hub_dataset_repo(repo_id=repo_id) +@pytest.fixture(scope="session") +def hub_public_text_image(datasets: Mapping[str, Dataset]) -> Iterator[str]: + repo_id = create_hub_dataset_repo(prefix="text_image", dataset=datasets["text_image"]) + yield repo_id + delete_hub_dataset_repo(repo_id=repo_id) + + class HubDatasetTest(TypedDict): name: str config_names_response: Any @@ -514,6 +521,26 @@ def get_IMAGE_rows(dataset: str) -> Any: ] +TEXT_IMAGE_cols = { + "col": {"_type": "Image"}, + "text": {"_type": "Value", "dtype": "string"}, +} + + +def get_TEXT_IMAGE_rows(dataset: str) -> Any: + dataset, config, split = get_default_config_split(dataset) + return [ + { + "col": { + "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/image.jpg", + "height": 480, + "width": 640, + }, + "text": "This is a text", + } + ] + + IMAGES_LIST_cols = { "col": [{"_type": "Image"}], } @@ -596,6 +623,7 @@ def hub_datasets( hub_public_external_files: str, hub_public_spawning_opt_in_out: str, hub_public_duckdb_index: str, + hub_public_text_image: str, ) -> HubDatasets: return { "does_not_exist": { @@ -731,4 +759,13 @@ def hub_datasets( dataset=hub_public_duckdb_index, data_type="csv" ), }, + "text_image": { + "name": hub_public_text_image, + "config_names_response": create_config_names_response(hub_public_text_image), + "splits_response": create_splits_response(hub_public_text_image), + "first_rows_response": create_first_rows_response( + hub_public_text_image, TEXT_IMAGE_cols, get_TEXT_IMAGE_rows(hub_public_text_image) + ), + "parquet_and_info_response": None, + }, } diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index 595e12d022..ab89bc4b11 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -24,110 +24,119 @@ @pytest.fixture -def get_parquet_job_runner( - libraries_resource: LibrariesResource, +def get_job_runner( + duckdb_index_directory: StrPath, cache_mongo_resource: CacheMongoResource, queue_mongo_resource: QueueMongoResource, -) -> GetParquetJobRunner: +) -> GetJobRunner: def _get_job_runner( dataset: str, config: str, + split: str, app_config: AppConfig, - ) -> ConfigParquetAndInfoJobRunner: - processing_step_name = ConfigParquetAndInfoJobRunner.get_job_type() + ) -> SplitDuckDbIndexJobRunner: + processing_step_name = SplitDuckDbIndexJobRunner.get_job_type() processing_graph = ProcessingGraph( { - "dataset-level": {"input_type": "dataset"}, + "dataset-step": {"input_type": "dataset"}, + "config-parquet": { + "input_type": "config", + "triggered_by": "dataset-step", + "provides_config_parquet": True, + }, + "config-split-names-from-streaming": { + "input_type": "config", + "triggered_by": "dataset-step", + }, processing_step_name: { "input_type": "dataset", - "job_runner_version": ConfigParquetAndInfoJobRunner.get_job_runner_version(), - "triggered_by": "dataset-level", + "job_runner_version": SplitDuckDbIndexJobRunner.get_job_runner_version(), + "triggered_by": ["config-parquet", "config-split-names-from-streaming"], }, } ) - return ConfigParquetAndInfoJobRunner( + return SplitDuckDbIndexJobRunner( job_info={ - "type": ConfigParquetAndInfoJobRunner.get_job_type(), + "type": SplitDuckDbIndexJobRunner.get_job_type(), "params": { "dataset": dataset, "revision": "revision", "config": config, - "split": None, + "split": split, }, "job_id": "job_id", "priority": Priority.NORMAL, }, app_config=app_config, processing_step=processing_graph.get_processing_step(processing_step_name), - hf_datasets_cache=libraries_resource.hf_datasets_cache, + duckdb_index_directory=duckdb_index_directory, ) return _get_job_runner @pytest.fixture -def get_job_runner( - duckdb_index_directory: StrPath, +def get_parquet_job_runner( + libraries_resource: LibrariesResource, cache_mongo_resource: CacheMongoResource, queue_mongo_resource: QueueMongoResource, -) -> GetJobRunner: +) -> GetParquetJobRunner: def _get_job_runner( dataset: str, config: str, - split: str, app_config: AppConfig, - ) -> SplitDuckDbIndexJobRunner: - processing_step_name = SplitDuckDbIndexJobRunner.get_job_type() + ) -> ConfigParquetAndInfoJobRunner: + processing_step_name = ConfigParquetAndInfoJobRunner.get_job_type() processing_graph = ProcessingGraph( { - "dataset-step": {"input_type": "dataset"}, - "config-parquet": { - "input_type": "config", - "triggered_by": "dataset-step", - "provides_config_parquet": True, - }, - "config-split-names-from-streaming": { - "input_type": "config", - "triggered_by": "dataset-step", - }, + "dataset-level": {"input_type": "dataset"}, processing_step_name: { - "input_type": "dataset", - "job_runner_version": SplitDuckDbIndexJobRunner.get_job_runner_version(), - "triggered_by": ["config-parquet", "config-split-names-from-streaming"], + "input_type": "config", + "job_runner_version": ConfigParquetAndInfoJobRunner.get_job_runner_version(), + "triggered_by": "dataset-level", }, } ) - return SplitDuckDbIndexJobRunner( + return ConfigParquetAndInfoJobRunner( job_info={ - "type": SplitDuckDbIndexJobRunner.get_job_type(), + "type": ConfigParquetAndInfoJobRunner.get_job_type(), "params": { "dataset": dataset, "revision": "revision", "config": config, - "split": split, + "split": None, }, "job_id": "job_id", "priority": Priority.NORMAL, }, app_config=app_config, processing_step=processing_graph.get_processing_step(processing_step_name), - duckdb_index_directory=duckdb_index_directory, + hf_datasets_cache=libraries_resource.hf_datasets_cache, ) return _get_job_runner +@pytest.mark.parametrize( + "hub_dataset_name,expected_error_code", + [ + ("duckdb_index", None), + ("text_image", "UnsupportedIndexableColumnsError"), + ("public", "NoIndexableColumnsError"), + ], +) def test_compute( get_parquet_job_runner: GetParquetJobRunner, get_job_runner: GetJobRunner, app_config: AppConfig, hub_datasets: HubDatasets, + hub_dataset_name: str, + expected_error_code: str, ) -> None: - hub_duckdb_index = "duckdb_index" - dataset = hub_datasets[hub_duckdb_index]["name"] - config_names = hub_datasets[hub_duckdb_index]["config_names_response"] - config = hub_datasets[hub_duckdb_index]["config_names_response"]["config_names"][0]["config"] - splits_response = hub_datasets[hub_duckdb_index]["splits_response"] + dataset = hub_datasets[hub_dataset_name]["name"] + config_names = hub_datasets[hub_dataset_name]["config_names_response"] + config = hub_datasets[hub_dataset_name]["config_names_response"]["config_names"][0]["config"] + splits_response = hub_datasets[hub_dataset_name]["splits_response"] split = "train" upsert_response( @@ -159,5 +168,14 @@ def test_compute( assert parquet_response job_runner = get_job_runner(dataset, config, split, app_config) - response = job_runner.compute() - assert response + + if expected_error_code: + with pytest.raises(Exception) as e: + job_runner.compute() + assert e.typename == expected_error_code + else: + response = job_runner.compute() + assert response + content = response.content + assert content["url"] is not None + assert content["filename"] is not None From 2afe9f3ee075d8f36243354a8f76d813ddc8d208 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 12 Jun 2023 17:38:58 -0400 Subject: [PATCH 17/52] Adding chart configuration --- chart/templates/_envWorker.tpl | 18 +++++++++++++++++- chart/values.yaml | 7 +++++++ .../worker/job_runners/split/duckdb_index.py | 2 +- tools/docker-compose-datasets-server.yml | 4 ++++ tools/docker-compose-dev-datasets-server.yml | 4 ++++ 5 files changed, 33 insertions(+), 2 deletions(-) diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl index ffbec5ae8c..4292090634 100644 --- a/chart/templates/_envWorker.tpl +++ b/chart/templates/_envWorker.tpl @@ -84,5 +84,21 @@ value: {{ .Values.optInOutUrlsScan.urlsNumberPerBatch | quote }} - name: OPT_IN_OUT_URLS_SCAN_SPAWNING_URL value: {{ .Values.optInOutUrlsScan.spawningUrl | quote }} - +# specific to 'split-duckdb-index' job runner +- name: DUCKDB_INDEX_COMMIT_MESSAGE + value: {{ .Values.duckDBIndex.commitMessage | quote }} +- name: DUCKDB_INDEX_COMMITTER_HF_TOKEN + {{- if .Values.secrets.appParquetConverterHfToken.fromSecret }} + valueFrom: + secretKeyRef: + name: {{ .Values.secrets.appParquetConverterHfToken.secretName | quote }} + key: HF_TOKEN + optional: false + {{- else }} + value: {{ .Values.secrets.appParquetConverterHfToken.value }} + {{- end }} +- name: DUCKDB_INDEX_TARGET_REVISION + value: {{ .Values.duckDBIndex.targetRevision | quote }} +- name: DUCKDB_INDEX_URL_TEMPLATE + value: {{ .Values.duckDBIndex.urlTemplate | quote }} {{- end -}} diff --git a/chart/values.yaml b/chart/values.yaml index 2d9754ed57..f45ecbdf87 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -208,6 +208,13 @@ parquetMetadata: duckDBIndex: # Directory on the shared storage (duckdb db files used for datasets indexing) storageDirectory: "/duckdb-index" + # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`. + commitMessage: "Update duckdb index files" + # the git revision of the dataset where to store the duckdb index file. Defaults to `duckdb/index`. + targetRevision: "duckdb/index" + # the URL template to build the duckdb index file URL. Defaults to `/datasets/%s/resolve/%s/%s`. + urlTemplate: "/datasets/%s/resolve/%s/%s" + # Directory where the cache data will be stored cacheDirectory: "/datasets-server-cache" diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 2897b6449f..becfec0b94 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -54,9 +54,9 @@ CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);" CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM" +# TODO: What if __id field already exist? INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';" LOAD_EXTENSION_COMMAND = "LOAD '{extension}';" -# TODO: What if __id field already exist? def create_index_item( diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml index 953a1ca1db..5e85f4de62 100644 --- a/tools/docker-compose-datasets-server.yml +++ b/tools/docker-compose-datasets-server.yml @@ -113,6 +113,10 @@ services: PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata} DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index} + DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index file} + DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-} + DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index} + DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} # ^ note: the datasets cache is automatically added, so no need to add it here OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml index 9b0ce15a25..59e2b90195 100644 --- a/tools/docker-compose-dev-datasets-server.yml +++ b/tools/docker-compose-dev-datasets-server.yml @@ -117,6 +117,10 @@ services: PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata} DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index} + DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index files} + DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-} + DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index} + DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} # ^ note: the datasets cache is automatically added, so no need to add it here OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} From 0bfcb62f2507e70da5f045556f14acfecf865dee Mon Sep 17 00:00:00 2001 From: Andrea Francis Soria Jimenez Date: Tue, 13 Jun 2023 07:58:55 -0400 Subject: [PATCH 18/52] Apply suggestions from code review Co-authored-by: Sylvain Lesage --- services/worker/src/worker/job_runners/split/duckdb_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index becfec0b94..74ab919a14 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -145,7 +145,7 @@ def compute_index_rows( if not string_columns: raise NoIndexableColumnsError("No string columns available to index.") - # look for image, audio and binary columns, if present, raise exeception (not supported yet) + # look for image, audio and binary columns, if present, raise exception (not supported yet) if any( feature for feature in features.values() @@ -174,7 +174,7 @@ def compute_index_rows( con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});") # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future - # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter + # see https://duckdb.org/docs/extensions/full_text_search.html for more details about 'stemmer' parameter con.sql(CREATE_INDEX_COMMAND) # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) From 2ff4f916d01b2b17317fd86d92ecf90e5037e8cb Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 13 Jun 2023 07:59:13 -0400 Subject: [PATCH 19/52] Change ParquetFileItem to SplitHubFile --- chart/static-files/openapi.json | 4 +- libs/libcommon/src/libcommon/parquet_utils.py | 12 +--- libs/libcommon/src/libcommon/utils.py | 9 +++ .../src/worker/job_runners/config/parquet.py | 5 +- .../job_runners/config/parquet_and_info.py | 8 +-- .../job_runners/config/parquet_metadata.py | 6 +- .../src/worker/job_runners/dataset/parquet.py | 7 ++- .../worker/job_runners/split/duckdb_index.py | 62 ++++++------------- services/worker/src/worker/utils.py | 18 ------ .../tests/job_runners/config/test_parquet.py | 11 ++-- .../config/test_parquet_metadata.py | 7 +-- .../tests/job_runners/dataset/test_parquet.py | 11 ++-- 12 files changed, 59 insertions(+), 101 deletions(-) diff --git a/chart/static-files/openapi.json b/chart/static-files/openapi.json index 11b6b1fedb..59a058d956 100644 --- a/chart/static-files/openapi.json +++ b/chart/static-files/openapi.json @@ -925,11 +925,11 @@ "properties": { "parquet_files": { "type": "array", - "items": { "$ref": "#/components/schemas/ParquetFileItem" } + "items": { "$ref": "#/components/schemas/SplitHubFile" } } } }, - "ParquetFileItem": { + "SplitHubFile": { "type": "object", "required": ["dataset", "config", "split", "url", "filename", "size"], "properties": { diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py index 3a4988889a..2fc92032d1 100644 --- a/libs/libcommon/src/libcommon/parquet_utils.py +++ b/libs/libcommon/src/libcommon/parquet_utils.py @@ -21,6 +21,7 @@ from libcommon.processing_graph import ProcessingGraph from libcommon.prometheus import StepProfiler from libcommon.simple_cache import get_previous_step_or_raise +from libcommon.utils import SplitHubFile StrPath = Union[str, PathLike[str]] @@ -37,15 +38,6 @@ class FileSystemError(Exception): pass -class ParquetFileItem(TypedDict): - dataset: str - config: str - split: str - url: str - filename: str - size: int - - class ParquetFileMetadataItem(TypedDict): dataset: str config: str @@ -157,7 +149,7 @@ def query(self, offset: int, length: int) -> pa.Table: @staticmethod def from_parquet_file_items( - parquet_file_items: List[ParquetFileItem], + parquet_file_items: List[SplitHubFile], dataset: str, config: str, split: str, diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index 301a760956..b921ea787b 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -65,6 +65,15 @@ class JobResult(TypedDict): output: Optional[JobOutput] +class SplitHubFile(TypedDict): + dataset: str + config: str + split: str + url: str + filename: str + size: int + + # orjson is used to get rid of errors with datetime (see allenai/c4) def orjson_default(obj: Any) -> Any: if isinstance(obj, bytes): diff --git a/services/worker/src/worker/job_runners/config/parquet.py b/services/worker/src/worker/job_runners/config/parquet.py index 4fa17de2ed..2b50e2aeb9 100644 --- a/services/worker/src/worker/job_runners/config/parquet.py +++ b/services/worker/src/worker/job_runners/config/parquet.py @@ -7,13 +7,14 @@ from libcommon.constants import PROCESSING_STEP_CONFIG_PARQUET_VERSION from libcommon.exceptions import PreviousStepFormatError from libcommon.simple_cache import get_previous_step_or_raise +from libcommon.utils import SplitHubFile from worker.job_runners.config.config_job_runner import ConfigJobRunner -from worker.utils import CompleteJobResult, ParquetFileItem +from worker.utils import CompleteJobResult class ConfigParquetResponse(TypedDict): - parquet_files: List[ParquetFileItem] + parquet_files: List[SplitHubFile] def compute_parquet_response(dataset: str, config: str) -> ConfigParquetResponse: diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index daf9ed701c..24ee3c9f3f 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -66,16 +66,16 @@ ) from libcommon.processing_graph import ProcessingStep from libcommon.simple_cache import get_previous_step_or_raise -from libcommon.utils import JobInfo +from libcommon.utils import JobInfo, SplitHubFile from tqdm.contrib.concurrent import thread_map from worker.config import AppConfig, ParquetAndInfoConfig from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner -from worker.utils import CompleteJobResult, ParquetFileItem, hf_hub_url +from worker.utils import CompleteJobResult, hf_hub_url class ConfigParquetAndInfoResponse(TypedDict): - parquet_files: List[ParquetFileItem] + parquet_files: List[SplitHubFile] dataset_info: Dict[str, Any] @@ -118,7 +118,7 @@ def create_parquet_file_item( hf_endpoint: str, target_revision: str, url_template: str, -) -> ParquetFileItem: +) -> SplitHubFile: if repo_file.size is None: raise ValueError(f"Cannot get size of {repo_file.rfilename}") _, split = parse_repo_filename(repo_file.rfilename) diff --git a/services/worker/src/worker/job_runners/config/parquet_metadata.py b/services/worker/src/worker/job_runners/config/parquet_metadata.py index 6c361168f9..ba58cfad48 100644 --- a/services/worker/src/worker/job_runners/config/parquet_metadata.py +++ b/services/worker/src/worker/job_runners/config/parquet_metadata.py @@ -15,14 +15,14 @@ from libcommon.processing_graph import ProcessingStep from libcommon.simple_cache import get_previous_step_or_raise from libcommon.storage import StrPath -from libcommon.utils import JobInfo +from libcommon.utils import JobInfo, SplitHubFile from libcommon.viewer_utils.parquet_metadata import create_parquet_metadata_file from pyarrow.parquet import ParquetFile from tqdm.contrib.concurrent import thread_map from worker.config import AppConfig from worker.job_runners.config.config_job_runner import ConfigJobRunner -from worker.utils import CompleteJobResult, ParquetFileItem, get_parquet_file +from worker.utils import CompleteJobResult, get_parquet_file class ParquetFileMetadataItem(TypedDict): @@ -74,7 +74,7 @@ def compute_parquet_metadata_response( config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) try: parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"] - parquet_file_items: List[ParquetFileItem] = [ + parquet_file_items: List[SplitHubFile] = [ parquet_file_item for parquet_file_item in parquet_files_content if parquet_file_item["config"] == config ] if not parquet_file_items: diff --git a/services/worker/src/worker/job_runners/dataset/parquet.py b/services/worker/src/worker/job_runners/dataset/parquet.py index 7c6bb4a82c..4dc81bc5f5 100644 --- a/services/worker/src/worker/job_runners/dataset/parquet.py +++ b/services/worker/src/worker/job_runners/dataset/parquet.py @@ -12,14 +12,15 @@ get_previous_step_or_raise, get_response, ) +from libcommon.utils import SplitHubFile from worker.job_runners.config.parquet import ConfigParquetResponse from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner -from worker.utils import JobResult, ParquetFileItem, PreviousJob +from worker.utils import JobResult, PreviousJob class DatasetParquetResponse(TypedDict): - parquet_files: List[ParquetFileItem] + parquet_files: List[SplitHubFile] pending: list[PreviousJob] failed: list[PreviousJob] @@ -47,7 +48,7 @@ def compute_sizes_response(dataset: str) -> Tuple[DatasetParquetResponse, float] raise PreviousStepFormatError("Previous step did not return the expected content: 'config_names'.") try: - parquet_files: list[ParquetFileItem] = [] + parquet_files: list[SplitHubFile] = [] total = 0 pending = [] failed = [] diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 74ab919a14..3dff3b9936 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -14,7 +14,7 @@ CommitOperationAdd, CommitOperationDelete, ) -from huggingface_hub.hf_api import HfApi, RepoFile +from huggingface_hub.hf_api import HfApi from huggingface_hub.utils._errors import RepositoryNotFoundError from libcommon.config import DuckDbIndexConfig from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION @@ -31,20 +31,14 @@ from libcommon.processing_graph import ProcessingStep from libcommon.simple_cache import get_previous_step_or_raise from libcommon.storage import StrPath, remove_dir -from libcommon.utils import JobInfo +from libcommon.utils import JobInfo, SplitHubFile from libcommon.viewer_utils.index_utils import create_index_dir_split from pyarrow.parquet import ParquetFile from tqdm.contrib.concurrent import thread_map from worker.config import AppConfig from worker.job_runners.split.split_job_runner import SplitJobRunner -from worker.utils import ( - CompleteJobResult, - IndexRowsResponse, - ParquetFileItem, - get_parquet_file, - hf_hub_url, -) +from worker.utils import CompleteJobResult, get_parquet_file, hf_hub_url DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" @@ -59,33 +53,6 @@ LOAD_EXTENSION_COMMAND = "LOAD '{extension}';" -def create_index_item( - repo_file: RepoFile, - dataset: str, - config: str, - split: str, - hf_endpoint: str, - target_revision: str, - url_template: str, -) -> IndexRowsResponse: - if repo_file.size is None: - raise ValueError(f"Cannot get size of {repo_file.rfilename}") - return { - "dataset": dataset, - "config": config, - "split": split, - "url": hf_hub_url( - repo_id=dataset, - filename=repo_file.rfilename, - hf_endpoint=hf_endpoint, - revision=target_revision, - url_template=url_template, - ), - "filename": Path(repo_file.rfilename).name, - "size": repo_file.size, - } - - def compute_index_rows( dataset: str, config: str, @@ -97,7 +64,7 @@ def compute_index_rows( url_template: str, hf_token: Optional[str], committer_hf_token: Optional[str], -) -> IndexRowsResponse: +) -> SplitHubFile: logging.info(f"get split-duckdb-index for dataset={dataset} config={config} split={split}") # validate split @@ -116,7 +83,7 @@ def compute_index_rows( config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) try: parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"] - parquet_file_items: List[ParquetFileItem] = [ + parquet_file_items: List[SplitHubFile] = [ parquet_file_item for parquet_file_item in parquet_files_content if parquet_file_item["config"] == config and parquet_file_item["split"] == split @@ -225,14 +192,23 @@ def compute_index_rows( remove_dir(dir_path) # remove index file since it is no more used and is stored in NFS - return create_index_item( - repo_file=repo_files[0], + + repo_file = repo_files[0] + if repo_file.size is None: + raise ValueError(f"Cannot get size of {repo_file.rfilename}") + return SplitHubFile( dataset=dataset, config=config, split=split, - hf_endpoint=hf_endpoint, - target_revision=target_revision, - url_template=url_template, + url=hf_hub_url( + repo_id=dataset, + filename=repo_file.rfilename, + hf_endpoint=hf_endpoint, + revision=target_revision, + url_template=url_template, + ), + filename=Path(repo_file.rfilename).name, + size=repo_file.size, ) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index e414e34d70..58a27c25fb 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -134,15 +134,6 @@ class ImageUrlColumnsResponse(TypedDict): columns: List[str] -class IndexRowsResponse(TypedDict): - dataset: str - config: str - split: str - url: str - filename: str - size: int - - Row = Mapping[str, Any] @@ -151,15 +142,6 @@ class RowsContent(TypedDict): all_fetched: bool -class ParquetFileItem(TypedDict): - dataset: str - config: str - split: str - url: str - filename: str - size: int - - # TODO: separate functions from common classes and named dicts otherwise this file will continue growing diff --git a/services/worker/tests/job_runners/config/test_parquet.py b/services/worker/tests/job_runners/config/test_parquet.py index eeecda8bfd..677e49715a 100644 --- a/services/worker/tests/job_runners/config/test_parquet.py +++ b/services/worker/tests/job_runners/config/test_parquet.py @@ -9,7 +9,7 @@ from libcommon.processing_graph import ProcessingGraph from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.simple_cache import CachedArtifactError, upsert_response -from libcommon.utils import Priority +from libcommon.utils import Priority, SplitHubFile from worker.config import AppConfig from worker.job_runners.config.parquet import ( @@ -17,7 +17,6 @@ ConfigParquetResponse, ) from worker.job_runners.config.parquet_and_info import ConfigParquetAndInfoResponse -from worker.utils import ParquetFileItem @pytest.fixture(autouse=True) @@ -78,10 +77,10 @@ def _get_job_runner( HTTPStatus.OK, ConfigParquetAndInfoResponse( parquet_files=[ - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0 ), - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", url="url2", filename="filename2", size=0 ), ], @@ -90,10 +89,10 @@ def _get_job_runner( None, ConfigParquetResponse( parquet_files=[ - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0 ), - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", url="url2", filename="filename2", size=0 ), ] diff --git a/services/worker/tests/job_runners/config/test_parquet_metadata.py b/services/worker/tests/job_runners/config/test_parquet_metadata.py index dbc9eb62d9..875e1e13bb 100644 --- a/services/worker/tests/job_runners/config/test_parquet_metadata.py +++ b/services/worker/tests/job_runners/config/test_parquet_metadata.py @@ -16,7 +16,7 @@ from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.simple_cache import CachedArtifactError, upsert_response from libcommon.storage import StrPath -from libcommon.utils import Priority +from libcommon.utils import Priority, SplitHubFile from worker.config import AppConfig from worker.job_runners.config.parquet import ConfigParquetResponse @@ -25,7 +25,6 @@ ConfigParquetMetadataResponse, ParquetFileMetadataItem, ) -from worker.utils import ParquetFileItem @pytest.fixture(autouse=True) @@ -91,10 +90,10 @@ def _get_job_runner( HTTPStatus.OK, ConfigParquetResponse( parquet_files=[ - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0 ), - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", url="url2", filename="filename2", size=0 ), ], diff --git a/services/worker/tests/job_runners/dataset/test_parquet.py b/services/worker/tests/job_runners/dataset/test_parquet.py index ba377eb65a..c5257d01a8 100644 --- a/services/worker/tests/job_runners/dataset/test_parquet.py +++ b/services/worker/tests/job_runners/dataset/test_parquet.py @@ -9,7 +9,7 @@ from libcommon.processing_graph import ProcessingGraph from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.simple_cache import CachedArtifactError, upsert_response -from libcommon.utils import Priority +from libcommon.utils import Priority, SplitHubFile from worker.config import AppConfig from worker.job_runners.config.parquet import ConfigParquetResponse @@ -17,7 +17,6 @@ DatasetParquetJobRunner, DatasetParquetResponse, ) -from worker.utils import ParquetFileItem from ..utils import UpstreamResponse @@ -93,7 +92,7 @@ def _get_job_runner( http_status=HTTPStatus.OK, content=ConfigParquetResponse( parquet_files=[ - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", @@ -111,7 +110,7 @@ def _get_job_runner( http_status=HTTPStatus.OK, content=ConfigParquetResponse( parquet_files=[ - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_2", split="train", @@ -126,10 +125,10 @@ def _get_job_runner( None, DatasetParquetResponse( parquet_files=[ - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0 ), - ParquetFileItem( + SplitHubFile( dataset="ok", config="config_2", split="train", url="url2", filename="filename2", size=0 ), ], From 3c9b4eef5f394e3f8dc0b60f7c740cfbdfb666b6 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 13 Jun 2023 14:01:58 -0400 Subject: [PATCH 20/52] Inherit from SplitCachedJobRunner --- .../src/libcommon/viewer_utils/index_utils.py | 17 ------------- .../job_runners/_datasets_based_job_runner.py | 1 - .../worker/job_runners/split/duckdb_index.py | 24 ++++++------------- .../job_runners/split/test_duckdb_index.py | 2 ++ 4 files changed, 9 insertions(+), 35 deletions(-) delete mode 100644 libs/libcommon/src/libcommon/viewer_utils/index_utils.py diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py deleted file mode 100644 index 5beb9f6a31..0000000000 --- a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright 2023 The HuggingFace Authors. - -from os import makedirs -from pathlib import Path - -from libcommon.storage import StrPath - -DATASET_SEPARATOR = "--" -INDEX_DIR_MODE = 0o755 - - -def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Path: - split_path = f"{dataset}/{DATASET_SEPARATOR}/{config}/{split}" - dir_path = Path(index_directory).resolve() / split_path - makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True) - return dir_path diff --git a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py index 310b4bde64..703e4a6353 100644 --- a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py +++ b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py @@ -23,7 +23,6 @@ class DatasetsBasedJobRunner(JobRunner): datasets_based_config: DatasetsBasedConfig base_datasets_cache: Path - # the datasets library cache directories (for data, downloads, extraction, NOT for modules) # the job runner should have only one running job at the same time, then it should # be safe to use a global variable (and to set the datasets cache globally) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 3dff3b9936..9ad20ea238 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -30,14 +30,13 @@ ) from libcommon.processing_graph import ProcessingStep from libcommon.simple_cache import get_previous_step_or_raise -from libcommon.storage import StrPath, remove_dir +from libcommon.storage import StrPath from libcommon.utils import JobInfo, SplitHubFile -from libcommon.viewer_utils.index_utils import create_index_dir_split from pyarrow.parquet import ParquetFile from tqdm.contrib.concurrent import thread_map from worker.config import AppConfig -from worker.job_runners.split.split_job_runner import SplitJobRunner +from worker.job_runners.split.split_job_runner import SplitCachedJobRunner from worker.utils import CompleteJobResult, get_parquet_file, hf_hub_url DATASET_TYPE = "dataset" @@ -57,7 +56,7 @@ def compute_index_rows( dataset: str, config: str, split: str, - duckdb_index_directory: StrPath, + duckdb_index_file_directory: StrPath, target_revision: str, hf_endpoint: str, commit_message: str, @@ -123,12 +122,6 @@ def compute_index_rows( ): raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.") - # create duckdb index location - dir_path = create_index_dir_split( - dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory - ) - db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME - # configure duckdb extensions duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs")) duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="httpfs")) @@ -136,6 +129,7 @@ def compute_index_rows( duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts")) # index all columns + db_location = f"{duckdb_index_file_directory}/{DUCKDB_DEFAULT_INDEX_FILENAME}" con = duckdb.connect(str(db_location)) con.sql(CREATE_SEQUENCE_COMMAND) con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});") @@ -190,9 +184,6 @@ def compute_index_rows( if len(repo_files) != 1: logging.warning(f"Found {len(repo_files)} index files, should be only 1") - remove_dir(dir_path) - # remove index file since it is no more used and is stored in NFS - repo_file = repo_files[0] if repo_file.size is None: raise ValueError(f"Cannot get size of {repo_file.rfilename}") @@ -212,9 +203,8 @@ def compute_index_rows( ) -class SplitDuckDbIndexJobRunner(SplitJobRunner): +class SplitDuckDbIndexJobRunner(SplitCachedJobRunner): duckdb_index_config: DuckDbIndexConfig - duckdb_index_directory: StrPath def __init__( self, @@ -227,8 +217,8 @@ def __init__( job_info=job_info, app_config=app_config, processing_step=processing_step, + hf_datasets_cache=Path(duckdb_index_directory).resolve(), ) - self.duckdb_index_directory = duckdb_index_directory self.duckdb_index_config = app_config.duckdb_index @staticmethod @@ -245,7 +235,7 @@ def compute(self) -> CompleteJobResult: dataset=self.dataset, config=self.config, split=self.split, - duckdb_index_directory=self.duckdb_index_directory, + duckdb_index_file_directory=self.datasets_cache, hf_token=self.app_config.common.hf_token, url_template=self.duckdb_index_config.url_template, commit_message=self.duckdb_index_config.commit_message, diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index ab89bc4b11..01d862140b 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -174,8 +174,10 @@ def test_compute( job_runner.compute() assert e.typename == expected_error_code else: + job_runner.pre_compute() response = job_runner.compute() assert response content = response.content assert content["url"] is not None assert content["filename"] is not None + job_runner.post_compute() From c78e99ae9e7b731130400795c1169e65bea72693 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 13 Jun 2023 14:32:15 -0400 Subject: [PATCH 21/52] Fix style --- chart/templates/_envWorker.tpl | 1 + .../worker/src/worker/job_runners/_datasets_based_job_runner.py | 1 + services/worker/src/worker/job_runners/split/duckdb_index.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl index 4292090634..b3add1de06 100644 --- a/chart/templates/_envWorker.tpl +++ b/chart/templates/_envWorker.tpl @@ -84,6 +84,7 @@ value: {{ .Values.optInOutUrlsScan.urlsNumberPerBatch | quote }} - name: OPT_IN_OUT_URLS_SCAN_SPAWNING_URL value: {{ .Values.optInOutUrlsScan.spawningUrl | quote }} + # specific to 'split-duckdb-index' job runner - name: DUCKDB_INDEX_COMMIT_MESSAGE value: {{ .Values.duckDBIndex.commitMessage | quote }} diff --git a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py index 703e4a6353..310b4bde64 100644 --- a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py +++ b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py @@ -23,6 +23,7 @@ class DatasetsBasedJobRunner(JobRunner): datasets_based_config: DatasetsBasedConfig base_datasets_cache: Path + # the datasets library cache directories (for data, downloads, extraction, NOT for modules) # the job runner should have only one running job at the same time, then it should # be safe to use a global variable (and to set the datasets cache globally) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 9ad20ea238..fb4c75b802 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -56,7 +56,7 @@ def compute_index_rows( dataset: str, config: str, split: str, - duckdb_index_file_directory: StrPath, + duckdb_index_file_directory: Optional[Path], target_revision: str, hf_endpoint: str, commit_message: str, From 6eba4d9f00a12e29b9a355c6d9ffaea8e13d0f36 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 13 Jun 2023 16:01:29 -0400 Subject: [PATCH 22/52] Depends on info featues instead of parquet schema --- libs/libcommon/src/libcommon/config.py | 2 +- libs/libcommon/src/libcommon/exceptions.py | 8 -- .../worker/job_runners/split/duckdb_index.py | 81 +++++++++---------- services/worker/tests/fixtures/datasets.py | 7 -- services/worker/tests/fixtures/hub.py | 17 ---- .../job_runners/split/test_duckdb_index.py | 5 +- 6 files changed, 39 insertions(+), 81 deletions(-) diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index 114286cddc..7f6ef16377 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -353,7 +353,7 @@ class ProcessingGraphConfig: "triggered_by": [ "config-split-names-from-info", "config-split-names-from-streaming", - "config-parquet", + "config-parquet-and-info", ], "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, }, diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index cdf8a3d95c..64e77be5a3 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -116,7 +116,6 @@ def as_response(self) -> ErrorResponse: "TooManyColumnsError", "UnexpectedError", "UnsupportedExternalFilesError", - "UnsupportedIndexableColumnsError", ] @@ -491,13 +490,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) -class UnsupportedIndexableColumnsError(CacheableError): - """Raised when some unsupported indexable columns present.""" - - def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedIndexableColumnsError", cause, True) - - class NotAvailableIndexFileError(CacheableError): """Raised when no duckdb index file was found for split.""" diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index fb4c75b802..cc1c320998 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -2,13 +2,10 @@ # Copyright 2023 The HuggingFace Authors. import logging -from functools import partial from pathlib import Path from typing import List, Optional, Set import duckdb -from datasets import Features -from fsspec.implementations.http import HTTPFileSystem from huggingface_hub._commit_api import ( CommitOperation, CommitOperationAdd, @@ -20,30 +17,25 @@ from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION from libcommon.exceptions import ( DatasetNotFoundError, - FileSystemError, NoIndexableColumnsError, NotAvailableIndexFileError, ParquetResponseEmptyError, PreviousStepFormatError, SplitNotFoundError, - UnsupportedIndexableColumnsError, ) from libcommon.processing_graph import ProcessingStep from libcommon.simple_cache import get_previous_step_or_raise from libcommon.storage import StrPath from libcommon.utils import JobInfo, SplitHubFile -from pyarrow.parquet import ParquetFile -from tqdm.contrib.concurrent import thread_map from worker.config import AppConfig from worker.job_runners.split.split_job_runner import SplitCachedJobRunner -from worker.utils import CompleteJobResult, get_parquet_file, hf_hub_url +from worker.utils import CompleteJobResult, hf_hub_url DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_INDEX_FILENAME = "index.db" -UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"] CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);" CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM" @@ -78,50 +70,48 @@ def compute_index_rows( if split not in [split_item["split"] for split_item in splits_content]: raise SplitNotFoundError(f"The split '{split}' does not exist for the config '{config}' of the dataset.") - # get parquet content - config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config) - try: - parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"] - parquet_file_items: List[SplitHubFile] = [ - parquet_file_item - for parquet_file_item in parquet_files_content - if parquet_file_item["config"] == config and parquet_file_item["split"] == split - ] - if not parquet_file_items: - raise ParquetResponseEmptyError("No parquet files found.") - except Exception as e: - raise PreviousStepFormatError("Previous step did not return the expected content.") from e + # get parquet urls and dataset_info + config_parquet_and_info_step = "config-parquet-and-info" + parquet_and_info_best_response = get_previous_step_or_raise( + kinds=[config_parquet_and_info_step], + dataset=dataset, + config=config, + ) + content_parquet_and_info = parquet_and_info_best_response.response["content"] + if "parquet_files" not in content_parquet_and_info: + raise PreviousStepFormatError( + f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'parquet_files'" + ) - fs = HTTPFileSystem() - parquet_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items] - desc = f"{dataset}/{config}" - try: - parquet_files: List[ParquetFile] = thread_map( - partial(get_parquet_file, fs=fs, hf_token=hf_token), parquet_urls, desc=desc, unit="pq", disable=True + if "dataset_info" not in content_parquet_and_info: + raise PreviousStepFormatError( + f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'dataset_info'" ) - except Exception as e: - raise FileSystemError(f"Could not read the parquet files: {e}") from e + + parquet_urls = [ + parquet_file["url"] + for parquet_file in content_parquet_and_info["parquet_files"] + if parquet_file["config"] == config and parquet_file["split"] == split + ] + + if not parquet_urls: + raise ParquetResponseEmptyError("No parquet files found.") # get the features - features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema()) + features = content_parquet_and_info["dataset_info"].get("features", []) # look for string columns using the first rows - string_columns = [column for column, feature in features.items() if STRING_FEATURE_DTYPE in str(feature)] - + string_columns = [ + column + for column, feature in features.items() + if "dtype" in feature + and "_type" in feature + and feature["dtype"] == STRING_FEATURE_DTYPE + and feature["_type"] == VALUE_FEATURE_TYPE + ] if not string_columns: raise NoIndexableColumnsError("No string columns available to index.") - # look for image, audio and binary columns, if present, raise exception (not supported yet) - if any( - feature - for feature in features.values() - if next( - (feature_type for feature_type in UNSUPPORTED_FEATURES_MAGIC_STRINGS if feature_type in str(feature)), None - ) - is not None - ): - raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.") - # configure duckdb extensions duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs")) duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="httpfs")) @@ -172,7 +162,7 @@ def compute_index_rows( parent_commit=target_dataset_info.sha, ) - # call the API again to get the list of parquet files + # call the API again to get the index file target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True) repo_files = [ repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location @@ -187,6 +177,7 @@ def compute_index_rows( repo_file = repo_files[0] if repo_file.size is None: raise ValueError(f"Cannot get size of {repo_file.rfilename}") + return SplitHubFile( dataset=dataset, config=config, diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py index 357b91e0ed..fe6413489d 100644 --- a/services/worker/tests/fixtures/datasets.py +++ b/services/worker/tests/fixtures/datasets.py @@ -157,11 +157,4 @@ def datasets() -> Mapping[str, Dataset]: dtype=pd.StringDtype(storage="python"), ) ), - "text_image": other( - { - "col": str(Path(__file__).resolve().parent / "data" / "test_image_rgb.jpg"), - "text": "This is a text", - }, - {"col": Image(), "text": Value(dtype="string")}, - ), } diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index 4ab34e7277..bcb7f03840 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -281,13 +281,6 @@ def hub_public_duckdb_index(datasets: Mapping[str, Dataset]) -> Iterator[str]: delete_hub_dataset_repo(repo_id=repo_id) -@pytest.fixture(scope="session") -def hub_public_text_image(datasets: Mapping[str, Dataset]) -> Iterator[str]: - repo_id = create_hub_dataset_repo(prefix="text_image", dataset=datasets["text_image"]) - yield repo_id - delete_hub_dataset_repo(repo_id=repo_id) - - class HubDatasetTest(TypedDict): name: str config_names_response: Any @@ -623,7 +616,6 @@ def hub_datasets( hub_public_external_files: str, hub_public_spawning_opt_in_out: str, hub_public_duckdb_index: str, - hub_public_text_image: str, ) -> HubDatasets: return { "does_not_exist": { @@ -759,13 +751,4 @@ def hub_datasets( dataset=hub_public_duckdb_index, data_type="csv" ), }, - "text_image": { - "name": hub_public_text_image, - "config_names_response": create_config_names_response(hub_public_text_image), - "splits_response": create_splits_response(hub_public_text_image), - "first_rows_response": create_first_rows_response( - hub_public_text_image, TEXT_IMAGE_cols, get_TEXT_IMAGE_rows(hub_public_text_image) - ), - "parquet_and_info_response": None, - }, } diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index 01d862140b..b6053d7ea4 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -121,8 +121,7 @@ def _get_job_runner( "hub_dataset_name,expected_error_code", [ ("duckdb_index", None), - ("text_image", "UnsupportedIndexableColumnsError"), - ("public", "NoIndexableColumnsError"), + ("public", "NoIndexableColumnsError"), # dataset does not have string columns to index ], ) def test_compute( @@ -159,7 +158,7 @@ def test_compute( config_parquet = parquet_response.content upsert_response( - "config-parquet", + "config-parquet-and-info", dataset=dataset, config=config, http_status=HTTPStatus.OK, From 39e7ded0272ec2f7ef4bd4bc9f779d3d64a403b5 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 13 Jun 2023 16:51:45 -0400 Subject: [PATCH 23/52] Fix libcommon test --- libs/libcommon/tests/test_processing_graph.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py index 938a86b5c0..1d0933479a 100644 --- a/libs/libcommon/tests/test_processing_graph.py +++ b/libs/libcommon/tests/test_processing_graph.py @@ -83,6 +83,7 @@ def graph() -> ProcessingGraph: "config-parquet", "config-info", "config-size", + "split-duckdb-index", ], ["dataset-config-names"], ["dataset-config-names"], @@ -148,7 +149,7 @@ def graph() -> ProcessingGraph: ), ( "config-parquet", - ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet", "split-duckdb-index"], + ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet"], ["config-parquet-and-info"], ["dataset-config-names", "config-parquet-and-info"], ), @@ -296,13 +297,12 @@ def graph() -> ProcessingGraph: ( "split-duckdb-index", [], - ["config-parquet", "config-split-names-from-streaming", "config-split-names-from-info"], + ["config-parquet-and-info", "config-split-names-from-streaming", "config-split-names-from-info"], [ "config-split-names-from-streaming", "config-split-names-from-info", "config-parquet-and-info", "config-info", - "config-parquet", "dataset-config-names", ], ), From e94e1d4f33f637bd6f5cc6d4eccb1e719e54c3bd Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Wed, 14 Jun 2023 08:46:56 -0400 Subject: [PATCH 24/52] Apply code review suggestions --- chart/env/prod.yaml | 2 ++ chart/templates/_envWorker.tpl | 2 ++ chart/values.yaml | 7 ++++--- libs/libcommon/src/libcommon/config.py | 7 ++++++- libs/libcommon/src/libcommon/exceptions.py | 8 +++++++ .../worker/job_runners/split/duckdb_index.py | 21 +++++++++++++++---- tools/docker-compose-datasets-server.yml | 1 + tools/docker-compose-dev-datasets-server.yml | 1 + 8 files changed, 41 insertions(+), 8 deletions(-) diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml index 163a431de8..90f5b3ef48 100644 --- a/chart/env/prod.yaml +++ b/chart/env/prod.yaml @@ -97,6 +97,8 @@ optInOutUrlsScan: rowsMaxNumber: 100_000 urlsNumberPerBatch: 1000 +duckDBIndex: + maxParquetSizeBytes: "5_000_000_000" # --- jobs (pre-install/upgrade hooks) --- diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl index b3add1de06..ba7d58eb7f 100644 --- a/chart/templates/_envWorker.tpl +++ b/chart/templates/_envWorker.tpl @@ -102,4 +102,6 @@ value: {{ .Values.duckDBIndex.targetRevision | quote }} - name: DUCKDB_INDEX_URL_TEMPLATE value: {{ .Values.duckDBIndex.urlTemplate | quote }} +- name: DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES + value: {{ .Values.duckDBIndex.maxParquetSizeBytes | quote }} {{- end -}} diff --git a/chart/values.yaml b/chart/values.yaml index f45ecbdf87..d23748aa32 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -210,11 +210,12 @@ duckDBIndex: storageDirectory: "/duckdb-index" # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`. commitMessage: "Update duckdb index files" - # the git revision of the dataset where to store the duckdb index file. Defaults to `duckdb/index`. - targetRevision: "duckdb/index" + # the git revision of the dataset where to store the duckdb index file. Defaults to `refs/convert/parquet`. + targetRevision: "refs/convert/parquet" # the URL template to build the duckdb index file URL. Defaults to `/datasets/%s/resolve/%s/%s`. urlTemplate: "/datasets/%s/resolve/%s/%s" - + # the maximum size of the split parquets. + maxParquetSizeBytes: "100_000_000" # Directory where the cache data will be stored cacheDirectory: "/datasets-server-cache" diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index 7f6ef16377..141e70e6f4 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -108,7 +108,8 @@ def from_env(cls) -> "ParquetMetadataConfig": DUCKDB_INDEX_STORAGE_DIRECTORY = None DUCKDB_INDEX_COMMIT_MESSAGE = "Update duckdb index file" DUCKDB_INDEX_COMMITTER_HF_TOKEN = None -DUCKDB_INDEX_TARGET_REVISION = "duckdb/index" +DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES = 100_000_000 +DUCKDB_INDEX_TARGET_REVISION = "refs/convert/parquet" DUCKDB_INDEX_URL_TEMPLATE = "/datasets/%s/resolve/%s/%s" @@ -119,6 +120,7 @@ class DuckDbIndexConfig: committer_hf_token: Optional[str] = DUCKDB_INDEX_COMMITTER_HF_TOKEN target_revision: str = DUCKDB_INDEX_TARGET_REVISION url_template: str = DUCKDB_INDEX_URL_TEMPLATE + max_parquet_size_bytes: int = DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES @classmethod def from_env(cls) -> "DuckDbIndexConfig": @@ -130,6 +132,9 @@ def from_env(cls) -> "DuckDbIndexConfig": committer_hf_token=env.str(name="COMMITTER_HF_TOKEN", default=DUCKDB_INDEX_COMMITTER_HF_TOKEN), target_revision=env.str(name="TARGET_REVISION", default=DUCKDB_INDEX_TARGET_REVISION), url_template=env.str(name="URL_TEMPLATE", default=DUCKDB_INDEX_URL_TEMPLATE), + max_parquet_size_bytes=env.int( + name="MAX_PARQUET_SIZE_BYTES", default=DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES + ), ) diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 64e77be5a3..e241d8f3a7 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -111,6 +111,7 @@ def as_response(self) -> ErrorResponse: "SplitsNamesError", "SplitNamesFromStreamingError", "SplitNotFoundError", + "SplitWithTooBigParquetError", "StreamingRowsError", "TooBigContentError", "TooManyColumnsError", @@ -495,3 +496,10 @@ class NotAvailableIndexFileError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NotAvailableIndexFileError", cause, False) + + +class SplitWithTooBigParquetError(CacheableError): + """Raised when the split parquet size (sum of parquet sizes given) is too big.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index cc1c320998..0ef42c2c65 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -22,6 +22,7 @@ ParquetResponseEmptyError, PreviousStepFormatError, SplitNotFoundError, + SplitWithTooBigParquetError, ) from libcommon.processing_graph import ProcessingStep from libcommon.simple_cache import get_previous_step_or_raise @@ -35,7 +36,7 @@ DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" VALUE_FEATURE_TYPE = "Value" -DUCKDB_DEFAULT_INDEX_FILENAME = "index.db" +DUCKDB_DEFAULT_INDEX_FILENAME = "duckdb_index.db" CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);" CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM" @@ -54,6 +55,7 @@ def compute_index_rows( commit_message: str, url_template: str, hf_token: Optional[str], + max_parquet_size_bytes: int, committer_hf_token: Optional[str], ) -> SplitHubFile: logging.info(f"get split-duckdb-index for dataset={dataset} config={config} split={split}") @@ -88,12 +90,22 @@ def compute_index_rows( f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'dataset_info'" ) - parquet_urls = [ - parquet_file["url"] + split_parquet_files = [ + parquet_file for parquet_file in content_parquet_and_info["parquet_files"] if parquet_file["config"] == config and parquet_file["split"] == split ] + split_parquets_size = sum(parquet_file["size"] for parquet_file in split_parquet_files) + + if split_parquets_size > max_parquet_size_bytes: + raise SplitWithTooBigParquetError( + f"The indexing is limited to split parquets under {max_parquet_size_bytes} bytes. " + f"Current size of sum of split parquets is {split_parquets_size} bytes." + ) + + parquet_urls = [parquet_file["url"] for parquet_file in split_parquet_files] + if not parquet_urls: raise ParquetResponseEmptyError("No parquet files found.") @@ -131,7 +143,7 @@ def compute_index_rows( # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) hf_api = HfApi(endpoint=hf_endpoint, token=hf_token) committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token) - index_file_location = f"{config}/{dataset}-{split}.db" + index_file_location = f"{config}/{split}/{DUCKDB_DEFAULT_INDEX_FILENAME}" try: refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE) if all(ref.ref != target_revision for ref in refs.converts): @@ -233,5 +245,6 @@ def compute(self) -> CompleteJobResult: committer_hf_token=self.duckdb_index_config.committer_hf_token, hf_endpoint=self.app_config.common.hf_endpoint, target_revision=self.duckdb_index_config.target_revision, + max_parquet_size_bytes=self.duckdb_index_config.max_parquet_size_bytes, ) ) diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml index 5e85f4de62..a5562832ca 100644 --- a/tools/docker-compose-datasets-server.yml +++ b/tools/docker-compose-datasets-server.yml @@ -117,6 +117,7 @@ services: DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-} DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index} DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} + DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} # ^ note: the datasets cache is automatically added, so no need to add it here OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml index 59e2b90195..4f1c5c393a 100644 --- a/tools/docker-compose-dev-datasets-server.yml +++ b/tools/docker-compose-dev-datasets-server.yml @@ -121,6 +121,7 @@ services: DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-} DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index} DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} + DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} # ^ note: the datasets cache is automatically added, so no need to add it here OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} From e28142fd575f09c59201b46c9fea2446f7451980 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 15 Jun 2023 12:32:01 -0400 Subject: [PATCH 25/52] Some details --- chart/env/prod.yaml | 3 --- services/worker/tests/fixtures/hub.py | 20 -------------------- 2 files changed, 23 deletions(-) diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml index 90f5b3ef48..8b98ecbedc 100644 --- a/chart/env/prod.yaml +++ b/chart/env/prod.yaml @@ -97,9 +97,6 @@ optInOutUrlsScan: rowsMaxNumber: 100_000 urlsNumberPerBatch: 1000 -duckDBIndex: - maxParquetSizeBytes: "5_000_000_000" - # --- jobs (pre-install/upgrade hooks) --- mongodbMigration: diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index bcb7f03840..db01d0fb3a 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -514,26 +514,6 @@ def get_IMAGE_rows(dataset: str) -> Any: ] -TEXT_IMAGE_cols = { - "col": {"_type": "Image"}, - "text": {"_type": "Value", "dtype": "string"}, -} - - -def get_TEXT_IMAGE_rows(dataset: str) -> Any: - dataset, config, split = get_default_config_split(dataset) - return [ - { - "col": { - "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/image.jpg", - "height": 480, - "width": 640, - }, - "text": "This is a text", - } - ] - - IMAGES_LIST_cols = { "col": [{"_type": "Image"}], } From a51d7d32cbb480311b314b6a414189be86dc04aa Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 15 Jun 2023 12:34:10 -0400 Subject: [PATCH 26/52] Fix style --- libs/libcommon/src/libcommon/exceptions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 8fd06b4c17..05ceda6a48 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -504,6 +504,8 @@ class SplitWithTooBigParquetError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False) + + class DatasetWithTooManyConfigsError(CacheableError): """Raised when the number of configs of a dataset exceeded the limit.""" From edd120d66d922e98e51455c206aea6156aaf6894 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 16 Jun 2023 09:36:34 -0400 Subject: [PATCH 27/52] Fix test --- services/worker/tests/fixtures/hub.py | 13 +++++++++++++ .../tests/job_runners/split/test_duckdb_index.py | 6 ++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index f35d327ea0..2dd5d02f38 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -764,3 +764,16 @@ def hub_reponses_spawning_opt_in_out(hub_public_spawning_opt_in_out: str) -> Hub ), "parquet_and_info_response": None, } + + +@pytest.fixture +def hub_reponses_duckdb_index(hub_public_duckdb_index: str) -> HubDatasetTest: + return { + "name": hub_public_duckdb_index, + "config_names_response": create_config_names_response(hub_public_duckdb_index), + "splits_response": create_splits_response(hub_public_duckdb_index), + "first_rows_response": create_first_rows_response(hub_public_duckdb_index, TEXT_cols, TEXT_rows), + "parquet_and_info_response": create_parquet_and_info_response( + dataset=hub_public_duckdb_index, data_type="csv" + ), + } diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index b6053d7ea4..1c10f37aca 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -16,7 +16,7 @@ from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner from worker.resources import LibrariesResource -from ...fixtures.hub import HubDatasets +from ...fixtures.hub import HubDatasetTest GetJobRunner = Callable[[str, str, str, AppConfig], SplitDuckDbIndexJobRunner] @@ -128,10 +128,12 @@ def test_compute( get_parquet_job_runner: GetParquetJobRunner, get_job_runner: GetJobRunner, app_config: AppConfig, - hub_datasets: HubDatasets, + hub_reponses_public: HubDatasetTest, + hub_reponses_duckdb_index: HubDatasetTest, hub_dataset_name: str, expected_error_code: str, ) -> None: + hub_datasets = {"public": hub_reponses_public, "duckdb_index": hub_reponses_duckdb_index} dataset = hub_datasets[hub_dataset_name]["name"] config_names = hub_datasets[hub_dataset_name]["config_names_response"] config = hub_datasets[hub_dataset_name]["config_names_response"]["config_names"][0]["config"] From 059c632a57d9d2bd111165967ad803303be51c94 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Fri, 16 Jun 2023 09:43:47 -0400 Subject: [PATCH 28/52] Apply code review suggestions --- .../job_runners/config/parquet_and_info.py | 2 +- .../worker/job_runners/split/duckdb_index.py | 25 +++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 8e3cfa4a67..84c78a879f 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -73,7 +73,7 @@ from worker.config import AppConfig, ParquetAndInfoConfig from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner -from worker.utils import CompleteJobResult, hf_hub_url +from worker.utils import CompleteJobResult, hf_hub_url, retry class ConfigParquetAndInfoResponse(TypedDict): diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 0ef42c2c65..3e776673fe 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -17,6 +17,7 @@ from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION from libcommon.exceptions import ( DatasetNotFoundError, + LockedDatasetTimeoutError, NoIndexableColumnsError, NotAvailableIndexFileError, ParquetResponseEmptyError, @@ -25,6 +26,7 @@ SplitWithTooBigParquetError, ) from libcommon.processing_graph import ProcessingStep +from libcommon.queue import lock from libcommon.simple_cache import get_previous_step_or_raise from libcommon.storage import StrPath from libcommon.utils import JobInfo, SplitHubFile @@ -46,6 +48,7 @@ def compute_index_rows( + job_id: str, dataset: str, config: str, split: str, @@ -165,14 +168,19 @@ def compute_index_rows( CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location) ] - committer_hf_api.create_commit( - repo_id=dataset, - repo_type=DATASET_TYPE, - revision=target_revision, - operations=delete_operations + add_operations, - commit_message=commit_message, - parent_commit=target_dataset_info.sha, - ) + try: + sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300] + with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps): + committer_hf_api.create_commit( + repo_id=dataset, + repo_type=DATASET_TYPE, + revision=target_revision, + operations=delete_operations + add_operations, + commit_message=commit_message, + parent_commit=target_dataset_info.sha, + ) + except TimeoutError as err: + raise LockedDatasetTimeoutError("the dataset is currently locked, please try again later.") from err # call the API again to get the index file target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True) @@ -235,6 +243,7 @@ def get_job_runner_version() -> int: def compute(self) -> CompleteJobResult: return CompleteJobResult( compute_index_rows( + job_id=self.job_info["job_id"], dataset=self.dataset, config=self.config, split=self.split, From 9ecf9233c694ebfc3a9b5ee6c3a28646319469a4 Mon Sep 17 00:00:00 2001 From: Andrea Francis Soria Jimenez Date: Mon, 19 Jun 2023 18:48:37 -0400 Subject: [PATCH 29/52] Update chart/values.yaml Co-authored-by: Sylvain Lesage --- chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/values.yaml b/chart/values.yaml index 15160d4dc1..ccac40bfdf 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -210,7 +210,7 @@ parquetMetadata: storageDirectory: "/parquet-metadata" duckDBIndex: - # Directory on the shared storage (duckdb db files used for datasets indexing) + # Directory on the shared storage (used temporarily to prepare the duckdb indexes before sending to the Hub) storageDirectory: "/duckdb-index" # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`. commitMessage: "Update duckdb index files" From 874fabd126cc6d3570b543095cb7272f9adc75ab Mon Sep 17 00:00:00 2001 From: Andrea Francis Soria Jimenez Date: Mon, 19 Jun 2023 18:53:49 -0400 Subject: [PATCH 30/52] Apply suggestions from code review Co-authored-by: Sylvain Lesage --- libs/libcommon/src/libcommon/config.py | 2 -- libs/libcommon/src/libcommon/exceptions.py | 2 +- services/worker/src/worker/job_runners/split/duckdb_index.py | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index 141e70e6f4..231ec1f55f 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -357,8 +357,6 @@ class ProcessingGraphConfig: "input_type": "split", "triggered_by": [ "config-split-names-from-info", - "config-split-names-from-streaming", - "config-parquet-and-info", ], "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION, }, diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 6c22581a22..57276b2c5e 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -500,7 +500,7 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) -class NotAvailableIndexFileError(CacheableError): +class DuckDBIndexFileNotFoundError(CacheableError): """Raised when no duckdb index file was found for split.""" def __init__(self, message: str, cause: Optional[BaseException] = None): diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 3e776673fe..c461c0324c 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -38,7 +38,7 @@ DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" VALUE_FEATURE_TYPE = "Value" -DUCKDB_DEFAULT_INDEX_FILENAME = "duckdb_index.db" +DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb" CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);" CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM" @@ -115,7 +115,7 @@ def compute_index_rows( # get the features features = content_parquet_and_info["dataset_info"].get("features", []) - # look for string columns using the first rows + # look for string columns string_columns = [ column for column, feature in features.items() From c36202fd55d61b552ffd58329cfa7a2e3502ca98 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 19 Jun 2023 08:08:41 -0400 Subject: [PATCH 31/52] Apply code review suggestions --- chart/templates/_envDuckDbIndex.tpl | 7 ---- chart/templates/_envWorker.tpl | 2 + chart/templates/worker/_container.tpl | 1 - .../job_runners/config/parquet_and_info.py | 13 ++---- .../worker/job_runners/split/duckdb_index.py | 42 +++++++++---------- services/worker/src/worker/utils.py | 22 +++++++++- 6 files changed, 47 insertions(+), 40 deletions(-) delete mode 100644 chart/templates/_envDuckDbIndex.tpl diff --git a/chart/templates/_envDuckDbIndex.tpl b/chart/templates/_envDuckDbIndex.tpl deleted file mode 100644 index a0a12059bb..0000000000 --- a/chart/templates/_envDuckDbIndex.tpl +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright 2023 The HuggingFace Authors. - -{{- define "envDuckDBIndex" -}} -- name: DUCKDB_INDEX_STORAGE_DIRECTORY - value: {{ .Values.duckDBIndex.storageDirectory | quote }} -{{- end -}} diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl index 7dbf16f1d0..0395558c18 100644 --- a/chart/templates/_envWorker.tpl +++ b/chart/templates/_envWorker.tpl @@ -106,4 +106,6 @@ value: {{ .Values.duckDBIndex.urlTemplate | quote }} - name: DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES value: {{ .Values.duckDBIndex.maxParquetSizeBytes | quote }} +- name: DUCKDB_INDEX_STORAGE_DIRECTORY + value: {{ .Values.duckDBIndex.storageDirectory | quote }} {{- end -}} diff --git a/chart/templates/worker/_container.tpl b/chart/templates/worker/_container.tpl index 1b8e2ddfe4..f9b86817a9 100644 --- a/chart/templates/worker/_container.tpl +++ b/chart/templates/worker/_container.tpl @@ -9,7 +9,6 @@ {{ include "envAssets" . | nindent 2 }} {{ include "envCache" . | nindent 2 }} {{ include "envParquetMetadata" . | nindent 2 }} - {{ include "envDuckDBIndex" . | nindent 2 }} {{ include "envQueue" . | nindent 2 }} {{ include "envCommon" . | nindent 2 }} {{ include "envLog" . | nindent 2 }} diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 84c78a879f..c8174138af 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -73,7 +73,7 @@ from worker.config import AppConfig, ParquetAndInfoConfig from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner -from worker.utils import CompleteJobResult, hf_hub_url, retry +from worker.utils import CompleteJobResult, create_branch, hf_hub_url, retry class ConfigParquetAndInfoResponse(TypedDict): @@ -991,14 +991,9 @@ def compute_config_parquet_and_info_response( # create the target revision if we managed to get the parquet files and it does not exist yet # (clone from initial commit to avoid cloning all repo's files) - try: - if all(ref.ref != target_revision for ref in refs.converts): - initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id - committer_hf_api.create_branch( - repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True - ) - except RepositoryNotFoundError as err: - raise DatasetNotFoundError("The dataset does not exist on the Hub (was deleted during job).") from err + create_branch( + dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api + ) try: sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300] diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index c461c0324c..05aabfb0c5 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -33,7 +33,7 @@ from worker.config import AppConfig from worker.job_runners.split.split_job_runner import SplitCachedJobRunner -from worker.utils import CompleteJobResult, hf_hub_url +from worker.utils import CompleteJobResult, create_branch, hf_hub_url DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" @@ -149,28 +149,27 @@ def compute_index_rows( index_file_location = f"{config}/{split}/{DUCKDB_DEFAULT_INDEX_FILENAME}" try: refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE) - if all(ref.ref != target_revision for ref in refs.converts): - initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id - committer_hf_api.create_branch( - repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True - ) except RepositoryNotFoundError as err: raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err - target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False) - all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings} - delete_operations: List[CommitOperation] = [] - if index_file_location in all_repo_files: - delete_operations.append(CommitOperationDelete(path_in_repo=index_file_location)) - - # send the files to the target revision - add_operations: List[CommitOperation] = [ - CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location) - ] + create_branch( + dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api + ) try: sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300] with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps): + target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False) + all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings} + delete_operations: List[CommitOperation] = [] + if index_file_location in all_repo_files: + delete_operations.append(CommitOperationDelete(path_in_repo=index_file_location)) + + # send the files to the target revision + add_operations: List[CommitOperation] = [ + CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location) + ] + committer_hf_api.create_commit( repo_id=dataset, repo_type=DATASET_TYPE, @@ -179,20 +178,19 @@ def compute_index_rows( commit_message=commit_message, parent_commit=target_dataset_info.sha, ) + + # call the API again to get the index file + target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True) except TimeoutError as err: raise LockedDatasetTimeoutError("the dataset is currently locked, please try again later.") from err - # call the API again to get the index file - target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True) repo_files = [ repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location ] - if not repo_files: - raise NotAvailableIndexFileError("No index file was found") - - if len(repo_files) != 1: + if not repo_files or len(repo_files) != 1: logging.warning(f"Found {len(repo_files)} index files, should be only 1") + raise NotAvailableIndexFileError("No index file was found") repo_file = repo_files[0] if repo_file.size is None: diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 46335ff5b2..7bb45547d4 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -33,7 +33,13 @@ ) from datasets.utils.file_utils import get_authentication_headers_for_url from fsspec.implementations.http import HTTPFileSystem -from libcommon.exceptions import NormalRowsError, StreamingRowsError +from huggingface_hub.hf_api import GitRefs, HfApi +from huggingface_hub.utils._errors import RepositoryNotFoundError +from libcommon.exceptions import ( + DatasetNotFoundError, + NormalRowsError, + StreamingRowsError, +) from libcommon.utils import orjson_dumps from pyarrow.parquet import ParquetFile @@ -421,3 +427,17 @@ def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> ParquetFile: headers = get_authentication_headers_for_url(url, use_auth_token=hf_token) return ParquetFile(fs.open(url, headers=headers)) + + +DATASET_TYPE = "dataset" + + +def create_branch(dataset: str, target_revision: str, refs: GitRefs, hf_api: HfApi, committer_hf_api: HfApi) -> None: + try: + if all(ref.ref != target_revision for ref in refs.converts): + initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id + committer_hf_api.create_branch( + repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True + ) + except RepositoryNotFoundError as err: + raise DatasetNotFoundError("The dataset does not exist on the Hub (was deleted during job).") from err From 9b82a66b2af08d0bb2024bef3b5b6cdf29b57a83 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 16 Jun 2023 09:10:04 -0700 Subject: [PATCH 32/52] [docs] Improvements (#1376) * add end-to-end example * apply feedback --- docs/source/_toctree.yml | 2 + docs/source/analyze_data.mdx | 63 +++++++++++++++++++++ docs/source/quick_start.mdx | 103 +++++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+) create mode 100644 docs/source/analyze_data.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 0767f5ab65..b4dbd1298a 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -4,6 +4,8 @@ title: 🤗 Datasets server - local: quick_start title: Quickstart + - local: analyze_data + title: Analyze a dataset on the Hub - title: Guides sections: - local: valid diff --git a/docs/source/analyze_data.mdx b/docs/source/analyze_data.mdx new file mode 100644 index 0000000000..e6c1a67313 --- /dev/null +++ b/docs/source/analyze_data.mdx @@ -0,0 +1,63 @@ +# Analyze a dataset on the Hub + +[[open-in-colab]] + +In the Quickstart, you were introduced to various endpoints for interacting with datasets on the Hub. One of the most useful ones is the `/parquet` endpoint, which allows you to get a dataset stored on the Hub and analyze it. This is a great way to explore the dataset, and get a better understanding of it's contents. + +To demonstrate, this guide will show you an end-to-end example of how to retrieve a dataset from the Hub and do some basic data analysis with the Pandas library. + +## Get a dataset + +The [Hub](https://huggingface.co/datasets) is home to more than 40,000 datasets across a wide variety of tasks, sizes, and languages. For this example, you'll use the [`codeparrot/codecomplex`](https://huggingface.co/datasets/codeparrot/codecomplex) dataset, but feel free to explore and find another dataset that interests you! The dataset contains Java code from programming competitions, and the time complexity of the code is labeled by a group of algorithm experts. + +Let's say you're interested in the average length of the submitted code as it relates to the time complexity. Here's how you can get started. + +Use the `/parquet` endpoint to convert the dataset to a Parquet file and return the URL to it: + +```py +import requests +API_URL = "https://datasets-server.huggingface.co/parquet?dataset=codeparrot/codecomplex" +def query(): + response = requests.get(API_URL) + return response.json() +data = query() +print(data) +{'parquet_files': + [ + {'dataset': 'codeparrot/codecomplex', 'config': 'codeparrot--codecomplex', 'split': 'train', 'url': 'https://huggingface.co/datasets/codeparrot/codecomplex/resolve/refs%2Fconvert%2Fparquet/codeparrot--codecomplex/json-train.parquet', 'filename': 'json-train.parquet', 'size': 4115908} + ], + 'pending': [], 'failed': [] +} +``` + +## Read dataset with Pandas + +With the URL, you can read the Parquet file into a Pandas DataFrame: + +```py +import pandas as pd + +url = "https://huggingface.co/datasets/codeparrot/codecomplex/resolve/refs%2Fconvert%2Fparquet/codeparrot--codecomplex/json-train.parquet" +df = pd.read_parquet(url) +df.head(5) +``` + +| src | complexity | problem | from | +|--------------------------------------------------:|-----------:|--------------------------------:|-----------:| +| import java.io.*;\nimport java.math.BigInteger... | quadratic | 1179_B. Tolik and His Uncle | CODEFORCES | +| import java.util.Scanner;\n \npublic class pil... | linear | 1197_B. Pillars | CODEFORCES | +| import java.io.BufferedReader;\nimport java.io... | linear | 1059_C. Sequence Transformation | CODEFORCES | +| import java.util.*;\n\nimport java.io.*;\npubl... | linear | 1011_A. Stages | CODEFORCES | +| import java.io.OutputStream;\nimport java.io.I... | linear | 1190_C. Tokitsukaze and Duel | CODEFORCES | + +## Calculate mean code length by time complexity + +Pandas is a powerful library for data analysis; group the dataset by time complexity, apply a function to calculate the average length of the code snippet, and plot the results: + +```py +df.groupby('complexity')['src'].apply(lambda x: x.str.len().mean()).sort_values(ascending=False).plot.barh(color="orange") +``` + +
+ +
\ No newline at end of file diff --git a/docs/source/quick_start.mdx b/docs/source/quick_start.mdx index d784bb2060..b798b8e8e4 100644 --- a/docs/source/quick_start.mdx +++ b/docs/source/quick_start.mdx @@ -1,5 +1,7 @@ # Quickstart +[[open-in-colab]] + In this quickstart, you'll learn how to use the Datasets Server's REST API to: - Check whether a dataset on the Hub is functional. @@ -87,6 +89,13 @@ curl https://datasets-server.huggingface.co/is-valid?dataset=rotten_tomatoes \ +You'll see the following error if you're trying to access a gated dataset without providing your user token: + +```py +print(data) +{'error': 'The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication.'} +``` + ## Check dataset validity The `/valid` endpoint returns a JSON list of datasets stored on the Hub that load without any errors: @@ -128,6 +137,22 @@ curl https://datasets-server.huggingface.co/valid \ +This returns a list of all the datasets that load without an error: + +```py +print(data) +{ + "valid": [ + "0n1xus/codexglue", + "0n1xus/pytorrent-standalone", + "0x7194633/rupile", + "51la5/keyword-extraction", + ..., + ..., + ] +} +``` + To check whether a specific dataset is valid, for example, [Rotten Tomatoes](https://huggingface.co/datasets/rotten_tomatoes), use the `/is-valid` endpoint instead: @@ -167,6 +192,13 @@ curl https://datasets-server.huggingface.co/is-valid?dataset=rotten_tomatoes \ +This returns whether the `valid` key is `true` or `false`: + +```py +print(data) +{'valid': True} +``` + ## List configurations and splits The `/splits` endpoint returns a JSON list of the splits in a dataset: @@ -208,6 +240,21 @@ curl https://datasets-server.huggingface.co/splits?dataset=rotten_tomatoes \ +This returns the available configuration and splits in the dataset: + +```py +print(data) +{'splits': + [ + {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'train'}, + {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'validation'}, + {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'test'} + ], + 'pending': [], + 'failed': [] +} +``` + ## Preview a dataset The `/first-rows` endpoint returns a JSON list of the first 100 rows of a dataset. It also returns the types of data features ("columns" data types). You should specify the dataset name, configuration name (you can find out the configuration name from the `/splits` endpoint), and split name of the dataset you'd like to preview: @@ -249,6 +296,26 @@ curl https://datasets-server.huggingface.co/first-rows?dataset=rotten_tomatoes&c +This returns the first 100 rows of the dataset: + +```py +print(data) +{'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'train', + 'features': + [ + {'feature_idx': 0, 'name': 'text', 'type': {'dtype': 'string', '_type': 'Value'}}, + {'feature_idx': 1, 'name': 'label', 'type': {'names': ['neg', 'pos'], '_type': 'ClassLabel'}} + ], + 'rows': + [ + {'row_idx': 0, 'row': {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}, 'truncated_cells': []}, + {'row_idx': 1, 'row': {'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'label': 1}, 'truncated_cells': []} + ..., + ..., + ], +} +``` + ## Download slices of a dataset The `/rows` endpoint returns a JSON list of a slice of rows of a dataset at any given location (offset). @@ -294,6 +361,27 @@ curl https://datasets-server.huggingface.co/rows?dataset=rotten_tomatoes&config= You can download slices of 100 rows maximum at a time. +The response looks like: + +```py +print(data) +{'features': + [ + {'feature_idx': 0, 'name': 'text', 'type': {'dtype': 'string', '_type': 'Value'}}, + {'feature_idx': 1, 'name': 'label', 'type': {'names': ['neg', 'pos'], '_type': 'ClassLabel'}}], + 'rows': + [ + {'row_idx': 150, 'row': {'text': 'enormously likable , partly because it is aware of its own grasp of the absurd .', 'label': 1}, 'truncated_cells': []}, + {'row_idx': 151, 'row': {'text': "here's a british flick gleefully unconcerned with plausibility , yet just as determined to entertain you .", 'label': 1}, 'truncated_cells': []}, + {'row_idx': 152, 'row': {'text': "it's an old story , but a lively script , sharp acting and partially animated interludes make just a kiss seem minty fresh .", 'label': 1}, 'truncated_cells': []}, + {'row_idx': 153, 'row': {'text': 'must be seen to be believed .', 'label': 1}, 'truncated_cells': []}, + {'row_idx': 154, 'row': {'text': "ray liotta and jason patric do some of their best work in their underwritten roles , but don't be fooled : nobody deserves any prizes here .", 'label': 1}, 'truncated_cells': []}, + ..., + ..., + ] +} +``` + ## Access Parquet files Datasets Server converts every public dataset on the Hub to the [Parquet](https://parquet.apache.org/) format. The `/parquet` endpoint returns a JSON list of the Parquet URLs for a dataset: @@ -334,3 +422,18 @@ curl https://datasets-server.huggingface.co/parquet?dataset=rotten_tomatoes \ ``` + +This returns a URL to the Parquet file for each split: + +```py +print(data) +{'parquet_files': + [ + {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'test', 'url': 'https://huggingface.co/datasets/rotten_tomatoes/resolve/refs%2Fconvert%2Fparquet/default/rotten_tomatoes-test.parquet', 'filename': 'rotten_tomatoes-test.parquet', 'size': 92206}, + {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'train', 'url': 'https://huggingface.co/datasets/rotten_tomatoes/resolve/refs%2Fconvert%2Fparquet/default/rotten_tomatoes-train.parquet', 'filename': 'rotten_tomatoes-train.parquet', 'size': 698845}, + {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'validation', 'url': 'https://huggingface.co/datasets/rotten_tomatoes/resolve/refs%2Fconvert%2Fparquet/default/rotten_tomatoes-validation.parquet', 'filename': 'rotten_tomatoes-validation.parquet', 'size': 90001} + ], + 'pending': [], + 'failed': [] +} +``` \ No newline at end of file From 33260144a83641c18c511f77274dde2f5073939a Mon Sep 17 00:00:00 2001 From: Bas Krahmer Date: Mon, 19 Jun 2023 10:51:24 +0200 Subject: [PATCH 33/52] Fix closing brackets and GH action link (#1389) --- DEVELOPER_GUIDE.md | 4 ++-- services/reverse-proxy/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index d446e6d012..e2101dc5d9 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -51,7 +51,7 @@ If you use VSCode, it might be useful to use the ["monorepo" workspace](./.vscod ## Architecture -The repository is structured as a monorepo, with Python libraries and applications in [jobs](./jobs)), [libs](./libs) and [services](./services): +The repository is structured as a monorepo, with Python libraries and applications in [jobs](./jobs), [libs](./libs) and [services](./services): - [jobs](./jobs) contains the one-time jobs run by Helm before deploying the pods. For now, the only job migrates the databases when needed. - [libs](./libs) contains the Python libraries used by the services and workers. For now, the only library is [libcommon](./libs/libcommon), which contains the common code for the services and workers. @@ -97,7 +97,7 @@ The following environments contain all the modules: reverse proxy, API server, a ## Quality -The CI checks the quality of the code through a [GitHub action](./.github/workflows/quality.yml). To manually format the code of a job, library, service or worker: +The CI checks the quality of the code through a [GitHub action](./.github/workflows/_quality-python.yml). To manually format the code of a job, library, service or worker: ```bash make style diff --git a/services/reverse-proxy/README.md b/services/reverse-proxy/README.md index 2801bf4441..d322fda21c 100644 --- a/services/reverse-proxy/README.md +++ b/services/reverse-proxy/README.md @@ -8,7 +8,7 @@ Note that the template configuration is located in [chart/nginx-templates/](../. The reverse proxy uses nginx: -- it serves the static assets directly (the API also serves them if required, but it's unnecessary to go through starlette for this, and it generates errors in Safari, see [1](https://github.com/encode/starlette/issues/950) and [2](https://developer.apple.com/library/archive/documentation/AppleApplications/Reference/SafariWebContent/CreatingVideoforSafarioniPhone/CreatingVideoforSafarioniPhone.html#//apple_ref/doc/uid/TP40006514-SW6) +- it serves the static assets directly (the API also serves them if required, but it's unnecessary to go through starlette for this, and it generates errors in Safari, see [1](https://github.com/encode/starlette/issues/950) and [2](https://developer.apple.com/library/archive/documentation/AppleApplications/Reference/SafariWebContent/CreatingVideoforSafarioniPhone/CreatingVideoforSafarioniPhone.html#//apple_ref/doc/uid/TP40006514-SW6)) - it serves the OpenAPI specification - it proxies the other requests to the API From 14107376fa21c33fbcc3ec3b9730af1a205850a7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 19 Jun 2023 10:55:12 +0200 Subject: [PATCH 34/52] Fix typo in erro rmessage (#1391) --- services/worker/src/worker/job_runners/dataset/config_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/worker/src/worker/job_runners/dataset/config_names.py b/services/worker/src/worker/job_runners/dataset/config_names.py index b1045c31a6..a3a803541d 100644 --- a/services/worker/src/worker/job_runners/dataset/config_names.py +++ b/services/worker/src/worker/job_runners/dataset/config_names.py @@ -74,7 +74,7 @@ def compute_config_names_response( number_of_configs = len(config_name_items) if number_of_configs > max_number: raise DatasetWithTooManyConfigsError( - f"The maximun number of configs allowed is {max_number}, dataset has {number_of_configs} configs." + f"The maximum number of configs allowed is {max_number}, dataset has {number_of_configs} configs." ) return DatasetConfigNamesResponse(config_names=config_name_items) From 1d9574e4f712eae2f8c66c848652e0be83784384 Mon Sep 17 00:00:00 2001 From: Bas Krahmer Date: Mon, 19 Jun 2023 12:39:35 +0200 Subject: [PATCH 35/52] Add docker internal to extra_hosts (#1390) --- tools/docker-compose-dev-base.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/docker-compose-dev-base.yml b/tools/docker-compose-dev-base.yml index 90a1e9cb1a..b9ca8e9ed8 100644 --- a/tools/docker-compose-dev-base.yml +++ b/tools/docker-compose-dev-base.yml @@ -25,6 +25,8 @@ services: WORKER_MAX_LOAD_PCT: ${WORKER_MAX_LOAD_PCT-70} WORKER_MAX_MEMORY_PCT: ${WORKER_MAX_MEMORY_PCT-80} WORKER_SLEEP_SECONDS: ${WORKER_SLEEP_SECONDS-15} + extra_hosts: + - "host.docker.internal:host-gateway" # volumes to local source directory for development volumes: - ../libs/libcommon/src:/src/libs/libcommon/src From 7971b346f3496501752f2a6166d1c4451562c840 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 19 Jun 2023 13:21:39 +0200 Subject: [PATCH 36/52] =?UTF-8?q?fix:=20=F0=9F=90=9B=20support=20bigger=20?= =?UTF-8?q?images=20(#1387)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: 🐛 support bigger images fixes https://github.com/huggingface/datasets-server/issues/1361 * style: 💄 fix style * style: 💄 add types for Pillow --- services/worker/poetry.lock | 14 ++++++++++++-- services/worker/pyproject.toml | 5 +++-- services/worker/src/worker/utils.py | 5 +++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock index c7a99d8ea4..cab3feb395 100644 --- a/services/worker/poetry.lock +++ b/services/worker/poetry.lock @@ -4749,8 +4749,6 @@ python-versions = ">=3.8" files = [ {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:db464c88e10e927725997f9b872a21c9d057789d3b7e9a26e4ef1af41d0bcc8c"}, {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:172277c33cb1ae0da19f98c5bcd4946149cfa73c8ea05c6ba18365d58dd3c6f2"}, - {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:9c9b14fbb73ec4cb0f209722a1489020fd8614c92ae22589f2309c48cefdf21f"}, - {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6a54539bd076746f69ae8bef7282f981674fe4dbf59c3a84c4af86ae6bae9d5c"}, {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e3fa53e63672fd71998bbd71cc5478c74dbe5a2d9291d1801c575358c28403c2"}, {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:5499312c21ed3ed47cc6b4cf861896e9564c2c32d8d3c2ef1437c5ca31adfc73"}, {file = "tensorflow_macos-2.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:84cb873c90be63efabfecca53fdc48b734a037d0750532b55cb7ce7c343b5cac"}, @@ -5067,6 +5065,18 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2 doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"] test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +[[package]] +name = "types-pillow" +version = "9.5.0.4" +description = "Typing stubs for Pillow" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "types-Pillow-9.5.0.4.tar.gz", hash = "sha256:f1b6af47abd151847ee25911ffeba784899bc7dc7f9eba8ca6a5aac522b012ef"}, + {file = "types_Pillow-9.5.0.4-py3-none-any.whl", hash = "sha256:69427d9fa4320ff6e30f00fb9c0dd71185dc0a16de4757774220104759483466"}, +] + [[package]] name = "types-psutil" version = "5.9.5.13" diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml index 3215758fb7..f5ac325bc2 100644 --- a/services/worker/pyproject.toml +++ b/services/worker/pyproject.toml @@ -20,13 +20,14 @@ kss = "^2.6.0" libcommon = {path = "../../libs/libcommon", develop = true} lm-dataformat = "^0.0.20" lxml = "^4.9.2" +mirakuru = "^2.4.2" nlp = "^0.4.0" nltk = "^3.8.1" numpy = "~1.22.4" openpyxl = "^3.1.1" pdf2image = "^1.16.2" -pyarrow = "^11.0.0" py7zr = "^0.20.4" +pyarrow = "^11.0.0" pydub = "^0.25.1" pypdf2 = "^3.0.1" python = "3.9.15" @@ -44,7 +45,6 @@ transformers = "^4.30.0" trec-car-tools = { path = "vendors/trec-car-tools/python3" } typer = "^0.4.2" wget = "^3.2" -mirakuru = "^2.4.2" duckdb = "^0.8.0" [tool.poetry.group.dev.dependencies] @@ -58,6 +58,7 @@ pip-audit = "^2.5.4" pytest = "^7.2.1" pytest-asyncio = "^0.21.0" pytest-cov = "^2.12.1" +types-pillow = "^9.5.0.4" types-psutil = "^5.9.5" types-requests = "^2.28.11" diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 7bb45547d4..a989f27776 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -23,6 +23,7 @@ ) from urllib.parse import quote +import PIL from datasets import ( Dataset, DatasetInfo, @@ -43,6 +44,9 @@ from libcommon.utils import orjson_dumps from pyarrow.parquet import ParquetFile +MAX_IMAGE_PIXELS = 1_000_000_000 +# ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS + class JobRunnerInfo(TypedDict): job_type: str @@ -341,6 +345,7 @@ def get_rows( column_names: Optional[List[str]] = None, ) -> RowsContent: download_config = DownloadConfig(delete_extracted=True) + PIL.Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS ds = load_dataset( dataset, name=config, From 431163d0f0ec6cc84a1fcc7e2304b2cc41b8f3ea Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 19 Jun 2023 14:12:18 +0200 Subject: [PATCH 37/52] Rename dev to staging, and use staging mongodb cluster (#1383) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: 🤖 remove makefile targets since we use ArgoCD now * feat: 🎸 align dev on prod, and use secret for mongo url * feat: 🎸 rename dev to staging * ci: 🎡 change dev to staging in ci --- .github/workflows/cd.yml | 6 ++--- .github/workflows/chart-pr.yml | 4 +-- chart/Makefile | 38 ---------------------------- chart/README.md | 19 +------------- chart/env/{dev.yaml => staging.yaml} | 29 ++++++++++++--------- 5 files changed, 23 insertions(+), 73 deletions(-) rename chart/env/{dev.yaml => staging.yaml} (91%) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 878e3e3bee..d139460112 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -76,14 +76,14 @@ jobs: - name: Lint chart with default values run: helm lint working-directory: chart - - name: Lint chart with dev values - run: helm lint --values env/dev.yaml + - name: Lint chart with staging values + run: helm lint --values env/staging.yaml working-directory: chart - name: Lint chart with prod values run: helm lint --values env/prod.yaml working-directory: chart - deploy-dev-and-prod: + deploy-staging-and-prod: if: ${{ endsWith(github.ref, '/main') }} runs-on: ubuntu-latest needs: [build-and-push-images] diff --git a/.github/workflows/chart-pr.yml b/.github/workflows/chart-pr.yml index cd5789bcdd..93880b2376 100644 --- a/.github/workflows/chart-pr.yml +++ b/.github/workflows/chart-pr.yml @@ -19,8 +19,8 @@ jobs: - name: Lint chart with default values run: helm lint working-directory: chart - - name: Lint chart with dev values - run: helm lint --values env/dev.yaml + - name: Lint chart with staging values + run: helm lint --values env/staging.yaml working-directory: chart - name: Lint chart with prod values run: helm lint --values env/prod.yaml diff --git a/chart/Makefile b/chart/Makefile index 555a89f3f8..57716f53a9 100644 --- a/chart/Makefile +++ b/chart/Makefile @@ -1,45 +1,7 @@ -K8S_NAMESPACE := datasets-server - .PHONY: init init: helm dependency update . -.PHONY: uninstall -uninstall: - helm uninstall $(ENV) -n $(K8S_NAMESPACE) - -.PHONY: diff -diff: - helm diff upgrade --install $(ENV) . --values env/$(ENV).yaml -n $(K8S_NAMESPACE) - -.PHONY: upgrade -upgrade: - helm upgrade --install $(ENV) . --values env/$(ENV).yaml -n $(K8S_NAMESPACE) - -.PHONY: diff-dev -diff-dev: - @make diff ENV=dev - -.PHONY: uninstall-dev -uninstall-dev: - @make uninstall ENV=dev - -.PHONY: upgrade-dev -upgrade-dev: - @make upgrade ENV=dev - -.PHONY: diff-prod -diff-prod: - @make diff ENV=prod - -.PHONY: uninstall-prod -uninstall-prod: - @make uninstall ENV=prod - -.PHONY: upgrade-prod -upgrade-prod: - @make upgrade ENV=prod - .PHONY: quality quality: helm lint diff --git a/chart/README.md b/chart/README.md index 26865e85ea..aa3782102f 100644 --- a/chart/README.md +++ b/chart/README.md @@ -13,21 +13,4 @@ Note that this Helm chart is used to manage the deployment of the `datasets-serv ## Deploy -To deploy to the `hub-ephemeral` Kubernetes cluster, ensure to first: - -- install the tools (aws, kubectl, helm) -- authenticate with AWS -- select the `hub-ephemeral` cluster - -Dry run: - -```shell -make init -make diff-dev -``` - -Deploy: - -```shell -make upgrade-dev -``` +To deploy, go to https://cd.internal.huggingface.tech/applications. diff --git a/chart/env/dev.yaml b/chart/env/staging.yaml similarity index 91% rename from chart/env/dev.yaml rename to chart/env/staging.yaml index b66d3f886e..54263cb876 100644 --- a/chart/env/dev.yaml +++ b/chart/env/staging.yaml @@ -4,12 +4,9 @@ # --- common parameters --- global: huggingface: - imageRegistry: "" - imagePullSecrets: [] - privateHub: - enabled: false ingress: domain: us.dev.moon.huggingface.tech + # ^ the domain contains "dev", not "staging". We don't change for now. subdomains: datasetsServer: datasets-server @@ -51,7 +48,7 @@ images: secrets: mongoUrl: - fromSecret: false + fromSecret: true secretName: "mongo-url" value: mongo:// appHfToken: @@ -75,7 +72,7 @@ monitoring: enabled: false mongodb: - enabled: true + enabled: false common: # URL of the HuggingFace Hub @@ -85,6 +82,9 @@ log: # Log level level: "DEBUG" +firstRows: + maxBytes: "200_000" + parquetAndInfo: maxDatasetSize: "500_000_000" @@ -102,6 +102,10 @@ mongodbMigration: cacheMaintenance: action: "skip" # ^ allowed values are {skip,backfill,upgrade} + log: + level: "debug" + backfill: + error_codes_to_retry: "" resources: requests: cpu: 100m @@ -114,14 +118,15 @@ backfill: metricsCollector: action: "collect-metrics" - schedule: "*/5 * * * *" - # every five minutes + schedule: "*/2 * * * *" + # every two minutes nodeSelector: {} resources: requests: - cpu: 0 + cpu: 1 limits: - cpu: 0 + cpu: 1 + memory: "512Mi" tolerations: [] # --- storage admin (to manually inspect the storage, in /data) --- @@ -160,8 +165,8 @@ ingress: annotations: # Link to Route53 - we could set any subdomain to us.dev.moon.huggingface.tech (common zone to the k8s cluster) external-dns.alpha.kubernetes.io/hostname: "datasets-server.us.dev.moon.huggingface.tech" - alb.ingress.kubernetes.io/load-balancer-name: "hub-datasets-server-dev" - alb.ingress.kubernetes.io/tags: "Env=dev,Project=datasets-server,Terraform=true" + alb.ingress.kubernetes.io/load-balancer-name: "hub-datasets-server-staging" + alb.ingress.kubernetes.io/tags: "Env=staging,Project=datasets-server,Terraform=true" alb.ingress.kubernetes.io/healthcheck-path: "/healthcheck" alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80, "HTTPS": 443}]' alb.ingress.kubernetes.io/scheme: "internet-facing" From 80c7b5d60e0bd92e71988a097407357289c95823 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 19 Jun 2023 14:35:59 +0200 Subject: [PATCH 38/52] =?UTF-8?q?feat:=20=F0=9F=8E=B8=2010x=20the=20size?= =?UTF-8?q?=20of=20supported=20images=20(#1392)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/worker/src/worker/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index a989f27776..f5842a08dd 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -44,7 +44,7 @@ from libcommon.utils import orjson_dumps from pyarrow.parquet import ParquetFile -MAX_IMAGE_PIXELS = 1_000_000_000 +MAX_IMAGE_PIXELS = 10_000_000_000 # ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS From b599b1038585ef0604484ed7aa00222d0fdd6c7b Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 19 Jun 2023 18:58:48 -0400 Subject: [PATCH 39/52] Fix exception --- libs/libcommon/src/libcommon/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 57276b2c5e..6c22581a22 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -500,7 +500,7 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) -class DuckDBIndexFileNotFoundError(CacheableError): +class NotAvailableIndexFileError(CacheableError): """Raised when no duckdb index file was found for split.""" def __init__(self, message: str, cause: Optional[BaseException] = None): From 187d7b656ab234dc7681986e53e56d85f39f011a Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 19 Jun 2023 19:08:24 -0400 Subject: [PATCH 40/52] Fix test in libcommon --- chart/env/prod.yaml | 1 + libs/libcommon/tests/test_processing_graph.py | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml index 8b98ecbedc..163a431de8 100644 --- a/chart/env/prod.yaml +++ b/chart/env/prod.yaml @@ -97,6 +97,7 @@ optInOutUrlsScan: rowsMaxNumber: 100_000 urlsNumberPerBatch: 1000 + # --- jobs (pre-install/upgrade hooks) --- mongodbMigration: diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py index 1d0933479a..c75d2104c2 100644 --- a/libs/libcommon/tests/test_processing_graph.py +++ b/libs/libcommon/tests/test_processing_graph.py @@ -83,7 +83,6 @@ def graph() -> ProcessingGraph: "config-parquet", "config-info", "config-size", - "split-duckdb-index", ], ["dataset-config-names"], ["dataset-config-names"], @@ -105,7 +104,6 @@ def graph() -> ProcessingGraph: "split-first-rows-from-streaming", "dataset-split-names", "config-opt-in-out-urls-count", - "split-duckdb-index", ], ["dataset-config-names"], ["dataset-config-names"], @@ -297,9 +295,8 @@ def graph() -> ProcessingGraph: ( "split-duckdb-index", [], - ["config-parquet-and-info", "config-split-names-from-streaming", "config-split-names-from-info"], + ["config-split-names-from-info"], [ - "config-split-names-from-streaming", "config-split-names-from-info", "config-parquet-and-info", "config-info", From 5c9639e2f5876cb39dbaf2d0989bc5f773a03e43 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 20 Jun 2023 16:14:50 -0400 Subject: [PATCH 41/52] Apply some code review suggestions --- libs/libcommon/src/libcommon/exceptions.py | 8 ++ .../worker/job_runners/split/duckdb_index.py | 99 ++++++++++--------- 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index a711a49300..2030a5aaa0 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -73,6 +73,7 @@ def as_response(self) -> ErrorResponse: CacheableErrorCode = Literal[ + "CachedDirectoryNotInitializedError", "ConfigNamesError", "CreateCommitError", "DatasetInBlockListError", @@ -527,3 +528,10 @@ class DatasetWithTooManyConfigsError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True) + + +class CachedDirectoryNotInitializedError(CacheableError): + """Raised when the cached directory has not been initialized before job compute.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CachedDirectoryNotInitializedError", cause, True) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 05aabfb0c5..5571a8911c 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -16,6 +16,7 @@ from libcommon.config import DuckDbIndexConfig from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION from libcommon.exceptions import ( + CachedDirectoryNotInitializedError, DatasetNotFoundError, LockedDatasetTimeoutError, NoIndexableColumnsError, @@ -32,7 +33,7 @@ from libcommon.utils import JobInfo, SplitHubFile from worker.config import AppConfig -from worker.job_runners.split.split_job_runner import SplitCachedJobRunner +from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache from worker.utils import CompleteJobResult, create_branch, hf_hub_url DATASET_TYPE = "dataset" @@ -40,8 +41,8 @@ VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb" CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" -CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);" -CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM" +CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', {columns}, overwrite=1);" +CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, {columns} FROM" # TODO: What if __id field already exist? INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';" LOAD_EXTENSION_COMMAND = "LOAD '{extension}';" @@ -83,49 +84,46 @@ def compute_index_rows( config=config, ) content_parquet_and_info = parquet_and_info_best_response.response["content"] - if "parquet_files" not in content_parquet_and_info: - raise PreviousStepFormatError( - f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'parquet_files'" - ) - - if "dataset_info" not in content_parquet_and_info: - raise PreviousStepFormatError( - f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'dataset_info'" - ) - - split_parquet_files = [ - parquet_file - for parquet_file in content_parquet_and_info["parquet_files"] - if parquet_file["config"] == config and parquet_file["split"] == split - ] - - split_parquets_size = sum(parquet_file["size"] for parquet_file in split_parquet_files) + try: + split_parquet_files = [ + parquet_file + for parquet_file in content_parquet_and_info["parquet_files"] + if parquet_file["config"] == config and parquet_file["split"] == split + ] + + split_parquets_size = sum(parquet_file["size"] for parquet_file in split_parquet_files) + + if split_parquets_size > max_parquet_size_bytes: + raise SplitWithTooBigParquetError( + f"The indexing is limited to split parquets under {max_parquet_size_bytes} bytes. " + f"Current size of sum of split parquets is {split_parquets_size} bytes." + ) - if split_parquets_size > max_parquet_size_bytes: - raise SplitWithTooBigParquetError( - f"The indexing is limited to split parquets under {max_parquet_size_bytes} bytes. " - f"Current size of sum of split parquets is {split_parquets_size} bytes." - ) + parquet_urls = [parquet_file["url"] for parquet_file in split_parquet_files] - parquet_urls = [parquet_file["url"] for parquet_file in split_parquet_files] + if not parquet_urls: + raise ParquetResponseEmptyError("No parquet files found.") - if not parquet_urls: - raise ParquetResponseEmptyError("No parquet files found.") + # get the features + features = content_parquet_and_info["dataset_info"]["features"] + column_names = ",".join(list(features.keys())) - # get the features - features = content_parquet_and_info["dataset_info"].get("features", []) + # look for string columns + string_columns = [ + column + for column, feature in features.items() + if "dtype" in feature + and "_type" in feature + and feature["dtype"] == STRING_FEATURE_DTYPE + and feature["_type"] == VALUE_FEATURE_TYPE + ] + if not string_columns: + raise NoIndexableColumnsError("No string columns available to index.") - # look for string columns - string_columns = [ - column - for column, feature in features.items() - if "dtype" in feature - and "_type" in feature - and feature["dtype"] == STRING_FEATURE_DTYPE - and feature["_type"] == VALUE_FEATURE_TYPE - ] - if not string_columns: - raise NoIndexableColumnsError("No string columns available to index.") + except KeyError as e: + raise PreviousStepFormatError( + f"Previous step '{config_parquet_and_info_step}' did not return the expected content.", e + ) from e # configure duckdb extensions duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs")) @@ -134,14 +132,17 @@ def compute_index_rows( duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts")) # index all columns - db_location = f"{duckdb_index_file_directory}/{DUCKDB_DEFAULT_INDEX_FILENAME}" - con = duckdb.connect(str(db_location)) + if duckdb_index_file_directory is None: + raise CachedDirectoryNotInitializedError("Cache directory has not been initialized.") + db_path = Path(duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME) + + con = duckdb.connect(str(db_path.resolve())) con.sql(CREATE_SEQUENCE_COMMAND) - con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});") + con.sql(f"{CREATE_TABLE_COMMAND.format(columns=column_names)} read_parquet({parquet_urls});") # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future # see https://duckdb.org/docs/extensions/full_text_search.html for more details about 'stemmer' parameter - con.sql(CREATE_INDEX_COMMAND) + con.sql(CREATE_INDEX_COMMAND.format(columns=column_names)) # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) hf_api = HfApi(endpoint=hf_endpoint, token=hf_token) @@ -167,7 +168,7 @@ def compute_index_rows( # send the files to the target revision add_operations: List[CommitOperation] = [ - CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location) + CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_path) ] committer_hf_api.create_commit( @@ -212,7 +213,7 @@ def compute_index_rows( ) -class SplitDuckDbIndexJobRunner(SplitCachedJobRunner): +class SplitDuckDbIndexJobRunner(SplitJobRunnerWithCache): duckdb_index_config: DuckDbIndexConfig def __init__( @@ -226,7 +227,7 @@ def __init__( job_info=job_info, app_config=app_config, processing_step=processing_step, - hf_datasets_cache=Path(duckdb_index_directory).resolve(), + cache_directory=Path(duckdb_index_directory), ) self.duckdb_index_config = app_config.duckdb_index @@ -245,7 +246,7 @@ def compute(self) -> CompleteJobResult: dataset=self.dataset, config=self.config, split=self.split, - duckdb_index_file_directory=self.datasets_cache, + duckdb_index_file_directory=self.cache_subdirectory, hf_token=self.app_config.common.hf_token, url_template=self.duckdb_index_config.url_template, commit_message=self.duckdb_index_config.commit_message, From ce4163a564e865a0a56f97cd2bbb648c3b5e28c9 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 20 Jun 2023 16:27:44 -0400 Subject: [PATCH 42/52] Apply code review suggestions --- libs/libcommon/src/libcommon/storage.py | 2 +- services/worker/src/worker/job_runner_factory.py | 4 ++-- .../src/worker/job_runners/split/duckdb_index.py | 9 ++++----- services/worker/src/worker/main.py | 6 +++--- services/worker/src/worker/start_worker_loop.py | 6 +++--- services/worker/tests/conftest.py | 6 +++--- services/worker/tests/fixtures/hub.py | 2 +- .../tests/job_runners/split/test_duckdb_index.py | 10 +++++----- services/worker/tests/test_executor.py | 4 ++-- services/worker/tests/test_job_runner_factory.py | 4 ++-- 10 files changed, 26 insertions(+), 27 deletions(-) diff --git a/libs/libcommon/src/libcommon/storage.py b/libs/libcommon/src/libcommon/storage.py index 5a8230107e..63d9c10853 100644 --- a/libs/libcommon/src/libcommon/storage.py +++ b/libs/libcommon/src/libcommon/storage.py @@ -82,7 +82,7 @@ def init_parquet_metadata_dir(directory: Optional[StrPath] = None) -> StrPath: return init_dir(directory, appname=PARQUET_METADATA_CACHE_APPNAME) -def init_duckdb_index_dir(directory: Optional[StrPath] = None) -> StrPath: +def init_duckdb_index_cache_dir(directory: Optional[StrPath] = None) -> StrPath: """Initialize the duckdb index directory. If directory is None, it will be set to the default duckdb index location on the machine. diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py index f04146a292..87c48d9019 100644 --- a/services/worker/src/worker/job_runner_factory.py +++ b/services/worker/src/worker/job_runner_factory.py @@ -74,7 +74,7 @@ class JobRunnerFactory(BaseJobRunnerFactory): hf_datasets_cache: Path assets_directory: StrPath parquet_metadata_directory: StrPath - duckdb_index_directory: StrPath + duckdb_index_cache_directory: StrPath def _create_job_runner(self, job_info: JobInfo) -> JobRunner: job_type = job_info["type"] @@ -222,7 +222,7 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner: job_info=job_info, app_config=self.app_config, processing_step=processing_step, - duckdb_index_directory=self.duckdb_index_directory, + duckdb_index_cache_directory=self.duckdb_index_cache_directory, ) supported_job_types = [ diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 5571a8911c..5ffd6e9e88 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -41,9 +41,8 @@ VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb" CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" -CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', {columns}, overwrite=1);" -CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, {columns} FROM" -# TODO: What if __id field already exist? +CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__hf_index_id', {columns}, overwrite=1);" +CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __hf_index_id, {columns} FROM" INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';" LOAD_EXTENSION_COMMAND = "LOAD '{extension}';" @@ -221,13 +220,13 @@ def __init__( job_info: JobInfo, app_config: AppConfig, processing_step: ProcessingStep, - duckdb_index_directory: StrPath, + duckdb_index_cache_directory: StrPath, ) -> None: super().__init__( job_info=job_info, app_config=app_config, processing_step=processing_step, - cache_directory=Path(duckdb_index_directory), + cache_directory=Path(duckdb_index_cache_directory), ) self.duckdb_index_config = app_config.duckdb_index diff --git a/services/worker/src/worker/main.py b/services/worker/src/worker/main.py index 5a866aa74f..4d207280e5 100644 --- a/services/worker/src/worker/main.py +++ b/services/worker/src/worker/main.py @@ -8,7 +8,7 @@ from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.storage import ( init_assets_dir, - init_duckdb_index_dir, + init_duckdb_index_cache_dir, init_parquet_metadata_dir, ) @@ -31,7 +31,7 @@ # ^ set first to have logs as soon as possible assets_directory = init_assets_dir(directory=app_config.assets.storage_directory) parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory) - duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory) + duckdb_index_cache_directory = init_duckdb_index_cache_dir(directory=app_config.duckdb_index.storage_directory) processing_graph = ProcessingGraph(app_config.processing_graph.specification) @@ -59,7 +59,7 @@ hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, - duckdb_index_directory=duckdb_index_directory, + duckdb_index_cache_directory=duckdb_index_cache_directory, ) worker_executor = WorkerExecutor( app_config=app_config, diff --git a/services/worker/src/worker/start_worker_loop.py b/services/worker/src/worker/start_worker_loop.py index ff498dffa2..d5e69a829c 100644 --- a/services/worker/src/worker/start_worker_loop.py +++ b/services/worker/src/worker/start_worker_loop.py @@ -8,7 +8,7 @@ from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.storage import ( init_assets_dir, - init_duckdb_index_dir, + init_duckdb_index_cache_dir, init_parquet_metadata_dir, ) @@ -30,7 +30,7 @@ # ^ set first to have logs as soon as possible assets_directory = init_assets_dir(directory=app_config.assets.storage_directory) parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory) - duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory) + duckdb_index_cache_directory = init_duckdb_index_cache_dir(directory=app_config.duckdb_index.storage_directory) processing_graph = ProcessingGraph(app_config.processing_graph.specification) @@ -58,7 +58,7 @@ hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, - duckdb_index_directory=duckdb_index_directory, + duckdb_index_cache_directory=duckdb_index_cache_directory, ) loop = Loop( library_cache_paths=libraries_resource.storage_paths, diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py index a3f9cbef54..fab2648725 100644 --- a/services/worker/tests/conftest.py +++ b/services/worker/tests/conftest.py @@ -11,7 +11,7 @@ from libcommon.storage import ( StrPath, init_assets_dir, - init_duckdb_index_dir, + init_duckdb_index_cache_dir, init_parquet_metadata_dir, ) from pytest import MonkeyPatch, fixture @@ -126,8 +126,8 @@ def parquet_metadata_directory(app_config: AppConfig) -> StrPath: @fixture -def duckdb_index_directory(app_config: AppConfig) -> StrPath: - return init_duckdb_index_dir(app_config.duckdb_index.storage_directory) +def duckdb_index_cache_directory(app_config: AppConfig) -> StrPath: + return init_duckdb_index_cache_dir(app_config.duckdb_index.storage_directory) @fixture diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index 2dd5d02f38..e194ff17e6 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -767,7 +767,7 @@ def hub_reponses_spawning_opt_in_out(hub_public_spawning_opt_in_out: str) -> Hub @pytest.fixture -def hub_reponses_duckdb_index(hub_public_duckdb_index: str) -> HubDatasetTest: +def hub_responses_duckdb_index(hub_public_duckdb_index: str) -> HubDatasetTest: return { "name": hub_public_duckdb_index, "config_names_response": create_config_names_response(hub_public_duckdb_index), diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index 1c10f37aca..2a8e4b8ad5 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -25,7 +25,7 @@ @pytest.fixture def get_job_runner( - duckdb_index_directory: StrPath, + duckdb_index_cache_directory: StrPath, cache_mongo_resource: CacheMongoResource, queue_mongo_resource: QueueMongoResource, ) -> GetJobRunner: @@ -69,7 +69,7 @@ def _get_job_runner( }, app_config=app_config, processing_step=processing_graph.get_processing_step(processing_step_name), - duckdb_index_directory=duckdb_index_directory, + duckdb_index_cache_directory=duckdb_index_cache_directory, ) return _get_job_runner @@ -128,12 +128,12 @@ def test_compute( get_parquet_job_runner: GetParquetJobRunner, get_job_runner: GetJobRunner, app_config: AppConfig, - hub_reponses_public: HubDatasetTest, - hub_reponses_duckdb_index: HubDatasetTest, + hub_responses_public: HubDatasetTest, + hub_responses_duckdb_index: HubDatasetTest, hub_dataset_name: str, expected_error_code: str, ) -> None: - hub_datasets = {"public": hub_reponses_public, "duckdb_index": hub_reponses_duckdb_index} + hub_datasets = {"public": hub_responses_public, "duckdb_index": hub_responses_duckdb_index} dataset = hub_datasets[hub_dataset_name]["name"] config_names = hub_datasets[hub_dataset_name]["config_names_response"] config = hub_datasets[hub_dataset_name]["config_names_response"]["config_names"][0]["config"] diff --git a/services/worker/tests/test_executor.py b/services/worker/tests/test_executor.py index 4dc2c47862..1a4dc47687 100644 --- a/services/worker/tests/test_executor.py +++ b/services/worker/tests/test_executor.py @@ -199,7 +199,7 @@ def job_runner_factory( libraries_resource: LibrariesResource, assets_directory: StrPath, parquet_metadata_directory: StrPath, - duckdb_index_directory: StrPath, + duckdb_index_cache_directory: StrPath, ) -> JobRunnerFactory: processing_graph = ProcessingGraph(app_config.processing_graph.specification) return JobRunnerFactory( @@ -208,7 +208,7 @@ def job_runner_factory( hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, - duckdb_index_directory=duckdb_index_directory, + duckdb_index_cache_directory=duckdb_index_cache_directory, ) diff --git a/services/worker/tests/test_job_runner_factory.py b/services/worker/tests/test_job_runner_factory.py index 982c0ae2a5..e10bc8c0f6 100644 --- a/services/worker/tests/test_job_runner_factory.py +++ b/services/worker/tests/test_job_runner_factory.py @@ -39,7 +39,7 @@ def test_create_job_runner( libraries_resource: LibrariesResource, assets_directory: StrPath, parquet_metadata_directory: StrPath, - duckdb_index_directory: StrPath, + duckdb_index_cache_directory: StrPath, job_type: str, expected_job_runner: Optional[str], ) -> None: @@ -49,7 +49,7 @@ def test_create_job_runner( hf_datasets_cache=libraries_resource.hf_datasets_cache, assets_directory=assets_directory, parquet_metadata_directory=parquet_metadata_directory, - duckdb_index_directory=duckdb_index_directory, + duckdb_index_cache_directory=duckdb_index_cache_directory, ) job_info: JobInfo = { "type": job_type, From 9e9e25a8f5bdc113fabf14071172228fe36cfdc9 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Tue, 20 Jun 2023 17:25:50 -0400 Subject: [PATCH 43/52] Adding close connection --- .../src/worker/job_runners/split/duckdb_index.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 5ffd6e9e88..7c8fb40846 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -41,7 +41,7 @@ VALUE_FEATURE_TYPE = "Value" DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb" CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;" -CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__hf_index_id', {columns}, overwrite=1);" +CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__hf_index_id', '*', overwrite=1);" CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __hf_index_id, {columns} FROM" INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';" LOAD_EXTENSION_COMMAND = "LOAD '{extension}';" @@ -136,13 +136,19 @@ def compute_index_rows( db_path = Path(duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME) con = duckdb.connect(str(db_path.resolve())) + logging.debug(CREATE_SEQUENCE_COMMAND) con.sql(CREATE_SEQUENCE_COMMAND) - con.sql(f"{CREATE_TABLE_COMMAND.format(columns=column_names)} read_parquet({parquet_urls});") + + create_command_sql = f"{CREATE_TABLE_COMMAND.format(columns=column_names)} read_parquet({parquet_urls});" + logging.debug(create_command_sql) + con.sql(create_command_sql) # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future # see https://duckdb.org/docs/extensions/full_text_search.html for more details about 'stemmer' parameter - con.sql(CREATE_INDEX_COMMAND.format(columns=column_names)) - + logging.debug(CREATE_INDEX_COMMAND) + con.sql(CREATE_INDEX_COMMAND) + con.close() + # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) hf_api = HfApi(endpoint=hf_endpoint, token=hf_token) committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token) @@ -167,7 +173,7 @@ def compute_index_rows( # send the files to the target revision add_operations: List[CommitOperation] = [ - CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_path) + CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_path.resolve()) ] committer_hf_api.create_commit( From b807613552f42d836c8bd71a8b1bee76c9ca8b4a Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 22 Jun 2023 08:13:36 -0400 Subject: [PATCH 44/52] Upgrade duckdb version --- services/worker/poetry.lock | 106 +++++++++++++++++---------------- services/worker/pyproject.toml | 2 +- 2 files changed, 57 insertions(+), 51 deletions(-) diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock index cab3feb395..8c61c00451 100644 --- a/services/worker/poetry.lock +++ b/services/worker/poetry.lock @@ -996,59 +996,64 @@ idna = ["idna (>=2.1)"] [[package]] name = "duckdb" -version = "0.8.0" +version = "0.8.1" description = "DuckDB embedded database" category = "main" optional = false python-versions = "*" files = [ - {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"}, - {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"}, - {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"}, - {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"}, - {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"}, - {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"}, - {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"}, - {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"}, - {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"}, - {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"}, - {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"}, - {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"}, - {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"}, - {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"}, - {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"}, - {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"}, - {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"}, - {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"}, - {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"}, - {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"}, - {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"}, - {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"}, - {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"}, - {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"}, - {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"}, - {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"}, - {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"}, - {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"}, - {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"}, - {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"}, - {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"}, - {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"}, - {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"}, - {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"}, - {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"}, - {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"}, - {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"}, - {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"}, - {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"}, - {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"}, - {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"}, - {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"}, - {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"}, - {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"}, - {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"}, - {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"}, - {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"}, + {file = "duckdb-0.8.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:14781d21580ee72aba1f5dcae7734674c9b6c078dd60470a08b2b420d15b996d"}, + {file = "duckdb-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f13bf7ab0e56ddd2014ef762ae4ee5ea4df5a69545ce1191b8d7df8118ba3167"}, + {file = "duckdb-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4032042d8363e55365bbca3faafc6dc336ed2aad088f10ae1a534ebc5bcc181"}, + {file = "duckdb-0.8.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a71bd8f0b0ca77c27fa89b99349ef22599ffefe1e7684ae2e1aa2904a08684"}, + {file = "duckdb-0.8.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24568d6e48f3dbbf4a933109e323507a46b9399ed24c5d4388c4987ddc694fd0"}, + {file = "duckdb-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297226c0dadaa07f7c5ae7cbdb9adba9567db7b16693dbd1b406b739ce0d7924"}, + {file = "duckdb-0.8.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5792cf777ece2c0591194006b4d3e531f720186102492872cb32ddb9363919cf"}, + {file = "duckdb-0.8.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:12803f9f41582b68921d6b21f95ba7a51e1d8f36832b7d8006186f58c3d1b344"}, + {file = "duckdb-0.8.1-cp310-cp310-win32.whl", hash = "sha256:d0953d5a2355ddc49095e7aef1392b7f59c5be5cec8cdc98b9d9dc1f01e7ce2b"}, + {file = "duckdb-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:6e6583c98a7d6637e83bcadfbd86e1f183917ea539f23b6b41178f32f813a5eb"}, + {file = "duckdb-0.8.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fad7ed0d4415f633d955ac24717fa13a500012b600751d4edb050b75fb940c25"}, + {file = "duckdb-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:81ae602f34d38d9c48dd60f94b89f28df3ef346830978441b83c5b4eae131d08"}, + {file = "duckdb-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d75cfe563aaa058d3b4ccaaa371c6271e00e3070df5de72361fd161b2fe6780"}, + {file = "duckdb-0.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dbb55e7a3336f2462e5e916fc128c47fe1c03b6208d6bd413ac11ed95132aa0"}, + {file = "duckdb-0.8.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a6df53efd63b6fdf04657385a791a4e3c4fb94bfd5db181c4843e2c46b04fef5"}, + {file = "duckdb-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b188b80b70d1159b17c9baaf541c1799c1ce8b2af4add179a9eed8e2616be96"}, + {file = "duckdb-0.8.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5ad481ee353f31250b45d64b4a104e53b21415577943aa8f84d0af266dc9af85"}, + {file = "duckdb-0.8.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d1d1b1729993611b1892509d21c21628917625cdbe824a61ce891baadf684b32"}, + {file = "duckdb-0.8.1-cp311-cp311-win32.whl", hash = "sha256:2d8f9cc301e8455a4f89aa1088b8a2d628f0c1f158d4cf9bc78971ed88d82eea"}, + {file = "duckdb-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:07457a43605223f62d93d2a5a66b3f97731f79bbbe81fdd5b79954306122f612"}, + {file = "duckdb-0.8.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d2c8062c3e978dbcd80d712ca3e307de8a06bd4f343aa457d7dd7294692a3842"}, + {file = "duckdb-0.8.1-cp36-cp36m-win32.whl", hash = "sha256:fad486c65ae944eae2de0d590a0a4fb91a9893df98411d66cab03359f9cba39b"}, + {file = "duckdb-0.8.1-cp36-cp36m-win_amd64.whl", hash = "sha256:86fa4506622c52d2df93089c8e7075f1c4d0ba56f4bf27faebde8725355edf32"}, + {file = "duckdb-0.8.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:60e07a62782f88420046e30cc0e3de842d0901c4fd5b8e4d28b73826ec0c3f5e"}, + {file = "duckdb-0.8.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f18563675977f8cbf03748efee0165b4c8ef64e0cbe48366f78e2914d82138bb"}, + {file = "duckdb-0.8.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16e179443832bea8439ae4dff93cf1e42c545144ead7a4ef5f473e373eea925a"}, + {file = "duckdb-0.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a413d5267cb41a1afe69d30dd6d4842c588256a6fed7554c7e07dad251ede095"}, + {file = "duckdb-0.8.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3784680df59eadd683b0a4c2375d451a64470ca54bd171c01e36951962b1d332"}, + {file = "duckdb-0.8.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:67a1725c2b01f9b53571ecf3f92959b652f60156c1c48fb35798302e39b3c1a2"}, + {file = "duckdb-0.8.1-cp37-cp37m-win32.whl", hash = "sha256:197d37e2588c5ad063e79819054eedb7550d43bf1a557d03ba8f8f67f71acc42"}, + {file = "duckdb-0.8.1-cp37-cp37m-win_amd64.whl", hash = "sha256:3843feb79edf100800f5037c32d5d5a5474fb94b32ace66c707b96605e7c16b2"}, + {file = "duckdb-0.8.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:624c889b0f2d656794757b3cc4fc58030d5e285f5ad2ef9fba1ea34a01dab7fb"}, + {file = "duckdb-0.8.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcbe3742d77eb5add2d617d487266d825e663270ef90253366137a47eaab9448"}, + {file = "duckdb-0.8.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47516c9299d09e9dbba097b9fb339b389313c4941da5c54109df01df0f05e78c"}, + {file = "duckdb-0.8.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf1ba718b7522d34399446ebd5d4b9fcac0b56b6ac07bfebf618fd190ec37c1d"}, + {file = "duckdb-0.8.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e36e35d38a9ae798fe8cf6a839e81494d5b634af89f4ec9483f4d0a313fc6bdb"}, + {file = "duckdb-0.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23493313f88ce6e708a512daacad13e83e6d1ea0be204b175df1348f7fc78671"}, + {file = "duckdb-0.8.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1fb9bf0b6f63616c8a4b9a6a32789045e98c108df100e6bac783dc1e36073737"}, + {file = "duckdb-0.8.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:12fc13ecd5eddd28b203b9e3999040d3a7374a8f4b833b04bd26b8c5685c2635"}, + {file = "duckdb-0.8.1-cp38-cp38-win32.whl", hash = "sha256:a12bf4b18306c9cb2c9ba50520317e6cf2de861f121d6f0678505fa83468c627"}, + {file = "duckdb-0.8.1-cp38-cp38-win_amd64.whl", hash = "sha256:e4e809358b9559c00caac4233e0e2014f3f55cd753a31c4bcbbd1b55ad0d35e4"}, + {file = "duckdb-0.8.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7acedfc00d97fbdb8c3d120418c41ef3cb86ef59367f3a9a30dff24470d38680"}, + {file = "duckdb-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:99bfe264059cdc1e318769103f656f98e819cd4e231cd76c1d1a0327f3e5cef8"}, + {file = "duckdb-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:538b225f361066231bc6cd66c04a5561de3eea56115a5dd773e99e5d47eb1b89"}, + {file = "duckdb-0.8.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae0be3f71a18cd8492d05d0fc1bc67d01d5a9457b04822d025b0fc8ee6efe32e"}, + {file = "duckdb-0.8.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd82ba63b58672e46c8ec60bc9946aa4dd7b77f21c1ba09633d8847ad9eb0d7b"}, + {file = "duckdb-0.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:780a34559aaec8354e83aa4b7b31b3555f1b2cf75728bf5ce11b89a950f5cdd9"}, + {file = "duckdb-0.8.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:01f0d4e9f7103523672bda8d3f77f440b3e0155dd3b2f24997bc0c77f8deb460"}, + {file = "duckdb-0.8.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:31f692decb98c2d57891da27180201d9e93bb470a3051fcf413e8da65bca37a5"}, + {file = "duckdb-0.8.1-cp39-cp39-win32.whl", hash = "sha256:e7fe93449cd309bbc67d1bf6f6392a6118e94a9a4479ab8a80518742e855370a"}, + {file = "duckdb-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:81d670bc6807672f038332d9bf587037aabdd741b0810de191984325ed307abd"}, + {file = "duckdb-0.8.1.tar.gz", hash = "sha256:a54d37f4abc2afc4f92314aaa56ecf215a411f40af4bffe1e86bd25e62aceee9"}, ] [[package]] @@ -4455,7 +4460,6 @@ files = [ {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"}, {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"}, - {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"}, {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"}, {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"}, {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"}, @@ -4749,6 +4753,8 @@ python-versions = ">=3.8" files = [ {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:db464c88e10e927725997f9b872a21c9d057789d3b7e9a26e4ef1af41d0bcc8c"}, {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:172277c33cb1ae0da19f98c5bcd4946149cfa73c8ea05c6ba18365d58dd3c6f2"}, + {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:9c9b14fbb73ec4cb0f209722a1489020fd8614c92ae22589f2309c48cefdf21f"}, + {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6a54539bd076746f69ae8bef7282f981674fe4dbf59c3a84c4af86ae6bae9d5c"}, {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e3fa53e63672fd71998bbd71cc5478c74dbe5a2d9291d1801c575358c28403c2"}, {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:5499312c21ed3ed47cc6b4cf861896e9564c2c32d8d3c2ef1437c5ca31adfc73"}, {file = "tensorflow_macos-2.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:84cb873c90be63efabfecca53fdc48b734a037d0750532b55cb7ce7c343b5cac"}, @@ -5648,4 +5654,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "3.9.15" -content-hash = "732285314a1b756206bdba83a83ee9e97635117f5fd9a6fd8d2b92d8f51e6679" +content-hash = "3aa60ce2866418d5594a71e79a63dbd8e2bd3991c079c53bc055a7c584b3f69e" diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml index f5ac325bc2..1be29673b9 100644 --- a/services/worker/pyproject.toml +++ b/services/worker/pyproject.toml @@ -45,7 +45,7 @@ transformers = "^4.30.0" trec-car-tools = { path = "vendors/trec-car-tools/python3" } typer = "^0.4.2" wget = "^3.2" -duckdb = "^0.8.0" +duckdb = "^0.8.1" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4" From e77b6b47e099f5e56438aff451ad1eecad1b5637 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 22 Jun 2023 08:13:42 -0400 Subject: [PATCH 45/52] Apply code review suggestions --- libs/libcommon/src/libcommon/exceptions.py | 6 +++--- .../src/worker/job_runners/config/parquet_and_info.py | 6 +++++- .../worker/src/worker/job_runners/split/duckdb_index.py | 6 +++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 2030a5aaa0..aa470371ef 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -90,6 +90,7 @@ def as_response(self) -> ErrorResponse: "DatasetWithTooManyConfigsError", "DatasetWithTooManyParquetFilesError", "DisabledViewerError", + "DuckDBIndexFileNotFoundError", "EmptyDatasetError", "ExternalFilesSizeRequestConnectionError", "ExternalFilesSizeRequestError", @@ -105,7 +106,6 @@ def as_response(self) -> ErrorResponse: "MissingSpawningTokenError", "NoIndexableColumnsError", "NormalRowsError", - "NotAvailableIndexFileError", "ParameterMissingError", "ParquetResponseEmptyError", "PreviousStepFormatError", @@ -509,11 +509,11 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) -class NotAvailableIndexFileError(CacheableError): +class DuckDBIndexFileNotFoundError(CacheableError): """Raised when no duckdb index file was found for split.""" def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NotAvailableIndexFileError", cause, False) + super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False) class SplitWithTooBigParquetError(CacheableError): diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 2c9eff9636..ad5f4200e0 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -1037,7 +1037,11 @@ def compute_config_parquet_and_info_response( repo_id=dataset, repo_type=DATASET_TYPE ) create_branch( - dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api + dataset=dataset, + target_revision=target_revision, + refs=refs, + hf_api=hf_api, + committer_hf_api=committer_hf_api, ) # commit the parquet files diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 7c8fb40846..1810fa7446 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -20,7 +20,7 @@ DatasetNotFoundError, LockedDatasetTimeoutError, NoIndexableColumnsError, - NotAvailableIndexFileError, + DuckDBIndexFileNotFoundError, ParquetResponseEmptyError, PreviousStepFormatError, SplitNotFoundError, @@ -148,7 +148,7 @@ def compute_index_rows( logging.debug(CREATE_INDEX_COMMAND) con.sql(CREATE_INDEX_COMMAND) con.close() - + # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) hf_api = HfApi(endpoint=hf_endpoint, token=hf_token) committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token) @@ -196,7 +196,7 @@ def compute_index_rows( if not repo_files or len(repo_files) != 1: logging.warning(f"Found {len(repo_files)} index files, should be only 1") - raise NotAvailableIndexFileError("No index file was found") + raise DuckDBIndexFileNotFoundError("No index file was found") repo_file = repo_files[0] if repo_file.size is None: From 3005e2eac7b047175085d899f7c7fac09a0a67e7 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 22 Jun 2023 08:25:38 -0400 Subject: [PATCH 46/52] Fix style --- services/worker/src/worker/job_runners/split/duckdb_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 1810fa7446..68b2229535 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -18,9 +18,9 @@ from libcommon.exceptions import ( CachedDirectoryNotInitializedError, DatasetNotFoundError, + DuckDBIndexFileNotFoundError, LockedDatasetTimeoutError, NoIndexableColumnsError, - DuckDBIndexFileNotFoundError, ParquetResponseEmptyError, PreviousStepFormatError, SplitNotFoundError, From 84687e0e7f92dafacea190bfa99d485bc95c5609 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 22 Jun 2023 11:43:45 -0400 Subject: [PATCH 47/52] Adding some test cases --- services/worker/tests/fixtures/datasets.py | 13 ++-- .../job_runners/split/test_duckdb_index.py | 62 +++++++++++++++++-- 2 files changed, 64 insertions(+), 11 deletions(-) diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py index fe6413489d..6e987e20e5 100644 --- a/services/worker/tests/fixtures/datasets.py +++ b/services/worker/tests/fixtures/datasets.py @@ -147,11 +147,14 @@ def datasets() -> Mapping[str, Dataset]: pd.DataFrame( { "text": [ - "foo", - "bar", - "foobar", - "- Hello there !", - "- General Kenobi !", + ( + "Grand Moff Tarkin and Lord Vader are interrupted in their discussion by the buzz of the" + " comlink" + ), + "There goes another one.", + "Vader turns round and round in circles as his ship spins into space.", + "We count thirty Rebel ships, Lord Vader.", + "The wingman spots the pirateship coming at him and warns the Dark Lord", ] }, dtype=pd.StringDtype(storage="python"), diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index 2a8e4b8ad5..5004d9f446 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -1,10 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2023 The HuggingFace Authors. +import os +from dataclasses import replace from http import HTTPStatus -from typing import Callable +from typing import Callable, Optional +import duckdb import pytest +import requests from libcommon.processing_graph import ProcessingGraph from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.simple_cache import upsert_response @@ -118,10 +122,11 @@ def _get_job_runner( @pytest.mark.parametrize( - "hub_dataset_name,expected_error_code", + "hub_dataset_name,max_parquet_size_bytes,expected_error_code", [ - ("duckdb_index", None), - ("public", "NoIndexableColumnsError"), # dataset does not have string columns to index + ("duckdb_index", None, None), + ("duckdb_index", 1_000, "SplitWithTooBigParquetError"), # parquet size is 2812 + ("public", None, "NoIndexableColumnsError"), # dataset does not have string columns to index ], ) def test_compute( @@ -131,6 +136,7 @@ def test_compute( hub_responses_public: HubDatasetTest, hub_responses_duckdb_index: HubDatasetTest, hub_dataset_name: str, + max_parquet_size_bytes: Optional[int], expected_error_code: str, ) -> None: hub_datasets = {"public": hub_responses_public, "duckdb_index": hub_responses_duckdb_index} @@ -155,10 +161,22 @@ def test_compute( content=splits_response, ) + app_config = ( + app_config + if max_parquet_size_bytes is None + else replace( + app_config, duckdb_index=replace(app_config.duckdb_index, max_parquet_size_bytes=max_parquet_size_bytes) + ) + ) + parquet_job_runner = get_parquet_job_runner(dataset, config, app_config) parquet_response = parquet_job_runner.compute() config_parquet = parquet_response.content + # simulate more than one parquet file to index + extra_parquet_file = config_parquet["parquet_files"][0] + config_parquet["parquet_files"].append(extra_parquet_file) + upsert_response( "config-parquet-and-info", dataset=dataset, @@ -179,6 +197,38 @@ def test_compute( response = job_runner.compute() assert response content = response.content - assert content["url"] is not None - assert content["filename"] is not None + url = content["url"] + file_name = content["filename"] + assert url is not None + assert file_name is not None job_runner.post_compute() + + # download locally duckdb index file + duckdb_file = requests.get(url) + with open(file_name, "wb") as f: + f.write(duckdb_file.content) + + duckdb.execute("INSTALL 'fts';") + duckdb.execute("LOAD 'fts';") + con = duckdb.connect(file_name) + + # validate number of inserted records + record_count = con.sql("SELECT COUNT(*) FROM data;").fetchall() + assert record_count is not None + assert isinstance(record_count, list) + assert record_count[0] == (10,) # dataset has 5 rows but since parquet file was duplicate it is 10 + + # perform a search to validate fts feature + query = "Lord Vader" + result = con.execute( + ( + "SELECT fts_main_data.match_bm25(__hf_index_id, ?) AS score, text FROM data WHERE score IS NOT NULL" + " ORDER BY score DESC;" + ), + [query], + ) + rows = result.df() + assert rows is not None + + con.close() + os.remove(file_name) From 021ea34b7efd0bcf8fa577377d8abe917db84cdb Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 22 Jun 2023 11:52:13 -0400 Subject: [PATCH 48/52] Remove duplicate code by merge --- services/worker/src/worker/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index d3e66167ef..5045313f3c 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -47,9 +47,6 @@ MAX_IMAGE_PIXELS = 10_000_000_000 # ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS -MAX_IMAGE_PIXELS = 10_000_000_000 -# ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS - class JobRunnerInfo(TypedDict): job_type: str From 80a3c214df043abf7bb683ad5a8fe3776fd9186f Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 26 Jun 2023 07:25:45 -0400 Subject: [PATCH 49/52] Fix imports --- services/worker/src/worker/dtos.py | 14 +++++--------- .../src/worker/job_runners/config/parquet.py | 1 - .../worker/job_runners/config/parquet_and_info.py | 6 +++--- .../worker/job_runners/config/parquet_metadata.py | 1 + .../src/worker/job_runners/dataset/parquet.py | 3 ++- .../src/worker/job_runners/split/duckdb_index.py | 3 ++- .../tests/job_runners/config/test_parquet.py | 6 +----- .../job_runners/config/test_parquet_metadata.py | 1 - .../tests/job_runners/dataset/test_parquet.py | 2 +- 9 files changed, 15 insertions(+), 22 deletions(-) diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py index ef4a4cc0b3..5eb630d1b3 100644 --- a/services/worker/src/worker/dtos.py +++ b/services/worker/src/worker/dtos.py @@ -4,6 +4,8 @@ from dataclasses import dataclass, field from typing import Any, Dict, List, Mapping, Optional, TypedDict +from libcommon.utils import SplitHubFile + class JobRunnerInfo(TypedDict): job_type: str @@ -110,14 +112,8 @@ class ConfigInfoResponse(TypedDict): dataset_info: Dict[str, Any] -class ParquetFileItem(SplitItem): - url: str - filename: str - size: int - - class ConfigParquetAndInfoResponse(TypedDict): - parquet_files: List[ParquetFileItem] + parquet_files: List[SplitHubFile] dataset_info: Dict[str, Any] @@ -134,7 +130,7 @@ class ConfigParquetMetadataResponse(TypedDict): class ConfigParquetResponse(TypedDict): - parquet_files: List[ParquetFileItem] + parquet_files: List[SplitHubFile] class ConfigSize(TypedDict): @@ -183,7 +179,7 @@ class DatasetIsValidResponse(TypedDict): class DatasetParquetResponse(TypedDict): - parquet_files: List[ParquetFileItem] + parquet_files: List[SplitHubFile] pending: list[PreviousJob] failed: list[PreviousJob] diff --git a/services/worker/src/worker/job_runners/config/parquet.py b/services/worker/src/worker/job_runners/config/parquet.py index 6bba5ca6c0..572df22cd9 100644 --- a/services/worker/src/worker/job_runners/config/parquet.py +++ b/services/worker/src/worker/job_runners/config/parquet.py @@ -6,7 +6,6 @@ from libcommon.constants import PROCESSING_STEP_CONFIG_PARQUET_VERSION from libcommon.exceptions import PreviousStepFormatError from libcommon.simple_cache import get_previous_step_or_raise -from libcommon.utils import SplitHubFile from worker.dtos import CompleteJobResult, ConfigParquetResponse from worker.job_runners.config.config_job_runner import ConfigJobRunner diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index b90b922095..9d6fa16271 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -7,7 +7,7 @@ from functools import partial from multiprocessing.pool import ThreadPool from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Tuple, TypedDict +from typing import Any, List, Optional, Set, Tuple import datasets import datasets.config @@ -74,9 +74,9 @@ from tqdm.contrib.concurrent import thread_map from worker.config import AppConfig, ParquetAndInfoConfig -from worker.dtos import CompleteJobResult, ConfigParquetAndInfoResponse, ParquetFileItem +from worker.dtos import CompleteJobResult, ConfigParquetAndInfoResponse from worker.job_runners.config.config_job_runner import ConfigJobRunnerWithDatasetsCache -from worker.utils import retry, create_branch +from worker.utils import create_branch, hf_hub_url, retry DATASET_TYPE = "dataset" MAX_FILES_PER_DIRECTORY = 10_000 # hf hub limitation diff --git a/services/worker/src/worker/job_runners/config/parquet_metadata.py b/services/worker/src/worker/job_runners/config/parquet_metadata.py index 002e043ee1..55de2fe3da 100644 --- a/services/worker/src/worker/job_runners/config/parquet_metadata.py +++ b/services/worker/src/worker/job_runners/config/parquet_metadata.py @@ -27,6 +27,7 @@ ParquetFileMetadataItem, ) from worker.job_runners.config.config_job_runner import ConfigJobRunner +from worker.utils import get_parquet_file def compute_parquet_metadata_response( diff --git a/services/worker/src/worker/job_runners/dataset/parquet.py b/services/worker/src/worker/job_runners/dataset/parquet.py index 68663da1bf..1f4f839949 100644 --- a/services/worker/src/worker/job_runners/dataset/parquet.py +++ b/services/worker/src/worker/job_runners/dataset/parquet.py @@ -12,12 +12,13 @@ get_previous_step_or_raise, get_response, ) +from libcommon.utils import SplitHubFile + from worker.dtos import ( ConfigParquetResponse, DatasetParquetResponse, JobResult, PreviousJob, - SplitHubFile, ) from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 68b2229535..6ab5d47eae 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -33,8 +33,9 @@ from libcommon.utils import JobInfo, SplitHubFile from worker.config import AppConfig +from worker.dtos import CompleteJobResult from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache -from worker.utils import CompleteJobResult, create_branch, hf_hub_url +from worker.utils import create_branch, hf_hub_url DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" diff --git a/services/worker/tests/job_runners/config/test_parquet.py b/services/worker/tests/job_runners/config/test_parquet.py index 09202118ce..e314000ac1 100644 --- a/services/worker/tests/job_runners/config/test_parquet.py +++ b/services/worker/tests/job_runners/config/test_parquet.py @@ -12,11 +12,7 @@ from libcommon.utils import Priority, SplitHubFile from worker.config import AppConfig -from worker.dtos import ( - ConfigParquetAndInfoResponse, - ConfigParquetResponse, - SplitHubFile, -) +from worker.dtos import ConfigParquetAndInfoResponse, ConfigParquetResponse from worker.job_runners.config.parquet import ConfigParquetJobRunner diff --git a/services/worker/tests/job_runners/config/test_parquet_metadata.py b/services/worker/tests/job_runners/config/test_parquet_metadata.py index c1ba370911..21b9ce01e2 100644 --- a/services/worker/tests/job_runners/config/test_parquet_metadata.py +++ b/services/worker/tests/job_runners/config/test_parquet_metadata.py @@ -22,7 +22,6 @@ from worker.dtos import ( ConfigParquetMetadataResponse, ConfigParquetResponse, - SplitHubFile, ParquetFileMetadataItem, ) from worker.job_runners.config.parquet_metadata import ConfigParquetMetadataJobRunner diff --git a/services/worker/tests/job_runners/dataset/test_parquet.py b/services/worker/tests/job_runners/dataset/test_parquet.py index 53d8343b81..8f63b188b0 100644 --- a/services/worker/tests/job_runners/dataset/test_parquet.py +++ b/services/worker/tests/job_runners/dataset/test_parquet.py @@ -12,7 +12,7 @@ from libcommon.utils import Priority, SplitHubFile from worker.config import AppConfig -from worker.dtos import ConfigParquetResponse, DatasetParquetResponse, SplitHubFile +from worker.dtos import ConfigParquetResponse, DatasetParquetResponse from worker.job_runners.dataset.parquet import DatasetParquetJobRunner from ..utils import UpstreamResponse From b6f3bd991782cf49840d0bd800f3c942966adff8 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 26 Jun 2023 08:32:22 -0400 Subject: [PATCH 50/52] Apply code review suggestions --- libs/libcommon/src/libcommon/exceptions.py | 78 +++++++++---------- services/worker/pyproject.toml | 2 +- .../job_runners/config/parquet_and_info.py | 11 +-- .../worker/job_runners/split/duckdb_index.py | 35 +++++---- services/worker/src/worker/utils.py | 11 ++- .../job_runners/split/test_duckdb_index.py | 2 + 6 files changed, 73 insertions(+), 66 deletions(-) diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index aa470371ef..46e66228de 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -73,7 +73,7 @@ def as_response(self) -> ErrorResponse: CacheableErrorCode = Literal[ - "CachedDirectoryNotInitializedError", + "CacheDirectoryNotInitializedError", "ConfigNamesError", "CreateCommitError", "DatasetInBlockListError", @@ -140,6 +140,13 @@ def __init__( ) +class CacheDirectoryNotInitializedError(CacheableError): + """Raised when the cache directory has not been initialized before job compute.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CacheDirectoryNotInitializedError", cause, True) + + class ConfigNamesError(CacheableError): """Raised when the config names could not be fetched.""" @@ -236,6 +243,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooBigExternalFilesError", cause, True) +class DatasetWithTooManyConfigsError(CacheableError): + """Raised when the number of configs of a dataset exceeded the limit.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True) + + class DatasetWithTooManyExternalFilesError(CacheableError): """Raised when the number of external data files of a dataset is too big.""" @@ -250,11 +264,11 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyParquetFilesError", cause, True) -class LockedDatasetTimeoutError(CacheableError): - """Raised when a dataset is locked by another job.""" +class DuckDBIndexFileNotFoundError(CacheableError): + """Raised when no duckdb index file was found for split.""" def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "LockedDatasetTimeoutError", cause, True) + super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False) class DisabledViewerError(CacheableError): @@ -359,6 +373,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): ) +class LockedDatasetTimeoutError(CacheableError): + """Raised when a dataset is locked by another job.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "LockedDatasetTimeoutError", cause, True) + + class MissingSpawningTokenError(CacheableError): """Raised when the spawning.ai token is not set.""" @@ -373,6 +394,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NormalRowsError", cause, True) +class NoIndexableColumnsError(CacheableError): + """Raised when split does not have string columns to index.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) + + class ParameterMissingError(CacheableError): """Raised when request is missing some parameter.""" @@ -454,6 +482,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): ) +class SplitWithTooBigParquetError(CacheableError): + """Raised when the split parquet size (sum of parquet sizes given) is too big.""" + + def __init__(self, message: str, cause: Optional[BaseException] = None): + super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False) + + class StreamingRowsError(CacheableError): """Raised when the rows could not be fetched in streaming mode.""" @@ -500,38 +535,3 @@ class UnsupportedExternalFilesError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True) - - -class NoIndexableColumnsError(CacheableError): - """Raised when split does not have string columns to index.""" - - def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) - - -class DuckDBIndexFileNotFoundError(CacheableError): - """Raised when no duckdb index file was found for split.""" - - def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False) - - -class SplitWithTooBigParquetError(CacheableError): - """Raised when the split parquet size (sum of parquet sizes given) is too big.""" - - def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False) - - -class DatasetWithTooManyConfigsError(CacheableError): - """Raised when the number of configs of a dataset exceeded the limit.""" - - def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True) - - -class CachedDirectoryNotInitializedError(CacheableError): - """Raised when the cached directory has not been initialized before job compute.""" - - def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CachedDirectoryNotInitializedError", cause, True) diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml index 1be29673b9..a2e6034dfc 100644 --- a/services/worker/pyproject.toml +++ b/services/worker/pyproject.toml @@ -12,6 +12,7 @@ aiohttp = "^3.8.4" aiolimiter = "^1.0.0" bs4 = "^0.0.1" conllu = "^4.5.2" +duckdb = "^0.8.1" environs = "^9.5.0" gdown = "^4.6.3" huggingface-hub = { git = "https://github.com/huggingface/huggingface_hub", rev = "1055a56b2d2723b55ba4fdf1f3296e04cfd8d6db" } @@ -45,7 +46,6 @@ transformers = "^4.30.0" trec-car-tools = { path = "vendors/trec-car-tools/python3" } typer = "^0.4.2" wget = "^3.2" -duckdb = "^0.8.1" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4" diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 9d6fa16271..77a598abcd 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -76,7 +76,7 @@ from worker.config import AppConfig, ParquetAndInfoConfig from worker.dtos import CompleteJobResult, ConfigParquetAndInfoResponse from worker.job_runners.config.config_job_runner import ConfigJobRunnerWithDatasetsCache -from worker.utils import create_branch, hf_hub_url, retry +from worker.utils import LOCK_GIT_BRANCH_RETRY_SLEEPS, create_branch, hf_hub_url, retry DATASET_TYPE = "dataset" MAX_FILES_PER_DIRECTORY = 10_000 # hf hub limitation @@ -1025,18 +1025,15 @@ def compute_config_parquet_and_info_response( parquet_operations = convert_to_parquet(builder) try: - sleeps = [1, 1, 1, 1, 1, 10, 10, 10, 10, 100] * 3 # ^ timeouts after ~7 minutes - with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps): + with lock.git_branch( + dataset=dataset, branch=target_revision, job_id=job_id, sleeps=LOCK_GIT_BRANCH_RETRY_SLEEPS + ): # create the target revision if we managed to get the parquet files and it does not exist yet # (clone from initial commit to avoid cloning all repo's files) - refs = retry(on=[requests.exceptions.ConnectionError], sleeps=[1, 1, 1, 10, 10])(hf_api.list_repo_refs)( - repo_id=dataset, repo_type=DATASET_TYPE - ) create_branch( dataset=dataset, target_revision=target_revision, - refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api, ) diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 6ab5d47eae..32abddf675 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -16,7 +16,7 @@ from libcommon.config import DuckDbIndexConfig from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION from libcommon.exceptions import ( - CachedDirectoryNotInitializedError, + CacheDirectoryNotInitializedError, DatasetNotFoundError, DuckDBIndexFileNotFoundError, LockedDatasetTimeoutError, @@ -35,7 +35,7 @@ from worker.config import AppConfig from worker.dtos import CompleteJobResult from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache -from worker.utils import create_branch, hf_hub_url +from worker.utils import LOCK_GIT_BRANCH_RETRY_SLEEPS, create_branch, hf_hub_url DATASET_TYPE = "dataset" STRING_FEATURE_DTYPE = "string" @@ -53,7 +53,7 @@ def compute_index_rows( dataset: str, config: str, split: str, - duckdb_index_file_directory: Optional[Path], + duckdb_index_file_directory: Path, target_revision: str, hf_endpoint: str, commit_message: str, @@ -132,9 +132,7 @@ def compute_index_rows( duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts")) # index all columns - if duckdb_index_file_directory is None: - raise CachedDirectoryNotInitializedError("Cache directory has not been initialized.") - db_path = Path(duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME) + db_path = duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME con = duckdb.connect(str(db_path.resolve())) logging.debug(CREATE_SEQUENCE_COMMAND) @@ -150,22 +148,21 @@ def compute_index_rows( con.sql(CREATE_INDEX_COMMAND) con.close() - # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files) hf_api = HfApi(endpoint=hf_endpoint, token=hf_token) committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token) index_file_location = f"{config}/{split}/{DUCKDB_DEFAULT_INDEX_FILENAME}" - try: - refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE) - except RepositoryNotFoundError as err: - raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err - - create_branch( - dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api - ) try: - sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300] - with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps): + with lock.git_branch( + dataset=dataset, branch=target_revision, job_id=job_id, sleeps=LOCK_GIT_BRANCH_RETRY_SLEEPS + ): + create_branch( + dataset=dataset, + target_revision=target_revision, + hf_api=hf_api, + committer_hf_api=committer_hf_api, + ) + target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False) all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings} delete_operations: List[CommitOperation] = [] @@ -190,6 +187,8 @@ def compute_index_rows( target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True) except TimeoutError as err: raise LockedDatasetTimeoutError("the dataset is currently locked, please try again later.") from err + except RepositoryNotFoundError as err: + raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err repo_files = [ repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location @@ -246,6 +245,8 @@ def get_job_runner_version() -> int: return PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION def compute(self) -> CompleteJobResult: + if self.cache_subdirectory is None: + raise CacheDirectoryNotInitializedError("Cache directory has not been initialized.") return CompleteJobResult( compute_index_rows( job_id=self.job_info["job_id"], diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 84bb009ee9..c3b985d8b9 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -21,6 +21,7 @@ from urllib.parse import quote import PIL +import requests from datasets import ( Dataset, DatasetInfo, @@ -31,7 +32,7 @@ ) from datasets.utils.file_utils import get_authentication_headers_for_url from fsspec.implementations.http import HTTPFileSystem -from huggingface_hub.hf_api import GitRefs, HfApi +from huggingface_hub.hf_api import HfApi from huggingface_hub.utils._errors import RepositoryNotFoundError from libcommon.exceptions import ( DatasetNotFoundError, @@ -335,9 +336,15 @@ def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> P DATASET_TYPE = "dataset" +LIST_REPO_REFS_RETRY_SLEEPS = [1, 1, 1, 10, 10] +LOCK_GIT_BRANCH_RETRY_SLEEPS = [1, 1, 1, 1, 1, 10, 10, 10, 10, 100] * 3 -def create_branch(dataset: str, target_revision: str, refs: GitRefs, hf_api: HfApi, committer_hf_api: HfApi) -> None: + +def create_branch(dataset: str, target_revision: str, hf_api: HfApi, committer_hf_api: HfApi) -> None: try: + refs = retry(on=[requests.exceptions.ConnectionError], sleeps=LIST_REPO_REFS_RETRY_SLEEPS)( + hf_api.list_repo_refs + )(repo_id=dataset, repo_type=DATASET_TYPE) if all(ref.ref != target_revision for ref in refs.converts): initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id committer_hf_api.create_branch( diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index 5004d9f446..94263f4241 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -187,6 +187,7 @@ def test_compute( assert parquet_response job_runner = get_job_runner(dataset, config, split, app_config) + job_runner.pre_compute() if expected_error_code: with pytest.raises(Exception) as e: @@ -232,3 +233,4 @@ def test_compute( con.close() os.remove(file_name) + job_runner.post_compute() \ No newline at end of file From 550f1183c691572429546cf5ec85f2295a60791a Mon Sep 17 00:00:00 2001 From: Andrea Francis Soria Jimenez Date: Mon, 26 Jun 2023 08:33:13 -0400 Subject: [PATCH 51/52] Apply suggestions from code review Co-authored-by: Sylvain Lesage --- tools/docker-compose-datasets-server.yml | 2 +- tools/docker-compose-dev-datasets-server.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml index 19ce53754e..37b1c87d1f 100644 --- a/tools/docker-compose-datasets-server.yml +++ b/tools/docker-compose-datasets-server.yml @@ -116,7 +116,7 @@ services: DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index} DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index file} DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-} - DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index} + DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-refs/convert/parquet} DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml index aa74c5d5f4..233e90f253 100644 --- a/tools/docker-compose-dev-datasets-server.yml +++ b/tools/docker-compose-dev-datasets-server.yml @@ -120,7 +120,7 @@ services: DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index} DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index files} DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-} - DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index} + DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-refs/convert/parquet} DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s} DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000} WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets} From 930f6c0d12f04d12cc3dc353148f0551dce41ca2 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 26 Jun 2023 12:03:24 -0400 Subject: [PATCH 52/52] Add test --- .../tests/job_runners/split/test_duckdb_index.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py index 94263f4241..a8fe40cfc3 100644 --- a/services/worker/tests/job_runners/split/test_duckdb_index.py +++ b/services/worker/tests/job_runners/split/test_duckdb_index.py @@ -222,15 +222,21 @@ def test_compute( # perform a search to validate fts feature query = "Lord Vader" result = con.execute( - ( - "SELECT fts_main_data.match_bm25(__hf_index_id, ?) AS score, text FROM data WHERE score IS NOT NULL" - " ORDER BY score DESC;" - ), + "SELECT text FROM data WHERE fts_main_data.match_bm25(__hf_index_id, ?) IS NOT NULL;", [query], ) rows = result.df() assert rows is not None + assert (rows["text"].eq("Vader turns round and round in circles as his ship spins into space.")).any() + assert (rows["text"].eq("The wingman spots the pirateship coming at him and warns the Dark Lord")).any() + assert (rows["text"].eq("We count thirty Rebel ships, Lord Vader.")).any() + assert ( + rows["text"].eq( + "Grand Moff Tarkin and Lord Vader are interrupted in their discussion by the buzz of the comlink" + ) + ).any() + assert not (rows["text"].eq("There goes another one.")).any() con.close() os.remove(file_name) - job_runner.post_compute() \ No newline at end of file + job_runner.post_compute()