From 1e419640ef977e12b6489adc0cb4d13b875fbedd Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 31 May 2023 07:51:01 -0400
Subject: [PATCH 01/52] Draft files

---
 services/worker/poetry.lock                   | 101 +++++++++++++++++-
 services/worker/pyproject.toml                |   2 +
 .../job_runners/split/index_elasticsearch.py  |  20 ++++
 .../worker/job_runners/split/index_parquet.py |  52 +++++++++
 .../worker/job_runners/split/read_index.py    |  22 ++++
 5 files changed, 194 insertions(+), 3 deletions(-)
 create mode 100644 services/worker/src/worker/job_runners/split/index_elasticsearch.py
 create mode 100644 services/worker/src/worker/job_runners/split/index_parquet.py
 create mode 100644 services/worker/src/worker/job_runners/split/read_index.py

diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock
index 3e747ec224..9563703a02 100644
--- a/services/worker/poetry.lock
+++ b/services/worker/poetry.lock
@@ -986,6 +986,101 @@ files = [
 dnssec = ["ecdsa (>=0.13)", "pycryptodome"]
 idna = ["idna (>=2.1)"]
 
+[[package]]
+name = "duckdb"
+version = "0.8.0"
+description = "DuckDB embedded database"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"},
+    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"},
+    {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"},
+    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"},
+    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"},
+    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"},
+    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"},
+    {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"},
+    {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"},
+    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"},
+    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"},
+    {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"},
+    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"},
+    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"},
+    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"},
+    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"},
+    {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"},
+    {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"},
+    {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"},
+    {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"},
+    {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"},
+    {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"},
+    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"},
+    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"},
+    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"},
+    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"},
+    {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"},
+    {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"},
+    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"},
+    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"},
+    {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"},
+    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"},
+    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"},
+    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"},
+    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"},
+    {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"},
+    {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"},
+    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"},
+    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"},
+    {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"},
+    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"},
+    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"},
+    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"},
+    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"},
+    {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"},
+    {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"},
+    {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"},
+]
+
+[[package]]
+name = "elastic-transport"
+version = "8.4.0"
+description = "Transport classes and utilities shared among Python Elastic client libraries"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"},
+    {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"},
+]
+
+[package.dependencies]
+certifi = "*"
+urllib3 = ">=1.26.2,<2"
+
+[package.extras]
+develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"]
+
+[[package]]
+name = "elasticsearch"
+version = "8.8.0"
+description = "Python client for Elasticsearch"
+category = "main"
+optional = false
+python-versions = ">=3.6, <4"
+files = [
+    {file = "elasticsearch-8.8.0-py3-none-any.whl", hash = "sha256:2223ee9daaa3c80c25b28ec3f7c48e66fce6b767a338333d9a81886046a07df6"},
+    {file = "elasticsearch-8.8.0.tar.gz", hash = "sha256:6878313cd598c7c90079fed1d4be72e198da35cba57f4083e6bee91f9c70b0eb"},
+]
+
+[package.dependencies]
+elastic-transport = ">=8,<9"
+
+[package.extras]
+async = ["aiohttp (>=3,<4)"]
+requests = ["requests (>=2.4.0,<3.0.0)"]
+
 [[package]]
 name = "environs"
 version = "9.5.0"
@@ -1740,7 +1835,6 @@ optional = false
 python-versions = "*"
 files = [
     {file = "libclang-15.0.6.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:8621795e07b87e17fc7aac9f071bc7fe6b52ed6110c0a96a9975d8113c8c2527"},
-    {file = "libclang-15.0.6.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0bf192c48a8d2992fc5034393ddc99e772ac30e105df84927d62fc88ef8a659f"},
     {file = "libclang-15.0.6.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:69b01a23ab543908a661532595daa23cf88bd96d80e41f58ba0eaa6a378fe0d8"},
     {file = "libclang-15.0.6.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:4a5188184b937132c198ee9de9a8a2316d5fdd1a825398d5ad1a8f5e06f9b40e"},
     {file = "libclang-15.0.6.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:f7ffa02ac5e586cfffde039dcccc439d88d0feac7d77bf9426d9ba7543d16545"},
@@ -4343,7 +4437,6 @@ files = [
     {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"},
-    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"},
     {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"},
     {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"},
     {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"},
@@ -4642,6 +4735,8 @@ python-versions = ">=3.8"
 files = [
     {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:db464c88e10e927725997f9b872a21c9d057789d3b7e9a26e4ef1af41d0bcc8c"},
     {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:172277c33cb1ae0da19f98c5bcd4946149cfa73c8ea05c6ba18365d58dd3c6f2"},
+    {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:9c9b14fbb73ec4cb0f209722a1489020fd8614c92ae22589f2309c48cefdf21f"},
+    {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6a54539bd076746f69ae8bef7282f981674fe4dbf59c3a84c4af86ae6bae9d5c"},
     {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e3fa53e63672fd71998bbd71cc5478c74dbe5a2d9291d1801c575358c28403c2"},
     {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:5499312c21ed3ed47cc6b4cf861896e9564c2c32d8d3c2ef1437c5ca31adfc73"},
     {file = "tensorflow_macos-2.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:84cb873c90be63efabfecca53fdc48b734a037d0750532b55cb7ce7c343b5cac"},
@@ -5551,4 +5646,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.9.15"
-content-hash = "2a3dd73c87ace648b1ae56a4b2139c6f658a095b4cb24f1d8bf96a5c5f748903"
+content-hash = "64b8f4a53abc1ae8c6bed9553eff820bab0171629c43a35d2b61acb3985f7920"
diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml
index 0e9ff2ef5d..0a14102caf 100644
--- a/services/worker/pyproject.toml
+++ b/services/worker/pyproject.toml
@@ -47,6 +47,8 @@ typer = "^0.4.2"
 wget = "^3.2"
 mirakuru = "^2.4.2"
 pytest-asyncio = "^0.21.0"
+duckdb = "0.8.0"
+elasticsearch = "^8.8.0"
 
 [tool.poetry.group.dev.dependencies]
 bandit = "^1.7.4"
diff --git a/services/worker/src/worker/job_runners/split/index_elasticsearch.py b/services/worker/src/worker/job_runners/split/index_elasticsearch.py
new file mode 100644
index 0000000000..d788c5993a
--- /dev/null
+++ b/services/worker/src/worker/job_runners/split/index_elasticsearch.py
@@ -0,0 +1,20 @@
+from datasets import load_dataset
+from elasticsearch import Elasticsearch
+from datetime import datetime
+
+duorc = load_dataset("LLMs/Alpaca-ShareGPT", split="train")
+es = Elasticsearch("http://localhost:9200")
+start_time = datetime.now()
+
+for i, row in enumerate(duorc):
+    doc = {
+        "config": "LLMs--Alpaca-ShareGPT",
+        "split": "train",
+        "index": i,
+        "row": row,
+    }
+            
+    es.index(index="LLMs--Alpaca-ShareGPT".lower(), id=i, document=doc)
+    print(f"indexed row {i}")
+end_time = datetime.now()
+print(f"Duration: {end_time - start_time}")
\ No newline at end of file
diff --git a/services/worker/src/worker/job_runners/split/index_parquet.py b/services/worker/src/worker/job_runners/split/index_parquet.py
new file mode 100644
index 0000000000..16b32e2bb6
--- /dev/null
+++ b/services/worker/src/worker/job_runners/split/index_parquet.py
@@ -0,0 +1,52 @@
+
+from typing import List
+import duckdb
+import pandas as pd
+import requests
+from datetime import datetime
+
+DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
+PARQUET_REVISION="refs/convert/parquet"
+
+EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT"
+
+con = duckdb.connect('datasets-server.db')
+
+def get_parquet_urls(dataset: str) -> List[str]:
+        splits = requests.get(f"{DATASETS_SERVER_ENDPOINT}/splits?dataset={dataset}", timeout=60).json().get("splits")
+        split = splits[0]
+        response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={split['config']}", timeout=60)
+        if response.status_code != 200:
+            raise Exception(response)
+        
+        response = response.json()
+        parquet_files = response["parquet_files"]
+        urls = [content["url"] for content in parquet_files if content["split"] == split["split"]]
+        if len(urls) == 0:
+             raise Exception("No parquet files found for dataset")
+        return urls
+
+def import_data():
+    start_time = datetime.now()
+
+    duckdb.execute("INSTALL 'httpfs';")
+    duckdb.execute("LOAD 'httpfs';")
+    duckdb.execute("INSTALL 'fts';")
+    duckdb.execute("LOAD 'fts';")
+    # duckdb.sql("select * from duckdb_extensions();").show()
+    
+    # Import data + index
+    parquet_url = get_parquet_urls(EXAMPLE_DATASET_NAME)[0]
+    print("parquet_url", parquet_url)
+    con.sql("CREATE SEQUENCE serial START 1;")
+    # We need a sequence id column for Full text search
+    # I'm very rusty in SQL so it's very possible there are simpler ways.
+
+    con.sql(f"CREATE TABLE data AS SELECT nextval('serial') AS id, * FROM '{parquet_url}';")
+    con.sql("PRAGMA create_fts_index('data', 'id', '*');")
+
+    con.sql("DESCRIBE SELECT * FROM data").show()
+    end_time = datetime.now()
+    print(f"Duration: {end_time - start_time}")
+
+import_data()
\ No newline at end of file
diff --git a/services/worker/src/worker/job_runners/split/read_index.py b/services/worker/src/worker/job_runners/split/read_index.py
new file mode 100644
index 0000000000..d4a2e49a54
--- /dev/null
+++ b/services/worker/src/worker/job_runners/split/read_index.py
@@ -0,0 +1,22 @@
+import duckdb
+import pandas as pd
+
+DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
+PARQUET_REVISION="refs/convert/parquet"
+
+EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT"
+
+con = duckdb.connect('datasets-server.db')
+
+def run_command(query: str) -> pd.DataFrame:
+    try:
+        result = con.execute("SELECT fts_main_data.match_bm25(id, ?) AS score, id, instruction, input, output   FROM data   WHERE score IS NOT NULL   ORDER BY score DESC;", [query])
+        print("Ok")
+    except Exception as error:
+        print(f"Error: {str(error)}")
+        return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
+    print(result)
+    return result.df()
+
+result = run_command("Jonny Walker")
+print(result)
\ No newline at end of file

From f37a829bba707e862edf8ccbb8ac7956dd357517 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 2 Jun 2023 08:11:34 -0400
Subject: [PATCH 02/52] Adding duckdb index job runner

---
 libs/libcommon/src/libcommon/config.py        |  24 ++++
 libs/libcommon/src/libcommon/constants.py     |   2 +
 libs/libcommon/src/libcommon/exceptions.py    |   7 +
 libs/libcommon/src/libcommon/storage.py       |  15 ++
 .../src/libcommon/viewer_utils/index_utils.py |  20 +++
 services/api/poetry.lock                      |  60 +++++++-
 services/api/pyproject.toml                   |   1 +
 services/worker/src/worker/config.py          |   3 +
 .../worker/src/worker/job_runner_factory.py   |  12 +-
 .../worker/job_runners/split/duckdb_index.py  | 129 ++++++++++++++++++
 .../job_runners/split/index_elasticsearch.py  |  20 ---
 .../worker/job_runners/split/index_parquet.py |  52 -------
 .../worker/job_runners/split/read_index.py    |  22 ---
 services/worker/src/worker/main.py            |   4 +-
 .../worker/src/worker/start_worker_loop.py    |   4 +-
 services/worker/src/worker/utils.py           |   4 +
 16 files changed, 280 insertions(+), 99 deletions(-)
 create mode 100644 libs/libcommon/src/libcommon/viewer_utils/index_utils.py
 create mode 100644 services/worker/src/worker/job_runners/split/duckdb_index.py
 delete mode 100644 services/worker/src/worker/job_runners/split/index_elasticsearch.py
 delete mode 100644 services/worker/src/worker/job_runners/split/index_parquet.py
 delete mode 100644 services/worker/src/worker/job_runners/split/read_index.py

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
index 716a82ce11..a5de43748b 100644
--- a/libs/libcommon/src/libcommon/config.py
+++ b/libs/libcommon/src/libcommon/config.py
@@ -29,6 +29,7 @@
     PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION,
     PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_COUNT_VERSION,
     PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_SCAN_VERSION,
+    PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
 )
 from libcommon.processing_graph import ProcessingGraphSpecification
 
@@ -104,6 +105,22 @@ def from_env(cls) -> "ParquetMetadataConfig":
             )
 
 
+DUCKDB_INDEX_STORAGE_DIRECTORY = None
+
+
+@dataclass(frozen=True)
+class DuckDbIndexConfig:
+    storage_directory: Optional[str] = DUCKDB_INDEX_STORAGE_DIRECTORY
+
+    @classmethod
+    def from_env(cls) -> "ParquetMetadataConfig":
+        env = Env(expand_vars=True)
+        with env.prefixed("DUCKDB_INDEX_"):
+            return cls(
+                storage_directory=env.str(name="STORAGE_DIRECTORY", default=DUCKDB_INDEX_STORAGE_DIRECTORY),
+            )
+
+
 COMMON_HF_ENDPOINT = "https://huggingface.co"
 COMMON_HF_TOKEN = None
 
@@ -320,6 +337,13 @@ class ProcessingGraphConfig:
                 "triggered_by": ["dataset-config-names", "config-opt-in-out-urls-count"],
                 "job_runner_version": PROCESSING_STEP_DATASET_OPT_IN_OUT_URLS_COUNT_VERSION,
             },
+            "split-duckdb-index": {
+                "input_type": "split",
+                "triggered_by": [
+                    "split-first-rows-from-streaming", "split-first-rows-from-parquet", "config-parquet",
+                ],
+                "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
+            }
         }
     )
 
diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py
index 0a3a549420..a9dcbf92f7 100644
--- a/libs/libcommon/src/libcommon/constants.py
+++ b/libs/libcommon/src/libcommon/constants.py
@@ -6,6 +6,7 @@
 CACHE_MONGOENGINE_ALIAS = "cache"
 CACHED_ASSETS_CACHE_APPNAME = "datasets_server_cached_assets"
 PARQUET_METADATA_CACHE_APPNAME = "datasets_server_parquet_metadata"
+DUCKDB_INDEX_CACHE_APPNAME="datasets_server_duckdb_index"
 METRICS_COLLECTION_CACHE_TOTAL_METRIC = "cacheTotalMetric"
 METRICS_COLLECTION_JOB_TOTAL_METRIC = "jobTotalMetric"
 METRICS_MONGOENGINE_ALIAS = "metrics"
@@ -35,6 +36,7 @@
 PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_COUNT_VERSION = 2
 PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_SCAN_VERSION = 4
 PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION = 1
+PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION = 1
 
 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = 100
 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = 100
diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 0f97f5c699..06721e9e47 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -505,3 +505,10 @@ class UnsupportedExternalFilesError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True)
+
+
+class NoIndexableColumnsError(CacheableError):
+    """Raised when split does not have string columns to index."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
diff --git a/libs/libcommon/src/libcommon/storage.py b/libs/libcommon/src/libcommon/storage.py
index bbef1442be..46f402df18 100644
--- a/libs/libcommon/src/libcommon/storage.py
+++ b/libs/libcommon/src/libcommon/storage.py
@@ -13,6 +13,7 @@
     ASSETS_CACHE_APPNAME,
     CACHED_ASSETS_CACHE_APPNAME,
     PARQUET_METADATA_CACHE_APPNAME,
+    DUCKDB_INDEX_CACHE_APPNAME,
 )
 
 StrPath = Union[str, PathLike[str]]
@@ -81,6 +82,20 @@ def init_parquet_metadata_dir(directory: Optional[StrPath] = None) -> StrPath:
     return init_dir(directory, appname=PARQUET_METADATA_CACHE_APPNAME)
 
 
+def init_duckdb_index_dir(directory: Optional[StrPath] = None) -> StrPath:
+    """Initialize the duckdb index directory.
+
+    If directory is None, it will be set to the default duckdb index location on the machine.
+
+    Args:
+        directory (Optional[Union[str, PathLike[str]]], optional): The directory to initialize. Defaults to None.
+
+    Returns:
+        Union[str, PathLike[str]]: The directory.
+    """
+    return init_dir(directory, appname=DUCKDB_INDEX_CACHE_APPNAME)
+
+
 def exists(path: StrPath) -> bool:
     """Check if a path exists.
 
diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
new file mode 100644
index 0000000000..22b346a2c9
--- /dev/null
+++ b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2022 The HuggingFace Authors.
+
+from os import makedirs
+from pathlib import Path
+from libcommon.storage import StrPath
+from typing import Tuple
+
+DATASET_SEPARATOR = "--"
+INDEX_DIR_MODE = 0o755
+
+
+def create_index_dir_split(
+    dataset: str, config: str, split: str, index_directory: StrPath
+) -> Tuple[str, str]:
+    split_path = dataset / DATASET_SEPARATOR / config / split
+    dir_path = Path(index_directory).resolve() / split_path
+    makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True)
+    return split_path, dir_path
+
diff --git a/services/api/poetry.lock b/services/api/poetry.lock
index a906e4f0b3..9c64231e25 100644
--- a/services/api/poetry.lock
+++ b/services/api/poetry.lock
@@ -711,6 +711,63 @@ files = [
 dnssec = ["ecdsa (>=0.13)", "pycryptodome"]
 idna = ["idna (>=2.1)"]
 
+[[package]]
+name = "duckdb"
+version = "0.8.0"
+description = "DuckDB embedded database"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"},
+    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"},
+    {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"},
+    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"},
+    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"},
+    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"},
+    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"},
+    {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"},
+    {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"},
+    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"},
+    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"},
+    {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"},
+    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"},
+    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"},
+    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"},
+    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"},
+    {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"},
+    {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"},
+    {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"},
+    {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"},
+    {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"},
+    {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"},
+    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"},
+    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"},
+    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"},
+    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"},
+    {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"},
+    {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"},
+    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"},
+    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"},
+    {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"},
+    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"},
+    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"},
+    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"},
+    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"},
+    {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"},
+    {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"},
+    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"},
+    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"},
+    {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"},
+    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"},
+    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"},
+    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"},
+    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"},
+    {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"},
+    {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"},
+    {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"},
+]
+
 [[package]]
 name = "environs"
 version = "9.5.0"
@@ -2880,7 +2937,6 @@ files = [
     {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"},
-    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"},
     {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"},
     {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"},
     {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"},
@@ -3441,4 +3497,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "3.9.15"
-content-hash = "4e76b1586360769e88d2439840cbbd3cb91c8b1087d4b17b0e4246d465cc163c"
+content-hash = "1cbdff67ee9555ae24c1f162b595c50a5fa9fa2e37c2d3784728b01ebdb5a278"
diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml
index 16a6fd6f3d..a16ca6c02f 100644
--- a/services/api/pyproject.toml
+++ b/services/api/pyproject.toml
@@ -19,6 +19,7 @@ starlette = "^0.27.0"
 starlette-prometheus = "^0.9.0"
 uvicorn = "^0.20.0"
 watchdog = { extras = ["watchmedo"], version = "^2.2.1" }
+duckdb = "^0.8.0"
 
 [tool.poetry.group.dev.dependencies]
 bandit = "^1.7.4"
diff --git a/services/worker/src/worker/config.py b/services/worker/src/worker/config.py
index b2c2ecf99c..cc27936f41 100644
--- a/services/worker/src/worker/config.py
+++ b/services/worker/src/worker/config.py
@@ -13,6 +13,7 @@
     ParquetMetadataConfig,
     ProcessingGraphConfig,
     QueueConfig,
+    DuckDbIndexConfig,
 )
 
 WORKER_CONTENT_MAX_BYTES = 10_000_000
@@ -232,6 +233,7 @@ class AppConfig:
     worker: WorkerConfig = field(default_factory=WorkerConfig)
     urls_scan: OptInOutUrlsScanConfig = field(default_factory=OptInOutUrlsScanConfig)
     parquet_metadata: ParquetMetadataConfig = field(default_factory=ParquetMetadataConfig)
+    duckdb_index: DuckDbIndexConfig = field(default_factory=DuckDbIndexConfig)
 
     @classmethod
     def from_env(cls) -> "AppConfig":
@@ -249,4 +251,5 @@ def from_env(cls) -> "AppConfig":
             worker=WorkerConfig.from_env(),
             urls_scan=OptInOutUrlsScanConfig.from_env(),
             parquet_metadata=ParquetMetadataConfig.from_env(),
+            duckdb_index=DuckDbIndexConfig.from_env(),
         )
diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py
index 0e4bfe8259..82b071b7e5 100644
--- a/services/worker/src/worker/job_runner_factory.py
+++ b/services/worker/src/worker/job_runner_factory.py
@@ -47,7 +47,7 @@
 from worker.job_runners.split.opt_in_out_urls_scan_from_streaming import (
     SplitOptInOutUrlsScanJobRunner,
 )
-
+from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner
 
 class BaseJobRunnerFactory(ABC):
     """
@@ -73,6 +73,7 @@ class JobRunnerFactory(BaseJobRunnerFactory):
     hf_datasets_cache: Path
     assets_directory: StrPath
     parquet_metadata_directory: StrPath
+    duckdb_index_directory: StrPath
 
     def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
         job_type = job_info["type"]
@@ -213,6 +214,14 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
                 processing_step=processing_step,
             )
 
+        if job_type == SplitDuckDbIndexJobRunner.get_job_type():
+            return SplitDuckDbIndexJobRunner(
+                job_info=job_info,
+                app_config=self.app_config,
+                processing_step=processing_step,
+                duckdb_index_directory=self.duckdb_index_directory,
+            )
+
         supported_job_types = [
             DatasetConfigNamesJobRunner.get_job_type(),
             ConfigSplitNamesFromStreamingJobRunner.get_job_type(),
@@ -232,5 +241,6 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
             SplitOptInOutUrlsCountJobRunner.get_job_type(),
             ConfigOptInOutUrlsCountJobRunner.get_job_type(),
             DatasetOptInOutUrlsCountJobRunner.get_job_type(),
+            SplitDuckDbIndexJobRunner.get_job_type(),
         ]
         raise ValueError(f"Unsupported job type: '{job_type}'. The supported job types are: {supported_job_types}")
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
new file mode 100644
index 0000000000..45586c3ff1
--- /dev/null
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+
+import logging
+
+import duckdb
+from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
+from libcommon.exceptions import ParquetResponseEmptyError, PreviousStepFormatError, NoIndexableColumnsError
+from libcommon.processing_graph import ProcessingStep
+from libcommon.storage import StrPath
+from libcommon.utils import JobInfo
+from libcommon.viewer_utils.index_utils import create_index_dir_split
+
+from worker.config import AppConfig
+from worker.job_runners.split.split_job_runner import SplitJobRunner
+from worker.utils import (
+    CompleteJobResult,
+    IndexRowsResponse,
+    get_previous_step_or_raise,
+)
+
+STRING_FEATURE_DTYPE = "string"
+VALUE_FEATURE_TYPE = "Value"
+DUCKDB_DEFAULT_DB_NAME = "index.db"
+
+def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse:
+    logging.info(f"get index-rows for dataset={dataset} config={config} split={split}")
+
+    # get the first rows from previous job
+    upstream_response = get_previous_step_or_raise(
+        kinds=["split-first-rows-from-streaming", "split-first-rows-from-parquet"],
+        dataset=dataset,
+        config=config,
+        split=split,
+    )
+    try:
+        first_rows = upstream_response.response["content"]
+        features = first_rows["features"]
+    except KeyError as e:
+        raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
+
+    # look for string columns using the first rows
+    string_columns = [
+        feature["name"]
+        for feature in features
+        if "dtype" in feature["type"]
+        and "_type" in feature["type"]
+        and feature["type"]["dtype"] == STRING_FEATURE_DTYPE
+        and feature["type"]["_type"] == VALUE_FEATURE_TYPE
+    ]
+
+    if not string_columns:
+        raise NoIndexableColumnsError("No string columns available to index.")
+
+    # get list of parquet urls
+    config_parquet = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
+    try:
+        parquet_files = config_parquet.response["content"]["parquet_files"]
+        parquet_urls = [content["url"] for content in parquet_files if content["split"] == split]
+
+        if not parquet_urls:
+            raise ParquetResponseEmptyError("No parquet files found.")
+    except Exception as e:
+        raise PreviousStepFormatError("Previous step did not return the expected content.") from e
+
+    # create duckdb index location
+    # TODO: Need to manage re index, maybe delete folder/file or perform a table drop/delete?
+    split_path, dir_path = create_index_dir_split(
+        dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
+    )
+    duck_db_name = split_path / DUCKDB_DEFAULT_DB_NAME
+    db_location = dir_path / DUCKDB_DEFAULT_DB_NAME
+
+    # configure duckdb extensions
+    duckdb.execute("INSTALL 'httpfs';")
+    duckdb.execute("LOAD 'httpfs';")
+    duckdb.execute("INSTALL 'fts';")
+    duckdb.execute("LOAD 'fts';")
+    logging.info(str(db_location))
+
+    # index
+    con = duckdb.connect(str(db_location))
+    con.sql("CREATE SEQUENCE serial START 1;")
+    # TODO: We need a sequence id column for Full text search, maybe there is a better way
+    filter_columns = ",".join(string_columns)  # TODO: What if already exists an id? need to create an identity column
+    con.sql(
+        f"CREATE TABLE data AS SELECT nextval('serial') AS id, {filter_columns} FROM read_parquet({parquet_urls});"
+    )
+    con.sql("PRAGMA create_fts_index('data', 'id', '*');")
+
+    return IndexRowsResponse(
+        duckdb_db_name=str(duck_db_name)
+    )
+
+
+class SplitDuckDbIndexJobRunner(SplitJobRunner):
+    duckdb_index_directory: StrPath
+
+    def __init__(
+        self,
+        job_info: JobInfo,
+        app_config: AppConfig,
+        processing_step: ProcessingStep,
+        duckdb_index_directory: StrPath,
+    ) -> None:
+        super().__init__(
+            job_info=job_info,
+            app_config=app_config,
+            processing_step=processing_step,
+        )
+        self.duckdb_index_directory = duckdb_index_directory
+
+    @staticmethod
+    def get_job_type() -> str:
+        return "split-duckdb-index"
+
+    @staticmethod
+    def get_job_runner_version() -> int:
+        return PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
+
+    def compute(self) -> CompleteJobResult:
+        return CompleteJobResult(
+            compute_index_rows(
+                dataset=self.dataset,
+                config=self.config,
+                split=self.split,
+                assets_directory=self.assets_directory,
+            )
+        )
diff --git a/services/worker/src/worker/job_runners/split/index_elasticsearch.py b/services/worker/src/worker/job_runners/split/index_elasticsearch.py
deleted file mode 100644
index d788c5993a..0000000000
--- a/services/worker/src/worker/job_runners/split/index_elasticsearch.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from datasets import load_dataset
-from elasticsearch import Elasticsearch
-from datetime import datetime
-
-duorc = load_dataset("LLMs/Alpaca-ShareGPT", split="train")
-es = Elasticsearch("http://localhost:9200")
-start_time = datetime.now()
-
-for i, row in enumerate(duorc):
-    doc = {
-        "config": "LLMs--Alpaca-ShareGPT",
-        "split": "train",
-        "index": i,
-        "row": row,
-    }
-            
-    es.index(index="LLMs--Alpaca-ShareGPT".lower(), id=i, document=doc)
-    print(f"indexed row {i}")
-end_time = datetime.now()
-print(f"Duration: {end_time - start_time}")
\ No newline at end of file
diff --git a/services/worker/src/worker/job_runners/split/index_parquet.py b/services/worker/src/worker/job_runners/split/index_parquet.py
deleted file mode 100644
index 16b32e2bb6..0000000000
--- a/services/worker/src/worker/job_runners/split/index_parquet.py
+++ /dev/null
@@ -1,52 +0,0 @@
-
-from typing import List
-import duckdb
-import pandas as pd
-import requests
-from datetime import datetime
-
-DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
-PARQUET_REVISION="refs/convert/parquet"
-
-EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT"
-
-con = duckdb.connect('datasets-server.db')
-
-def get_parquet_urls(dataset: str) -> List[str]:
-        splits = requests.get(f"{DATASETS_SERVER_ENDPOINT}/splits?dataset={dataset}", timeout=60).json().get("splits")
-        split = splits[0]
-        response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={split['config']}", timeout=60)
-        if response.status_code != 200:
-            raise Exception(response)
-        
-        response = response.json()
-        parquet_files = response["parquet_files"]
-        urls = [content["url"] for content in parquet_files if content["split"] == split["split"]]
-        if len(urls) == 0:
-             raise Exception("No parquet files found for dataset")
-        return urls
-
-def import_data():
-    start_time = datetime.now()
-
-    duckdb.execute("INSTALL 'httpfs';")
-    duckdb.execute("LOAD 'httpfs';")
-    duckdb.execute("INSTALL 'fts';")
-    duckdb.execute("LOAD 'fts';")
-    # duckdb.sql("select * from duckdb_extensions();").show()
-    
-    # Import data + index
-    parquet_url = get_parquet_urls(EXAMPLE_DATASET_NAME)[0]
-    print("parquet_url", parquet_url)
-    con.sql("CREATE SEQUENCE serial START 1;")
-    # We need a sequence id column for Full text search
-    # I'm very rusty in SQL so it's very possible there are simpler ways.
-
-    con.sql(f"CREATE TABLE data AS SELECT nextval('serial') AS id, * FROM '{parquet_url}';")
-    con.sql("PRAGMA create_fts_index('data', 'id', '*');")
-
-    con.sql("DESCRIBE SELECT * FROM data").show()
-    end_time = datetime.now()
-    print(f"Duration: {end_time - start_time}")
-
-import_data()
\ No newline at end of file
diff --git a/services/worker/src/worker/job_runners/split/read_index.py b/services/worker/src/worker/job_runners/split/read_index.py
deleted file mode 100644
index d4a2e49a54..0000000000
--- a/services/worker/src/worker/job_runners/split/read_index.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import duckdb
-import pandas as pd
-
-DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
-PARQUET_REVISION="refs/convert/parquet"
-
-EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT"
-
-con = duckdb.connect('datasets-server.db')
-
-def run_command(query: str) -> pd.DataFrame:
-    try:
-        result = con.execute("SELECT fts_main_data.match_bm25(id, ?) AS score, id, instruction, input, output   FROM data   WHERE score IS NOT NULL   ORDER BY score DESC;", [query])
-        print("Ok")
-    except Exception as error:
-        print(f"Error: {str(error)}")
-        return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
-    print(result)
-    return result.df()
-
-result = run_command("Jonny Walker")
-print(result)
\ No newline at end of file
diff --git a/services/worker/src/worker/main.py b/services/worker/src/worker/main.py
index da297ccc67..31d6686956 100644
--- a/services/worker/src/worker/main.py
+++ b/services/worker/src/worker/main.py
@@ -6,7 +6,7 @@
 from libcommon.log import init_logging
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.resources import CacheMongoResource, QueueMongoResource
-from libcommon.storage import init_assets_dir, init_parquet_metadata_dir
+from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir
 
 from worker.config import AppConfig
 from worker.executor import WorkerExecutor
@@ -27,6 +27,7 @@
         # ^ set first to have logs as soon as possible
         assets_directory = init_assets_dir(directory=app_config.assets.storage_directory)
         parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory)
+        duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory)
 
         processing_graph = ProcessingGraph(app_config.processing_graph.specification)
 
@@ -54,6 +55,7 @@
                 hf_datasets_cache=libraries_resource.hf_datasets_cache,
                 assets_directory=assets_directory,
                 parquet_metadata_directory=parquet_metadata_directory,
+                duckdb_index_directory=duckdb_index_directory,
             )
             worker_executor = WorkerExecutor(
                 app_config=app_config,
diff --git a/services/worker/src/worker/start_worker_loop.py b/services/worker/src/worker/start_worker_loop.py
index 92be5d69bd..3e66ea1ed5 100644
--- a/services/worker/src/worker/start_worker_loop.py
+++ b/services/worker/src/worker/start_worker_loop.py
@@ -6,7 +6,7 @@
 from libcommon.log import init_logging
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.resources import CacheMongoResource, QueueMongoResource
-from libcommon.storage import init_assets_dir, init_parquet_metadata_dir
+from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir
 
 from worker.config import AppConfig
 from worker.job_runner_factory import JobRunnerFactory
@@ -26,6 +26,7 @@
     # ^ set first to have logs as soon as possible
     assets_directory = init_assets_dir(directory=app_config.assets.storage_directory)
     parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory)
+    duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory)
 
     processing_graph = ProcessingGraph(app_config.processing_graph.specification)
 
@@ -53,6 +54,7 @@
             hf_datasets_cache=libraries_resource.hf_datasets_cache,
             assets_directory=assets_directory,
             parquet_metadata_directory=parquet_metadata_directory,
+            duckdb_index_directory=duckdb_index_directory,
         )
         loop = Loop(
             library_cache_paths=libraries_resource.storage_paths,
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index 17ccd75b33..69aaba10c9 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -132,6 +132,10 @@ class ImageUrlColumnsResponse(TypedDict):
     columns: List[str]
 
 
+class IndexRowsResponse(TypedDict):
+    duckdb_db_name: str
+
+
 Row = Mapping[str, Any]
 
 

From 340d85edf017c38cce0033cce09ff36b18a309eb Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 2 Jun 2023 08:21:01 -0400
Subject: [PATCH 03/52] Fix style

---
 libs/libcommon/src/libcommon/config.py           | 10 ++++++----
 libs/libcommon/src/libcommon/constants.py        |  2 +-
 libs/libcommon/src/libcommon/exceptions.py       |  1 +
 libs/libcommon/src/libcommon/storage.py          |  2 +-
 .../src/libcommon/viewer_utils/index_utils.py    | 10 ++++------
 services/worker/src/worker/config.py             |  2 +-
 services/worker/src/worker/job_runner_factory.py |  3 ++-
 .../src/worker/job_runners/split/duckdb_index.py | 16 +++++++++-------
 services/worker/src/worker/main.py               |  6 +++++-
 services/worker/src/worker/start_worker_loop.py  |  6 +++++-
 services/worker/tests/conftest.py                | 12 +++++++++++-
 services/worker/tests/test_executor.py           |  2 ++
 services/worker/tests/test_job_runner_factory.py |  2 ++
 13 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
index a5de43748b..47031b0ab4 100644
--- a/libs/libcommon/src/libcommon/config.py
+++ b/libs/libcommon/src/libcommon/config.py
@@ -24,12 +24,12 @@
     PROCESSING_STEP_DATASET_PARQUET_VERSION,
     PROCESSING_STEP_DATASET_SIZE_VERSION,
     PROCESSING_STEP_DATASET_SPLIT_NAMES_VERSION,
+    PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
     PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION,
     PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION,
     PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION,
     PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_COUNT_VERSION,
     PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_SCAN_VERSION,
-    PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
 )
 from libcommon.processing_graph import ProcessingGraphSpecification
 
@@ -113,7 +113,7 @@ class DuckDbIndexConfig:
     storage_directory: Optional[str] = DUCKDB_INDEX_STORAGE_DIRECTORY
 
     @classmethod
-    def from_env(cls) -> "ParquetMetadataConfig":
+    def from_env(cls) -> "DuckDbIndexConfig":
         env = Env(expand_vars=True)
         with env.prefixed("DUCKDB_INDEX_"):
             return cls(
@@ -340,10 +340,12 @@ class ProcessingGraphConfig:
             "split-duckdb-index": {
                 "input_type": "split",
                 "triggered_by": [
-                    "split-first-rows-from-streaming", "split-first-rows-from-parquet", "config-parquet",
+                    "split-first-rows-from-streaming",
+                    "split-first-rows-from-parquet",
+                    "config-parquet",
                 ],
                 "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
-            }
+            },
         }
     )
 
diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py
index a9dcbf92f7..26f089f970 100644
--- a/libs/libcommon/src/libcommon/constants.py
+++ b/libs/libcommon/src/libcommon/constants.py
@@ -6,7 +6,7 @@
 CACHE_MONGOENGINE_ALIAS = "cache"
 CACHED_ASSETS_CACHE_APPNAME = "datasets_server_cached_assets"
 PARQUET_METADATA_CACHE_APPNAME = "datasets_server_parquet_metadata"
-DUCKDB_INDEX_CACHE_APPNAME="datasets_server_duckdb_index"
+DUCKDB_INDEX_CACHE_APPNAME = "datasets_server_duckdb_index"
 METRICS_COLLECTION_CACHE_TOTAL_METRIC = "cacheTotalMetric"
 METRICS_COLLECTION_JOB_TOTAL_METRIC = "jobTotalMetric"
 METRICS_MONGOENGINE_ALIAS = "metrics"
diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 06721e9e47..d9cead1daa 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -100,6 +100,7 @@ def as_response(self) -> ErrorResponse:
     "JobManagerCrashedError",
     "JobManagerExceededMaximumDurationError",
     "MissingSpawningTokenError",
+    "NoIndexableColumnsError",
     "NormalRowsError",
     "ParameterMissingError",
     "ParquetResponseEmptyError",
diff --git a/libs/libcommon/src/libcommon/storage.py b/libs/libcommon/src/libcommon/storage.py
index 46f402df18..5a8230107e 100644
--- a/libs/libcommon/src/libcommon/storage.py
+++ b/libs/libcommon/src/libcommon/storage.py
@@ -12,8 +12,8 @@
 from libcommon.constants import (
     ASSETS_CACHE_APPNAME,
     CACHED_ASSETS_CACHE_APPNAME,
-    PARQUET_METADATA_CACHE_APPNAME,
     DUCKDB_INDEX_CACHE_APPNAME,
+    PARQUET_METADATA_CACHE_APPNAME,
 )
 
 StrPath = Union[str, PathLike[str]]
diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
index 22b346a2c9..d00b4754cc 100644
--- a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
+++ b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
@@ -3,18 +3,16 @@
 
 from os import makedirs
 from pathlib import Path
-from libcommon.storage import StrPath
 from typing import Tuple
 
+from libcommon.storage import StrPath
+
 DATASET_SEPARATOR = "--"
 INDEX_DIR_MODE = 0o755
 
 
-def create_index_dir_split(
-    dataset: str, config: str, split: str, index_directory: StrPath
-) -> Tuple[str, str]:
-    split_path = dataset / DATASET_SEPARATOR / config / split
+def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Tuple[str, Path]:
+    split_path = f"{dataset}/{DATASET_SEPARATOR}/{config}/{split}"
     dir_path = Path(index_directory).resolve() / split_path
     makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True)
     return split_path, dir_path
-
diff --git a/services/worker/src/worker/config.py b/services/worker/src/worker/config.py
index cc27936f41..7a0697d2e2 100644
--- a/services/worker/src/worker/config.py
+++ b/services/worker/src/worker/config.py
@@ -9,11 +9,11 @@
     AssetsConfig,
     CacheConfig,
     CommonConfig,
+    DuckDbIndexConfig,
     LogConfig,
     ParquetMetadataConfig,
     ProcessingGraphConfig,
     QueueConfig,
-    DuckDbIndexConfig,
 )
 
 WORKER_CONTENT_MAX_BYTES = 10_000_000
diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py
index 82b071b7e5..bd73c19c46 100644
--- a/services/worker/src/worker/job_runner_factory.py
+++ b/services/worker/src/worker/job_runner_factory.py
@@ -34,6 +34,7 @@
 from worker.job_runners.dataset.parquet import DatasetParquetJobRunner
 from worker.job_runners.dataset.size import DatasetSizeJobRunner
 from worker.job_runners.dataset.split_names import DatasetSplitNamesJobRunner
+from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner
 from worker.job_runners.split.first_rows_from_parquet import (
     SplitFirstRowsFromParquetJobRunner,
 )
@@ -47,7 +48,7 @@
 from worker.job_runners.split.opt_in_out_urls_scan_from_streaming import (
     SplitOptInOutUrlsScanJobRunner,
 )
-from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner
+
 
 class BaseJobRunnerFactory(ABC):
     """
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 45586c3ff1..19f0b0e368 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -5,7 +5,11 @@
 
 import duckdb
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
-from libcommon.exceptions import ParquetResponseEmptyError, PreviousStepFormatError, NoIndexableColumnsError
+from libcommon.exceptions import (
+    NoIndexableColumnsError,
+    ParquetResponseEmptyError,
+    PreviousStepFormatError,
+)
 from libcommon.processing_graph import ProcessingStep
 from libcommon.storage import StrPath
 from libcommon.utils import JobInfo
@@ -23,6 +27,7 @@
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_DB_NAME = "index.db"
 
+
 def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse:
     logging.info(f"get index-rows for dataset={dataset} config={config} split={split}")
 
@@ -68,7 +73,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     split_path, dir_path = create_index_dir_split(
         dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
     )
-    duck_db_name = split_path / DUCKDB_DEFAULT_DB_NAME
+    duck_db_name = f"{split_path}/{DUCKDB_DEFAULT_DB_NAME}"
     db_location = dir_path / DUCKDB_DEFAULT_DB_NAME
 
     # configure duckdb extensions
@@ -76,7 +81,6 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     duckdb.execute("LOAD 'httpfs';")
     duckdb.execute("INSTALL 'fts';")
     duckdb.execute("LOAD 'fts';")
-    logging.info(str(db_location))
 
     # index
     con = duckdb.connect(str(db_location))
@@ -88,9 +92,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     )
     con.sql("PRAGMA create_fts_index('data', 'id', '*');")
 
-    return IndexRowsResponse(
-        duckdb_db_name=str(duck_db_name)
-    )
+    return IndexRowsResponse(duckdb_db_name=duck_db_name)
 
 
 class SplitDuckDbIndexJobRunner(SplitJobRunner):
@@ -124,6 +126,6 @@ def compute(self) -> CompleteJobResult:
                 dataset=self.dataset,
                 config=self.config,
                 split=self.split,
-                assets_directory=self.assets_directory,
+                duckdb_index_directory=self.duckdb_index_directory,
             )
         )
diff --git a/services/worker/src/worker/main.py b/services/worker/src/worker/main.py
index 31d6686956..5a866aa74f 100644
--- a/services/worker/src/worker/main.py
+++ b/services/worker/src/worker/main.py
@@ -6,7 +6,11 @@
 from libcommon.log import init_logging
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.resources import CacheMongoResource, QueueMongoResource
-from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir
+from libcommon.storage import (
+    init_assets_dir,
+    init_duckdb_index_dir,
+    init_parquet_metadata_dir,
+)
 
 from worker.config import AppConfig
 from worker.executor import WorkerExecutor
diff --git a/services/worker/src/worker/start_worker_loop.py b/services/worker/src/worker/start_worker_loop.py
index 3e66ea1ed5..039f69811a 100644
--- a/services/worker/src/worker/start_worker_loop.py
+++ b/services/worker/src/worker/start_worker_loop.py
@@ -6,7 +6,11 @@
 from libcommon.log import init_logging
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.resources import CacheMongoResource, QueueMongoResource
-from libcommon.storage import init_assets_dir, init_parquet_metadata_dir, init_duckdb_index_dir
+from libcommon.storage import (
+    init_assets_dir,
+    init_duckdb_index_dir,
+    init_parquet_metadata_dir,
+)
 
 from worker.config import AppConfig
 from worker.job_runner_factory import JobRunnerFactory
diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py
index 987c2b0d57..5b32a89726 100644
--- a/services/worker/tests/conftest.py
+++ b/services/worker/tests/conftest.py
@@ -8,7 +8,12 @@
 from libcommon.queue import _clean_queue_database
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.simple_cache import _clean_cache_database
-from libcommon.storage import StrPath, init_assets_dir, init_parquet_metadata_dir
+from libcommon.storage import (
+    StrPath,
+    init_assets_dir,
+    init_duckdb_index_dir,
+    init_parquet_metadata_dir,
+)
 from pytest import MonkeyPatch, fixture
 
 from worker.config import AppConfig
@@ -114,6 +119,11 @@ def parquet_metadata_directory(app_config: AppConfig) -> StrPath:
     return init_parquet_metadata_dir(app_config.parquet_metadata.storage_directory)
 
 
+@fixture
+def duckdb_index_directory(app_config: AppConfig) -> StrPath:
+    return init_duckdb_index_dir(app_config.duckdb_index.storage_directory)
+
+
 @fixture
 def test_processing_graph() -> ProcessingGraph:
     return ProcessingGraph(
diff --git a/services/worker/tests/test_executor.py b/services/worker/tests/test_executor.py
index a34fa1f3aa..4dc2c47862 100644
--- a/services/worker/tests/test_executor.py
+++ b/services/worker/tests/test_executor.py
@@ -199,6 +199,7 @@ def job_runner_factory(
     libraries_resource: LibrariesResource,
     assets_directory: StrPath,
     parquet_metadata_directory: StrPath,
+    duckdb_index_directory: StrPath,
 ) -> JobRunnerFactory:
     processing_graph = ProcessingGraph(app_config.processing_graph.specification)
     return JobRunnerFactory(
@@ -207,6 +208,7 @@ def job_runner_factory(
         hf_datasets_cache=libraries_resource.hf_datasets_cache,
         assets_directory=assets_directory,
         parquet_metadata_directory=parquet_metadata_directory,
+        duckdb_index_directory=duckdb_index_directory,
     )
 
 
diff --git a/services/worker/tests/test_job_runner_factory.py b/services/worker/tests/test_job_runner_factory.py
index 3ed3b0e7e6..982c0ae2a5 100644
--- a/services/worker/tests/test_job_runner_factory.py
+++ b/services/worker/tests/test_job_runner_factory.py
@@ -39,6 +39,7 @@ def test_create_job_runner(
     libraries_resource: LibrariesResource,
     assets_directory: StrPath,
     parquet_metadata_directory: StrPath,
+    duckdb_index_directory: StrPath,
     job_type: str,
     expected_job_runner: Optional[str],
 ) -> None:
@@ -48,6 +49,7 @@ def test_create_job_runner(
         hf_datasets_cache=libraries_resource.hf_datasets_cache,
         assets_directory=assets_directory,
         parquet_metadata_directory=parquet_metadata_directory,
+        duckdb_index_directory=duckdb_index_directory,
     )
     job_info: JobInfo = {
         "type": job_type,

From c53af5f6a8f80e165ec678937630073bce22f42e Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 2 Jun 2023 08:33:23 -0400
Subject: [PATCH 04/52] WIP adding fts on API

---
 services/api/src/api/app.py        | 12 ++++
 services/api/src/api/routes/fts.py | 95 ++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 services/api/src/api/routes/fts.py

diff --git a/services/api/src/api/app.py b/services/api/src/api/app.py
index 13479c33ad..8b291f56f1 100644
--- a/services/api/src/api/app.py
+++ b/services/api/src/api/app.py
@@ -16,6 +16,7 @@
 from api.config import AppConfig, EndpointConfig, UvicornConfig
 from api.jwt_token import fetch_jwt_public_key
 from api.routes.endpoint import EndpointsDefinition, create_endpoint
+from api.routes.fts import create_fts_endpoint
 from api.routes.healthcheck import healthcheck_endpoint
 from api.routes.metrics import create_metrics_endpoint
 from api.routes.rows import create_rows_endpoint
@@ -121,6 +122,17 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi
                 max_age_short=app_config.api.max_age_short,
             ),
         ),
+        Route(
+            "/fts",
+            endpoint=create_fts_endpoint(
+                processing_graph=processing_graph,
+                cached_assets_directory=cached_assets_directory,
+                hf_endpoint=app_config.common.hf_endpoint,
+                hf_token=app_config.common.hf_token,
+                max_age_long=app_config.api.max_age_long,
+                max_age_short=app_config.api.max_age_short,
+            ),
+        ),
     ]
 
     return Starlette(routes=routes, middleware=middleware, on_shutdown=[resource.release for resource in resources])
diff --git a/services/api/src/api/routes/fts.py b/services/api/src/api/routes/fts.py
new file mode 100644
index 0000000000..833766f824
--- /dev/null
+++ b/services/api/src/api/routes/fts.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2022 The HuggingFace Authors.
+
+import logging
+from os import PathLike
+from typing import List, Optional, Set, Union
+
+import duckdb
+from libcommon.processing_graph import ProcessingGraph
+from libcommon.prometheus import StepProfiler
+from libcommon.simple_cache import get_valid_datasets
+from starlette.requests import Request
+from starlette.responses import Response
+
+from api.routes.endpoint import get_cache_entry_from_steps
+from api.utils import (
+    Endpoint,
+    MissingRequiredParameterError,
+    UnexpectedError,
+    are_valid_parameters,
+    get_json_api_error_response,
+    get_json_ok_response,
+)
+
+
+def get_valid(processing_graph: ProcessingGraph) -> List[str]:
+    # a dataset is considered valid if at least one response for PROCESSING_STEPS_FOR_VALID
+    # is valid.
+    datasets: Optional[Set[str]] = None
+    for processing_step in processing_graph.get_processing_steps_required_by_dataset_viewer():
+        kind_datasets = get_valid_datasets(kind=processing_step.cache_kind)
+        if datasets is None:
+            # first iteration fills the set of datasets
+            datasets = kind_datasets
+        else:
+            # next iterations remove the datasets that miss a required processing step
+            datasets.intersection_update(kind_datasets)
+    # note that the list is sorted alphabetically for consistency
+    return [] if datasets is None else sorted(datasets)
+
+
+StrPath = Union[str, PathLike[str]]
+
+
+def create_fts_endpoint(
+    processing_graph: ProcessingGraph,
+    cached_assets_directory: StrPath,
+    hf_endpoint: str,
+    max_age_long: int = 0,
+    max_age_short: int = 0,
+    hf_token: Optional[str] = None,
+) -> Endpoint:
+    async def fts_endpoint(request: Request) -> Response:
+        with StepProfiler(method="fts_endpoint", step="all"):
+            try:
+                logging.info("/fts")
+                # processing_step = processing_graph.get_processing_step("split-duckdb-index")
+                dataset = request.query_params.get("dataset")
+                config = request.query_params.get("config")
+                split = request.query_params.get("split")
+                query = request.query_params.get("query")
+                if not dataset or not config or not split or not are_valid_parameters([dataset, config, split]):
+                    raise MissingRequiredParameterError("Parameter 'dataset', 'config' and 'split' are required")
+                if not query:
+                    raise MissingRequiredParameterError("Parameter 'query' is required")
+                # upstream_result = get_cache_entry_from_steps(
+                #     processing_steps=[processing_step],
+                #     dataset=dataset,
+                #     config=config,
+                #     split=split,
+                #     processing_graph=processing_graph,
+                #     hf_endpoint=hf_endpoint,
+                #     hf_token=hf_token,
+                # )
+                # content = result["content"]
+                # duck_db_name = content["duckdb_db_name"]
+
+            except Exception as e:
+                with StepProfiler(method="fts_endpoint", step="generate API error response"):
+                    return get_json_api_error_response(UnexpectedError("Unexpected error.", e), max_age=max_age_short)
+            duckdb.execute("INSTALL 'fts';")
+            duckdb.execute("LOAD 'fts';")
+            # db_location = cached_assets_directory / duck_db_name
+            db_location = "/tmp/asoria/openfire/--/default/train/index.db"
+            con = duckdb.connect(str(db_location))
+            result = con.execute(
+                (
+                    "SELECT fts_main_data.match_bm25(id, ?) AS score, * FROM data WHERE score IS NOT NULL ORDER BY"
+                    " score DESC;"
+                ),
+                [query],
+            ).df()
+            return get_json_ok_response({"result": result.to_json()}, max_age=max_age_long)
+
+    return fts_endpoint

From 8cac1c54efe99d9441a2b1a336d98de0036c78fd Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 2 Jun 2023 16:01:33 -0400
Subject: [PATCH 05/52] Remove non used code

---
 libs/libcommon/src/libcommon/exceptions.py    |  8 ++
 services/api/src/api/app.py                   | 12 ---
 services/api/src/api/routes/fts.py            | 95 -------------------
 .../worker/job_runners/split/duckdb_index.py  | 28 ++++--
 4 files changed, 28 insertions(+), 115 deletions(-)
 delete mode 100644 services/api/src/api/routes/fts.py

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index d9cead1daa..45dcbbcce7 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -116,6 +116,7 @@ def as_response(self) -> ErrorResponse:
     "TooManyColumnsError",
     "UnexpectedError",
     "UnsupportedExternalFilesError",
+    "UnsupportedIndexableColumnsError",
 ]
 
 
@@ -513,3 +514,10 @@ class NoIndexableColumnsError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
+
+
+class UnsupportedIndexableColumnsError(CacheableError):
+    """Raised when some unsupported indexable columns present."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedIndexableColumnsError", cause, True)
diff --git a/services/api/src/api/app.py b/services/api/src/api/app.py
index 8b291f56f1..13479c33ad 100644
--- a/services/api/src/api/app.py
+++ b/services/api/src/api/app.py
@@ -16,7 +16,6 @@
 from api.config import AppConfig, EndpointConfig, UvicornConfig
 from api.jwt_token import fetch_jwt_public_key
 from api.routes.endpoint import EndpointsDefinition, create_endpoint
-from api.routes.fts import create_fts_endpoint
 from api.routes.healthcheck import healthcheck_endpoint
 from api.routes.metrics import create_metrics_endpoint
 from api.routes.rows import create_rows_endpoint
@@ -122,17 +121,6 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi
                 max_age_short=app_config.api.max_age_short,
             ),
         ),
-        Route(
-            "/fts",
-            endpoint=create_fts_endpoint(
-                processing_graph=processing_graph,
-                cached_assets_directory=cached_assets_directory,
-                hf_endpoint=app_config.common.hf_endpoint,
-                hf_token=app_config.common.hf_token,
-                max_age_long=app_config.api.max_age_long,
-                max_age_short=app_config.api.max_age_short,
-            ),
-        ),
     ]
 
     return Starlette(routes=routes, middleware=middleware, on_shutdown=[resource.release for resource in resources])
diff --git a/services/api/src/api/routes/fts.py b/services/api/src/api/routes/fts.py
deleted file mode 100644
index 833766f824..0000000000
--- a/services/api/src/api/routes/fts.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2022 The HuggingFace Authors.
-
-import logging
-from os import PathLike
-from typing import List, Optional, Set, Union
-
-import duckdb
-from libcommon.processing_graph import ProcessingGraph
-from libcommon.prometheus import StepProfiler
-from libcommon.simple_cache import get_valid_datasets
-from starlette.requests import Request
-from starlette.responses import Response
-
-from api.routes.endpoint import get_cache_entry_from_steps
-from api.utils import (
-    Endpoint,
-    MissingRequiredParameterError,
-    UnexpectedError,
-    are_valid_parameters,
-    get_json_api_error_response,
-    get_json_ok_response,
-)
-
-
-def get_valid(processing_graph: ProcessingGraph) -> List[str]:
-    # a dataset is considered valid if at least one response for PROCESSING_STEPS_FOR_VALID
-    # is valid.
-    datasets: Optional[Set[str]] = None
-    for processing_step in processing_graph.get_processing_steps_required_by_dataset_viewer():
-        kind_datasets = get_valid_datasets(kind=processing_step.cache_kind)
-        if datasets is None:
-            # first iteration fills the set of datasets
-            datasets = kind_datasets
-        else:
-            # next iterations remove the datasets that miss a required processing step
-            datasets.intersection_update(kind_datasets)
-    # note that the list is sorted alphabetically for consistency
-    return [] if datasets is None else sorted(datasets)
-
-
-StrPath = Union[str, PathLike[str]]
-
-
-def create_fts_endpoint(
-    processing_graph: ProcessingGraph,
-    cached_assets_directory: StrPath,
-    hf_endpoint: str,
-    max_age_long: int = 0,
-    max_age_short: int = 0,
-    hf_token: Optional[str] = None,
-) -> Endpoint:
-    async def fts_endpoint(request: Request) -> Response:
-        with StepProfiler(method="fts_endpoint", step="all"):
-            try:
-                logging.info("/fts")
-                # processing_step = processing_graph.get_processing_step("split-duckdb-index")
-                dataset = request.query_params.get("dataset")
-                config = request.query_params.get("config")
-                split = request.query_params.get("split")
-                query = request.query_params.get("query")
-                if not dataset or not config or not split or not are_valid_parameters([dataset, config, split]):
-                    raise MissingRequiredParameterError("Parameter 'dataset', 'config' and 'split' are required")
-                if not query:
-                    raise MissingRequiredParameterError("Parameter 'query' is required")
-                # upstream_result = get_cache_entry_from_steps(
-                #     processing_steps=[processing_step],
-                #     dataset=dataset,
-                #     config=config,
-                #     split=split,
-                #     processing_graph=processing_graph,
-                #     hf_endpoint=hf_endpoint,
-                #     hf_token=hf_token,
-                # )
-                # content = result["content"]
-                # duck_db_name = content["duckdb_db_name"]
-
-            except Exception as e:
-                with StepProfiler(method="fts_endpoint", step="generate API error response"):
-                    return get_json_api_error_response(UnexpectedError("Unexpected error.", e), max_age=max_age_short)
-            duckdb.execute("INSTALL 'fts';")
-            duckdb.execute("LOAD 'fts';")
-            # db_location = cached_assets_directory / duck_db_name
-            db_location = "/tmp/asoria/openfire/--/default/train/index.db"
-            con = duckdb.connect(str(db_location))
-            result = con.execute(
-                (
-                    "SELECT fts_main_data.match_bm25(id, ?) AS score, * FROM data WHERE score IS NOT NULL ORDER BY"
-                    " score DESC;"
-                ),
-                [query],
-            ).df()
-            return get_json_ok_response({"result": result.to_json()}, max_age=max_age_long)
-
-    return fts_endpoint
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 19f0b0e368..696ed71964 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -9,6 +9,7 @@
     NoIndexableColumnsError,
     ParquetResponseEmptyError,
     PreviousStepFormatError,
+    UnsupportedIndexableColumnsError,
 )
 from libcommon.processing_graph import ProcessingStep
 from libcommon.storage import StrPath
@@ -26,6 +27,7 @@
 STRING_FEATURE_DTYPE = "string"
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_DB_NAME = "index.db"
+UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"]
 
 
 def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse:
@@ -57,6 +59,15 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     if not string_columns:
         raise NoIndexableColumnsError("No string columns available to index.")
 
+    # look for image, audio and binary columns, if present, raise exeception do not supported yet and index everything
+    if any(
+        feature["name"]
+        for feature in features
+        if "_type" in feature["type"]
+        and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS
+    ):
+        raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.")
+
     # get list of parquet urls
     config_parquet = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
     try:
@@ -69,7 +80,6 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
         raise PreviousStepFormatError("Previous step did not return the expected content.") from e
 
     # create duckdb index location
-    # TODO: Need to manage re index, maybe delete folder/file or perform a table drop/delete?
     split_path, dir_path = create_index_dir_split(
         dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
     )
@@ -84,13 +94,15 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
 
     # index
     con = duckdb.connect(str(db_location))
-    con.sql("CREATE SEQUENCE serial START 1;")
-    # TODO: We need a sequence id column for Full text search, maybe there is a better way
-    filter_columns = ",".join(string_columns)  # TODO: What if already exists an id? need to create an identity column
-    con.sql(
-        f"CREATE TABLE data AS SELECT nextval('serial') AS id, {filter_columns} FROM read_parquet({parquet_urls});"
-    )
-    con.sql("PRAGMA create_fts_index('data', 'id', '*');")
+    con.sql("CREATE OR REPLACE SEQUENCE serial START 1;")
+
+    # TODO: What if already exists an __id field? need to create an identity column, maybe some random name?
+    con.sql(f"CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM read_parquet({parquet_urls});")
+    con.sql("PRAGMA drop_fts_index('data');")
+    
+    # TODO: by default, 'porter' stemmer is being used, we might need to use a specific one by dataset language in the future
+    # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter
+    con.sql("PRAGMA create_fts_index('data', '__id', '*');")
 
     return IndexRowsResponse(duckdb_db_name=duck_db_name)
 

From 23ce3eee617eb6af6efb125863f02a09f4d8e02b Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 2 Jun 2023 16:05:51 -0400
Subject: [PATCH 06/52] Fix style

---
 .../worker/src/worker/job_runners/split/duckdb_index.py    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 696ed71964..a146694da1 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -63,8 +63,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     if any(
         feature["name"]
         for feature in features
-        if "_type" in feature["type"]
-        and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS
+        if "_type" in feature["type"] and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS
     ):
         raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.")
 
@@ -99,8 +98,8 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     # TODO: What if already exists an __id field? need to create an identity column, maybe some random name?
     con.sql(f"CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM read_parquet({parquet_urls});")
     con.sql("PRAGMA drop_fts_index('data');")
-    
-    # TODO: by default, 'porter' stemmer is being used, we might need to use a specific one by dataset language in the future
+
+    # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future
     # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter
     con.sql("PRAGMA create_fts_index('data', '__id', '*');")
 

From ac0a2d9af7f78d4e34c73bc600b2f772dd3131dd Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 2 Jun 2023 16:48:57 -0400
Subject: [PATCH 07/52] Adding chart objects

---
 chart/Chart.yaml                              |  2 +-
 chart/templates/_envDuckDbIndex.tpl           |  7 ++++++
 chart/templates/_helpers.tpl                  |  9 +++++++
 chart/templates/_initContainerDuckDBIndex.tpl | 21 ++++++++++++++++
 chart/templates/_volumeMountDuckDBIndex.tpl   | 10 ++++++++
 chart/templates/worker/_container.tpl         |  2 ++
 chart/templates/worker/_deployment.yaml       |  1 +
 chart/values.yaml                             |  3 +++
 .../worker/job_runners/split/duckdb_index.py  | 25 +++++++++++--------
 tools/docker-compose-datasets-server.yml      |  3 +++
 tools/docker-compose-dev-datasets-server.yml  |  3 +++
 11 files changed, 75 insertions(+), 11 deletions(-)
 create mode 100644 chart/templates/_envDuckDbIndex.tpl
 create mode 100644 chart/templates/_initContainerDuckDBIndex.tpl
 create mode 100644 chart/templates/_volumeMountDuckDBIndex.tpl

diff --git a/chart/Chart.yaml b/chart/Chart.yaml
index a315e64960..9a4a0a379f 100644
--- a/chart/Chart.yaml
+++ b/chart/Chart.yaml
@@ -18,7 +18,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.12.4
+version: 1.13.4
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/chart/templates/_envDuckDbIndex.tpl b/chart/templates/_envDuckDbIndex.tpl
new file mode 100644
index 0000000000..a0a12059bb
--- /dev/null
+++ b/chart/templates/_envDuckDbIndex.tpl
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+
+{{- define "envDuckDBIndex" -}}
+- name: DUCKDB_INDEX_STORAGE_DIRECTORY
+  value: {{ .Values.duckDBIndex.storageDirectory | quote }}
+{{- end -}}
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
index 12aeec4c7d..ac9370e664 100644
--- a/chart/templates/_helpers.tpl
+++ b/chart/templates/_helpers.tpl
@@ -169,6 +169,15 @@ The parquet-metadata/ subpath in the NFS
 {{- printf "%s/%s/%s/" .Chart.Name .Release.Name "parquet-metadata" }}
 {{- end }}
 
+{{/*
+The duckdb-index/ subpath in the NFS
+- in a subdirectory named as the chart (datasets-server/), and below it,
+- in a subdirectory named as the Release, so that Releases will not share the same dir
+*/}}
+{{- define "duckDBIndex.subpath" -}}
+{{- printf "%s/%s/%s/" .Chart.Name .Release.Name "duckdb-index" }}
+{{- end }}
+
 {{/*
 The datasets library will use this directory as a cache
 - in a subdirectory named as the chart (datasets-server/), and below it,
diff --git a/chart/templates/_initContainerDuckDBIndex.tpl b/chart/templates/_initContainerDuckDBIndex.tpl
new file mode 100644
index 0000000000..ed7cb43bc3
--- /dev/null
+++ b/chart/templates/_initContainerDuckDBIndex.tpl
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+
+{{- define "initContainerDuckDBIndex" -}}
+- name: prepare-duckdb-index
+  image: ubuntu:focal
+  imagePullPolicy: {{ .Values.images.pullPolicy }}
+  command: ["/bin/sh", "-c"]
+  args:
+  - chown {{ .Values.uid }}:{{ .Values.gid }} /mounted-path;
+  volumeMounts:
+  - mountPath: /mounted-path
+    mountPropagation: None
+    name: data
+    subPath: "{{ include "duckDBIndex.subpath" . }}"
+    readOnly: false
+  securityContext:
+    runAsNonRoot: false
+    runAsUser: 0
+    runAsGroup: 0
+{{- end -}}
diff --git a/chart/templates/_volumeMountDuckDBIndex.tpl b/chart/templates/_volumeMountDuckDBIndex.tpl
new file mode 100644
index 0000000000..01c37b8919
--- /dev/null
+++ b/chart/templates/_volumeMountDuckDBIndex.tpl
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+
+{{- define "volumeMountDuckDBIndexRW" -}}
+- mountPath: {{ .Values.duckDBIndex.storageDirectory | quote }}
+  mountPropagation: None
+  name: data
+  subPath: "{{ include "duckDBIndex.subpath" . }}"
+  readOnly: false
+{{- end -}}
diff --git a/chart/templates/worker/_container.tpl b/chart/templates/worker/_container.tpl
index 899c8c5800..d5c1ed9527 100644
--- a/chart/templates/worker/_container.tpl
+++ b/chart/templates/worker/_container.tpl
@@ -9,6 +9,7 @@
   {{ include "envAssets" . | nindent 2 }}
   {{ include "envCache" . | nindent 2 }}
   {{ include "envParquetMetadata" . | nindent 2 }}
+  {{ include "envDuckDBIndex" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
   {{ include "envLog" . | nindent 2 }}
@@ -26,6 +27,7 @@
   {{ include "volumeMountAssetsRW" . | nindent 2 }}
   {{ include "volumeMountCache" . | nindent 2 }}
   {{ include "volumeMountParquetMetadataRW" . | nindent 2 }}
+  {{ include "volumeMountDuckDBIndexRW" . | nindent 2 }}
   securityContext:
     allowPrivilegeEscalation: false
   resources: {{ toYaml .workerValues.resources | nindent 4 }}
diff --git a/chart/templates/worker/_deployment.yaml b/chart/templates/worker/_deployment.yaml
index e06d319c65..03a70646ae 100644
--- a/chart/templates/worker/_deployment.yaml
+++ b/chart/templates/worker/_deployment.yaml
@@ -26,6 +26,7 @@ spec:
         {{ include "initContainerAssets" . | nindent 8 }}
         {{ include "initContainerCache" . | nindent 8 }}
         {{ include "initContainerParquetMetadata" . | nindent 8 }}
+        {{ include "initContainerDuckDBIndex" . | nindent 8 }}
       containers: {{ include "containerWorker" . | nindent 8 }}
       nodeSelector: {{ toYaml .workerValues.nodeSelector | nindent 8 }}
       tolerations: {{ toYaml .workerValues.tolerations | nindent 8 }}
diff --git a/chart/values.yaml b/chart/values.yaml
index 0afac4b83e..d378e3d472 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -207,6 +207,9 @@ parquetMetadata:
   # Directory on the shared storage (parquet metadata files used for random access in /rows)
   storageDirectory: "/parquet-metadata"
 
+duckDBIndex:
+  # Directory on the shared storage (duckdb db files used for datasets indexing)
+  storageDirectory: "/duckdb-index"
 
 # Directory where the cache data will be stored
 cacheDirectory: "/datasets-server-cache"
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index a146694da1..0f2a996465 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -28,6 +28,13 @@
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_DB_NAME = "index.db"
 UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"]
+CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
+DROP_INDEX_COMMAND = "PRAGMA drop_fts_index('data');"
+CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*');"
+CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM"
+INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';"
+LOAD_EXTENSION_COMMAND = "LOAD '{extension}';"
+# TODO: What if __id field already exist?
 
 
 def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse:
@@ -86,22 +93,20 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     db_location = dir_path / DUCKDB_DEFAULT_DB_NAME
 
     # configure duckdb extensions
-    duckdb.execute("INSTALL 'httpfs';")
-    duckdb.execute("LOAD 'httpfs';")
-    duckdb.execute("INSTALL 'fts';")
-    duckdb.execute("LOAD 'fts';")
+    duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs"))
+    duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="httpfs"))
+    duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="fts"))
+    duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts"))
 
     # index
     con = duckdb.connect(str(db_location))
-    con.sql("CREATE OR REPLACE SEQUENCE serial START 1;")
-
-    # TODO: What if already exists an __id field? need to create an identity column, maybe some random name?
-    con.sql(f"CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM read_parquet({parquet_urls});")
-    con.sql("PRAGMA drop_fts_index('data');")
+    con.sql(CREATE_SEQUENCE_COMMAND)
+    con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});")
+    con.sql(DROP_INDEX_COMMAND)
 
     # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future
     # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter
-    con.sql("PRAGMA create_fts_index('data', '__id', '*');")
+    con.sql(CREATE_INDEX_COMMAND)
 
     return IndexRowsResponse(duckdb_db_name=duck_db_name)
 
diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml
index cead78fa98..de36efd79c 100644
--- a/tools/docker-compose-datasets-server.yml
+++ b/tools/docker-compose-datasets-server.yml
@@ -90,6 +90,7 @@ services:
     volumes:
       - assets:${ASSETS_STORAGE_DIRECTORY-/assets}:rw
       - parquet-metadata:${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}:rw
+      - duckdb-index:${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}:rw
     extends:
       file: docker-compose-base.yml
       service: datasets-worker
@@ -110,6 +111,7 @@ services:
       PARQUET_AND_INFO_TARGET_REVISION: ${PARQUET_AND_INFO_TARGET_REVISION-refs/convert/parquet}
       PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}
+      DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}
       # ^ note: the datasets cache is automatically added, so no need to add it here
       OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10}
@@ -143,3 +145,4 @@ volumes:
   parquet-modules-cache:
   parquet-numba-cache:
   parquet-metadata:
+  duckdb-index:
diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml
index 6489bc4155..ad6c44f3f8 100644
--- a/tools/docker-compose-dev-datasets-server.yml
+++ b/tools/docker-compose-dev-datasets-server.yml
@@ -94,6 +94,7 @@ services:
     volumes:
       - assets:${ASSETS_STORAGE_DIRECTORY-/assets}:rw
       - parquet-metadata:${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}:rw
+      - duckdb-index:${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}:rw
     extends:
       file: docker-compose-dev-base.yml
       service: datasets-worker
@@ -114,6 +115,7 @@ services:
       PARQUET_AND_INFO_TARGET_REVISION: ${PARQUET_AND_INFO_TARGET_REVISION-refs/convert/parquet}
       PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}
+      DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}
       # ^ note: the datasets cache is automatically added, so no need to add it here
       OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10}
@@ -145,3 +147,4 @@ volumes:
   parquet-modules-cache:
   parquet-numba-cache:
   parquet-metadata:
+  duckdb-index:

From dff50cffa5cb7963dd8509311e99443652e9b6d8 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 2 Jun 2023 16:51:11 -0400
Subject: [PATCH 08/52] Rollback dependency in API

---
 services/api/poetry.lock    | 59 +------------------------------------
 services/api/pyproject.toml |  1 -
 2 files changed, 1 insertion(+), 59 deletions(-)

diff --git a/services/api/poetry.lock b/services/api/poetry.lock
index a1e1e4ec2b..e5c9e22603 100644
--- a/services/api/poetry.lock
+++ b/services/api/poetry.lock
@@ -711,63 +711,6 @@ files = [
 dnssec = ["ecdsa (>=0.13)", "pycryptodome"]
 idna = ["idna (>=2.1)"]
 
-[[package]]
-name = "duckdb"
-version = "0.8.0"
-description = "DuckDB embedded database"
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"},
-    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"},
-    {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"},
-    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"},
-    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"},
-    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"},
-    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"},
-    {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"},
-    {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"},
-    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"},
-    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"},
-    {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"},
-    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"},
-    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"},
-    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"},
-    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"},
-    {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"},
-    {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"},
-    {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"},
-    {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"},
-    {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"},
-    {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"},
-    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"},
-    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"},
-    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"},
-    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"},
-    {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"},
-    {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"},
-    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"},
-    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"},
-    {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"},
-    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"},
-    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"},
-    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"},
-    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"},
-    {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"},
-    {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"},
-    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"},
-    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"},
-    {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"},
-    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"},
-    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"},
-    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"},
-    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"},
-    {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"},
-    {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"},
-    {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"},
-]
-
 [[package]]
 name = "environs"
 version = "9.5.0"
@@ -3497,4 +3440,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "3.9.15"
-content-hash = "1cbdff67ee9555ae24c1f162b595c50a5fa9fa2e37c2d3784728b01ebdb5a278"
+content-hash = "4e76b1586360769e88d2439840cbbd3cb91c8b1087d4b17b0e4246d465cc163c"
diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml
index a16ca6c02f..16a6fd6f3d 100644
--- a/services/api/pyproject.toml
+++ b/services/api/pyproject.toml
@@ -19,7 +19,6 @@ starlette = "^0.27.0"
 starlette-prometheus = "^0.9.0"
 uvicorn = "^0.20.0"
 watchdog = { extras = ["watchmedo"], version = "^2.2.1" }
-duckdb = "^0.8.0"
 
 [tool.poetry.group.dev.dependencies]
 bandit = "^1.7.4"

From 4659117a71fa27c3ce2148f7ee690001eae111d5 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 6 Jun 2023 10:50:35 -0400
Subject: [PATCH 09/52] Depend on parquet an split

---
 libs/libcommon/src/libcommon/config.py        |  4 +-
 .../worker/job_runners/split/duckdb_index.py  | 78 ++++++++++++++-----
 .../split/first_rows_from_parquet.py          | 31 +-------
 services/worker/src/worker/utils.py           | 30 ++++++-
 .../split/test_first_rows_from_parquet.py     |  4 +-
 5 files changed, 96 insertions(+), 51 deletions(-)

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
index 5e01500cbf..b925e58bc5 100644
--- a/libs/libcommon/src/libcommon/config.py
+++ b/libs/libcommon/src/libcommon/config.py
@@ -341,8 +341,8 @@ class ProcessingGraphConfig:
             "split-duckdb-index": {
                 "input_type": "split",
                 "triggered_by": [
-                    "split-first-rows-from-streaming",
-                    "split-first-rows-from-parquet",
+                    "config-split-names-from-info",
+                    "config-split-names-from-streaming",
                     "config-parquet",
                 ],
                 "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 0f2a996465..0001f9609b 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -2,31 +2,40 @@
 # Copyright 2023 The HuggingFace Authors.
 
 import logging
+from functools import partial
+from typing import List, Optional
 
 import duckdb
+from datasets import Features
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
 from libcommon.exceptions import (
+    FileSystemError,
     NoIndexableColumnsError,
     ParquetResponseEmptyError,
     PreviousStepFormatError,
     UnsupportedIndexableColumnsError,
+    SplitNotFoundError,
 )
 from libcommon.processing_graph import ProcessingStep
 from libcommon.storage import StrPath
 from libcommon.utils import JobInfo
 from libcommon.viewer_utils.index_utils import create_index_dir_split
+from pyarrow.parquet import ParquetFile
+from tqdm.contrib.concurrent import thread_map
 
 from worker.config import AppConfig
 from worker.job_runners.split.split_job_runner import SplitJobRunner
 from worker.utils import (
     CompleteJobResult,
     IndexRowsResponse,
+    get_hf_fs,
+    get_hf_parquet_uris,
     get_previous_step_or_raise,
 )
 
 STRING_FEATURE_DTYPE = "string"
 VALUE_FEATURE_TYPE = "Value"
-DUCKDB_DEFAULT_DB_NAME = "index.db"
+DUCKDB_DEFAULT_INDEX_FILENAME = "index.db"
 UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"]
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
 DROP_INDEX_COMMAND = "PRAGMA drop_fts_index('data');"
@@ -37,22 +46,57 @@
 # TODO: What if __id field already exist?
 
 
-def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_directory: StrPath) -> IndexRowsResponse:
+def compute_index_rows(
+    dataset: str,
+    config: str,
+    split: str,
+    duckdb_index_directory: StrPath,
+    hf_token: Optional[str],
+) -> IndexRowsResponse:
     logging.info(f"get index-rows for dataset={dataset} config={config} split={split}")
 
-    # get the first rows from previous job
-    upstream_response = get_previous_step_or_raise(
-        kinds=["split-first-rows-from-streaming", "split-first-rows-from-parquet"],
-        dataset=dataset,
-        config=config,
-        split=split,
+    # validate split
+    split_names_best_response = get_previous_step_or_raise(
+        kinds=["config-split-names-from-streaming", "config-split-names-from-info"], dataset=dataset, config=config
     )
     try:
-        first_rows = upstream_response.response["content"]
-        features = first_rows["features"]
-    except KeyError as e:
+        splits_content = split_names_best_response.response["content"]["splits"]
+    except Exception as e:
         raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
 
+    if split not in [split_item["split"] for split_item in splits_content]:
+        raise SplitNotFoundError(f"The split '{split}' does not exist for the config '{config}' of the dataset.")
+
+    # get parquet content
+    config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
+
+    try:
+        parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"]
+        sources = sorted(
+            f"{config}/{parquet_file['filename']}"
+            for parquet_file in parquet_files_content
+            if parquet_file["split"] == split and parquet_file["config"] == config
+        )
+        if not sources:
+            raise ParquetResponseEmptyError("No parquet files found.")
+    except Exception as e:
+        raise PreviousStepFormatError("Previous step did not return the expected content.") from e
+
+    logging.debug(f"Found {len(sources)} parquet files for {dataset=}, {config=}, {split=}: {sources}")
+
+    fs = get_hf_fs(hf_token=hf_token)
+    source_uris = get_hf_parquet_uris(sources, dataset=dataset)
+    desc = f"{dataset}/{config}/{split}"
+    try:
+        parquet_files: List[ParquetFile] = thread_map(
+            partial(ParquetFile, filesystem=fs), source_uris, desc=desc, unit="pq", disable=True
+        )
+    except Exception as e:
+        raise FileSystemError(f"Could not read the parquet files: {e}") from e
+
+    # get the features
+    features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema())
+
     # look for string columns using the first rows
     string_columns = [
         feature["name"]
@@ -74,11 +118,8 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     ):
         raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.")
 
-    # get list of parquet urls
-    config_parquet = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
     try:
-        parquet_files = config_parquet.response["content"]["parquet_files"]
-        parquet_urls = [content["url"] for content in parquet_files if content["split"] == split]
+        parquet_urls = [content["url"] for content in parquet_files_content if content["split"] == split]
 
         if not parquet_urls:
             raise ParquetResponseEmptyError("No parquet files found.")
@@ -89,8 +130,8 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     split_path, dir_path = create_index_dir_split(
         dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
     )
-    duck_db_name = f"{split_path}/{DUCKDB_DEFAULT_DB_NAME}"
-    db_location = dir_path / DUCKDB_DEFAULT_DB_NAME
+    duckdb_index_filename = f"{split_path}/{DUCKDB_DEFAULT_INDEX_FILENAME}"
+    db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME
 
     # configure duckdb extensions
     duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs"))
@@ -108,7 +149,7 @@ def compute_index_rows(dataset: str, config: str, split: str, duckdb_index_direc
     # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter
     con.sql(CREATE_INDEX_COMMAND)
 
-    return IndexRowsResponse(duckdb_db_name=duck_db_name)
+    return IndexRowsResponse(duckdb_index_filename=duckdb_index_filename)
 
 
 class SplitDuckDbIndexJobRunner(SplitJobRunner):
@@ -143,5 +184,6 @@ def compute(self) -> CompleteJobResult:
                 config=self.config,
                 split=self.split,
                 duckdb_index_directory=self.duckdb_index_directory,
+                hf_token=self.app_config.common.hf_token,
             )
         )
diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
index c8685b7a6f..e39ac4ff34 100644
--- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
+++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
@@ -2,15 +2,12 @@
 # Copyright 2022 The HuggingFace Authors.
 
 import logging
-from functools import lru_cache, partial
+from functools import partial
 from typing import List, Optional
 
 import pyarrow as pa
 from datasets import Features
-from huggingface_hub import HfFileSystem
-from huggingface_hub.hf_file_system import safe_quote
 from libcommon.constants import (
-    PARQUET_REVISION,
     PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION,
     PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION,
 )
@@ -38,6 +35,8 @@
     RowItem,
     SplitFirstRowsResponse,
     create_truncated_row_items,
+    get_hf_fs,
+    get_hf_parquet_uris,
     get_json_size,
     get_previous_step_or_raise,
     to_features_list,
@@ -72,30 +71,6 @@ def transform_rows(
     ]
 
 
-@lru_cache(maxsize=128)
-def get_hf_fs(hf_token: Optional[str]) -> HfFileSystem:
-    """Get the Hugging Face filesystem.
-
-    Args:
-        hf_token (Optional[str]): The token to access the filesystem.
-    Returns:
-        HfFileSystem: The Hugging Face filesystem.
-    """
-    return HfFileSystem(token=hf_token)
-
-
-def get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]:
-    """Get the Hugging Face URIs from the Parquet branch of the dataset repository (see PARQUET_REVISION).
-
-    Args:
-        paths (List[str]): List of paths.
-        dataset (str): The dataset name.
-    Returns:
-        List[str]: List of Parquet URIs.
-    """
-    return [f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}" for path in paths]
-
-
 def compute_first_rows_response(
     dataset: str,
     config: str,
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index 69aaba10c9..580c73d2a2 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -7,6 +7,7 @@
 import time
 import warnings
 from dataclasses import dataclass, field
+from functools import lru_cache
 from http import HTTPStatus
 from typing import (
     Any,
@@ -28,6 +29,9 @@
     IterableDataset,
     load_dataset,
 )
+from huggingface_hub import HfFileSystem
+from huggingface_hub.hf_file_system import safe_quote
+from libcommon.constants import PARQUET_REVISION
 from libcommon.exceptions import NormalRowsError, StreamingRowsError
 from libcommon.simple_cache import BestResponse, CachedArtifactError, get_best_response
 from libcommon.utils import orjson_dumps
@@ -133,7 +137,7 @@ class ImageUrlColumnsResponse(TypedDict):
 
 
 class IndexRowsResponse(TypedDict):
-    duckdb_db_name: str
+    duckdb_index_filename: str
 
 
 Row = Mapping[str, Any]
@@ -421,3 +425,27 @@ def get_previous_step_or_raise(
             cache_entry_with_details=best_response.response,
         )
     return best_response
+
+
+@lru_cache(maxsize=128)
+def get_hf_fs(hf_token: Optional[str]) -> HfFileSystem:
+    """Get the Hugging Face filesystem.
+
+    Args:
+        hf_token (Optional[str]): The token to access the filesystem.
+    Returns:
+        HfFileSystem: The Hugging Face filesystem.
+    """
+    return HfFileSystem(token=hf_token)
+
+
+def get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]:
+    """Get the Hugging Face URIs from the Parquet branch of the dataset repository (see PARQUET_REVISION).
+
+    Args:
+        paths (List[str]): List of paths.
+        dataset (str): The dataset name.
+    Returns:
+        List[str]: List of Parquet URIs.
+    """
+    return [f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}" for path in paths]
diff --git a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
index 46a92c80d5..8ab70191ea 100644
--- a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
+++ b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
@@ -107,9 +107,9 @@ def test_compute(
         http_status=HTTPStatus.OK,
     )
 
-    with patch("worker.job_runners.split.first_rows_from_parquet.get_hf_fs") as mock_read:
+    with patch("worker.utils.get_hf_fs") as mock_read:
         with patch(
-            "worker.job_runners.split.first_rows_from_parquet.get_hf_parquet_uris",
+            "worker.utils.get_hf_parquet_uris",
             side_effect=mock_get_hf_parquet_uris,
         ):
             initial_location = os.getcwd()

From f0794a8f2fd35d9689c3818d97826aba10d59e71 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 6 Jun 2023 13:42:59 -0400
Subject: [PATCH 10/52] Fix libcommon test

---
 libs/libcommon/tests/test_processing_graph.py | 23 ++++++++++++++++--
 .../worker/job_runners/split/duckdb_index.py  | 24 +++++++------------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py
index a4861594a4..c5946ab36c 100644
--- a/libs/libcommon/tests/test_processing_graph.py
+++ b/libs/libcommon/tests/test_processing_graph.py
@@ -93,13 +93,19 @@ def graph() -> ProcessingGraph:
                 "config-opt-in-out-urls-count",
                 "split-first-rows-from-streaming",
                 "dataset-split-names",
+                "split-duckdb-index",
             ],
             ["config-info"],
             ["dataset-config-names", "config-parquet-and-info", "config-info"],
         ),
         (
             "config-split-names-from-streaming",
-            ["split-first-rows-from-streaming", "dataset-split-names", "config-opt-in-out-urls-count"],
+            [
+                "split-first-rows-from-streaming",
+                "dataset-split-names",
+                "config-opt-in-out-urls-count",
+                "split-duckdb-index",
+            ],
             ["dataset-config-names"],
             ["dataset-config-names"],
         ),
@@ -142,7 +148,7 @@ def graph() -> ProcessingGraph:
         ),
         (
             "config-parquet",
-            ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet"],
+            ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet", "split-duckdb-index"],
             ["config-parquet-and-info"],
             ["dataset-config-names", "config-parquet-and-info"],
         ),
@@ -287,6 +293,19 @@ def graph() -> ProcessingGraph:
                 "split-image-url-columns",
             ],
         ),
+        (
+            "split-duckdb-index",
+            [],
+            ["config-parquet", "config-split-names-from-streaming", "config-split-names-from-info"],
+            [
+                "config-split-names-from-streaming",
+                "config-split-names-from-info",
+                "config-parquet-and-info",
+                "config-info",
+                "config-parquet",
+                "dataset-config-names",
+            ],
+        ),
     ],
 )
 def test_default_graph_steps(
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 0001f9609b..2704413e97 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -13,8 +13,8 @@
     NoIndexableColumnsError,
     ParquetResponseEmptyError,
     PreviousStepFormatError,
-    UnsupportedIndexableColumnsError,
     SplitNotFoundError,
+    UnsupportedIndexableColumnsError,
 )
 from libcommon.processing_graph import ProcessingStep
 from libcommon.storage import StrPath
@@ -38,8 +38,7 @@
 DUCKDB_DEFAULT_INDEX_FILENAME = "index.db"
 UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"]
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
-DROP_INDEX_COMMAND = "PRAGMA drop_fts_index('data');"
-CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*');"
+CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);"
 CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM"
 INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';"
 LOAD_EXTENSION_COMMAND = "LOAD '{extension}';"
@@ -98,23 +97,19 @@ def compute_index_rows(
     features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema())
 
     # look for string columns using the first rows
-    string_columns = [
-        feature["name"]
-        for feature in features
-        if "dtype" in feature["type"]
-        and "_type" in feature["type"]
-        and feature["type"]["dtype"] == STRING_FEATURE_DTYPE
-        and feature["type"]["_type"] == VALUE_FEATURE_TYPE
-    ]
+    string_columns = [column for column, feature in features.items() if STRING_FEATURE_DTYPE in str(feature)]
 
     if not string_columns:
         raise NoIndexableColumnsError("No string columns available to index.")
 
     # look for image, audio and binary columns, if present, raise exeception do not supported yet and index everything
     if any(
-        feature["name"]
-        for feature in features
-        if "_type" in feature["type"] and feature["type"]["_type"] in UNSUPPORTED_FEATURES_MAGIC_STRINGS
+        feature
+        for feature in features.values()
+        if next(
+            (feature_type for feature_type in UNSUPPORTED_FEATURES_MAGIC_STRINGS if feature_type in str(feature)), None
+        )
+        is not None
     ):
         raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.")
 
@@ -143,7 +138,6 @@ def compute_index_rows(
     con = duckdb.connect(str(db_location))
     con.sql(CREATE_SEQUENCE_COMMAND)
     con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});")
-    con.sql(DROP_INDEX_COMMAND)
 
     # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future
     # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter

From 05d33624b602ab8f268d6e9bcccaa6d4ad305470 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 6 Jun 2023 16:50:46 -0400
Subject: [PATCH 11/52] Send index file to dedicated branch

---
 libs/libcommon/src/libcommon/config.py        |  12 ++
 .../worker/job_runners/split/duckdb_index.py  | 125 +++++++++++++++++-
 services/worker/src/worker/utils.py           |   7 +-
 3 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
index b925e58bc5..caa34d6e84 100644
--- a/libs/libcommon/src/libcommon/config.py
+++ b/libs/libcommon/src/libcommon/config.py
@@ -106,11 +106,19 @@ def from_env(cls) -> "ParquetMetadataConfig":
 
 
 DUCKDB_INDEX_STORAGE_DIRECTORY = None
+DUCKDB_INDEX_COMMIT_MESSAGE = "Update duckdb index file"
+DUCKDB_INDEX_COMMITTER_HF_TOKEN = None
+DUCKDB_INDEX_TARGET_REVISION = "duckdb/index"
+DUCKDB_INDEX_URL_TEMPLATE = "/datasets/%s/resolve/%s/%s"
 
 
 @dataclass(frozen=True)
 class DuckDbIndexConfig:
     storage_directory: Optional[str] = DUCKDB_INDEX_STORAGE_DIRECTORY
+    commit_message: str = DUCKDB_INDEX_COMMIT_MESSAGE
+    committer_hf_token: Optional[str] = DUCKDB_INDEX_COMMITTER_HF_TOKEN
+    target_revision: str = DUCKDB_INDEX_TARGET_REVISION
+    url_template: str = DUCKDB_INDEX_URL_TEMPLATE
 
     @classmethod
     def from_env(cls) -> "DuckDbIndexConfig":
@@ -118,6 +126,10 @@ def from_env(cls) -> "DuckDbIndexConfig":
         with env.prefixed("DUCKDB_INDEX_"):
             return cls(
                 storage_directory=env.str(name="STORAGE_DIRECTORY", default=DUCKDB_INDEX_STORAGE_DIRECTORY),
+                commit_message=env.str(name="COMMIT_MESSAGE", default=DUCKDB_INDEX_COMMIT_MESSAGE),
+                committer_hf_token=env.str(name="COMMITTER_HF_TOKEN", default=DUCKDB_INDEX_COMMITTER_HF_TOKEN),
+                target_revision=env.str(name="TARGET_REVISION", default=DUCKDB_INDEX_TARGET_REVISION),
+                url_template=env.str(name="URL_TEMPLATE", default=DUCKDB_INDEX_URL_TEMPLATE),
             )
 
 
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 2704413e97..827875cd05 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -3,12 +3,23 @@
 
 import logging
 from functools import partial
-from typing import List, Optional
+from pathlib import Path
+from typing import List, Optional, Set
+from urllib.parse import quote
 
 import duckdb
 from datasets import Features
+from huggingface_hub._commit_api import (
+    CommitOperation,
+    CommitOperationAdd,
+    CommitOperationDelete,
+)
+from huggingface_hub.hf_api import HfApi, RepoFile
+from huggingface_hub.utils._errors import RepositoryNotFoundError
+from libcommon.config import DuckDbIndexConfig
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
 from libcommon.exceptions import (
+    DatasetNotFoundError,
     FileSystemError,
     NoIndexableColumnsError,
     ParquetResponseEmptyError,
@@ -17,7 +28,7 @@
     UnsupportedIndexableColumnsError,
 )
 from libcommon.processing_graph import ProcessingStep
-from libcommon.storage import StrPath
+from libcommon.storage import StrPath, remove_dir
 from libcommon.utils import JobInfo
 from libcommon.viewer_utils.index_utils import create_index_dir_split
 from pyarrow.parquet import ParquetFile
@@ -33,6 +44,7 @@
     get_previous_step_or_raise,
 )
 
+DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_INDEX_FILENAME = "index.db"
@@ -45,12 +57,48 @@
 # TODO: What if __id field already exist?
 
 
+def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str:
+    return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename)
+
+
+def create_index_item(
+    repo_file: RepoFile,
+    dataset: str,
+    config: str,
+    split: str,
+    hf_endpoint: str,
+    target_revision: str,
+    url_template: str,
+) -> IndexRowsResponse:
+    if repo_file.size is None:
+        raise ValueError(f"Cannot get size of {repo_file.rfilename}")
+    return {
+        "dataset": dataset,
+        "config": config,
+        "split": split,
+        "url": hf_hub_url(
+            repo_id=dataset,
+            filename=repo_file.rfilename,
+            hf_endpoint=hf_endpoint,
+            revision=target_revision,
+            url_template=url_template,
+        ),
+        "filename": Path(repo_file.rfilename).name,
+        "size": repo_file.size,
+    }
+
+
 def compute_index_rows(
     dataset: str,
     config: str,
     split: str,
     duckdb_index_directory: StrPath,
+    target_revision: str,
+    hf_endpoint: str,
+    commit_message: str,
+    url_template: str,
     hf_token: Optional[str],
+    committer_hf_token: Optional[str],
 ) -> IndexRowsResponse:
     logging.info(f"get index-rows for dataset={dataset} config={config} split={split}")
 
@@ -125,7 +173,6 @@ def compute_index_rows(
     split_path, dir_path = create_index_dir_split(
         dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
     )
-    duckdb_index_filename = f"{split_path}/{DUCKDB_DEFAULT_INDEX_FILENAME}"
     db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME
 
     # configure duckdb extensions
@@ -143,10 +190,74 @@ def compute_index_rows(
     # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter
     con.sql(CREATE_INDEX_COMMAND)
 
-    return IndexRowsResponse(duckdb_index_filename=duckdb_index_filename)
+    # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)
+    hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
+    committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token)
+
+    try:
+        refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE)
+        if all(ref.ref != target_revision for ref in refs.converts):
+            initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id
+            committer_hf_api.create_branch(
+                repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit
+            )
+    except RepositoryNotFoundError as err:
+        raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err
+    except Exception as e:
+        # TODO: improve error handling
+        logging.error(str(e))
+    target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
+    all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings}
+    previous_index = f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}"
+    delete_operations: List[CommitOperation] = []
+    if previous_index in all_repo_files:
+        delete_operations.append(CommitOperationDelete(path_in_repo=previous_index))
+        logging.debug(f"{delete_operations=}")
+
+    # send the files to the target revision
+    add_operations: List[CommitOperation] = [
+        CommitOperationAdd(
+            path_in_repo=f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}", path_or_fileobj=db_location
+        )
+    ]
+    logging.debug(f"{add_operations=}")
+
+    # TODO: Delete local index file
+    committer_hf_api.create_commit(
+        repo_id=dataset,
+        repo_type=DATASET_TYPE,
+        revision=target_revision,
+        operations=delete_operations + add_operations,
+        commit_message=commit_message,
+        parent_commit=target_dataset_info.sha,
+    )
+
+    # call the API again to get the list of parquet files
+    target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True)
+    repo_files = [
+        repo_file
+        for repo_file in target_dataset_info.siblings
+        if repo_file.rfilename.startswith(f"{config}/{split}") and repo_file.rfilename.endswith(".db")
+    ]
+    if len(repo_files) != 1:
+        # TODO: improve exception type
+        raise Exception("NO FILE WAS UPLOADED TO BRANCH")
+    index_file = repo_files[0]
+
+    remove_dir(dir_path)
+    return create_index_item(
+        repo_file=index_file,
+        dataset=dataset,
+        config=config,
+        split=split,
+        hf_endpoint=hf_endpoint,
+        target_revision=target_revision,
+        url_template=url_template,
+    )
 
 
 class SplitDuckDbIndexJobRunner(SplitJobRunner):
+    duckdb_index_config: DuckDbIndexConfig
     duckdb_index_directory: StrPath
 
     def __init__(
@@ -162,6 +273,7 @@ def __init__(
             processing_step=processing_step,
         )
         self.duckdb_index_directory = duckdb_index_directory
+        self.duckdb_index_config = app_config.duckdb_index
 
     @staticmethod
     def get_job_type() -> str:
@@ -179,5 +291,10 @@ def compute(self) -> CompleteJobResult:
                 split=self.split,
                 duckdb_index_directory=self.duckdb_index_directory,
                 hf_token=self.app_config.common.hf_token,
+                url_template=self.duckdb_index_config.url_template,
+                commit_message=self.duckdb_index_config.commit_message,
+                committer_hf_token=self.duckdb_index_config.committer_hf_token,
+                hf_endpoint=self.app_config.common.hf_endpoint,
+                target_revision=self.duckdb_index_config.target_revision,
             )
         )
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index 580c73d2a2..8e849055bc 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -137,7 +137,12 @@ class ImageUrlColumnsResponse(TypedDict):
 
 
 class IndexRowsResponse(TypedDict):
-    duckdb_index_filename: str
+    dataset: str
+    config: str
+    split: str
+    url: str
+    filename: str
+    size: int
 
 
 Row = Mapping[str, Any]

From cec74e3ab98e4e5817244744256c8eafd56f3274 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 7 Jun 2023 12:50:55 -0400
Subject: [PATCH 12/52] Fix test in first parquet

---
 .../split/first_rows_from_parquet.py          |   5 +-
 services/worker/tests/conftest.py             |   2 +-
 services/worker/tests/fixtures/fsspec.py      | 119 ++++++++++++++++++
 .../split/test_first_rows_from_parquet.py     |  43 +++++--
 4 files changed, 155 insertions(+), 14 deletions(-)
 create mode 100644 services/worker/tests/fixtures/fsspec.py

diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
index 7fe3f6e9e5..d1b0678333 100644
--- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
+++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
@@ -111,7 +111,10 @@ def compute_first_rows_response(
             partial(ParquetFile, filesystem=fs), source_uris, desc=desc, unit="pq", disable=True
         )
     except Exception as e:
-        raise FileSystemError(f"Could not read the parquet files: {e}") from e
+        raise e
+        # print(f"ERROR")
+        # print(str(e))
+        # raise FileSystemError(f"Could not read the parquet files: {e}") from e
 
     # get the features
     features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema())
diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py
index d2a80b3055..6092d5babf 100644
--- a/services/worker/tests/conftest.py
+++ b/services/worker/tests/conftest.py
@@ -150,4 +150,4 @@ def another_processing_step(test_processing_graph: ProcessingGraph) -> Processin
 
 
 # Import fixture modules as plugins
-pytest_plugins = ["tests.fixtures.datasets", "tests.fixtures.files", "tests.fixtures.hub"]
+pytest_plugins = ["tests.fixtures.datasets", "tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"]
diff --git a/services/worker/tests/fixtures/fsspec.py b/services/worker/tests/fixtures/fsspec.py
new file mode 100644
index 0000000000..848dceb54a
--- /dev/null
+++ b/services/worker/tests/fixtures/fsspec.py
@@ -0,0 +1,119 @@
+# type: ignore
+import posixpath
+import shutil
+from pathlib import Path
+from unittest.mock import patch
+
+import fsspec
+import pytest
+from fsspec.implementations.local import (
+    AbstractFileSystem,
+    LocalFileSystem,
+    stringify_path,
+)
+
+
+class MockFileSystem(AbstractFileSystem):
+    protocol = "mock"
+
+    def __init__(self, *args, local_root_dir, **kwargs):
+        super().__init__()
+        self._fs = LocalFileSystem(*args, **kwargs)
+        self.local_root_dir = Path(local_root_dir).resolve().as_posix() + "/"
+
+    def mkdir(self, path, *args, **kwargs):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs.mkdir(path, *args, **kwargs)
+
+    def makedirs(self, path, *args, **kwargs):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs.makedirs(path, *args, **kwargs)
+
+    def rmdir(self, path):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs.rmdir(path)
+
+    def ls(self, path, detail=True, *args, **kwargs):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        out = self._fs.ls(path, detail=detail, *args, **kwargs)
+        if detail:
+            return [{**info, "name": info["name"][len(self.local_root_dir) :]} for info in out]  # noqa: E203
+        else:
+            return [name[len(self.local_root_dir) :] for name in out]  # noqa: E203
+
+    def info(self, path, *args, **kwargs):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        out = dict(self._fs.info(path, *args, **kwargs))
+        out["name"] = out["name"][len(self.local_root_dir) :]  # noqa: E203
+        return out
+
+    def cp_file(self, path1, path2, *args, **kwargs):
+        path1 = posixpath.join(self.local_root_dir, self._strip_protocol(path1))
+        path2 = posixpath.join(self.local_root_dir, self._strip_protocol(path2))
+        return self._fs.cp_file(path1, path2, *args, **kwargs)
+
+    def rm_file(self, path, *args, **kwargs):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs.rm_file(path, *args, **kwargs)
+
+    def rm(self, path, *args, **kwargs):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs.rm(path, *args, **kwargs)
+
+    def _open(self, path, *args, **kwargs):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs._open(path, *args, **kwargs)
+
+    def created(self, path):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs.created(path)
+
+    def modified(self, path):
+        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
+        return self._fs.modified(path)
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        path = stringify_path(path)
+        if path.startswith("mock://"):
+            path = path[7:]
+        return path
+
+
+class TmpDirFileSystem(MockFileSystem):
+    protocol = "tmp"
+    tmp_dir = None
+
+    def __init__(self, *args, **kwargs):
+        assert self.tmp_dir is not None, "TmpDirFileSystem.tmp_dir is not set"
+        super().__init__(*args, **kwargs, local_root_dir=self.tmp_dir, auto_mkdir=True)
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        path = stringify_path(path)
+        if path.startswith("tmp://"):
+            path = path[6:]
+        return path
+
+
+@pytest.fixture
+def mock_fsspec():
+    original_registry = fsspec.registry.copy()
+    fsspec.register_implementation("mock", MockFileSystem)
+    fsspec.register_implementation("tmp", TmpDirFileSystem)
+    yield
+    fsspec.registry = original_registry
+
+
+@pytest.fixture
+def mockfs(tmp_path_factory, mock_fsspec):
+    local_fs_dir = tmp_path_factory.mktemp("mockfs")
+    return MockFileSystem(local_root_dir=local_fs_dir, auto_mkdir=True)
+
+
+@pytest.fixture
+def tmpfs(tmp_path_factory, mock_fsspec):
+    tmp_fs_dir = tmp_path_factory.mktemp("tmpfs")
+    with patch.object(TmpDirFileSystem, "tmp_dir", tmp_fs_dir):
+        yield TmpDirFileSystem()
+    shutil.rmtree(tmp_fs_dir)
diff --git a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
index 8ab70191ea..591557f74a 100644
--- a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
+++ b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
@@ -4,9 +4,9 @@
 import os
 from dataclasses import replace
 from http import HTTPStatus
-from typing import Callable, List
+from typing import Callable, List, Generator
 from unittest.mock import patch
-
+from datasets import Dataset
 import pytest
 from libcommon.exceptions import CustomError
 from libcommon.processing_graph import ProcessingGraph
@@ -14,7 +14,7 @@
 from libcommon.simple_cache import upsert_response
 from libcommon.storage import StrPath
 from libcommon.utils import Priority
-from pyarrow.fs import LocalFileSystem
+from fsspec import AbstractFileSystem
 
 from worker.config import AppConfig
 from worker.job_runners.split.first_rows_from_parquet import (
@@ -70,20 +70,38 @@ def _get_job_runner(
 
 
 def mock_get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]:
+    print("----------------------------->>>")
     return paths
 
+@pytest.fixture
+def ds() -> Dataset:
+    return Dataset.from_dict({"text": ["Hello there", "General Kenobi"]})
+
+@pytest.fixture
+def ds_fs(ds: Dataset, tmpfs: AbstractFileSystem) -> Generator[AbstractFileSystem, None, None]:
+    with tmpfs.open("config/dataset-split.parquet", "wb") as f:
+        print("---->AAAA")
+        try:
+            ds.to_parquet(f)
+        except Exception as e:
+            print("-------------->CCCCCCCCCCCC")
+            print(str(e))
+        print("---->BBBB")
+    yield tmpfs
+
 
 @pytest.mark.parametrize(
     "rows_max_bytes,columns_max_number,error_code",
     [
         (0, 10, "TooBigContentError"),  # too small limit, even with truncation
-        (1_000, 1, "TooManyColumnsError"),  # too small columns limit
-        (1_000, 10, None),
+        # (1_000, 1, "TooManyColumnsError"),  # too small columns limit
+        # (1_000, 10, None),
     ],
 )
 def test_compute(
     get_job_runner: GetJobRunner,
     app_config: AppConfig,
+    ds_fs: AbstractFileSystem,
     rows_max_bytes: int,
     columns_max_number: int,
     error_code: str,
@@ -99,6 +117,7 @@ def test_compute(
                     "dataset": dataset,
                     "config": config,
                     "split": split,
+                    "url": f"https://fake.huggingface.co/datasets/ds/resolve/refs%2Fconvert%2Fparquet/{config}/{dataset}-{split}.parquet",  # noqa: E501
                     "filename": f"{dataset}-{split}.parquet",
                     "size": 1000,
                 }
@@ -107,16 +126,16 @@ def test_compute(
         http_status=HTTPStatus.OK,
     )
 
-    with patch("worker.utils.get_hf_fs") as mock_read:
+    with patch("worker.utils.get_hf_fs", return_value=ds_fs):
         with patch(
             "worker.utils.get_hf_parquet_uris",
             side_effect=mock_get_hf_parquet_uris,
         ):
-            initial_location = os.getcwd()
-            os.chdir("tests/job_runners/split")
-            # TODO:  Make localsystem by relative path
-            fs = LocalFileSystem()
-            mock_read.return_value = fs
+            # initial_location = os.getcwd()
+            # os.chdir("tests/job_runners/split")
+            # # TODO:  Make localsystem by relative path
+            # fs = LocalFileSystem()
+            # mock_read.return_value = fs
             # ^ Mocking file system with local file
             job_runner = get_job_runner(
                 dataset,
@@ -167,4 +186,4 @@ def test_compute(
                 assert response["rows"][2]["row_idx"] == 2
                 assert response["rows"][2]["truncated_cells"] == []
                 assert response["rows"][2]["row"] == {"col1": 3, "col2": "c"}
-            os.chdir(initial_location)
+            # os.chdir(initial_location)

From 8679ce9183d3821d12459ff88c5d4df70f63b1f2 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 9 Jun 2023 14:16:34 -0400
Subject: [PATCH 13/52] Fix merge hanges

---
 .../worker/job_runners/split/duckdb_index.py   | 10 +++-------
 .../split/first_rows_from_parquet.py           |  2 --
 services/worker/src/worker/utils.py            |  3 ---
 .../split/test_first_rows_from_parquet.py      | 18 ++++--------------
 4 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 827875cd05..69e662af0c 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -27,7 +27,9 @@
     SplitNotFoundError,
     UnsupportedIndexableColumnsError,
 )
+from libcommon.parquet_utils import get_hf_fs, get_hf_parquet_uris
 from libcommon.processing_graph import ProcessingStep
+from libcommon.simple_cache import get_previous_step_or_raise
 from libcommon.storage import StrPath, remove_dir
 from libcommon.utils import JobInfo
 from libcommon.viewer_utils.index_utils import create_index_dir_split
@@ -36,13 +38,7 @@
 
 from worker.config import AppConfig
 from worker.job_runners.split.split_job_runner import SplitJobRunner
-from worker.utils import (
-    CompleteJobResult,
-    IndexRowsResponse,
-    get_hf_fs,
-    get_hf_parquet_uris,
-    get_previous_step_or_raise,
-)
+from worker.utils import CompleteJobResult, IndexRowsResponse
 
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
index b79083c3be..64112c28b4 100644
--- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
+++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
@@ -29,8 +29,6 @@
     RowItem,
     SplitFirstRowsResponse,
     create_truncated_row_items,
-    get_hf_fs,
-    get_hf_parquet_uris,
     get_json_size,
     to_features_list,
 )
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index 916c9f6bea..b28c4193c2 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -27,9 +27,6 @@
     IterableDataset,
     load_dataset,
 )
-from huggingface_hub import HfFileSystem
-from huggingface_hub.hf_file_system import safe_quote
-from libcommon.constants import PARQUET_REVISION
 from libcommon.exceptions import NormalRowsError, StreamingRowsError
 from libcommon.utils import orjson_dumps
 
diff --git a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
index df6253cf7c..3dcb37c15b 100644
--- a/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
+++ b/services/worker/tests/job_runners/split/test_first_rows_from_parquet.py
@@ -5,7 +5,7 @@
 from http import HTTPStatus
 from typing import Callable, Generator, List
 from unittest.mock import patch
-from datasets import Dataset
+
 import pytest
 from datasets import Dataset
 from fsspec import AbstractFileSystem
@@ -91,29 +91,19 @@ def ds_fs(ds: Dataset, tmpfs: AbstractFileSystem) -> Generator[AbstractFileSyste
 def mock_get_hf_parquet_uris(paths: List[str], dataset: str) -> List[str]:
     return paths
 
-@pytest.fixture
-def ds() -> Dataset:
-    return Dataset.from_dict({"text": ["Hello there", "General Kenobi"]})
-
-@pytest.fixture
-def ds_fs(ds: Dataset, tmpfs: AbstractFileSystem) -> Generator[AbstractFileSystem, None, None]:
-    with tmpfs.open("config/dataset-split.parquet", "wb") as f:
-        ds.to_parquet(f)
-    yield tmpfs
-
 
 @pytest.mark.parametrize(
     "rows_max_bytes,columns_max_number,error_code",
     [
         (0, 10, "TooBigContentError"),  # too small limit, even with truncation
-        # (1_000, 1, "TooManyColumnsError"),  # too small columns limit
-        # (1_000, 10, None),
+        (1_000, 1, "TooManyColumnsError"),  # too small columns limit
+        (1_000, 10, None),
     ],
 )
 def test_compute(
+    ds_fs: AbstractFileSystem,
     get_job_runner: GetJobRunner,
     app_config: AppConfig,
-    ds_fs: AbstractFileSystem,
     rows_max_bytes: int,
     columns_max_number: int,
     error_code: str,

From 163928e51354cb621ad2261b4c7cdfdd41b84457 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 9 Jun 2023 14:28:31 -0400
Subject: [PATCH 14/52] Fix poetry files

---
 services/api/poetry.lock       |  1 +
 services/worker/poetry.lock    | 43 +++-------------------------------
 services/worker/pyproject.toml |  2 +-
 3 files changed, 5 insertions(+), 41 deletions(-)

diff --git a/services/api/poetry.lock b/services/api/poetry.lock
index ef10527085..a4b0eaf999 100644
--- a/services/api/poetry.lock
+++ b/services/api/poetry.lock
@@ -2867,6 +2867,7 @@ files = [
     {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"},
+    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"},
     {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"},
     {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"},
     {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"},
diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock
index c82510a896..59cb048ebc 100644
--- a/services/worker/poetry.lock
+++ b/services/worker/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 
 [[package]]
 name = "absl-py"
@@ -1051,44 +1051,6 @@ files = [
     {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"},
 ]
 
-[[package]]
-name = "elastic-transport"
-version = "8.4.0"
-description = "Transport classes and utilities shared among Python Elastic client libraries"
-category = "main"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"},
-    {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"},
-]
-
-[package.dependencies]
-certifi = "*"
-urllib3 = ">=1.26.2,<2"
-
-[package.extras]
-develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"]
-
-[[package]]
-name = "elasticsearch"
-version = "8.8.0"
-description = "Python client for Elasticsearch"
-category = "main"
-optional = false
-python-versions = ">=3.6, <4"
-files = [
-    {file = "elasticsearch-8.8.0-py3-none-any.whl", hash = "sha256:2223ee9daaa3c80c25b28ec3f7c48e66fce6b767a338333d9a81886046a07df6"},
-    {file = "elasticsearch-8.8.0.tar.gz", hash = "sha256:6878313cd598c7c90079fed1d4be72e198da35cba57f4083e6bee91f9c70b0eb"},
-]
-
-[package.dependencies]
-elastic-transport = ">=8,<9"
-
-[package.extras]
-async = ["aiohttp (>=3,<4)"]
-requests = ["requests (>=2.4.0,<3.0.0)"]
-
 [[package]]
 name = "environs"
 version = "9.5.0"
@@ -2956,6 +2918,7 @@ optional = false
 python-versions = "*"
 files = [
     {file = "pdf2image-1.16.3-py3-none-any.whl", hash = "sha256:b6154164af3677211c22cbb38b2bd778b43aca02758e962fe1e231f6d3b0e380"},
+    {file = "pdf2image-1.16.3.tar.gz", hash = "sha256:74208810c2cef4d9e347769b8e62a52303982ddb4f2dfd744c7ab4b940ae287e"},
 ]
 
 [package.dependencies]
@@ -5692,4 +5655,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.9.15"
-content-hash = "ad3c5c34e9ea75e4cb4394930bb35c5afdff65fe03c5f51b8cbebbea37f62f1d"
+content-hash = "732285314a1b756206bdba83a83ee9e97635117f5fd9a6fd8d2b92d8f51e6679"
diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml
index b2cf6d1701..3215758fb7 100644
--- a/services/worker/pyproject.toml
+++ b/services/worker/pyproject.toml
@@ -45,7 +45,7 @@ trec-car-tools = { path = "vendors/trec-car-tools/python3" }
 typer = "^0.4.2"
 wget = "^3.2"
 mirakuru = "^2.4.2"
-duckdb = "0.8.0"
+duckdb = "^0.8.0"
 
 [tool.poetry.group.dev.dependencies]
 bandit = "^1.7.4"

From b1238f503aa69b0686c76cb1359f377e407746d5 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 12 Jun 2023 16:28:47 -0400
Subject: [PATCH 15/52] Adding happy path test

---
 libs/libcommon/src/libcommon/exceptions.py    |   8 +
 .../src/libcommon/viewer_utils/index_utils.py |   7 +-
 .../src/worker/job_runners/config/parquet.py  |   3 +-
 .../job_runners/config/parquet_and_info.py    |  18 +-
 .../job_runners/config/parquet_metadata.py    |   9 +-
 .../src/worker/job_runners/dataset/parquet.py |   3 +-
 .../worker/job_runners/split/duckdb_index.py  |  76 ++++----
 services/worker/src/worker/utils.py           |  24 +++
 services/worker/tests/conftest.py             |   1 +
 services/worker/tests/fixtures/datasets.py    |  14 ++
 services/worker/tests/fixtures/hub.py         |  17 ++
 .../tests/job_runners/config/test_parquet.py  |   6 +-
 .../config/test_parquet_metadata.py           |   2 +-
 .../tests/job_runners/dataset/test_parquet.py |   2 +-
 .../job_runners/split/test_duckdb_index.py    | 163 ++++++++++++++++++
 15 files changed, 271 insertions(+), 82 deletions(-)
 create mode 100644 services/worker/tests/job_runners/split/test_duckdb_index.py

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index c9ac5d5058..cdf8a3d95c 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -101,6 +101,7 @@ def as_response(self) -> ErrorResponse:
     "MissingSpawningTokenError",
     "NoIndexableColumnsError",
     "NormalRowsError",
+    "NotAvailableIndexFileError",
     "ParameterMissingError",
     "ParquetResponseEmptyError",
     "PreviousStepFormatError",
@@ -495,3 +496,10 @@ class UnsupportedIndexableColumnsError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedIndexableColumnsError", cause, True)
+
+
+class NotAvailableIndexFileError(CacheableError):
+    """Raised when no duckdb index file was found for split."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NotAvailableIndexFileError", cause, False)
diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
index d00b4754cc..5beb9f6a31 100644
--- a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
+++ b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright 2022 The HuggingFace Authors.
+# Copyright 2023 The HuggingFace Authors.
 
 from os import makedirs
 from pathlib import Path
-from typing import Tuple
 
 from libcommon.storage import StrPath
 
@@ -11,8 +10,8 @@
 INDEX_DIR_MODE = 0o755
 
 
-def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Tuple[str, Path]:
+def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Path:
     split_path = f"{dataset}/{DATASET_SEPARATOR}/{config}/{split}"
     dir_path = Path(index_directory).resolve() / split_path
     makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True)
-    return split_path, dir_path
+    return dir_path
diff --git a/services/worker/src/worker/job_runners/config/parquet.py b/services/worker/src/worker/job_runners/config/parquet.py
index eae1d780c2..4fa17de2ed 100644
--- a/services/worker/src/worker/job_runners/config/parquet.py
+++ b/services/worker/src/worker/job_runners/config/parquet.py
@@ -9,8 +9,7 @@
 from libcommon.simple_cache import get_previous_step_or_raise
 
 from worker.job_runners.config.config_job_runner import ConfigJobRunner
-from worker.job_runners.config.parquet_and_info import ParquetFileItem
-from worker.utils import CompleteJobResult
+from worker.utils import CompleteJobResult, ParquetFileItem
 
 
 class ConfigParquetResponse(TypedDict):
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index 91de381e19..318ff270da 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -8,7 +8,6 @@
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple, TypedDict
-from urllib.parse import quote
 
 import datasets
 import datasets.config
@@ -72,16 +71,7 @@
 
 from worker.config import AppConfig, ParquetAndInfoConfig
 from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner
-from worker.utils import CompleteJobResult
-
-
-class ParquetFileItem(TypedDict):
-    dataset: str
-    config: str
-    split: str
-    url: str
-    filename: str
-    size: int
+from worker.utils import CompleteJobResult, ParquetFileItem, hf_hub_url
 
 
 class ConfigParquetAndInfoResponse(TypedDict):
@@ -106,12 +96,6 @@ def path_in_repo(self) -> str:
         return f'{self.config}/{self.local_file.removeprefix(f"{self.local_dir}/")}'
 
 
-# TODO: use huggingface_hub's hf_hub_url after
-# https://github.com/huggingface/huggingface_hub/issues/1082
-def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str:
-    return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename)
-
-
 p = re.compile(r"(?P<builder>[\w-]+?)-(?P<split>\w+(\.\w+)*?)(-[0-9]{5}-of-[0-9]{5})?.parquet")
 
 
diff --git a/services/worker/src/worker/job_runners/config/parquet_metadata.py b/services/worker/src/worker/job_runners/config/parquet_metadata.py
index e1cb6ef575..6c361168f9 100644
--- a/services/worker/src/worker/job_runners/config/parquet_metadata.py
+++ b/services/worker/src/worker/job_runners/config/parquet_metadata.py
@@ -5,7 +5,6 @@
 from functools import partial
 from typing import List, Optional, TypedDict
 
-from datasets.utils.file_utils import get_authentication_headers_for_url
 from fsspec.implementations.http import HTTPFileSystem
 from libcommon.constants import PROCESSING_STEP_CONFIG_PARQUET_METADATA_VERSION
 from libcommon.exceptions import (
@@ -23,8 +22,7 @@
 
 from worker.config import AppConfig
 from worker.job_runners.config.config_job_runner import ConfigJobRunner
-from worker.job_runners.config.parquet_and_info import ParquetFileItem
-from worker.utils import CompleteJobResult
+from worker.utils import CompleteJobResult, ParquetFileItem, get_parquet_file
 
 
 class ParquetFileMetadataItem(TypedDict):
@@ -42,11 +40,6 @@ class ConfigParquetMetadataResponse(TypedDict):
     parquet_files_metadata: List[ParquetFileMetadataItem]
 
 
-def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> ParquetFile:
-    headers = get_authentication_headers_for_url(url, use_auth_token=hf_token)
-    return ParquetFile(fs.open(url, headers=headers))
-
-
 def compute_parquet_metadata_response(
     dataset: str, config: str, hf_token: Optional[str], parquet_metadata_directory: StrPath
 ) -> ConfigParquetMetadataResponse:
diff --git a/services/worker/src/worker/job_runners/dataset/parquet.py b/services/worker/src/worker/job_runners/dataset/parquet.py
index 4a3609505d..7c6bb4a82c 100644
--- a/services/worker/src/worker/job_runners/dataset/parquet.py
+++ b/services/worker/src/worker/job_runners/dataset/parquet.py
@@ -14,9 +14,8 @@
 )
 
 from worker.job_runners.config.parquet import ConfigParquetResponse
-from worker.job_runners.config.parquet_and_info import ParquetFileItem
 from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner
-from worker.utils import JobResult, PreviousJob
+from worker.utils import JobResult, ParquetFileItem, PreviousJob
 
 
 class DatasetParquetResponse(TypedDict):
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 69e662af0c..29fdacfdba 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -5,10 +5,10 @@
 from functools import partial
 from pathlib import Path
 from typing import List, Optional, Set
-from urllib.parse import quote
 
 import duckdb
 from datasets import Features
+from fsspec.implementations.http import HTTPFileSystem
 from huggingface_hub._commit_api import (
     CommitOperation,
     CommitOperationAdd,
@@ -22,12 +22,12 @@
     DatasetNotFoundError,
     FileSystemError,
     NoIndexableColumnsError,
+    NotAvailableIndexFileError,
     ParquetResponseEmptyError,
     PreviousStepFormatError,
     SplitNotFoundError,
     UnsupportedIndexableColumnsError,
 )
-from libcommon.parquet_utils import get_hf_fs, get_hf_parquet_uris
 from libcommon.processing_graph import ProcessingStep
 from libcommon.simple_cache import get_previous_step_or_raise
 from libcommon.storage import StrPath, remove_dir
@@ -38,7 +38,13 @@
 
 from worker.config import AppConfig
 from worker.job_runners.split.split_job_runner import SplitJobRunner
-from worker.utils import CompleteJobResult, IndexRowsResponse
+from worker.utils import (
+    CompleteJobResult,
+    IndexRowsResponse,
+    ParquetFileItem,
+    get_parquet_file,
+    hf_hub_url,
+)
 
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
@@ -53,10 +59,6 @@
 # TODO: What if __id field already exist?
 
 
-def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str:
-    return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename)
-
-
 def create_index_item(
     repo_file: RepoFile,
     dataset: str,
@@ -96,7 +98,7 @@ def compute_index_rows(
     hf_token: Optional[str],
     committer_hf_token: Optional[str],
 ) -> IndexRowsResponse:
-    logging.info(f"get index-rows for dataset={dataset} config={config} split={split}")
+    logging.info(f"get split-duckdb-index for dataset={dataset} config={config} split={split}")
 
     # validate split
     split_names_best_response = get_previous_step_or_raise(
@@ -112,27 +114,22 @@ def compute_index_rows(
 
     # get parquet content
     config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
-
     try:
         parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"]
-        sources = sorted(
-            f"{config}/{parquet_file['filename']}"
-            for parquet_file in parquet_files_content
-            if parquet_file["split"] == split and parquet_file["config"] == config
-        )
-        if not sources:
+        parquet_file_items: List[ParquetFileItem] = [
+            parquet_file_item for parquet_file_item in parquet_files_content if parquet_file_item["config"] == config
+        ]
+        if not parquet_file_items:
             raise ParquetResponseEmptyError("No parquet files found.")
     except Exception as e:
         raise PreviousStepFormatError("Previous step did not return the expected content.") from e
 
-    logging.debug(f"Found {len(sources)} parquet files for {dataset=}, {config=}, {split=}: {sources}")
-
-    fs = get_hf_fs(hf_token=hf_token)
-    source_uris = get_hf_parquet_uris(sources, dataset=dataset)
-    desc = f"{dataset}/{config}/{split}"
+    fs = HTTPFileSystem()
+    source_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items]
+    desc = f"{dataset}/{config}"
     try:
         parquet_files: List[ParquetFile] = thread_map(
-            partial(ParquetFile, filesystem=fs), source_uris, desc=desc, unit="pq", disable=True
+            partial(get_parquet_file, fs=fs, hf_token=hf_token), source_urls, desc=desc, unit="pq", disable=True
         )
     except Exception as e:
         raise FileSystemError(f"Could not read the parquet files: {e}") from e
@@ -166,7 +163,7 @@ def compute_index_rows(
         raise PreviousStepFormatError("Previous step did not return the expected content.") from e
 
     # create duckdb index location
-    split_path, dir_path = create_index_dir_split(
+    dir_path = create_index_dir_split(
         dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
     )
     db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME
@@ -189,36 +186,28 @@ def compute_index_rows(
     # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)
     hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
     committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token)
-
+    index_file_location = f"{config}/{dataset}-{split}.db"
     try:
         refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE)
         if all(ref.ref != target_revision for ref in refs.converts):
             initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id
             committer_hf_api.create_branch(
-                repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit
+                repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True
             )
     except RepositoryNotFoundError as err:
         raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err
-    except Exception as e:
-        # TODO: improve error handling
-        logging.error(str(e))
+
     target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
     all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings}
-    previous_index = f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}"
     delete_operations: List[CommitOperation] = []
-    if previous_index in all_repo_files:
-        delete_operations.append(CommitOperationDelete(path_in_repo=previous_index))
-        logging.debug(f"{delete_operations=}")
+    if index_file_location in all_repo_files:
+        delete_operations.append(CommitOperationDelete(path_in_repo=index_file_location))
 
     # send the files to the target revision
     add_operations: List[CommitOperation] = [
-        CommitOperationAdd(
-            path_in_repo=f"{config}/{split}-{DUCKDB_DEFAULT_INDEX_FILENAME}", path_or_fileobj=db_location
-        )
+        CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location)
     ]
-    logging.debug(f"{add_operations=}")
 
-    # TODO: Delete local index file
     committer_hf_api.create_commit(
         repo_id=dataset,
         repo_type=DATASET_TYPE,
@@ -231,18 +220,19 @@ def compute_index_rows(
     # call the API again to get the list of parquet files
     target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True)
     repo_files = [
-        repo_file
-        for repo_file in target_dataset_info.siblings
-        if repo_file.rfilename.startswith(f"{config}/{split}") and repo_file.rfilename.endswith(".db")
+        repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location
     ]
+
+    if not repo_files:
+        raise NotAvailableIndexFileError("No index file was found")
+
     if len(repo_files) != 1:
-        # TODO: improve exception type
-        raise Exception("NO FILE WAS UPLOADED TO BRANCH")
-    index_file = repo_files[0]
+        logging.warning(f"Found {len(repo_files)} index files, should be only 1")
 
     remove_dir(dir_path)
+    # remove index file since it is no more used and is stored in NFS
     return create_index_item(
-        repo_file=index_file,
+        repo_file=repo_files[0],
         dataset=dataset,
         config=config,
         split=split,
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index b28c4193c2..e414e34d70 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -18,6 +18,7 @@
     Union,
     cast,
 )
+from urllib.parse import quote
 
 from datasets import (
     Dataset,
@@ -27,8 +28,11 @@
     IterableDataset,
     load_dataset,
 )
+from datasets.utils.file_utils import get_authentication_headers_for_url
+from fsspec.implementations.http import HTTPFileSystem
 from libcommon.exceptions import NormalRowsError, StreamingRowsError
 from libcommon.utils import orjson_dumps
+from pyarrow.parquet import ParquetFile
 
 
 class JobRunnerInfo(TypedDict):
@@ -147,6 +151,15 @@ class RowsContent(TypedDict):
     all_fetched: bool
 
 
+class ParquetFileItem(TypedDict):
+    dataset: str
+    config: str
+    split: str
+    url: str
+    filename: str
+    size: int
+
+
 # TODO: separate functions from common classes and named dicts otherwise this file will continue growing
 
 
@@ -407,3 +420,14 @@ def get_rows_or_raise(
                 "Cannot load the dataset split (in normal download mode) to extract the first rows.",
                 cause=err,
             ) from err
+
+
+# TODO: use huggingface_hub's hf_hub_url after
+# https://github.com/huggingface/huggingface_hub/issues/1082
+def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url_template: str) -> str:
+    return (hf_endpoint + url_template) % (repo_id, quote(revision, safe=""), filename)
+
+
+def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> ParquetFile:
+    headers = get_authentication_headers_for_url(url, use_auth_token=hf_token)
+    return ParquetFile(fs.open(url, headers=headers))
diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py
index 6092d5babf..a3f9cbef54 100644
--- a/services/worker/tests/conftest.py
+++ b/services/worker/tests/conftest.py
@@ -70,6 +70,7 @@ def set_env_vars(
     mp.setenv("PARQUET_AND_INFO_MAX_DATASET_SIZE", "10_000")
     mp.setenv("PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES", "10")
     mp.setenv("PARQUET_AND_INFO_COMMITTER_HF_TOKEN", CI_PARQUET_CONVERTER_APP_TOKEN)
+    mp.setenv("DUCKDB_INDEX_COMMITTER_HF_TOKEN", CI_PARQUET_CONVERTER_APP_TOKEN)
     mp.setenv("DATASETS_BASED_HF_DATASETS_CACHE", str(datasets_cache_directory))
     mp.setenv("HF_MODULES_CACHE", str(modules_cache_directory))
     mp.setenv("WORKER_CONTENT_MAX_BYTES", "10_000_000")
diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
index 18edaad1f1..fe6413489d 100644
--- a/services/worker/tests/fixtures/datasets.py
+++ b/services/worker/tests/fixtures/datasets.py
@@ -143,4 +143,18 @@ def datasets() -> Mapping[str, Dataset]:
                 dtype=pd.StringDtype(storage="python"),
             )
         ),
+        "duckdb_index": Dataset.from_pandas(
+            pd.DataFrame(
+                {
+                    "text": [
+                        "foo",
+                        "bar",
+                        "foobar",
+                        "- Hello there !",
+                        "- General Kenobi !",
+                    ]
+                },
+                dtype=pd.StringDtype(storage="python"),
+            )
+        ),
     }
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index 110dba75d2..db01d0fb3a 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -274,6 +274,13 @@ def hub_public_spawning_opt_in_out(datasets: Mapping[str, Dataset]) -> Iterator[
     delete_hub_dataset_repo(repo_id=repo_id)
 
 
+@pytest.fixture(scope="session")
+def hub_public_duckdb_index(datasets: Mapping[str, Dataset]) -> Iterator[str]:
+    repo_id = create_hub_dataset_repo(prefix="duckdb_index", dataset=datasets["duckdb_index"])
+    yield repo_id
+    delete_hub_dataset_repo(repo_id=repo_id)
+
+
 class HubDatasetTest(TypedDict):
     name: str
     config_names_response: Any
@@ -588,6 +595,7 @@ def hub_datasets(
     hub_public_big_csv: str,
     hub_public_external_files: str,
     hub_public_spawning_opt_in_out: str,
+    hub_public_duckdb_index: str,
 ) -> HubDatasets:
     return {
         "does_not_exist": {
@@ -714,4 +722,13 @@ def hub_datasets(
             ),
             "parquet_and_info_response": None,
         },
+        "duckdb_index": {
+            "name": hub_public_duckdb_index,
+            "config_names_response": create_config_names_response(hub_public_duckdb_index),
+            "splits_response": create_splits_response(hub_public_duckdb_index),
+            "first_rows_response": create_first_rows_response(hub_public_duckdb_index, TEXT_cols, TEXT_rows),
+            "parquet_and_info_response": create_parquet_and_info_response(
+                dataset=hub_public_duckdb_index, data_type="csv"
+            ),
+        },
     }
diff --git a/services/worker/tests/job_runners/config/test_parquet.py b/services/worker/tests/job_runners/config/test_parquet.py
index 4b58e5f7a1..eeecda8bfd 100644
--- a/services/worker/tests/job_runners/config/test_parquet.py
+++ b/services/worker/tests/job_runners/config/test_parquet.py
@@ -16,10 +16,8 @@
     ConfigParquetJobRunner,
     ConfigParquetResponse,
 )
-from worker.job_runners.config.parquet_and_info import (
-    ConfigParquetAndInfoResponse,
-    ParquetFileItem,
-)
+from worker.job_runners.config.parquet_and_info import ConfigParquetAndInfoResponse
+from worker.utils import ParquetFileItem
 
 
 @pytest.fixture(autouse=True)
diff --git a/services/worker/tests/job_runners/config/test_parquet_metadata.py b/services/worker/tests/job_runners/config/test_parquet_metadata.py
index fadd6e711b..dbc9eb62d9 100644
--- a/services/worker/tests/job_runners/config/test_parquet_metadata.py
+++ b/services/worker/tests/job_runners/config/test_parquet_metadata.py
@@ -20,12 +20,12 @@
 
 from worker.config import AppConfig
 from worker.job_runners.config.parquet import ConfigParquetResponse
-from worker.job_runners.config.parquet_and_info import ParquetFileItem
 from worker.job_runners.config.parquet_metadata import (
     ConfigParquetMetadataJobRunner,
     ConfigParquetMetadataResponse,
     ParquetFileMetadataItem,
 )
+from worker.utils import ParquetFileItem
 
 
 @pytest.fixture(autouse=True)
diff --git a/services/worker/tests/job_runners/dataset/test_parquet.py b/services/worker/tests/job_runners/dataset/test_parquet.py
index c698848c32..ba377eb65a 100644
--- a/services/worker/tests/job_runners/dataset/test_parquet.py
+++ b/services/worker/tests/job_runners/dataset/test_parquet.py
@@ -13,11 +13,11 @@
 
 from worker.config import AppConfig
 from worker.job_runners.config.parquet import ConfigParquetResponse
-from worker.job_runners.config.parquet_and_info import ParquetFileItem
 from worker.job_runners.dataset.parquet import (
     DatasetParquetJobRunner,
     DatasetParquetResponse,
 )
+from worker.utils import ParquetFileItem
 
 from ..utils import UpstreamResponse
 
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
new file mode 100644
index 0000000000..595e12d022
--- /dev/null
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+
+from http import HTTPStatus
+from typing import Callable
+
+import pytest
+from libcommon.processing_graph import ProcessingGraph
+from libcommon.resources import CacheMongoResource, QueueMongoResource
+from libcommon.simple_cache import upsert_response
+from libcommon.storage import StrPath
+from libcommon.utils import Priority
+
+from worker.config import AppConfig
+from worker.job_runners.config.parquet_and_info import ConfigParquetAndInfoJobRunner
+from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner
+from worker.resources import LibrariesResource
+
+from ...fixtures.hub import HubDatasets
+
+GetJobRunner = Callable[[str, str, str, AppConfig], SplitDuckDbIndexJobRunner]
+
+GetParquetJobRunner = Callable[[str, str, AppConfig], ConfigParquetAndInfoJobRunner]
+
+
+@pytest.fixture
+def get_parquet_job_runner(
+    libraries_resource: LibrariesResource,
+    cache_mongo_resource: CacheMongoResource,
+    queue_mongo_resource: QueueMongoResource,
+) -> GetParquetJobRunner:
+    def _get_job_runner(
+        dataset: str,
+        config: str,
+        app_config: AppConfig,
+    ) -> ConfigParquetAndInfoJobRunner:
+        processing_step_name = ConfigParquetAndInfoJobRunner.get_job_type()
+        processing_graph = ProcessingGraph(
+            {
+                "dataset-level": {"input_type": "dataset"},
+                processing_step_name: {
+                    "input_type": "dataset",
+                    "job_runner_version": ConfigParquetAndInfoJobRunner.get_job_runner_version(),
+                    "triggered_by": "dataset-level",
+                },
+            }
+        )
+        return ConfigParquetAndInfoJobRunner(
+            job_info={
+                "type": ConfigParquetAndInfoJobRunner.get_job_type(),
+                "params": {
+                    "dataset": dataset,
+                    "revision": "revision",
+                    "config": config,
+                    "split": None,
+                },
+                "job_id": "job_id",
+                "priority": Priority.NORMAL,
+            },
+            app_config=app_config,
+            processing_step=processing_graph.get_processing_step(processing_step_name),
+            hf_datasets_cache=libraries_resource.hf_datasets_cache,
+        )
+
+    return _get_job_runner
+
+
+@pytest.fixture
+def get_job_runner(
+    duckdb_index_directory: StrPath,
+    cache_mongo_resource: CacheMongoResource,
+    queue_mongo_resource: QueueMongoResource,
+) -> GetJobRunner:
+    def _get_job_runner(
+        dataset: str,
+        config: str,
+        split: str,
+        app_config: AppConfig,
+    ) -> SplitDuckDbIndexJobRunner:
+        processing_step_name = SplitDuckDbIndexJobRunner.get_job_type()
+        processing_graph = ProcessingGraph(
+            {
+                "dataset-step": {"input_type": "dataset"},
+                "config-parquet": {
+                    "input_type": "config",
+                    "triggered_by": "dataset-step",
+                    "provides_config_parquet": True,
+                },
+                "config-split-names-from-streaming": {
+                    "input_type": "config",
+                    "triggered_by": "dataset-step",
+                },
+                processing_step_name: {
+                    "input_type": "dataset",
+                    "job_runner_version": SplitDuckDbIndexJobRunner.get_job_runner_version(),
+                    "triggered_by": ["config-parquet", "config-split-names-from-streaming"],
+                },
+            }
+        )
+        return SplitDuckDbIndexJobRunner(
+            job_info={
+                "type": SplitDuckDbIndexJobRunner.get_job_type(),
+                "params": {
+                    "dataset": dataset,
+                    "revision": "revision",
+                    "config": config,
+                    "split": split,
+                },
+                "job_id": "job_id",
+                "priority": Priority.NORMAL,
+            },
+            app_config=app_config,
+            processing_step=processing_graph.get_processing_step(processing_step_name),
+            duckdb_index_directory=duckdb_index_directory,
+        )
+
+    return _get_job_runner
+
+
+def test_compute(
+    get_parquet_job_runner: GetParquetJobRunner,
+    get_job_runner: GetJobRunner,
+    app_config: AppConfig,
+    hub_datasets: HubDatasets,
+) -> None:
+    hub_duckdb_index = "duckdb_index"
+    dataset = hub_datasets[hub_duckdb_index]["name"]
+    config_names = hub_datasets[hub_duckdb_index]["config_names_response"]
+    config = hub_datasets[hub_duckdb_index]["config_names_response"]["config_names"][0]["config"]
+    splits_response = hub_datasets[hub_duckdb_index]["splits_response"]
+    split = "train"
+
+    upsert_response(
+        "dataset-config-names",
+        dataset=dataset,
+        http_status=HTTPStatus.OK,
+        content=config_names,
+    )
+
+    upsert_response(
+        "config-split-names-from-streaming",
+        dataset=dataset,
+        config=config,
+        http_status=HTTPStatus.OK,
+        content=splits_response,
+    )
+
+    parquet_job_runner = get_parquet_job_runner(dataset, config, app_config)
+    parquet_response = parquet_job_runner.compute()
+    config_parquet = parquet_response.content
+
+    upsert_response(
+        "config-parquet",
+        dataset=dataset,
+        config=config,
+        http_status=HTTPStatus.OK,
+        content=config_parquet,
+    )
+
+    assert parquet_response
+    job_runner = get_job_runner(dataset, config, split, app_config)
+    response = job_runner.compute()
+    assert response

From fd298befa4b9d371a59bbd65ac167d9e26bc7879 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 12 Jun 2023 17:25:55 -0400
Subject: [PATCH 16/52] Adding other test scenarios

---
 .../worker/job_runners/split/duckdb_index.py  |  20 ++--
 services/worker/tests/fixtures/datasets.py    |   7 ++
 services/worker/tests/fixtures/hub.py         |  37 +++++++
 .../job_runners/split/test_duckdb_index.py    | 102 ++++++++++--------
 4 files changed, 111 insertions(+), 55 deletions(-)

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 29fdacfdba..2897b6449f 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -117,7 +117,9 @@ def compute_index_rows(
     try:
         parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"]
         parquet_file_items: List[ParquetFileItem] = [
-            parquet_file_item for parquet_file_item in parquet_files_content if parquet_file_item["config"] == config
+            parquet_file_item
+            for parquet_file_item in parquet_files_content
+            if parquet_file_item["config"] == config and parquet_file_item["split"] == split
         ]
         if not parquet_file_items:
             raise ParquetResponseEmptyError("No parquet files found.")
@@ -125,11 +127,11 @@ def compute_index_rows(
         raise PreviousStepFormatError("Previous step did not return the expected content.") from e
 
     fs = HTTPFileSystem()
-    source_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items]
+    parquet_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items]
     desc = f"{dataset}/{config}"
     try:
         parquet_files: List[ParquetFile] = thread_map(
-            partial(get_parquet_file, fs=fs, hf_token=hf_token), source_urls, desc=desc, unit="pq", disable=True
+            partial(get_parquet_file, fs=fs, hf_token=hf_token), parquet_urls, desc=desc, unit="pq", disable=True
         )
     except Exception as e:
         raise FileSystemError(f"Could not read the parquet files: {e}") from e
@@ -143,7 +145,7 @@ def compute_index_rows(
     if not string_columns:
         raise NoIndexableColumnsError("No string columns available to index.")
 
-    # look for image, audio and binary columns, if present, raise exeception do not supported yet and index everything
+    # look for image, audio and binary columns, if present, raise exeception (not supported yet)
     if any(
         feature
         for feature in features.values()
@@ -154,14 +156,6 @@ def compute_index_rows(
     ):
         raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.")
 
-    try:
-        parquet_urls = [content["url"] for content in parquet_files_content if content["split"] == split]
-
-        if not parquet_urls:
-            raise ParquetResponseEmptyError("No parquet files found.")
-    except Exception as e:
-        raise PreviousStepFormatError("Previous step did not return the expected content.") from e
-
     # create duckdb index location
     dir_path = create_index_dir_split(
         dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
@@ -174,7 +168,7 @@ def compute_index_rows(
     duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="fts"))
     duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts"))
 
-    # index
+    # index all columns
     con = duckdb.connect(str(db_location))
     con.sql(CREATE_SEQUENCE_COMMAND)
     con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});")
diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
index fe6413489d..357b91e0ed 100644
--- a/services/worker/tests/fixtures/datasets.py
+++ b/services/worker/tests/fixtures/datasets.py
@@ -157,4 +157,11 @@ def datasets() -> Mapping[str, Dataset]:
                 dtype=pd.StringDtype(storage="python"),
             )
         ),
+        "text_image": other(
+            {
+                "col": str(Path(__file__).resolve().parent / "data" / "test_image_rgb.jpg"),
+                "text": "This is a text",
+            },
+            {"col": Image(), "text": Value(dtype="string")},
+        ),
     }
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index db01d0fb3a..4ab34e7277 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -281,6 +281,13 @@ def hub_public_duckdb_index(datasets: Mapping[str, Dataset]) -> Iterator[str]:
     delete_hub_dataset_repo(repo_id=repo_id)
 
 
+@pytest.fixture(scope="session")
+def hub_public_text_image(datasets: Mapping[str, Dataset]) -> Iterator[str]:
+    repo_id = create_hub_dataset_repo(prefix="text_image", dataset=datasets["text_image"])
+    yield repo_id
+    delete_hub_dataset_repo(repo_id=repo_id)
+
+
 class HubDatasetTest(TypedDict):
     name: str
     config_names_response: Any
@@ -514,6 +521,26 @@ def get_IMAGE_rows(dataset: str) -> Any:
     ]
 
 
+TEXT_IMAGE_cols = {
+    "col": {"_type": "Image"},
+    "text": {"_type": "Value", "dtype": "string"},
+}
+
+
+def get_TEXT_IMAGE_rows(dataset: str) -> Any:
+    dataset, config, split = get_default_config_split(dataset)
+    return [
+        {
+            "col": {
+                "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/image.jpg",
+                "height": 480,
+                "width": 640,
+            },
+            "text": "This is a text",
+        }
+    ]
+
+
 IMAGES_LIST_cols = {
     "col": [{"_type": "Image"}],
 }
@@ -596,6 +623,7 @@ def hub_datasets(
     hub_public_external_files: str,
     hub_public_spawning_opt_in_out: str,
     hub_public_duckdb_index: str,
+    hub_public_text_image: str,
 ) -> HubDatasets:
     return {
         "does_not_exist": {
@@ -731,4 +759,13 @@ def hub_datasets(
                 dataset=hub_public_duckdb_index, data_type="csv"
             ),
         },
+        "text_image": {
+            "name": hub_public_text_image,
+            "config_names_response": create_config_names_response(hub_public_text_image),
+            "splits_response": create_splits_response(hub_public_text_image),
+            "first_rows_response": create_first_rows_response(
+                hub_public_text_image, TEXT_IMAGE_cols, get_TEXT_IMAGE_rows(hub_public_text_image)
+            ),
+            "parquet_and_info_response": None,
+        },
     }
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index 595e12d022..ab89bc4b11 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -24,110 +24,119 @@
 
 
 @pytest.fixture
-def get_parquet_job_runner(
-    libraries_resource: LibrariesResource,
+def get_job_runner(
+    duckdb_index_directory: StrPath,
     cache_mongo_resource: CacheMongoResource,
     queue_mongo_resource: QueueMongoResource,
-) -> GetParquetJobRunner:
+) -> GetJobRunner:
     def _get_job_runner(
         dataset: str,
         config: str,
+        split: str,
         app_config: AppConfig,
-    ) -> ConfigParquetAndInfoJobRunner:
-        processing_step_name = ConfigParquetAndInfoJobRunner.get_job_type()
+    ) -> SplitDuckDbIndexJobRunner:
+        processing_step_name = SplitDuckDbIndexJobRunner.get_job_type()
         processing_graph = ProcessingGraph(
             {
-                "dataset-level": {"input_type": "dataset"},
+                "dataset-step": {"input_type": "dataset"},
+                "config-parquet": {
+                    "input_type": "config",
+                    "triggered_by": "dataset-step",
+                    "provides_config_parquet": True,
+                },
+                "config-split-names-from-streaming": {
+                    "input_type": "config",
+                    "triggered_by": "dataset-step",
+                },
                 processing_step_name: {
                     "input_type": "dataset",
-                    "job_runner_version": ConfigParquetAndInfoJobRunner.get_job_runner_version(),
-                    "triggered_by": "dataset-level",
+                    "job_runner_version": SplitDuckDbIndexJobRunner.get_job_runner_version(),
+                    "triggered_by": ["config-parquet", "config-split-names-from-streaming"],
                 },
             }
         )
-        return ConfigParquetAndInfoJobRunner(
+        return SplitDuckDbIndexJobRunner(
             job_info={
-                "type": ConfigParquetAndInfoJobRunner.get_job_type(),
+                "type": SplitDuckDbIndexJobRunner.get_job_type(),
                 "params": {
                     "dataset": dataset,
                     "revision": "revision",
                     "config": config,
-                    "split": None,
+                    "split": split,
                 },
                 "job_id": "job_id",
                 "priority": Priority.NORMAL,
             },
             app_config=app_config,
             processing_step=processing_graph.get_processing_step(processing_step_name),
-            hf_datasets_cache=libraries_resource.hf_datasets_cache,
+            duckdb_index_directory=duckdb_index_directory,
         )
 
     return _get_job_runner
 
 
 @pytest.fixture
-def get_job_runner(
-    duckdb_index_directory: StrPath,
+def get_parquet_job_runner(
+    libraries_resource: LibrariesResource,
     cache_mongo_resource: CacheMongoResource,
     queue_mongo_resource: QueueMongoResource,
-) -> GetJobRunner:
+) -> GetParquetJobRunner:
     def _get_job_runner(
         dataset: str,
         config: str,
-        split: str,
         app_config: AppConfig,
-    ) -> SplitDuckDbIndexJobRunner:
-        processing_step_name = SplitDuckDbIndexJobRunner.get_job_type()
+    ) -> ConfigParquetAndInfoJobRunner:
+        processing_step_name = ConfigParquetAndInfoJobRunner.get_job_type()
         processing_graph = ProcessingGraph(
             {
-                "dataset-step": {"input_type": "dataset"},
-                "config-parquet": {
-                    "input_type": "config",
-                    "triggered_by": "dataset-step",
-                    "provides_config_parquet": True,
-                },
-                "config-split-names-from-streaming": {
-                    "input_type": "config",
-                    "triggered_by": "dataset-step",
-                },
+                "dataset-level": {"input_type": "dataset"},
                 processing_step_name: {
-                    "input_type": "dataset",
-                    "job_runner_version": SplitDuckDbIndexJobRunner.get_job_runner_version(),
-                    "triggered_by": ["config-parquet", "config-split-names-from-streaming"],
+                    "input_type": "config",
+                    "job_runner_version": ConfigParquetAndInfoJobRunner.get_job_runner_version(),
+                    "triggered_by": "dataset-level",
                 },
             }
         )
-        return SplitDuckDbIndexJobRunner(
+        return ConfigParquetAndInfoJobRunner(
             job_info={
-                "type": SplitDuckDbIndexJobRunner.get_job_type(),
+                "type": ConfigParquetAndInfoJobRunner.get_job_type(),
                 "params": {
                     "dataset": dataset,
                     "revision": "revision",
                     "config": config,
-                    "split": split,
+                    "split": None,
                 },
                 "job_id": "job_id",
                 "priority": Priority.NORMAL,
             },
             app_config=app_config,
             processing_step=processing_graph.get_processing_step(processing_step_name),
-            duckdb_index_directory=duckdb_index_directory,
+            hf_datasets_cache=libraries_resource.hf_datasets_cache,
         )
 
     return _get_job_runner
 
 
+@pytest.mark.parametrize(
+    "hub_dataset_name,expected_error_code",
+    [
+        ("duckdb_index", None),
+        ("text_image", "UnsupportedIndexableColumnsError"),
+        ("public", "NoIndexableColumnsError"),
+    ],
+)
 def test_compute(
     get_parquet_job_runner: GetParquetJobRunner,
     get_job_runner: GetJobRunner,
     app_config: AppConfig,
     hub_datasets: HubDatasets,
+    hub_dataset_name: str,
+    expected_error_code: str,
 ) -> None:
-    hub_duckdb_index = "duckdb_index"
-    dataset = hub_datasets[hub_duckdb_index]["name"]
-    config_names = hub_datasets[hub_duckdb_index]["config_names_response"]
-    config = hub_datasets[hub_duckdb_index]["config_names_response"]["config_names"][0]["config"]
-    splits_response = hub_datasets[hub_duckdb_index]["splits_response"]
+    dataset = hub_datasets[hub_dataset_name]["name"]
+    config_names = hub_datasets[hub_dataset_name]["config_names_response"]
+    config = hub_datasets[hub_dataset_name]["config_names_response"]["config_names"][0]["config"]
+    splits_response = hub_datasets[hub_dataset_name]["splits_response"]
     split = "train"
 
     upsert_response(
@@ -159,5 +168,14 @@ def test_compute(
 
     assert parquet_response
     job_runner = get_job_runner(dataset, config, split, app_config)
-    response = job_runner.compute()
-    assert response
+
+    if expected_error_code:
+        with pytest.raises(Exception) as e:
+            job_runner.compute()
+        assert e.typename == expected_error_code
+    else:
+        response = job_runner.compute()
+        assert response
+        content = response.content
+        assert content["url"] is not None
+        assert content["filename"] is not None

From 2afe9f3ee075d8f36243354a8f76d813ddc8d208 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 12 Jun 2023 17:38:58 -0400
Subject: [PATCH 17/52] Adding chart configuration

---
 chart/templates/_envWorker.tpl                 | 18 +++++++++++++++++-
 chart/values.yaml                              |  7 +++++++
 .../worker/job_runners/split/duckdb_index.py   |  2 +-
 tools/docker-compose-datasets-server.yml       |  4 ++++
 tools/docker-compose-dev-datasets-server.yml   |  4 ++++
 5 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl
index ffbec5ae8c..4292090634 100644
--- a/chart/templates/_envWorker.tpl
+++ b/chart/templates/_envWorker.tpl
@@ -84,5 +84,21 @@
   value: {{ .Values.optInOutUrlsScan.urlsNumberPerBatch | quote }}
 - name: OPT_IN_OUT_URLS_SCAN_SPAWNING_URL
   value: {{ .Values.optInOutUrlsScan.spawningUrl | quote }}
-
+# specific to 'split-duckdb-index' job runner
+- name: DUCKDB_INDEX_COMMIT_MESSAGE
+  value: {{ .Values.duckDBIndex.commitMessage | quote }}
+- name: DUCKDB_INDEX_COMMITTER_HF_TOKEN
+  {{- if .Values.secrets.appParquetConverterHfToken.fromSecret }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Values.secrets.appParquetConverterHfToken.secretName | quote }}
+      key: HF_TOKEN
+      optional: false
+  {{- else }}
+  value: {{ .Values.secrets.appParquetConverterHfToken.value }}
+  {{- end }}
+- name: DUCKDB_INDEX_TARGET_REVISION
+  value: {{ .Values.duckDBIndex.targetRevision | quote }}
+- name: DUCKDB_INDEX_URL_TEMPLATE
+  value: {{ .Values.duckDBIndex.urlTemplate | quote }}
 {{- end -}}
diff --git a/chart/values.yaml b/chart/values.yaml
index 2d9754ed57..f45ecbdf87 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -208,6 +208,13 @@ parquetMetadata:
 duckDBIndex:
   # Directory on the shared storage (duckdb db files used for datasets indexing)
   storageDirectory: "/duckdb-index"
+  # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`.
+  commitMessage: "Update duckdb index files"
+  # the git revision of the dataset where to store the duckdb index file. Defaults to `duckdb/index`.
+  targetRevision: "duckdb/index"
+  # the URL template to build the duckdb index file URL. Defaults to `/datasets/%s/resolve/%s/%s`.
+  urlTemplate: "/datasets/%s/resolve/%s/%s"
+
 
 # Directory where the cache data will be stored
 cacheDirectory: "/datasets-server-cache"
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 2897b6449f..becfec0b94 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -54,9 +54,9 @@
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
 CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);"
 CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM"
+# TODO: What if __id field already exist?
 INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';"
 LOAD_EXTENSION_COMMAND = "LOAD '{extension}';"
-# TODO: What if __id field already exist?
 
 
 def create_index_item(
diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml
index 953a1ca1db..5e85f4de62 100644
--- a/tools/docker-compose-datasets-server.yml
+++ b/tools/docker-compose-datasets-server.yml
@@ -113,6 +113,10 @@ services:
       PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}
       DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}
+      DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index file}
+      DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-}
+      DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index}
+      DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}
       # ^ note: the datasets cache is automatically added, so no need to add it here
       OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10}
diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml
index 9b0ce15a25..59e2b90195 100644
--- a/tools/docker-compose-dev-datasets-server.yml
+++ b/tools/docker-compose-dev-datasets-server.yml
@@ -117,6 +117,10 @@ services:
       PARQUET_AND_INFO_URL_TEMPLATE: ${PARQUET_AND_INFO_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata}
       DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}
+      DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index files}
+      DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-}
+      DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index}
+      DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}
       # ^ note: the datasets cache is automatically added, so no need to add it here
       OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10}

From 0bfcb62f2507e70da5f045556f14acfecf865dee Mon Sep 17 00:00:00 2001
From: Andrea Francis Soria Jimenez <andrea@huggingface.co>
Date: Tue, 13 Jun 2023 07:58:55 -0400
Subject: [PATCH 18/52] Apply suggestions from code review

Co-authored-by: Sylvain Lesage <sylvain.lesage@huggingface.co>
---
 services/worker/src/worker/job_runners/split/duckdb_index.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index becfec0b94..74ab919a14 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -145,7 +145,7 @@ def compute_index_rows(
     if not string_columns:
         raise NoIndexableColumnsError("No string columns available to index.")
 
-    # look for image, audio and binary columns, if present, raise exeception (not supported yet)
+    # look for image, audio and binary columns, if present, raise exception (not supported yet)
     if any(
         feature
         for feature in features.values()
@@ -174,7 +174,7 @@ def compute_index_rows(
     con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});")
 
     # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future
-    # see https://duckdb.org/docs/extensions/full_text_search.html for more deails about 'stemmer' parameter
+    # see https://duckdb.org/docs/extensions/full_text_search.html for more details about 'stemmer' parameter
     con.sql(CREATE_INDEX_COMMAND)
 
     # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)

From 2ff4f916d01b2b17317fd86d92ecf90e5037e8cb Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 13 Jun 2023 07:59:13 -0400
Subject: [PATCH 19/52] Change ParquetFileItem to SplitHubFile

---
 chart/static-files/openapi.json               |  4 +-
 libs/libcommon/src/libcommon/parquet_utils.py | 12 +---
 libs/libcommon/src/libcommon/utils.py         |  9 +++
 .../src/worker/job_runners/config/parquet.py  |  5 +-
 .../job_runners/config/parquet_and_info.py    |  8 +--
 .../job_runners/config/parquet_metadata.py    |  6 +-
 .../src/worker/job_runners/dataset/parquet.py |  7 ++-
 .../worker/job_runners/split/duckdb_index.py  | 62 ++++++-------------
 services/worker/src/worker/utils.py           | 18 ------
 .../tests/job_runners/config/test_parquet.py  | 11 ++--
 .../config/test_parquet_metadata.py           |  7 +--
 .../tests/job_runners/dataset/test_parquet.py | 11 ++--
 12 files changed, 59 insertions(+), 101 deletions(-)

diff --git a/chart/static-files/openapi.json b/chart/static-files/openapi.json
index 11b6b1fedb..59a058d956 100644
--- a/chart/static-files/openapi.json
+++ b/chart/static-files/openapi.json
@@ -925,11 +925,11 @@
         "properties": {
           "parquet_files": {
             "type": "array",
-            "items": { "$ref": "#/components/schemas/ParquetFileItem" }
+            "items": { "$ref": "#/components/schemas/SplitHubFile" }
           }
         }
       },
-      "ParquetFileItem": {
+      "SplitHubFile": {
         "type": "object",
         "required": ["dataset", "config", "split", "url", "filename", "size"],
         "properties": {
diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py
index 3a4988889a..2fc92032d1 100644
--- a/libs/libcommon/src/libcommon/parquet_utils.py
+++ b/libs/libcommon/src/libcommon/parquet_utils.py
@@ -21,6 +21,7 @@
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.prometheus import StepProfiler
 from libcommon.simple_cache import get_previous_step_or_raise
+from libcommon.utils import SplitHubFile
 
 StrPath = Union[str, PathLike[str]]
 
@@ -37,15 +38,6 @@ class FileSystemError(Exception):
     pass
 
 
-class ParquetFileItem(TypedDict):
-    dataset: str
-    config: str
-    split: str
-    url: str
-    filename: str
-    size: int
-
-
 class ParquetFileMetadataItem(TypedDict):
     dataset: str
     config: str
@@ -157,7 +149,7 @@ def query(self, offset: int, length: int) -> pa.Table:
 
     @staticmethod
     def from_parquet_file_items(
-        parquet_file_items: List[ParquetFileItem],
+        parquet_file_items: List[SplitHubFile],
         dataset: str,
         config: str,
         split: str,
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index 301a760956..b921ea787b 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -65,6 +65,15 @@ class JobResult(TypedDict):
     output: Optional[JobOutput]
 
 
+class SplitHubFile(TypedDict):
+    dataset: str
+    config: str
+    split: str
+    url: str
+    filename: str
+    size: int
+
+
 # orjson is used to get rid of errors with datetime (see allenai/c4)
 def orjson_default(obj: Any) -> Any:
     if isinstance(obj, bytes):
diff --git a/services/worker/src/worker/job_runners/config/parquet.py b/services/worker/src/worker/job_runners/config/parquet.py
index 4fa17de2ed..2b50e2aeb9 100644
--- a/services/worker/src/worker/job_runners/config/parquet.py
+++ b/services/worker/src/worker/job_runners/config/parquet.py
@@ -7,13 +7,14 @@
 from libcommon.constants import PROCESSING_STEP_CONFIG_PARQUET_VERSION
 from libcommon.exceptions import PreviousStepFormatError
 from libcommon.simple_cache import get_previous_step_or_raise
+from libcommon.utils import SplitHubFile
 
 from worker.job_runners.config.config_job_runner import ConfigJobRunner
-from worker.utils import CompleteJobResult, ParquetFileItem
+from worker.utils import CompleteJobResult
 
 
 class ConfigParquetResponse(TypedDict):
-    parquet_files: List[ParquetFileItem]
+    parquet_files: List[SplitHubFile]
 
 
 def compute_parquet_response(dataset: str, config: str) -> ConfigParquetResponse:
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index daf9ed701c..24ee3c9f3f 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -66,16 +66,16 @@
 )
 from libcommon.processing_graph import ProcessingStep
 from libcommon.simple_cache import get_previous_step_or_raise
-from libcommon.utils import JobInfo
+from libcommon.utils import JobInfo, SplitHubFile
 from tqdm.contrib.concurrent import thread_map
 
 from worker.config import AppConfig, ParquetAndInfoConfig
 from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner
-from worker.utils import CompleteJobResult, ParquetFileItem, hf_hub_url
+from worker.utils import CompleteJobResult, hf_hub_url
 
 
 class ConfigParquetAndInfoResponse(TypedDict):
-    parquet_files: List[ParquetFileItem]
+    parquet_files: List[SplitHubFile]
     dataset_info: Dict[str, Any]
 
 
@@ -118,7 +118,7 @@ def create_parquet_file_item(
     hf_endpoint: str,
     target_revision: str,
     url_template: str,
-) -> ParquetFileItem:
+) -> SplitHubFile:
     if repo_file.size is None:
         raise ValueError(f"Cannot get size of {repo_file.rfilename}")
     _, split = parse_repo_filename(repo_file.rfilename)
diff --git a/services/worker/src/worker/job_runners/config/parquet_metadata.py b/services/worker/src/worker/job_runners/config/parquet_metadata.py
index 6c361168f9..ba58cfad48 100644
--- a/services/worker/src/worker/job_runners/config/parquet_metadata.py
+++ b/services/worker/src/worker/job_runners/config/parquet_metadata.py
@@ -15,14 +15,14 @@
 from libcommon.processing_graph import ProcessingStep
 from libcommon.simple_cache import get_previous_step_or_raise
 from libcommon.storage import StrPath
-from libcommon.utils import JobInfo
+from libcommon.utils import JobInfo, SplitHubFile
 from libcommon.viewer_utils.parquet_metadata import create_parquet_metadata_file
 from pyarrow.parquet import ParquetFile
 from tqdm.contrib.concurrent import thread_map
 
 from worker.config import AppConfig
 from worker.job_runners.config.config_job_runner import ConfigJobRunner
-from worker.utils import CompleteJobResult, ParquetFileItem, get_parquet_file
+from worker.utils import CompleteJobResult, get_parquet_file
 
 
 class ParquetFileMetadataItem(TypedDict):
@@ -74,7 +74,7 @@ def compute_parquet_metadata_response(
     config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
     try:
         parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"]
-        parquet_file_items: List[ParquetFileItem] = [
+        parquet_file_items: List[SplitHubFile] = [
             parquet_file_item for parquet_file_item in parquet_files_content if parquet_file_item["config"] == config
         ]
         if not parquet_file_items:
diff --git a/services/worker/src/worker/job_runners/dataset/parquet.py b/services/worker/src/worker/job_runners/dataset/parquet.py
index 7c6bb4a82c..4dc81bc5f5 100644
--- a/services/worker/src/worker/job_runners/dataset/parquet.py
+++ b/services/worker/src/worker/job_runners/dataset/parquet.py
@@ -12,14 +12,15 @@
     get_previous_step_or_raise,
     get_response,
 )
+from libcommon.utils import SplitHubFile
 
 from worker.job_runners.config.parquet import ConfigParquetResponse
 from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner
-from worker.utils import JobResult, ParquetFileItem, PreviousJob
+from worker.utils import JobResult, PreviousJob
 
 
 class DatasetParquetResponse(TypedDict):
-    parquet_files: List[ParquetFileItem]
+    parquet_files: List[SplitHubFile]
     pending: list[PreviousJob]
     failed: list[PreviousJob]
 
@@ -47,7 +48,7 @@ def compute_sizes_response(dataset: str) -> Tuple[DatasetParquetResponse, float]
         raise PreviousStepFormatError("Previous step did not return the expected content: 'config_names'.")
 
     try:
-        parquet_files: list[ParquetFileItem] = []
+        parquet_files: list[SplitHubFile] = []
         total = 0
         pending = []
         failed = []
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 74ab919a14..3dff3b9936 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -14,7 +14,7 @@
     CommitOperationAdd,
     CommitOperationDelete,
 )
-from huggingface_hub.hf_api import HfApi, RepoFile
+from huggingface_hub.hf_api import HfApi
 from huggingface_hub.utils._errors import RepositoryNotFoundError
 from libcommon.config import DuckDbIndexConfig
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
@@ -31,20 +31,14 @@
 from libcommon.processing_graph import ProcessingStep
 from libcommon.simple_cache import get_previous_step_or_raise
 from libcommon.storage import StrPath, remove_dir
-from libcommon.utils import JobInfo
+from libcommon.utils import JobInfo, SplitHubFile
 from libcommon.viewer_utils.index_utils import create_index_dir_split
 from pyarrow.parquet import ParquetFile
 from tqdm.contrib.concurrent import thread_map
 
 from worker.config import AppConfig
 from worker.job_runners.split.split_job_runner import SplitJobRunner
-from worker.utils import (
-    CompleteJobResult,
-    IndexRowsResponse,
-    ParquetFileItem,
-    get_parquet_file,
-    hf_hub_url,
-)
+from worker.utils import CompleteJobResult, get_parquet_file, hf_hub_url
 
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
@@ -59,33 +53,6 @@
 LOAD_EXTENSION_COMMAND = "LOAD '{extension}';"
 
 
-def create_index_item(
-    repo_file: RepoFile,
-    dataset: str,
-    config: str,
-    split: str,
-    hf_endpoint: str,
-    target_revision: str,
-    url_template: str,
-) -> IndexRowsResponse:
-    if repo_file.size is None:
-        raise ValueError(f"Cannot get size of {repo_file.rfilename}")
-    return {
-        "dataset": dataset,
-        "config": config,
-        "split": split,
-        "url": hf_hub_url(
-            repo_id=dataset,
-            filename=repo_file.rfilename,
-            hf_endpoint=hf_endpoint,
-            revision=target_revision,
-            url_template=url_template,
-        ),
-        "filename": Path(repo_file.rfilename).name,
-        "size": repo_file.size,
-    }
-
-
 def compute_index_rows(
     dataset: str,
     config: str,
@@ -97,7 +64,7 @@ def compute_index_rows(
     url_template: str,
     hf_token: Optional[str],
     committer_hf_token: Optional[str],
-) -> IndexRowsResponse:
+) -> SplitHubFile:
     logging.info(f"get split-duckdb-index for dataset={dataset} config={config} split={split}")
 
     # validate split
@@ -116,7 +83,7 @@ def compute_index_rows(
     config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
     try:
         parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"]
-        parquet_file_items: List[ParquetFileItem] = [
+        parquet_file_items: List[SplitHubFile] = [
             parquet_file_item
             for parquet_file_item in parquet_files_content
             if parquet_file_item["config"] == config and parquet_file_item["split"] == split
@@ -225,14 +192,23 @@ def compute_index_rows(
 
     remove_dir(dir_path)
     # remove index file since it is no more used and is stored in NFS
-    return create_index_item(
-        repo_file=repo_files[0],
+
+    repo_file = repo_files[0]
+    if repo_file.size is None:
+        raise ValueError(f"Cannot get size of {repo_file.rfilename}")
+    return SplitHubFile(
         dataset=dataset,
         config=config,
         split=split,
-        hf_endpoint=hf_endpoint,
-        target_revision=target_revision,
-        url_template=url_template,
+        url=hf_hub_url(
+            repo_id=dataset,
+            filename=repo_file.rfilename,
+            hf_endpoint=hf_endpoint,
+            revision=target_revision,
+            url_template=url_template,
+        ),
+        filename=Path(repo_file.rfilename).name,
+        size=repo_file.size,
     )
 
 
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index e414e34d70..58a27c25fb 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -134,15 +134,6 @@ class ImageUrlColumnsResponse(TypedDict):
     columns: List[str]
 
 
-class IndexRowsResponse(TypedDict):
-    dataset: str
-    config: str
-    split: str
-    url: str
-    filename: str
-    size: int
-
-
 Row = Mapping[str, Any]
 
 
@@ -151,15 +142,6 @@ class RowsContent(TypedDict):
     all_fetched: bool
 
 
-class ParquetFileItem(TypedDict):
-    dataset: str
-    config: str
-    split: str
-    url: str
-    filename: str
-    size: int
-
-
 # TODO: separate functions from common classes and named dicts otherwise this file will continue growing
 
 
diff --git a/services/worker/tests/job_runners/config/test_parquet.py b/services/worker/tests/job_runners/config/test_parquet.py
index eeecda8bfd..677e49715a 100644
--- a/services/worker/tests/job_runners/config/test_parquet.py
+++ b/services/worker/tests/job_runners/config/test_parquet.py
@@ -9,7 +9,7 @@
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.simple_cache import CachedArtifactError, upsert_response
-from libcommon.utils import Priority
+from libcommon.utils import Priority, SplitHubFile
 
 from worker.config import AppConfig
 from worker.job_runners.config.parquet import (
@@ -17,7 +17,6 @@
     ConfigParquetResponse,
 )
 from worker.job_runners.config.parquet_and_info import ConfigParquetAndInfoResponse
-from worker.utils import ParquetFileItem
 
 
 @pytest.fixture(autouse=True)
@@ -78,10 +77,10 @@ def _get_job_runner(
             HTTPStatus.OK,
             ConfigParquetAndInfoResponse(
                 parquet_files=[
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0
                     ),
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_1", split="train", url="url2", filename="filename2", size=0
                     ),
                 ],
@@ -90,10 +89,10 @@ def _get_job_runner(
             None,
             ConfigParquetResponse(
                 parquet_files=[
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0
                     ),
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_1", split="train", url="url2", filename="filename2", size=0
                     ),
                 ]
diff --git a/services/worker/tests/job_runners/config/test_parquet_metadata.py b/services/worker/tests/job_runners/config/test_parquet_metadata.py
index dbc9eb62d9..875e1e13bb 100644
--- a/services/worker/tests/job_runners/config/test_parquet_metadata.py
+++ b/services/worker/tests/job_runners/config/test_parquet_metadata.py
@@ -16,7 +16,7 @@
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.simple_cache import CachedArtifactError, upsert_response
 from libcommon.storage import StrPath
-from libcommon.utils import Priority
+from libcommon.utils import Priority, SplitHubFile
 
 from worker.config import AppConfig
 from worker.job_runners.config.parquet import ConfigParquetResponse
@@ -25,7 +25,6 @@
     ConfigParquetMetadataResponse,
     ParquetFileMetadataItem,
 )
-from worker.utils import ParquetFileItem
 
 
 @pytest.fixture(autouse=True)
@@ -91,10 +90,10 @@ def _get_job_runner(
             HTTPStatus.OK,
             ConfigParquetResponse(
                 parquet_files=[
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0
                     ),
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_1", split="train", url="url2", filename="filename2", size=0
                     ),
                 ],
diff --git a/services/worker/tests/job_runners/dataset/test_parquet.py b/services/worker/tests/job_runners/dataset/test_parquet.py
index ba377eb65a..c5257d01a8 100644
--- a/services/worker/tests/job_runners/dataset/test_parquet.py
+++ b/services/worker/tests/job_runners/dataset/test_parquet.py
@@ -9,7 +9,7 @@
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.simple_cache import CachedArtifactError, upsert_response
-from libcommon.utils import Priority
+from libcommon.utils import Priority, SplitHubFile
 
 from worker.config import AppConfig
 from worker.job_runners.config.parquet import ConfigParquetResponse
@@ -17,7 +17,6 @@
     DatasetParquetJobRunner,
     DatasetParquetResponse,
 )
-from worker.utils import ParquetFileItem
 
 from ..utils import UpstreamResponse
 
@@ -93,7 +92,7 @@ def _get_job_runner(
                     http_status=HTTPStatus.OK,
                     content=ConfigParquetResponse(
                         parquet_files=[
-                            ParquetFileItem(
+                            SplitHubFile(
                                 dataset="ok",
                                 config="config_1",
                                 split="train",
@@ -111,7 +110,7 @@ def _get_job_runner(
                     http_status=HTTPStatus.OK,
                     content=ConfigParquetResponse(
                         parquet_files=[
-                            ParquetFileItem(
+                            SplitHubFile(
                                 dataset="ok",
                                 config="config_2",
                                 split="train",
@@ -126,10 +125,10 @@ def _get_job_runner(
             None,
             DatasetParquetResponse(
                 parquet_files=[
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0
                     ),
-                    ParquetFileItem(
+                    SplitHubFile(
                         dataset="ok", config="config_2", split="train", url="url2", filename="filename2", size=0
                     ),
                 ],

From 3c9b4eef5f394e3f8dc0b60f7c740cfbdfb666b6 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 13 Jun 2023 14:01:58 -0400
Subject: [PATCH 20/52] Inherit from SplitCachedJobRunner

---
 .../src/libcommon/viewer_utils/index_utils.py | 17 -------------
 .../job_runners/_datasets_based_job_runner.py |  1 -
 .../worker/job_runners/split/duckdb_index.py  | 24 ++++++-------------
 .../job_runners/split/test_duckdb_index.py    |  2 ++
 4 files changed, 9 insertions(+), 35 deletions(-)
 delete mode 100644 libs/libcommon/src/libcommon/viewer_utils/index_utils.py

diff --git a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py b/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
deleted file mode 100644
index 5beb9f6a31..0000000000
--- a/libs/libcommon/src/libcommon/viewer_utils/index_utils.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2023 The HuggingFace Authors.
-
-from os import makedirs
-from pathlib import Path
-
-from libcommon.storage import StrPath
-
-DATASET_SEPARATOR = "--"
-INDEX_DIR_MODE = 0o755
-
-
-def create_index_dir_split(dataset: str, config: str, split: str, index_directory: StrPath) -> Path:
-    split_path = f"{dataset}/{DATASET_SEPARATOR}/{config}/{split}"
-    dir_path = Path(index_directory).resolve() / split_path
-    makedirs(dir_path, INDEX_DIR_MODE, exist_ok=True)
-    return dir_path
diff --git a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py
index 310b4bde64..703e4a6353 100644
--- a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py
+++ b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py
@@ -23,7 +23,6 @@ class DatasetsBasedJobRunner(JobRunner):
 
     datasets_based_config: DatasetsBasedConfig
     base_datasets_cache: Path
-
     # the datasets library cache directories (for data, downloads, extraction, NOT for modules)
     # the job runner should have only one running job at the same time, then it should
     # be safe to use a global variable (and to set the datasets cache globally)
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 3dff3b9936..9ad20ea238 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -30,14 +30,13 @@
 )
 from libcommon.processing_graph import ProcessingStep
 from libcommon.simple_cache import get_previous_step_or_raise
-from libcommon.storage import StrPath, remove_dir
+from libcommon.storage import StrPath
 from libcommon.utils import JobInfo, SplitHubFile
-from libcommon.viewer_utils.index_utils import create_index_dir_split
 from pyarrow.parquet import ParquetFile
 from tqdm.contrib.concurrent import thread_map
 
 from worker.config import AppConfig
-from worker.job_runners.split.split_job_runner import SplitJobRunner
+from worker.job_runners.split.split_job_runner import SplitCachedJobRunner
 from worker.utils import CompleteJobResult, get_parquet_file, hf_hub_url
 
 DATASET_TYPE = "dataset"
@@ -57,7 +56,7 @@ def compute_index_rows(
     dataset: str,
     config: str,
     split: str,
-    duckdb_index_directory: StrPath,
+    duckdb_index_file_directory: StrPath,
     target_revision: str,
     hf_endpoint: str,
     commit_message: str,
@@ -123,12 +122,6 @@ def compute_index_rows(
     ):
         raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.")
 
-    # create duckdb index location
-    dir_path = create_index_dir_split(
-        dataset=dataset, config=config, split=split, index_directory=duckdb_index_directory
-    )
-    db_location = dir_path / DUCKDB_DEFAULT_INDEX_FILENAME
-
     # configure duckdb extensions
     duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs"))
     duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="httpfs"))
@@ -136,6 +129,7 @@ def compute_index_rows(
     duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts"))
 
     # index all columns
+    db_location = f"{duckdb_index_file_directory}/{DUCKDB_DEFAULT_INDEX_FILENAME}"
     con = duckdb.connect(str(db_location))
     con.sql(CREATE_SEQUENCE_COMMAND)
     con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});")
@@ -190,9 +184,6 @@ def compute_index_rows(
     if len(repo_files) != 1:
         logging.warning(f"Found {len(repo_files)} index files, should be only 1")
 
-    remove_dir(dir_path)
-    # remove index file since it is no more used and is stored in NFS
-
     repo_file = repo_files[0]
     if repo_file.size is None:
         raise ValueError(f"Cannot get size of {repo_file.rfilename}")
@@ -212,9 +203,8 @@ def compute_index_rows(
     )
 
 
-class SplitDuckDbIndexJobRunner(SplitJobRunner):
+class SplitDuckDbIndexJobRunner(SplitCachedJobRunner):
     duckdb_index_config: DuckDbIndexConfig
-    duckdb_index_directory: StrPath
 
     def __init__(
         self,
@@ -227,8 +217,8 @@ def __init__(
             job_info=job_info,
             app_config=app_config,
             processing_step=processing_step,
+            hf_datasets_cache=Path(duckdb_index_directory).resolve(),
         )
-        self.duckdb_index_directory = duckdb_index_directory
         self.duckdb_index_config = app_config.duckdb_index
 
     @staticmethod
@@ -245,7 +235,7 @@ def compute(self) -> CompleteJobResult:
                 dataset=self.dataset,
                 config=self.config,
                 split=self.split,
-                duckdb_index_directory=self.duckdb_index_directory,
+                duckdb_index_file_directory=self.datasets_cache,
                 hf_token=self.app_config.common.hf_token,
                 url_template=self.duckdb_index_config.url_template,
                 commit_message=self.duckdb_index_config.commit_message,
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index ab89bc4b11..01d862140b 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -174,8 +174,10 @@ def test_compute(
             job_runner.compute()
         assert e.typename == expected_error_code
     else:
+        job_runner.pre_compute()
         response = job_runner.compute()
         assert response
         content = response.content
         assert content["url"] is not None
         assert content["filename"] is not None
+        job_runner.post_compute()

From c78e99ae9e7b731130400795c1169e65bea72693 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 13 Jun 2023 14:32:15 -0400
Subject: [PATCH 21/52] Fix style

---
 chart/templates/_envWorker.tpl                                  | 1 +
 .../worker/src/worker/job_runners/_datasets_based_job_runner.py | 1 +
 services/worker/src/worker/job_runners/split/duckdb_index.py    | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl
index 4292090634..b3add1de06 100644
--- a/chart/templates/_envWorker.tpl
+++ b/chart/templates/_envWorker.tpl
@@ -84,6 +84,7 @@
   value: {{ .Values.optInOutUrlsScan.urlsNumberPerBatch | quote }}
 - name: OPT_IN_OUT_URLS_SCAN_SPAWNING_URL
   value: {{ .Values.optInOutUrlsScan.spawningUrl | quote }}
+
 # specific to 'split-duckdb-index' job runner
 - name: DUCKDB_INDEX_COMMIT_MESSAGE
   value: {{ .Values.duckDBIndex.commitMessage | quote }}
diff --git a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py
index 703e4a6353..310b4bde64 100644
--- a/services/worker/src/worker/job_runners/_datasets_based_job_runner.py
+++ b/services/worker/src/worker/job_runners/_datasets_based_job_runner.py
@@ -23,6 +23,7 @@ class DatasetsBasedJobRunner(JobRunner):
 
     datasets_based_config: DatasetsBasedConfig
     base_datasets_cache: Path
+
     # the datasets library cache directories (for data, downloads, extraction, NOT for modules)
     # the job runner should have only one running job at the same time, then it should
     # be safe to use a global variable (and to set the datasets cache globally)
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 9ad20ea238..fb4c75b802 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -56,7 +56,7 @@ def compute_index_rows(
     dataset: str,
     config: str,
     split: str,
-    duckdb_index_file_directory: StrPath,
+    duckdb_index_file_directory: Optional[Path],
     target_revision: str,
     hf_endpoint: str,
     commit_message: str,

From 6eba4d9f00a12e29b9a355c6d9ffaea8e13d0f36 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 13 Jun 2023 16:01:29 -0400
Subject: [PATCH 22/52] Depends on info  featues instead of parquet schema

---
 libs/libcommon/src/libcommon/config.py        |  2 +-
 libs/libcommon/src/libcommon/exceptions.py    |  8 --
 .../worker/job_runners/split/duckdb_index.py  | 81 +++++++++----------
 services/worker/tests/fixtures/datasets.py    |  7 --
 services/worker/tests/fixtures/hub.py         | 17 ----
 .../job_runners/split/test_duckdb_index.py    |  5 +-
 6 files changed, 39 insertions(+), 81 deletions(-)

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
index 114286cddc..7f6ef16377 100644
--- a/libs/libcommon/src/libcommon/config.py
+++ b/libs/libcommon/src/libcommon/config.py
@@ -353,7 +353,7 @@ class ProcessingGraphConfig:
                 "triggered_by": [
                     "config-split-names-from-info",
                     "config-split-names-from-streaming",
-                    "config-parquet",
+                    "config-parquet-and-info",
                 ],
                 "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
             },
diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index cdf8a3d95c..64e77be5a3 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -116,7 +116,6 @@ def as_response(self) -> ErrorResponse:
     "TooManyColumnsError",
     "UnexpectedError",
     "UnsupportedExternalFilesError",
-    "UnsupportedIndexableColumnsError",
 ]
 
 
@@ -491,13 +490,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
 
 
-class UnsupportedIndexableColumnsError(CacheableError):
-    """Raised when some unsupported indexable columns present."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedIndexableColumnsError", cause, True)
-
-
 class NotAvailableIndexFileError(CacheableError):
     """Raised when no duckdb index file was found for split."""
 
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index fb4c75b802..cc1c320998 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -2,13 +2,10 @@
 # Copyright 2023 The HuggingFace Authors.
 
 import logging
-from functools import partial
 from pathlib import Path
 from typing import List, Optional, Set
 
 import duckdb
-from datasets import Features
-from fsspec.implementations.http import HTTPFileSystem
 from huggingface_hub._commit_api import (
     CommitOperation,
     CommitOperationAdd,
@@ -20,30 +17,25 @@
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
 from libcommon.exceptions import (
     DatasetNotFoundError,
-    FileSystemError,
     NoIndexableColumnsError,
     NotAvailableIndexFileError,
     ParquetResponseEmptyError,
     PreviousStepFormatError,
     SplitNotFoundError,
-    UnsupportedIndexableColumnsError,
 )
 from libcommon.processing_graph import ProcessingStep
 from libcommon.simple_cache import get_previous_step_or_raise
 from libcommon.storage import StrPath
 from libcommon.utils import JobInfo, SplitHubFile
-from pyarrow.parquet import ParquetFile
-from tqdm.contrib.concurrent import thread_map
 
 from worker.config import AppConfig
 from worker.job_runners.split.split_job_runner import SplitCachedJobRunner
-from worker.utils import CompleteJobResult, get_parquet_file, hf_hub_url
+from worker.utils import CompleteJobResult, hf_hub_url
 
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_INDEX_FILENAME = "index.db"
-UNSUPPORTED_FEATURES_MAGIC_STRINGS = ["'binary'", "Audio", "Image"]
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
 CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);"
 CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM"
@@ -78,50 +70,48 @@ def compute_index_rows(
     if split not in [split_item["split"] for split_item in splits_content]:
         raise SplitNotFoundError(f"The split '{split}' does not exist for the config '{config}' of the dataset.")
 
-    # get parquet content
-    config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
-    try:
-        parquet_files_content = config_parquet_best_response.response["content"]["parquet_files"]
-        parquet_file_items: List[SplitHubFile] = [
-            parquet_file_item
-            for parquet_file_item in parquet_files_content
-            if parquet_file_item["config"] == config and parquet_file_item["split"] == split
-        ]
-        if not parquet_file_items:
-            raise ParquetResponseEmptyError("No parquet files found.")
-    except Exception as e:
-        raise PreviousStepFormatError("Previous step did not return the expected content.") from e
+    # get parquet urls and dataset_info
+    config_parquet_and_info_step = "config-parquet-and-info"
+    parquet_and_info_best_response = get_previous_step_or_raise(
+        kinds=[config_parquet_and_info_step],
+        dataset=dataset,
+        config=config,
+    )
+    content_parquet_and_info = parquet_and_info_best_response.response["content"]
+    if "parquet_files" not in content_parquet_and_info:
+        raise PreviousStepFormatError(
+            f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'parquet_files'"
+        )
 
-    fs = HTTPFileSystem()
-    parquet_urls = [parquet_file_item["url"] for parquet_file_item in parquet_file_items]
-    desc = f"{dataset}/{config}"
-    try:
-        parquet_files: List[ParquetFile] = thread_map(
-            partial(get_parquet_file, fs=fs, hf_token=hf_token), parquet_urls, desc=desc, unit="pq", disable=True
+    if "dataset_info" not in content_parquet_and_info:
+        raise PreviousStepFormatError(
+            f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'dataset_info'"
         )
-    except Exception as e:
-        raise FileSystemError(f"Could not read the parquet files: {e}") from e
+
+    parquet_urls = [
+        parquet_file["url"]
+        for parquet_file in content_parquet_and_info["parquet_files"]
+        if parquet_file["config"] == config and parquet_file["split"] == split
+    ]
+
+    if not parquet_urls:
+        raise ParquetResponseEmptyError("No parquet files found.")
 
     # get the features
-    features = Features.from_arrow_schema(parquet_files[0].schema.to_arrow_schema())
+    features = content_parquet_and_info["dataset_info"].get("features", [])
 
     # look for string columns using the first rows
-    string_columns = [column for column, feature in features.items() if STRING_FEATURE_DTYPE in str(feature)]
-
+    string_columns = [
+        column
+        for column, feature in features.items()
+        if "dtype" in feature
+        and "_type" in feature
+        and feature["dtype"] == STRING_FEATURE_DTYPE
+        and feature["_type"] == VALUE_FEATURE_TYPE
+    ]
     if not string_columns:
         raise NoIndexableColumnsError("No string columns available to index.")
 
-    # look for image, audio and binary columns, if present, raise exception (not supported yet)
-    if any(
-        feature
-        for feature in features.values()
-        if next(
-            (feature_type for feature_type in UNSUPPORTED_FEATURES_MAGIC_STRINGS if feature_type in str(feature)), None
-        )
-        is not None
-    ):
-        raise UnsupportedIndexableColumnsError("Unsupported feature types for indexing.")
-
     # configure duckdb extensions
     duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs"))
     duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="httpfs"))
@@ -172,7 +162,7 @@ def compute_index_rows(
         parent_commit=target_dataset_info.sha,
     )
 
-    # call the API again to get the list of parquet files
+    # call the API again to get the index file
     target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True)
     repo_files = [
         repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location
@@ -187,6 +177,7 @@ def compute_index_rows(
     repo_file = repo_files[0]
     if repo_file.size is None:
         raise ValueError(f"Cannot get size of {repo_file.rfilename}")
+
     return SplitHubFile(
         dataset=dataset,
         config=config,
diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
index 357b91e0ed..fe6413489d 100644
--- a/services/worker/tests/fixtures/datasets.py
+++ b/services/worker/tests/fixtures/datasets.py
@@ -157,11 +157,4 @@ def datasets() -> Mapping[str, Dataset]:
                 dtype=pd.StringDtype(storage="python"),
             )
         ),
-        "text_image": other(
-            {
-                "col": str(Path(__file__).resolve().parent / "data" / "test_image_rgb.jpg"),
-                "text": "This is a text",
-            },
-            {"col": Image(), "text": Value(dtype="string")},
-        ),
     }
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index 4ab34e7277..bcb7f03840 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -281,13 +281,6 @@ def hub_public_duckdb_index(datasets: Mapping[str, Dataset]) -> Iterator[str]:
     delete_hub_dataset_repo(repo_id=repo_id)
 
 
-@pytest.fixture(scope="session")
-def hub_public_text_image(datasets: Mapping[str, Dataset]) -> Iterator[str]:
-    repo_id = create_hub_dataset_repo(prefix="text_image", dataset=datasets["text_image"])
-    yield repo_id
-    delete_hub_dataset_repo(repo_id=repo_id)
-
-
 class HubDatasetTest(TypedDict):
     name: str
     config_names_response: Any
@@ -623,7 +616,6 @@ def hub_datasets(
     hub_public_external_files: str,
     hub_public_spawning_opt_in_out: str,
     hub_public_duckdb_index: str,
-    hub_public_text_image: str,
 ) -> HubDatasets:
     return {
         "does_not_exist": {
@@ -759,13 +751,4 @@ def hub_datasets(
                 dataset=hub_public_duckdb_index, data_type="csv"
             ),
         },
-        "text_image": {
-            "name": hub_public_text_image,
-            "config_names_response": create_config_names_response(hub_public_text_image),
-            "splits_response": create_splits_response(hub_public_text_image),
-            "first_rows_response": create_first_rows_response(
-                hub_public_text_image, TEXT_IMAGE_cols, get_TEXT_IMAGE_rows(hub_public_text_image)
-            ),
-            "parquet_and_info_response": None,
-        },
     }
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index 01d862140b..b6053d7ea4 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -121,8 +121,7 @@ def _get_job_runner(
     "hub_dataset_name,expected_error_code",
     [
         ("duckdb_index", None),
-        ("text_image", "UnsupportedIndexableColumnsError"),
-        ("public", "NoIndexableColumnsError"),
+        ("public", "NoIndexableColumnsError"),  # dataset does not have string columns to index
     ],
 )
 def test_compute(
@@ -159,7 +158,7 @@ def test_compute(
     config_parquet = parquet_response.content
 
     upsert_response(
-        "config-parquet",
+        "config-parquet-and-info",
         dataset=dataset,
         config=config,
         http_status=HTTPStatus.OK,

From 39e7ded0272ec2f7ef4bd4bc9f779d3d64a403b5 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 13 Jun 2023 16:51:45 -0400
Subject: [PATCH 23/52] Fix libcommon test

---
 libs/libcommon/tests/test_processing_graph.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py
index 938a86b5c0..1d0933479a 100644
--- a/libs/libcommon/tests/test_processing_graph.py
+++ b/libs/libcommon/tests/test_processing_graph.py
@@ -83,6 +83,7 @@ def graph() -> ProcessingGraph:
                 "config-parquet",
                 "config-info",
                 "config-size",
+                "split-duckdb-index",
             ],
             ["dataset-config-names"],
             ["dataset-config-names"],
@@ -148,7 +149,7 @@ def graph() -> ProcessingGraph:
         ),
         (
             "config-parquet",
-            ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet", "split-duckdb-index"],
+            ["config-parquet-metadata", "split-first-rows-from-parquet", "dataset-parquet"],
             ["config-parquet-and-info"],
             ["dataset-config-names", "config-parquet-and-info"],
         ),
@@ -296,13 +297,12 @@ def graph() -> ProcessingGraph:
         (
             "split-duckdb-index",
             [],
-            ["config-parquet", "config-split-names-from-streaming", "config-split-names-from-info"],
+            ["config-parquet-and-info", "config-split-names-from-streaming", "config-split-names-from-info"],
             [
                 "config-split-names-from-streaming",
                 "config-split-names-from-info",
                 "config-parquet-and-info",
                 "config-info",
-                "config-parquet",
                 "dataset-config-names",
             ],
         ),

From e94e1d4f33f637bd6f5cc6d4eccb1e719e54c3bd Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 14 Jun 2023 08:46:56 -0400
Subject: [PATCH 24/52] Apply code review suggestions

---
 chart/env/prod.yaml                           |  2 ++
 chart/templates/_envWorker.tpl                |  2 ++
 chart/values.yaml                             |  7 ++++---
 libs/libcommon/src/libcommon/config.py        |  7 ++++++-
 libs/libcommon/src/libcommon/exceptions.py    |  8 +++++++
 .../worker/job_runners/split/duckdb_index.py  | 21 +++++++++++++++----
 tools/docker-compose-datasets-server.yml      |  1 +
 tools/docker-compose-dev-datasets-server.yml  |  1 +
 8 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml
index 163a431de8..90f5b3ef48 100644
--- a/chart/env/prod.yaml
+++ b/chart/env/prod.yaml
@@ -97,6 +97,8 @@ optInOutUrlsScan:
   rowsMaxNumber: 100_000
   urlsNumberPerBatch: 1000
 
+duckDBIndex:
+  maxParquetSizeBytes: "5_000_000_000"
 
 # --- jobs (pre-install/upgrade hooks) ---
 
diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl
index b3add1de06..ba7d58eb7f 100644
--- a/chart/templates/_envWorker.tpl
+++ b/chart/templates/_envWorker.tpl
@@ -102,4 +102,6 @@
   value: {{ .Values.duckDBIndex.targetRevision | quote }}
 - name: DUCKDB_INDEX_URL_TEMPLATE
   value: {{ .Values.duckDBIndex.urlTemplate | quote }}
+- name: DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES
+  value: {{ .Values.duckDBIndex.maxParquetSizeBytes | quote }}
 {{- end -}}
diff --git a/chart/values.yaml b/chart/values.yaml
index f45ecbdf87..d23748aa32 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -210,11 +210,12 @@ duckDBIndex:
   storageDirectory: "/duckdb-index"
   # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`.
   commitMessage: "Update duckdb index files"
-  # the git revision of the dataset where to store the duckdb index file. Defaults to `duckdb/index`.
-  targetRevision: "duckdb/index"
+  # the git revision of the dataset where to store the duckdb index file. Defaults to `refs/convert/parquet`.
+  targetRevision: "refs/convert/parquet"
   # the URL template to build the duckdb index file URL. Defaults to `/datasets/%s/resolve/%s/%s`.
   urlTemplate: "/datasets/%s/resolve/%s/%s"
-
+  # the maximum size of the split parquets.
+  maxParquetSizeBytes: "100_000_000"
 
 # Directory where the cache data will be stored
 cacheDirectory: "/datasets-server-cache"
diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
index 7f6ef16377..141e70e6f4 100644
--- a/libs/libcommon/src/libcommon/config.py
+++ b/libs/libcommon/src/libcommon/config.py
@@ -108,7 +108,8 @@ def from_env(cls) -> "ParquetMetadataConfig":
 DUCKDB_INDEX_STORAGE_DIRECTORY = None
 DUCKDB_INDEX_COMMIT_MESSAGE = "Update duckdb index file"
 DUCKDB_INDEX_COMMITTER_HF_TOKEN = None
-DUCKDB_INDEX_TARGET_REVISION = "duckdb/index"
+DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES = 100_000_000
+DUCKDB_INDEX_TARGET_REVISION = "refs/convert/parquet"
 DUCKDB_INDEX_URL_TEMPLATE = "/datasets/%s/resolve/%s/%s"
 
 
@@ -119,6 +120,7 @@ class DuckDbIndexConfig:
     committer_hf_token: Optional[str] = DUCKDB_INDEX_COMMITTER_HF_TOKEN
     target_revision: str = DUCKDB_INDEX_TARGET_REVISION
     url_template: str = DUCKDB_INDEX_URL_TEMPLATE
+    max_parquet_size_bytes: int = DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES
 
     @classmethod
     def from_env(cls) -> "DuckDbIndexConfig":
@@ -130,6 +132,9 @@ def from_env(cls) -> "DuckDbIndexConfig":
                 committer_hf_token=env.str(name="COMMITTER_HF_TOKEN", default=DUCKDB_INDEX_COMMITTER_HF_TOKEN),
                 target_revision=env.str(name="TARGET_REVISION", default=DUCKDB_INDEX_TARGET_REVISION),
                 url_template=env.str(name="URL_TEMPLATE", default=DUCKDB_INDEX_URL_TEMPLATE),
+                max_parquet_size_bytes=env.int(
+                    name="MAX_PARQUET_SIZE_BYTES", default=DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES
+                ),
             )
 
 
diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 64e77be5a3..e241d8f3a7 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -111,6 +111,7 @@ def as_response(self) -> ErrorResponse:
     "SplitsNamesError",
     "SplitNamesFromStreamingError",
     "SplitNotFoundError",
+    "SplitWithTooBigParquetError",
     "StreamingRowsError",
     "TooBigContentError",
     "TooManyColumnsError",
@@ -495,3 +496,10 @@ class NotAvailableIndexFileError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NotAvailableIndexFileError", cause, False)
+
+
+class SplitWithTooBigParquetError(CacheableError):
+    """Raised when the split parquet size (sum of parquet sizes given) is too big."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False)
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index cc1c320998..0ef42c2c65 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -22,6 +22,7 @@
     ParquetResponseEmptyError,
     PreviousStepFormatError,
     SplitNotFoundError,
+    SplitWithTooBigParquetError,
 )
 from libcommon.processing_graph import ProcessingStep
 from libcommon.simple_cache import get_previous_step_or_raise
@@ -35,7 +36,7 @@
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
 VALUE_FEATURE_TYPE = "Value"
-DUCKDB_DEFAULT_INDEX_FILENAME = "index.db"
+DUCKDB_DEFAULT_INDEX_FILENAME = "duckdb_index.db"
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
 CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);"
 CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM"
@@ -54,6 +55,7 @@ def compute_index_rows(
     commit_message: str,
     url_template: str,
     hf_token: Optional[str],
+    max_parquet_size_bytes: int,
     committer_hf_token: Optional[str],
 ) -> SplitHubFile:
     logging.info(f"get split-duckdb-index for dataset={dataset} config={config} split={split}")
@@ -88,12 +90,22 @@ def compute_index_rows(
             f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'dataset_info'"
         )
 
-    parquet_urls = [
-        parquet_file["url"]
+    split_parquet_files = [
+        parquet_file
         for parquet_file in content_parquet_and_info["parquet_files"]
         if parquet_file["config"] == config and parquet_file["split"] == split
     ]
 
+    split_parquets_size = sum(parquet_file["size"] for parquet_file in split_parquet_files)
+
+    if split_parquets_size > max_parquet_size_bytes:
+        raise SplitWithTooBigParquetError(
+            f"The indexing is limited to split parquets under {max_parquet_size_bytes} bytes. "
+            f"Current size of sum of split parquets is {split_parquets_size} bytes."
+        )
+
+    parquet_urls = [parquet_file["url"] for parquet_file in split_parquet_files]
+
     if not parquet_urls:
         raise ParquetResponseEmptyError("No parquet files found.")
 
@@ -131,7 +143,7 @@ def compute_index_rows(
     # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)
     hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
     committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token)
-    index_file_location = f"{config}/{dataset}-{split}.db"
+    index_file_location = f"{config}/{split}/{DUCKDB_DEFAULT_INDEX_FILENAME}"
     try:
         refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE)
         if all(ref.ref != target_revision for ref in refs.converts):
@@ -233,5 +245,6 @@ def compute(self) -> CompleteJobResult:
                 committer_hf_token=self.duckdb_index_config.committer_hf_token,
                 hf_endpoint=self.app_config.common.hf_endpoint,
                 target_revision=self.duckdb_index_config.target_revision,
+                max_parquet_size_bytes=self.duckdb_index_config.max_parquet_size_bytes,
             )
         )
diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml
index 5e85f4de62..a5562832ca 100644
--- a/tools/docker-compose-datasets-server.yml
+++ b/tools/docker-compose-datasets-server.yml
@@ -117,6 +117,7 @@ services:
       DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-}
       DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index}
       DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
+      DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}
       # ^ note: the datasets cache is automatically added, so no need to add it here
       OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10}
diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml
index 59e2b90195..4f1c5c393a 100644
--- a/tools/docker-compose-dev-datasets-server.yml
+++ b/tools/docker-compose-dev-datasets-server.yml
@@ -121,6 +121,7 @@ services:
       DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-}
       DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index}
       DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
+      DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}
       # ^ note: the datasets cache is automatically added, so no need to add it here
       OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10}

From e28142fd575f09c59201b46c9fea2446f7451980 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 15 Jun 2023 12:32:01 -0400
Subject: [PATCH 25/52] Some details

---
 chart/env/prod.yaml                   |  3 ---
 services/worker/tests/fixtures/hub.py | 20 --------------------
 2 files changed, 23 deletions(-)

diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml
index 90f5b3ef48..8b98ecbedc 100644
--- a/chart/env/prod.yaml
+++ b/chart/env/prod.yaml
@@ -97,9 +97,6 @@ optInOutUrlsScan:
   rowsMaxNumber: 100_000
   urlsNumberPerBatch: 1000
 
-duckDBIndex:
-  maxParquetSizeBytes: "5_000_000_000"
-
 # --- jobs (pre-install/upgrade hooks) ---
 
 mongodbMigration:
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index bcb7f03840..db01d0fb3a 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -514,26 +514,6 @@ def get_IMAGE_rows(dataset: str) -> Any:
     ]
 
 
-TEXT_IMAGE_cols = {
-    "col": {"_type": "Image"},
-    "text": {"_type": "Value", "dtype": "string"},
-}
-
-
-def get_TEXT_IMAGE_rows(dataset: str) -> Any:
-    dataset, config, split = get_default_config_split(dataset)
-    return [
-        {
-            "col": {
-                "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/image.jpg",
-                "height": 480,
-                "width": 640,
-            },
-            "text": "This is a text",
-        }
-    ]
-
-
 IMAGES_LIST_cols = {
     "col": [{"_type": "Image"}],
 }

From a51d7d32cbb480311b314b6a414189be86dc04aa Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 15 Jun 2023 12:34:10 -0400
Subject: [PATCH 26/52] Fix style

---
 libs/libcommon/src/libcommon/exceptions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 8fd06b4c17..05ceda6a48 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -504,6 +504,8 @@ class SplitWithTooBigParquetError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False)
+
+
 class DatasetWithTooManyConfigsError(CacheableError):
     """Raised when the number of configs of a dataset exceeded the limit."""
 

From edd120d66d922e98e51455c206aea6156aaf6894 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 16 Jun 2023 09:36:34 -0400
Subject: [PATCH 27/52] Fix test

---
 services/worker/tests/fixtures/hub.py               | 13 +++++++++++++
 .../tests/job_runners/split/test_duckdb_index.py    |  6 ++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index f35d327ea0..2dd5d02f38 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -764,3 +764,16 @@ def hub_reponses_spawning_opt_in_out(hub_public_spawning_opt_in_out: str) -> Hub
         ),
         "parquet_and_info_response": None,
     }
+
+
+@pytest.fixture
+def hub_reponses_duckdb_index(hub_public_duckdb_index: str) -> HubDatasetTest:
+    return {
+        "name": hub_public_duckdb_index,
+        "config_names_response": create_config_names_response(hub_public_duckdb_index),
+        "splits_response": create_splits_response(hub_public_duckdb_index),
+        "first_rows_response": create_first_rows_response(hub_public_duckdb_index, TEXT_cols, TEXT_rows),
+        "parquet_and_info_response": create_parquet_and_info_response(
+            dataset=hub_public_duckdb_index, data_type="csv"
+        ),
+    }
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index b6053d7ea4..1c10f37aca 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -16,7 +16,7 @@
 from worker.job_runners.split.duckdb_index import SplitDuckDbIndexJobRunner
 from worker.resources import LibrariesResource
 
-from ...fixtures.hub import HubDatasets
+from ...fixtures.hub import HubDatasetTest
 
 GetJobRunner = Callable[[str, str, str, AppConfig], SplitDuckDbIndexJobRunner]
 
@@ -128,10 +128,12 @@ def test_compute(
     get_parquet_job_runner: GetParquetJobRunner,
     get_job_runner: GetJobRunner,
     app_config: AppConfig,
-    hub_datasets: HubDatasets,
+    hub_reponses_public: HubDatasetTest,
+    hub_reponses_duckdb_index: HubDatasetTest,
     hub_dataset_name: str,
     expected_error_code: str,
 ) -> None:
+    hub_datasets = {"public": hub_reponses_public, "duckdb_index": hub_reponses_duckdb_index}
     dataset = hub_datasets[hub_dataset_name]["name"]
     config_names = hub_datasets[hub_dataset_name]["config_names_response"]
     config = hub_datasets[hub_dataset_name]["config_names_response"]["config_names"][0]["config"]

From 059c632a57d9d2bd111165967ad803303be51c94 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Fri, 16 Jun 2023 09:43:47 -0400
Subject: [PATCH 28/52] Apply code review suggestions

---
 .../job_runners/config/parquet_and_info.py    |  2 +-
 .../worker/job_runners/split/duckdb_index.py  | 25 +++++++++++++------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index 8e3cfa4a67..84c78a879f 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -73,7 +73,7 @@
 
 from worker.config import AppConfig, ParquetAndInfoConfig
 from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner
-from worker.utils import CompleteJobResult, hf_hub_url
+from worker.utils import CompleteJobResult, hf_hub_url, retry
 
 
 class ConfigParquetAndInfoResponse(TypedDict):
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 0ef42c2c65..3e776673fe 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -17,6 +17,7 @@
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
 from libcommon.exceptions import (
     DatasetNotFoundError,
+    LockedDatasetTimeoutError,
     NoIndexableColumnsError,
     NotAvailableIndexFileError,
     ParquetResponseEmptyError,
@@ -25,6 +26,7 @@
     SplitWithTooBigParquetError,
 )
 from libcommon.processing_graph import ProcessingStep
+from libcommon.queue import lock
 from libcommon.simple_cache import get_previous_step_or_raise
 from libcommon.storage import StrPath
 from libcommon.utils import JobInfo, SplitHubFile
@@ -46,6 +48,7 @@
 
 
 def compute_index_rows(
+    job_id: str,
     dataset: str,
     config: str,
     split: str,
@@ -165,14 +168,19 @@ def compute_index_rows(
         CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location)
     ]
 
-    committer_hf_api.create_commit(
-        repo_id=dataset,
-        repo_type=DATASET_TYPE,
-        revision=target_revision,
-        operations=delete_operations + add_operations,
-        commit_message=commit_message,
-        parent_commit=target_dataset_info.sha,
-    )
+    try:
+        sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300]
+        with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps):
+            committer_hf_api.create_commit(
+                repo_id=dataset,
+                repo_type=DATASET_TYPE,
+                revision=target_revision,
+                operations=delete_operations + add_operations,
+                commit_message=commit_message,
+                parent_commit=target_dataset_info.sha,
+            )
+    except TimeoutError as err:
+        raise LockedDatasetTimeoutError("the dataset is currently locked, please try again later.") from err
 
     # call the API again to get the index file
     target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True)
@@ -235,6 +243,7 @@ def get_job_runner_version() -> int:
     def compute(self) -> CompleteJobResult:
         return CompleteJobResult(
             compute_index_rows(
+                job_id=self.job_info["job_id"],
                 dataset=self.dataset,
                 config=self.config,
                 split=self.split,

From 9ecf9233c694ebfc3a9b5ee6c3a28646319469a4 Mon Sep 17 00:00:00 2001
From: Andrea Francis Soria Jimenez <andrea@huggingface.co>
Date: Mon, 19 Jun 2023 18:48:37 -0400
Subject: [PATCH 29/52] Update chart/values.yaml

Co-authored-by: Sylvain Lesage <sylvain.lesage@huggingface.co>
---
 chart/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chart/values.yaml b/chart/values.yaml
index 15160d4dc1..ccac40bfdf 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -210,7 +210,7 @@ parquetMetadata:
   storageDirectory: "/parquet-metadata"
 
 duckDBIndex:
-  # Directory on the shared storage (duckdb db files used for datasets indexing)
+  # Directory on the shared storage (used temporarily to prepare the duckdb indexes before sending to the Hub)
   storageDirectory: "/duckdb-index"
   # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`.
   commitMessage: "Update duckdb index files"

From 874fabd126cc6d3570b543095cb7272f9adc75ab Mon Sep 17 00:00:00 2001
From: Andrea Francis Soria Jimenez <andrea@huggingface.co>
Date: Mon, 19 Jun 2023 18:53:49 -0400
Subject: [PATCH 30/52] Apply suggestions from code review

Co-authored-by: Sylvain Lesage <sylvain.lesage@huggingface.co>
---
 libs/libcommon/src/libcommon/config.py                       | 2 --
 libs/libcommon/src/libcommon/exceptions.py                   | 2 +-
 services/worker/src/worker/job_runners/split/duckdb_index.py | 4 ++--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
index 141e70e6f4..231ec1f55f 100644
--- a/libs/libcommon/src/libcommon/config.py
+++ b/libs/libcommon/src/libcommon/config.py
@@ -357,8 +357,6 @@ class ProcessingGraphConfig:
                 "input_type": "split",
                 "triggered_by": [
                     "config-split-names-from-info",
-                    "config-split-names-from-streaming",
-                    "config-parquet-and-info",
                 ],
                 "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
             },
diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 6c22581a22..57276b2c5e 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -500,7 +500,7 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
 
 
-class NotAvailableIndexFileError(CacheableError):
+class DuckDBIndexFileNotFoundError(CacheableError):
     """Raised when no duckdb index file was found for split."""
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 3e776673fe..c461c0324c 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -38,7 +38,7 @@
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
 VALUE_FEATURE_TYPE = "Value"
-DUCKDB_DEFAULT_INDEX_FILENAME = "duckdb_index.db"
+DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb"
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
 CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);"
 CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM"
@@ -115,7 +115,7 @@ def compute_index_rows(
     # get the features
     features = content_parquet_and_info["dataset_info"].get("features", [])
 
-    # look for string columns using the first rows
+    # look for string columns
     string_columns = [
         column
         for column, feature in features.items()

From c36202fd55d61b552ffd58329cfa7a2e3502ca98 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 19 Jun 2023 08:08:41 -0400
Subject: [PATCH 31/52] Apply code review suggestions

---
 chart/templates/_envDuckDbIndex.tpl           |  7 ----
 chart/templates/_envWorker.tpl                |  2 +
 chart/templates/worker/_container.tpl         |  1 -
 .../job_runners/config/parquet_and_info.py    | 13 ++----
 .../worker/job_runners/split/duckdb_index.py  | 42 +++++++++----------
 services/worker/src/worker/utils.py           | 22 +++++++++-
 6 files changed, 47 insertions(+), 40 deletions(-)
 delete mode 100644 chart/templates/_envDuckDbIndex.tpl

diff --git a/chart/templates/_envDuckDbIndex.tpl b/chart/templates/_envDuckDbIndex.tpl
deleted file mode 100644
index a0a12059bb..0000000000
--- a/chart/templates/_envDuckDbIndex.tpl
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2023 The HuggingFace Authors.
-
-{{- define "envDuckDBIndex" -}}
-- name: DUCKDB_INDEX_STORAGE_DIRECTORY
-  value: {{ .Values.duckDBIndex.storageDirectory | quote }}
-{{- end -}}
diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl
index 7dbf16f1d0..0395558c18 100644
--- a/chart/templates/_envWorker.tpl
+++ b/chart/templates/_envWorker.tpl
@@ -106,4 +106,6 @@
   value: {{ .Values.duckDBIndex.urlTemplate | quote }}
 - name: DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES
   value: {{ .Values.duckDBIndex.maxParquetSizeBytes | quote }}
+- name: DUCKDB_INDEX_STORAGE_DIRECTORY
+  value: {{ .Values.duckDBIndex.storageDirectory | quote }}
 {{- end -}}
diff --git a/chart/templates/worker/_container.tpl b/chart/templates/worker/_container.tpl
index 1b8e2ddfe4..f9b86817a9 100644
--- a/chart/templates/worker/_container.tpl
+++ b/chart/templates/worker/_container.tpl
@@ -9,7 +9,6 @@
   {{ include "envAssets" . | nindent 2 }}
   {{ include "envCache" . | nindent 2 }}
   {{ include "envParquetMetadata" . | nindent 2 }}
-  {{ include "envDuckDBIndex" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
   {{ include "envLog" . | nindent 2 }}
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index 84c78a879f..c8174138af 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -73,7 +73,7 @@
 
 from worker.config import AppConfig, ParquetAndInfoConfig
 from worker.job_runners.config.config_job_runner import ConfigCachedJobRunner
-from worker.utils import CompleteJobResult, hf_hub_url, retry
+from worker.utils import CompleteJobResult, create_branch, hf_hub_url, retry
 
 
 class ConfigParquetAndInfoResponse(TypedDict):
@@ -991,14 +991,9 @@ def compute_config_parquet_and_info_response(
 
     # create the target revision if we managed to get the parquet files and it does not exist yet
     # (clone from initial commit to avoid cloning all repo's files)
-    try:
-        if all(ref.ref != target_revision for ref in refs.converts):
-            initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id
-            committer_hf_api.create_branch(
-                repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True
-            )
-    except RepositoryNotFoundError as err:
-        raise DatasetNotFoundError("The dataset does not exist on the Hub (was deleted during job).") from err
+    create_branch(
+        dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api
+    )
 
     try:
         sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300]
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index c461c0324c..05aabfb0c5 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -33,7 +33,7 @@
 
 from worker.config import AppConfig
 from worker.job_runners.split.split_job_runner import SplitCachedJobRunner
-from worker.utils import CompleteJobResult, hf_hub_url
+from worker.utils import CompleteJobResult, create_branch, hf_hub_url
 
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
@@ -149,28 +149,27 @@ def compute_index_rows(
     index_file_location = f"{config}/{split}/{DUCKDB_DEFAULT_INDEX_FILENAME}"
     try:
         refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE)
-        if all(ref.ref != target_revision for ref in refs.converts):
-            initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id
-            committer_hf_api.create_branch(
-                repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True
-            )
     except RepositoryNotFoundError as err:
         raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err
 
-    target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
-    all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings}
-    delete_operations: List[CommitOperation] = []
-    if index_file_location in all_repo_files:
-        delete_operations.append(CommitOperationDelete(path_in_repo=index_file_location))
-
-    # send the files to the target revision
-    add_operations: List[CommitOperation] = [
-        CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location)
-    ]
+    create_branch(
+        dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api
+    )
 
     try:
         sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300]
         with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps):
+            target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
+            all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings}
+            delete_operations: List[CommitOperation] = []
+            if index_file_location in all_repo_files:
+                delete_operations.append(CommitOperationDelete(path_in_repo=index_file_location))
+
+            # send the files to the target revision
+            add_operations: List[CommitOperation] = [
+                CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location)
+            ]
+
             committer_hf_api.create_commit(
                 repo_id=dataset,
                 repo_type=DATASET_TYPE,
@@ -179,20 +178,19 @@ def compute_index_rows(
                 commit_message=commit_message,
                 parent_commit=target_dataset_info.sha,
             )
+
+            # call the API again to get the index file
+            target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True)
     except TimeoutError as err:
         raise LockedDatasetTimeoutError("the dataset is currently locked, please try again later.") from err
 
-    # call the API again to get the index file
-    target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True)
     repo_files = [
         repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location
     ]
 
-    if not repo_files:
-        raise NotAvailableIndexFileError("No index file was found")
-
-    if len(repo_files) != 1:
+    if not repo_files or len(repo_files) != 1:
         logging.warning(f"Found {len(repo_files)} index files, should be only 1")
+        raise NotAvailableIndexFileError("No index file was found")
 
     repo_file = repo_files[0]
     if repo_file.size is None:
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index 46335ff5b2..7bb45547d4 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -33,7 +33,13 @@
 )
 from datasets.utils.file_utils import get_authentication_headers_for_url
 from fsspec.implementations.http import HTTPFileSystem
-from libcommon.exceptions import NormalRowsError, StreamingRowsError
+from huggingface_hub.hf_api import GitRefs, HfApi
+from huggingface_hub.utils._errors import RepositoryNotFoundError
+from libcommon.exceptions import (
+    DatasetNotFoundError,
+    NormalRowsError,
+    StreamingRowsError,
+)
 from libcommon.utils import orjson_dumps
 from pyarrow.parquet import ParquetFile
 
@@ -421,3 +427,17 @@ def hf_hub_url(repo_id: str, filename: str, hf_endpoint: str, revision: str, url
 def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> ParquetFile:
     headers = get_authentication_headers_for_url(url, use_auth_token=hf_token)
     return ParquetFile(fs.open(url, headers=headers))
+
+
+DATASET_TYPE = "dataset"
+
+
+def create_branch(dataset: str, target_revision: str, refs: GitRefs, hf_api: HfApi, committer_hf_api: HfApi) -> None:
+    try:
+        if all(ref.ref != target_revision for ref in refs.converts):
+            initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id
+            committer_hf_api.create_branch(
+                repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True
+            )
+    except RepositoryNotFoundError as err:
+        raise DatasetNotFoundError("The dataset does not exist on the Hub (was deleted during job).") from err

From 9b82a66b2af08d0bb2024bef3b5b6cdf29b57a83 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 16 Jun 2023 09:10:04 -0700
Subject: [PATCH 32/52] [docs] Improvements (#1376)

* add end-to-end example

* apply feedback
---
 docs/source/_toctree.yml     |   2 +
 docs/source/analyze_data.mdx |  63 +++++++++++++++++++++
 docs/source/quick_start.mdx  | 103 +++++++++++++++++++++++++++++++++++
 3 files changed, 168 insertions(+)
 create mode 100644 docs/source/analyze_data.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 0767f5ab65..b4dbd1298a 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -4,6 +4,8 @@
     title: 🤗 Datasets server
   - local: quick_start
     title: Quickstart
+  - local: analyze_data
+    title: Analyze a dataset on the Hub
 - title: Guides
   sections:
   - local: valid
diff --git a/docs/source/analyze_data.mdx b/docs/source/analyze_data.mdx
new file mode 100644
index 0000000000..e6c1a67313
--- /dev/null
+++ b/docs/source/analyze_data.mdx
@@ -0,0 +1,63 @@
+# Analyze a dataset on the Hub
+
+[[open-in-colab]]
+
+In the Quickstart, you were introduced to various endpoints for interacting with datasets on the Hub. One of the most useful ones is the `/parquet` endpoint, which allows you to get a dataset stored on the Hub and analyze it. This is a great way to explore the dataset, and get a better understanding of it's contents.
+
+To demonstrate, this guide will show you an end-to-end example of how to retrieve a dataset from the Hub and do some basic data analysis with the Pandas library.
+
+## Get a dataset
+
+The [Hub](https://huggingface.co/datasets) is home to more than 40,000 datasets across a wide variety of tasks, sizes, and languages. For this example, you'll use the [`codeparrot/codecomplex`](https://huggingface.co/datasets/codeparrot/codecomplex) dataset, but feel free to explore and find another dataset that interests you! The dataset contains Java code from programming competitions, and the time complexity of the code is labeled by a group of algorithm experts. 
+
+Let's say you're interested in the average length of the submitted code as it relates to the time complexity. Here's how you can get started. 
+
+Use the `/parquet` endpoint to convert the dataset to a Parquet file and return the URL to it:
+
+```py
+import requests
+API_URL = "https://datasets-server.huggingface.co/parquet?dataset=codeparrot/codecomplex"
+def query():
+    response = requests.get(API_URL)
+    return response.json()
+data = query()
+print(data)
+{'parquet_files': 
+    [
+        {'dataset': 'codeparrot/codecomplex', 'config': 'codeparrot--codecomplex', 'split': 'train', 'url': 'https://huggingface.co/datasets/codeparrot/codecomplex/resolve/refs%2Fconvert%2Fparquet/codeparrot--codecomplex/json-train.parquet', 'filename': 'json-train.parquet', 'size': 4115908}
+    ], 
+ 'pending': [], 'failed': []
+}
+```
+
+## Read dataset with Pandas
+
+With the URL, you can read the Parquet file into a Pandas DataFrame:
+
+```py
+import pandas as pd
+
+url = "https://huggingface.co/datasets/codeparrot/codecomplex/resolve/refs%2Fconvert%2Fparquet/codeparrot--codecomplex/json-train.parquet"
+df = pd.read_parquet(url)
+df.head(5)
+```
+
+|                                               src | complexity |                         problem |       from |
+|--------------------------------------------------:|-----------:|--------------------------------:|-----------:|
+| import java.io.*;\nimport java.math.BigInteger... |  quadratic |     1179_B. Tolik and His Uncle | CODEFORCES |
+| import java.util.Scanner;\n \npublic class pil... |     linear |                 1197_B. Pillars | CODEFORCES |
+| import java.io.BufferedReader;\nimport java.io... |     linear | 1059_C. Sequence Transformation | CODEFORCES |
+| import java.util.*;\n\nimport java.io.*;\npubl... |     linear |                  1011_A. Stages | CODEFORCES |
+| import java.io.OutputStream;\nimport java.io.I... |     linear |    1190_C. Tokitsukaze and Duel | CODEFORCES |
+
+## Calculate mean code length by time complexity
+
+Pandas is a powerful library for data analysis; group the dataset by time complexity, apply a function to calculate the average length of the code snippet, and plot the results:
+
+```py
+df.groupby('complexity')['src'].apply(lambda x: x.str.len().mean()).sort_values(ascending=False).plot.barh(color="orange")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets-server/codecomplex.png"/>
+</div>
\ No newline at end of file
diff --git a/docs/source/quick_start.mdx b/docs/source/quick_start.mdx
index d784bb2060..b798b8e8e4 100644
--- a/docs/source/quick_start.mdx
+++ b/docs/source/quick_start.mdx
@@ -1,5 +1,7 @@
 # Quickstart
 
+[[open-in-colab]]
+
 In this quickstart, you'll learn how to use the Datasets Server's REST API to:
 
 - Check whether a dataset on the Hub is functional.
@@ -87,6 +89,13 @@ curl https://datasets-server.huggingface.co/is-valid?dataset=rotten_tomatoes \
 </curl>
 </inferencesnippet>
 
+You'll see the following error if you're trying to access a gated dataset without providing your user token:
+
+```py
+print(data)
+{'error': 'The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication.'}
+```
+
 ## Check dataset validity
 
 The `/valid` endpoint returns a JSON list of datasets stored on the Hub that load without any errors:
@@ -128,6 +137,22 @@ curl https://datasets-server.huggingface.co/valid \
 </curl>
 </inferencesnippet>
 
+This returns a list of all the datasets that load without an error:
+
+```py
+print(data)
+{
+  "valid": [
+    "0n1xus/codexglue",
+    "0n1xus/pytorrent-standalone",
+    "0x7194633/rupile",
+    "51la5/keyword-extraction",
+    ...,
+    ...,
+  ]
+}
+```
+
 To check whether a specific dataset is valid, for example, [Rotten Tomatoes](https://huggingface.co/datasets/rotten_tomatoes), use the `/is-valid` endpoint instead:
 
 <inferencesnippet>
@@ -167,6 +192,13 @@ curl https://datasets-server.huggingface.co/is-valid?dataset=rotten_tomatoes \
 </curl>
 </inferencesnippet>
 
+This returns whether the `valid` key is `true` or `false`:
+
+```py
+print(data)
+{'valid': True}
+```
+
 ## List configurations and splits
 
 The `/splits` endpoint returns a JSON list of the splits in a dataset:
@@ -208,6 +240,21 @@ curl https://datasets-server.huggingface.co/splits?dataset=rotten_tomatoes \
 </curl>
 </inferencesnippet>
 
+This returns the available configuration and splits in the dataset:
+
+```py
+print(data)
+{'splits': 
+    [
+        {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'train'}, 
+        {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'validation'}, 
+        {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'test'}
+    ], 
+ 'pending': [], 
+ 'failed': []
+}
+```
+
 ## Preview a dataset
 
 The `/first-rows` endpoint returns a JSON list of the first 100 rows of a dataset. It also returns the types of data features ("columns" data types). You should specify the dataset name, configuration name (you can find out the configuration name from the `/splits` endpoint), and split name of the dataset you'd like to preview:
@@ -249,6 +296,26 @@ curl https://datasets-server.huggingface.co/first-rows?dataset=rotten_tomatoes&c
 </curl>
 </inferencesnippet>
 
+This returns the first 100 rows of the dataset:
+
+```py
+print(data)
+{'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'train', 
+ 'features': 
+    [
+        {'feature_idx': 0, 'name': 'text', 'type': {'dtype': 'string', '_type': 'Value'}}, 
+        {'feature_idx': 1, 'name': 'label', 'type': {'names': ['neg', 'pos'], '_type': 'ClassLabel'}}
+    ], 
+ 'rows': 
+    [
+        {'row_idx': 0, 'row': {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}, 'truncated_cells': []}, 
+        {'row_idx': 1, 'row': {'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'label': 1}, 'truncated_cells': []}
+        ...,
+        ...,
+    ],
+}
+```
+
 ## Download slices of a dataset
 
 The `/rows` endpoint returns a JSON list of a slice of rows of a dataset at any given location (offset).
@@ -294,6 +361,27 @@ curl https://datasets-server.huggingface.co/rows?dataset=rotten_tomatoes&config=
 
 You can download slices of 100 rows maximum at a time.
 
+The response looks like:
+
+```py
+print(data)
+{'features': 
+    [
+        {'feature_idx': 0, 'name': 'text', 'type': {'dtype': 'string', '_type': 'Value'}}, 
+        {'feature_idx': 1, 'name': 'label', 'type': {'names': ['neg', 'pos'], '_type': 'ClassLabel'}}], 
+ 'rows': 
+    [
+        {'row_idx': 150, 'row': {'text': 'enormously likable , partly because it is aware of its own grasp of the absurd .', 'label': 1}, 'truncated_cells': []}, 
+        {'row_idx': 151, 'row': {'text': "here's a british flick gleefully unconcerned with plausibility , yet just as determined to entertain you .", 'label': 1}, 'truncated_cells': []}, 
+        {'row_idx': 152, 'row': {'text': "it's an old story , but a lively script , sharp acting and partially animated interludes make just a kiss seem minty fresh .", 'label': 1}, 'truncated_cells': []}, 
+        {'row_idx': 153, 'row': {'text': 'must be seen to be believed .', 'label': 1}, 'truncated_cells': []}, 
+        {'row_idx': 154, 'row': {'text': "ray liotta and jason patric do some of their best work in their underwritten roles , but don't be fooled : nobody deserves any prizes here .", 'label': 1}, 'truncated_cells': []}, 
+        ...,
+        ...,
+    ]
+}
+```
+
 ## Access Parquet files
 
 Datasets Server converts every public dataset on the Hub to the [Parquet](https://parquet.apache.org/) format. The `/parquet` endpoint returns a JSON list of the Parquet URLs for a dataset:
@@ -334,3 +422,18 @@ curl https://datasets-server.huggingface.co/parquet?dataset=rotten_tomatoes \
 ```
 </curl>
 </inferencesnippet>
+
+This returns a URL to the Parquet file for each split:
+
+```py
+print(data)
+{'parquet_files': 
+    [
+        {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'test', 'url': 'https://huggingface.co/datasets/rotten_tomatoes/resolve/refs%2Fconvert%2Fparquet/default/rotten_tomatoes-test.parquet', 'filename': 'rotten_tomatoes-test.parquet', 'size': 92206}, 
+        {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'train', 'url': 'https://huggingface.co/datasets/rotten_tomatoes/resolve/refs%2Fconvert%2Fparquet/default/rotten_tomatoes-train.parquet', 'filename': 'rotten_tomatoes-train.parquet', 'size': 698845}, 
+        {'dataset': 'rotten_tomatoes', 'config': 'default', 'split': 'validation', 'url': 'https://huggingface.co/datasets/rotten_tomatoes/resolve/refs%2Fconvert%2Fparquet/default/rotten_tomatoes-validation.parquet', 'filename': 'rotten_tomatoes-validation.parquet', 'size': 90001}
+    ], 
+ 'pending': [], 
+ 'failed': []
+}
+```
\ No newline at end of file

From 33260144a83641c18c511f77274dde2f5073939a Mon Sep 17 00:00:00 2001
From: Bas Krahmer <baskrahmer@gmail.com>
Date: Mon, 19 Jun 2023 10:51:24 +0200
Subject: [PATCH 33/52] Fix closing brackets and GH action link (#1389)

---
 DEVELOPER_GUIDE.md               | 4 ++--
 services/reverse-proxy/README.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index d446e6d012..e2101dc5d9 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -51,7 +51,7 @@ If you use VSCode, it might be useful to use the ["monorepo" workspace](./.vscod
 
 ## Architecture
 
-The repository is structured as a monorepo, with Python libraries and applications in [jobs](./jobs)), [libs](./libs) and [services](./services):
+The repository is structured as a monorepo, with Python libraries and applications in [jobs](./jobs), [libs](./libs) and [services](./services):
 
 - [jobs](./jobs) contains the one-time jobs run by Helm before deploying the pods. For now, the only job migrates the databases when needed.
 - [libs](./libs) contains the Python libraries used by the services and workers. For now, the only library is [libcommon](./libs/libcommon), which contains the common code for the services and workers.
@@ -97,7 +97,7 @@ The following environments contain all the modules: reverse proxy, API server, a
 
 ## Quality
 
-The CI checks the quality of the code through a [GitHub action](./.github/workflows/quality.yml). To manually format the code of a job, library, service or worker:
+The CI checks the quality of the code through a [GitHub action](./.github/workflows/_quality-python.yml). To manually format the code of a job, library, service or worker:
 
 ```bash
 make style
diff --git a/services/reverse-proxy/README.md b/services/reverse-proxy/README.md
index 2801bf4441..d322fda21c 100644
--- a/services/reverse-proxy/README.md
+++ b/services/reverse-proxy/README.md
@@ -8,7 +8,7 @@ Note that the template configuration is located in [chart/nginx-templates/](../.
 
 The reverse proxy uses nginx:
 
-- it serves the static assets directly (the API also serves them if required, but it's unnecessary to go through starlette for this, and it generates errors in Safari, see [1](https://github.com/encode/starlette/issues/950) and [2](https://developer.apple.com/library/archive/documentation/AppleApplications/Reference/SafariWebContent/CreatingVideoforSafarioniPhone/CreatingVideoforSafarioniPhone.html#//apple_ref/doc/uid/TP40006514-SW6)
+- it serves the static assets directly (the API also serves them if required, but it's unnecessary to go through starlette for this, and it generates errors in Safari, see [1](https://github.com/encode/starlette/issues/950) and [2](https://developer.apple.com/library/archive/documentation/AppleApplications/Reference/SafariWebContent/CreatingVideoforSafarioniPhone/CreatingVideoforSafarioniPhone.html#//apple_ref/doc/uid/TP40006514-SW6))
 - it serves the OpenAPI specification
 - it proxies the other requests to the API
 

From 14107376fa21c33fbcc3ec3b9730af1a205850a7 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 19 Jun 2023 10:55:12 +0200
Subject: [PATCH 34/52] Fix typo in erro rmessage (#1391)

---
 services/worker/src/worker/job_runners/dataset/config_names.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/worker/src/worker/job_runners/dataset/config_names.py b/services/worker/src/worker/job_runners/dataset/config_names.py
index b1045c31a6..a3a803541d 100644
--- a/services/worker/src/worker/job_runners/dataset/config_names.py
+++ b/services/worker/src/worker/job_runners/dataset/config_names.py
@@ -74,7 +74,7 @@ def compute_config_names_response(
     number_of_configs = len(config_name_items)
     if number_of_configs > max_number:
         raise DatasetWithTooManyConfigsError(
-            f"The maximun number of configs allowed is {max_number}, dataset has {number_of_configs} configs."
+            f"The maximum number of configs allowed is {max_number}, dataset has {number_of_configs} configs."
         )
 
     return DatasetConfigNamesResponse(config_names=config_name_items)

From 1d9574e4f712eae2f8c66c848652e0be83784384 Mon Sep 17 00:00:00 2001
From: Bas Krahmer <baskrahmer@gmail.com>
Date: Mon, 19 Jun 2023 12:39:35 +0200
Subject: [PATCH 35/52] Add docker internal to extra_hosts (#1390)

---
 tools/docker-compose-dev-base.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/docker-compose-dev-base.yml b/tools/docker-compose-dev-base.yml
index 90a1e9cb1a..b9ca8e9ed8 100644
--- a/tools/docker-compose-dev-base.yml
+++ b/tools/docker-compose-dev-base.yml
@@ -25,6 +25,8 @@ services:
       WORKER_MAX_LOAD_PCT: ${WORKER_MAX_LOAD_PCT-70}
       WORKER_MAX_MEMORY_PCT: ${WORKER_MAX_MEMORY_PCT-80}
       WORKER_SLEEP_SECONDS: ${WORKER_SLEEP_SECONDS-15}
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     # volumes to local source directory for development
     volumes:
       - ../libs/libcommon/src:/src/libs/libcommon/src

From 7971b346f3496501752f2a6166d1c4451562c840 Mon Sep 17 00:00:00 2001
From: Sylvain Lesage <sylvain.lesage@huggingface.co>
Date: Mon, 19 Jun 2023 13:21:39 +0200
Subject: [PATCH 36/52] =?UTF-8?q?fix:=20=F0=9F=90=9B=20support=20bigger=20?=
 =?UTF-8?q?images=20(#1387)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: 🐛 support bigger images

fixes https://github.com/huggingface/datasets-server/issues/1361

* style: 💄 fix style

* style: 💄 add types for Pillow
---
 services/worker/poetry.lock         | 14 ++++++++++++--
 services/worker/pyproject.toml      |  5 +++--
 services/worker/src/worker/utils.py |  5 +++++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock
index c7a99d8ea4..cab3feb395 100644
--- a/services/worker/poetry.lock
+++ b/services/worker/poetry.lock
@@ -4749,8 +4749,6 @@ python-versions = ">=3.8"
 files = [
     {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:db464c88e10e927725997f9b872a21c9d057789d3b7e9a26e4ef1af41d0bcc8c"},
     {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:172277c33cb1ae0da19f98c5bcd4946149cfa73c8ea05c6ba18365d58dd3c6f2"},
-    {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:9c9b14fbb73ec4cb0f209722a1489020fd8614c92ae22589f2309c48cefdf21f"},
-    {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6a54539bd076746f69ae8bef7282f981674fe4dbf59c3a84c4af86ae6bae9d5c"},
     {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e3fa53e63672fd71998bbd71cc5478c74dbe5a2d9291d1801c575358c28403c2"},
     {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:5499312c21ed3ed47cc6b4cf861896e9564c2c32d8d3c2ef1437c5ca31adfc73"},
     {file = "tensorflow_macos-2.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:84cb873c90be63efabfecca53fdc48b734a037d0750532b55cb7ce7c343b5cac"},
@@ -5067,6 +5065,18 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
 doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
 test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
 
+[[package]]
+name = "types-pillow"
+version = "9.5.0.4"
+description = "Typing stubs for Pillow"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "types-Pillow-9.5.0.4.tar.gz", hash = "sha256:f1b6af47abd151847ee25911ffeba784899bc7dc7f9eba8ca6a5aac522b012ef"},
+    {file = "types_Pillow-9.5.0.4-py3-none-any.whl", hash = "sha256:69427d9fa4320ff6e30f00fb9c0dd71185dc0a16de4757774220104759483466"},
+]
+
 [[package]]
 name = "types-psutil"
 version = "5.9.5.13"
diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml
index 3215758fb7..f5ac325bc2 100644
--- a/services/worker/pyproject.toml
+++ b/services/worker/pyproject.toml
@@ -20,13 +20,14 @@ kss = "^2.6.0"
 libcommon = {path = "../../libs/libcommon", develop = true}
 lm-dataformat = "^0.0.20"
 lxml = "^4.9.2"
+mirakuru = "^2.4.2"
 nlp = "^0.4.0"
 nltk = "^3.8.1"
 numpy = "~1.22.4"
 openpyxl = "^3.1.1"
 pdf2image = "^1.16.2"
-pyarrow = "^11.0.0"
 py7zr = "^0.20.4"
+pyarrow = "^11.0.0"
 pydub = "^0.25.1"
 pypdf2 = "^3.0.1"
 python = "3.9.15"
@@ -44,7 +45,6 @@ transformers = "^4.30.0"
 trec-car-tools = { path = "vendors/trec-car-tools/python3" }
 typer = "^0.4.2"
 wget = "^3.2"
-mirakuru = "^2.4.2"
 duckdb = "^0.8.0"
 
 [tool.poetry.group.dev.dependencies]
@@ -58,6 +58,7 @@ pip-audit = "^2.5.4"
 pytest = "^7.2.1"
 pytest-asyncio = "^0.21.0"
 pytest-cov = "^2.12.1"
+types-pillow = "^9.5.0.4"
 types-psutil = "^5.9.5"
 types-requests = "^2.28.11"
 
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index 7bb45547d4..a989f27776 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -23,6 +23,7 @@
 )
 from urllib.parse import quote
 
+import PIL
 from datasets import (
     Dataset,
     DatasetInfo,
@@ -43,6 +44,9 @@
 from libcommon.utils import orjson_dumps
 from pyarrow.parquet import ParquetFile
 
+MAX_IMAGE_PIXELS = 1_000_000_000
+# ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS
+
 
 class JobRunnerInfo(TypedDict):
     job_type: str
@@ -341,6 +345,7 @@ def get_rows(
     column_names: Optional[List[str]] = None,
 ) -> RowsContent:
     download_config = DownloadConfig(delete_extracted=True)
+    PIL.Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
     ds = load_dataset(
         dataset,
         name=config,

From 431163d0f0ec6cc84a1fcc7e2304b2cc41b8f3ea Mon Sep 17 00:00:00 2001
From: Sylvain Lesage <sylvain.lesage@huggingface.co>
Date: Mon, 19 Jun 2023 14:12:18 +0200
Subject: [PATCH 37/52] Rename dev to staging, and use staging mongodb cluster
 (#1383)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* chore: 🤖 remove makefile targets

since we use ArgoCD now

* feat: 🎸 align dev on prod, and use secret for mongo url

* feat: 🎸 rename dev to staging

* ci: 🎡 change dev to staging in ci
---
 .github/workflows/cd.yml             |  6 ++---
 .github/workflows/chart-pr.yml       |  4 +--
 chart/Makefile                       | 38 ----------------------------
 chart/README.md                      | 19 +-------------
 chart/env/{dev.yaml => staging.yaml} | 29 ++++++++++++---------
 5 files changed, 23 insertions(+), 73 deletions(-)
 rename chart/env/{dev.yaml => staging.yaml} (91%)

diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
index 878e3e3bee..d139460112 100644
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -76,14 +76,14 @@ jobs:
       - name: Lint chart with default values
         run: helm lint
         working-directory: chart
-      - name: Lint chart with dev values
-        run: helm lint --values env/dev.yaml
+      - name: Lint chart with staging values
+        run: helm lint --values env/staging.yaml
         working-directory: chart
       - name: Lint chart with prod values
         run: helm lint --values env/prod.yaml
         working-directory: chart
 
-  deploy-dev-and-prod:
+  deploy-staging-and-prod:
     if: ${{ endsWith(github.ref, '/main') }}
     runs-on: ubuntu-latest
     needs: [build-and-push-images]
diff --git a/.github/workflows/chart-pr.yml b/.github/workflows/chart-pr.yml
index cd5789bcdd..93880b2376 100644
--- a/.github/workflows/chart-pr.yml
+++ b/.github/workflows/chart-pr.yml
@@ -19,8 +19,8 @@ jobs:
       - name: Lint chart with default values
         run: helm lint
         working-directory: chart
-      - name: Lint chart with dev values
-        run: helm lint --values env/dev.yaml
+      - name: Lint chart with staging values
+        run: helm lint --values env/staging.yaml
         working-directory: chart
       - name: Lint chart with prod values
         run: helm lint --values env/prod.yaml
diff --git a/chart/Makefile b/chart/Makefile
index 555a89f3f8..57716f53a9 100644
--- a/chart/Makefile
+++ b/chart/Makefile
@@ -1,45 +1,7 @@
-K8S_NAMESPACE := datasets-server
-
 .PHONY: init
 init:
 	helm dependency update .
 
-.PHONY: uninstall
-uninstall:
-	helm uninstall $(ENV) -n $(K8S_NAMESPACE)
-
-.PHONY: diff
-diff:
-	helm diff upgrade --install $(ENV) . --values env/$(ENV).yaml -n $(K8S_NAMESPACE)
-
-.PHONY: upgrade
-upgrade:
-	helm upgrade --install $(ENV) . --values env/$(ENV).yaml -n $(K8S_NAMESPACE)
-
-.PHONY: diff-dev
-diff-dev:
-	@make diff ENV=dev
-
-.PHONY: uninstall-dev
-uninstall-dev:
-	@make uninstall ENV=dev
-
-.PHONY: upgrade-dev
-upgrade-dev:
-	@make upgrade ENV=dev
-
-.PHONY: diff-prod
-diff-prod:
-	@make diff ENV=prod
-
-.PHONY: uninstall-prod
-uninstall-prod:
-	@make uninstall ENV=prod
-
-.PHONY: upgrade-prod
-upgrade-prod:
-	@make upgrade ENV=prod
-
 .PHONY: quality
 quality:
 	helm lint
diff --git a/chart/README.md b/chart/README.md
index 26865e85ea..aa3782102f 100644
--- a/chart/README.md
+++ b/chart/README.md
@@ -13,21 +13,4 @@ Note that this Helm chart is used to manage the deployment of the `datasets-serv
 
 ## Deploy
 
-To deploy to the `hub-ephemeral` Kubernetes cluster, ensure to first:
-
-- install the tools (aws, kubectl, helm)
-- authenticate with AWS
-- select the `hub-ephemeral` cluster
-
-Dry run:
-
-```shell
-make init
-make diff-dev
-```
-
-Deploy:
-
-```shell
-make upgrade-dev
-```
+To deploy, go to https://cd.internal.huggingface.tech/applications.
diff --git a/chart/env/dev.yaml b/chart/env/staging.yaml
similarity index 91%
rename from chart/env/dev.yaml
rename to chart/env/staging.yaml
index b66d3f886e..54263cb876 100644
--- a/chart/env/dev.yaml
+++ b/chart/env/staging.yaml
@@ -4,12 +4,9 @@
 # --- common parameters ---
 global:
   huggingface:
-    imageRegistry: ""
-    imagePullSecrets: []
-    privateHub:
-      enabled: false
     ingress:
       domain: us.dev.moon.huggingface.tech
+      # ^ the domain contains "dev", not "staging". We don't change for now.
       subdomains:
         datasetsServer: datasets-server
 
@@ -51,7 +48,7 @@ images:
 
 secrets:
   mongoUrl:
-    fromSecret: false
+    fromSecret: true
     secretName: "mongo-url"
     value: mongo://
   appHfToken:
@@ -75,7 +72,7 @@ monitoring:
   enabled: false
 
 mongodb:
-  enabled: true
+  enabled: false
 
 common:
   # URL of the HuggingFace Hub
@@ -85,6 +82,9 @@ log:
   # Log level
   level: "DEBUG"
 
+firstRows:
+  maxBytes: "200_000"
+
 parquetAndInfo:
   maxDatasetSize: "500_000_000"
 
@@ -102,6 +102,10 @@ mongodbMigration:
 cacheMaintenance:
   action: "skip"
   # ^ allowed values are {skip,backfill,upgrade}
+  log:
+    level: "debug"
+  backfill:
+    error_codes_to_retry: ""
   resources:
     requests:
       cpu: 100m
@@ -114,14 +118,15 @@ backfill:
 
 metricsCollector:
   action: "collect-metrics"
-  schedule: "*/5 * * * *"
-  # every five minutes
+  schedule: "*/2 * * * *"
+  # every two minutes
   nodeSelector: {}
   resources:
     requests:
-      cpu: 0
+      cpu: 1
     limits:
-      cpu: 0
+      cpu: 1
+      memory: "512Mi"
   tolerations: []
 
 # --- storage admin (to manually inspect the storage, in /data) ---
@@ -160,8 +165,8 @@ ingress:
   annotations:
     # Link to Route53 - we could set any subdomain to us.dev.moon.huggingface.tech (common zone to the k8s cluster)
     external-dns.alpha.kubernetes.io/hostname: "datasets-server.us.dev.moon.huggingface.tech"
-    alb.ingress.kubernetes.io/load-balancer-name: "hub-datasets-server-dev"
-    alb.ingress.kubernetes.io/tags: "Env=dev,Project=datasets-server,Terraform=true"
+    alb.ingress.kubernetes.io/load-balancer-name: "hub-datasets-server-staging"
+    alb.ingress.kubernetes.io/tags: "Env=staging,Project=datasets-server,Terraform=true"
     alb.ingress.kubernetes.io/healthcheck-path: "/healthcheck"
     alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80, "HTTPS": 443}]'
     alb.ingress.kubernetes.io/scheme: "internet-facing"

From 80c7b5d60e0bd92e71988a097407357289c95823 Mon Sep 17 00:00:00 2001
From: Sylvain Lesage <sylvain.lesage@huggingface.co>
Date: Mon, 19 Jun 2023 14:35:59 +0200
Subject: [PATCH 38/52] =?UTF-8?q?feat:=20=F0=9F=8E=B8=2010x=20the=20size?=
 =?UTF-8?q?=20of=20supported=20images=20(#1392)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 services/worker/src/worker/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index a989f27776..f5842a08dd 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -44,7 +44,7 @@
 from libcommon.utils import orjson_dumps
 from pyarrow.parquet import ParquetFile
 
-MAX_IMAGE_PIXELS = 1_000_000_000
+MAX_IMAGE_PIXELS = 10_000_000_000
 # ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS
 
 

From b599b1038585ef0604484ed7aa00222d0fdd6c7b Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 19 Jun 2023 18:58:48 -0400
Subject: [PATCH 39/52] Fix exception

---
 libs/libcommon/src/libcommon/exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 57276b2c5e..6c22581a22 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -500,7 +500,7 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
 
 
-class DuckDBIndexFileNotFoundError(CacheableError):
+class NotAvailableIndexFileError(CacheableError):
     """Raised when no duckdb index file was found for split."""
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):

From 187d7b656ab234dc7681986e53e56d85f39f011a Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 19 Jun 2023 19:08:24 -0400
Subject: [PATCH 40/52] Fix test in libcommon

---
 chart/env/prod.yaml                           | 1 +
 libs/libcommon/tests/test_processing_graph.py | 5 +----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml
index 8b98ecbedc..163a431de8 100644
--- a/chart/env/prod.yaml
+++ b/chart/env/prod.yaml
@@ -97,6 +97,7 @@ optInOutUrlsScan:
   rowsMaxNumber: 100_000
   urlsNumberPerBatch: 1000
 
+
 # --- jobs (pre-install/upgrade hooks) ---
 
 mongodbMigration:
diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py
index 1d0933479a..c75d2104c2 100644
--- a/libs/libcommon/tests/test_processing_graph.py
+++ b/libs/libcommon/tests/test_processing_graph.py
@@ -83,7 +83,6 @@ def graph() -> ProcessingGraph:
                 "config-parquet",
                 "config-info",
                 "config-size",
-                "split-duckdb-index",
             ],
             ["dataset-config-names"],
             ["dataset-config-names"],
@@ -105,7 +104,6 @@ def graph() -> ProcessingGraph:
                 "split-first-rows-from-streaming",
                 "dataset-split-names",
                 "config-opt-in-out-urls-count",
-                "split-duckdb-index",
             ],
             ["dataset-config-names"],
             ["dataset-config-names"],
@@ -297,9 +295,8 @@ def graph() -> ProcessingGraph:
         (
             "split-duckdb-index",
             [],
-            ["config-parquet-and-info", "config-split-names-from-streaming", "config-split-names-from-info"],
+            ["config-split-names-from-info"],
             [
-                "config-split-names-from-streaming",
                 "config-split-names-from-info",
                 "config-parquet-and-info",
                 "config-info",

From 5c9639e2f5876cb39dbaf2d0989bc5f773a03e43 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 20 Jun 2023 16:14:50 -0400
Subject: [PATCH 41/52] Apply some code review suggestions

---
 libs/libcommon/src/libcommon/exceptions.py    |  8 ++
 .../worker/job_runners/split/duckdb_index.py  | 99 ++++++++++---------
 2 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index a711a49300..2030a5aaa0 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -73,6 +73,7 @@ def as_response(self) -> ErrorResponse:
 
 
 CacheableErrorCode = Literal[
+    "CachedDirectoryNotInitializedError",
     "ConfigNamesError",
     "CreateCommitError",
     "DatasetInBlockListError",
@@ -527,3 +528,10 @@ class DatasetWithTooManyConfigsError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True)
+
+
+class CachedDirectoryNotInitializedError(CacheableError):
+    """Raised when the cached directory has not been initialized before job compute."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CachedDirectoryNotInitializedError", cause, True)
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 05aabfb0c5..5571a8911c 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -16,6 +16,7 @@
 from libcommon.config import DuckDbIndexConfig
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
 from libcommon.exceptions import (
+    CachedDirectoryNotInitializedError,
     DatasetNotFoundError,
     LockedDatasetTimeoutError,
     NoIndexableColumnsError,
@@ -32,7 +33,7 @@
 from libcommon.utils import JobInfo, SplitHubFile
 
 from worker.config import AppConfig
-from worker.job_runners.split.split_job_runner import SplitCachedJobRunner
+from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache
 from worker.utils import CompleteJobResult, create_branch, hf_hub_url
 
 DATASET_TYPE = "dataset"
@@ -40,8 +41,8 @@
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb"
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
-CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', '*', overwrite=1);"
-CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, * FROM"
+CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', {columns}, overwrite=1);"
+CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, {columns} FROM"
 # TODO: What if __id field already exist?
 INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';"
 LOAD_EXTENSION_COMMAND = "LOAD '{extension}';"
@@ -83,49 +84,46 @@ def compute_index_rows(
         config=config,
     )
     content_parquet_and_info = parquet_and_info_best_response.response["content"]
-    if "parquet_files" not in content_parquet_and_info:
-        raise PreviousStepFormatError(
-            f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'parquet_files'"
-        )
-
-    if "dataset_info" not in content_parquet_and_info:
-        raise PreviousStepFormatError(
-            f"previous step '{config_parquet_and_info_step} doesn't return expected field: 'dataset_info'"
-        )
-
-    split_parquet_files = [
-        parquet_file
-        for parquet_file in content_parquet_and_info["parquet_files"]
-        if parquet_file["config"] == config and parquet_file["split"] == split
-    ]
-
-    split_parquets_size = sum(parquet_file["size"] for parquet_file in split_parquet_files)
+    try:
+        split_parquet_files = [
+            parquet_file
+            for parquet_file in content_parquet_and_info["parquet_files"]
+            if parquet_file["config"] == config and parquet_file["split"] == split
+        ]
+
+        split_parquets_size = sum(parquet_file["size"] for parquet_file in split_parquet_files)
+
+        if split_parquets_size > max_parquet_size_bytes:
+            raise SplitWithTooBigParquetError(
+                f"The indexing is limited to split parquets under {max_parquet_size_bytes} bytes. "
+                f"Current size of sum of split parquets is {split_parquets_size} bytes."
+            )
 
-    if split_parquets_size > max_parquet_size_bytes:
-        raise SplitWithTooBigParquetError(
-            f"The indexing is limited to split parquets under {max_parquet_size_bytes} bytes. "
-            f"Current size of sum of split parquets is {split_parquets_size} bytes."
-        )
+        parquet_urls = [parquet_file["url"] for parquet_file in split_parquet_files]
 
-    parquet_urls = [parquet_file["url"] for parquet_file in split_parquet_files]
+        if not parquet_urls:
+            raise ParquetResponseEmptyError("No parquet files found.")
 
-    if not parquet_urls:
-        raise ParquetResponseEmptyError("No parquet files found.")
+        # get the features
+        features = content_parquet_and_info["dataset_info"]["features"]
+        column_names = ",".join(list(features.keys()))
 
-    # get the features
-    features = content_parquet_and_info["dataset_info"].get("features", [])
+        # look for string columns
+        string_columns = [
+            column
+            for column, feature in features.items()
+            if "dtype" in feature
+            and "_type" in feature
+            and feature["dtype"] == STRING_FEATURE_DTYPE
+            and feature["_type"] == VALUE_FEATURE_TYPE
+        ]
+        if not string_columns:
+            raise NoIndexableColumnsError("No string columns available to index.")
 
-    # look for string columns
-    string_columns = [
-        column
-        for column, feature in features.items()
-        if "dtype" in feature
-        and "_type" in feature
-        and feature["dtype"] == STRING_FEATURE_DTYPE
-        and feature["_type"] == VALUE_FEATURE_TYPE
-    ]
-    if not string_columns:
-        raise NoIndexableColumnsError("No string columns available to index.")
+    except KeyError as e:
+        raise PreviousStepFormatError(
+            f"Previous step '{config_parquet_and_info_step}' did not return the expected content.", e
+        ) from e
 
     # configure duckdb extensions
     duckdb.execute(INSTALL_EXTENSION_COMMAND.format(extension="httpfs"))
@@ -134,14 +132,17 @@ def compute_index_rows(
     duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts"))
 
     # index all columns
-    db_location = f"{duckdb_index_file_directory}/{DUCKDB_DEFAULT_INDEX_FILENAME}"
-    con = duckdb.connect(str(db_location))
+    if duckdb_index_file_directory is None:
+        raise CachedDirectoryNotInitializedError("Cache directory has not been initialized.")
+    db_path = Path(duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME)
+
+    con = duckdb.connect(str(db_path.resolve()))
     con.sql(CREATE_SEQUENCE_COMMAND)
-    con.sql(f"{CREATE_TABLE_COMMAND} read_parquet({parquet_urls});")
+    con.sql(f"{CREATE_TABLE_COMMAND.format(columns=column_names)} read_parquet({parquet_urls});")
 
     # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future
     # see https://duckdb.org/docs/extensions/full_text_search.html for more details about 'stemmer' parameter
-    con.sql(CREATE_INDEX_COMMAND)
+    con.sql(CREATE_INDEX_COMMAND.format(columns=column_names))
 
     # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)
     hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
@@ -167,7 +168,7 @@ def compute_index_rows(
 
             # send the files to the target revision
             add_operations: List[CommitOperation] = [
-                CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_location)
+                CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_path)
             ]
 
             committer_hf_api.create_commit(
@@ -212,7 +213,7 @@ def compute_index_rows(
     )
 
 
-class SplitDuckDbIndexJobRunner(SplitCachedJobRunner):
+class SplitDuckDbIndexJobRunner(SplitJobRunnerWithCache):
     duckdb_index_config: DuckDbIndexConfig
 
     def __init__(
@@ -226,7 +227,7 @@ def __init__(
             job_info=job_info,
             app_config=app_config,
             processing_step=processing_step,
-            hf_datasets_cache=Path(duckdb_index_directory).resolve(),
+            cache_directory=Path(duckdb_index_directory),
         )
         self.duckdb_index_config = app_config.duckdb_index
 
@@ -245,7 +246,7 @@ def compute(self) -> CompleteJobResult:
                 dataset=self.dataset,
                 config=self.config,
                 split=self.split,
-                duckdb_index_file_directory=self.datasets_cache,
+                duckdb_index_file_directory=self.cache_subdirectory,
                 hf_token=self.app_config.common.hf_token,
                 url_template=self.duckdb_index_config.url_template,
                 commit_message=self.duckdb_index_config.commit_message,

From ce4163a564e865a0a56f97cd2bbb648c3b5e28c9 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 20 Jun 2023 16:27:44 -0400
Subject: [PATCH 42/52] Apply code review suggestions

---
 libs/libcommon/src/libcommon/storage.py                |  2 +-
 services/worker/src/worker/job_runner_factory.py       |  4 ++--
 .../src/worker/job_runners/split/duckdb_index.py       |  9 ++++-----
 services/worker/src/worker/main.py                     |  6 +++---
 services/worker/src/worker/start_worker_loop.py        |  6 +++---
 services/worker/tests/conftest.py                      |  6 +++---
 services/worker/tests/fixtures/hub.py                  |  2 +-
 .../tests/job_runners/split/test_duckdb_index.py       | 10 +++++-----
 services/worker/tests/test_executor.py                 |  4 ++--
 services/worker/tests/test_job_runner_factory.py       |  4 ++--
 10 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/libs/libcommon/src/libcommon/storage.py b/libs/libcommon/src/libcommon/storage.py
index 5a8230107e..63d9c10853 100644
--- a/libs/libcommon/src/libcommon/storage.py
+++ b/libs/libcommon/src/libcommon/storage.py
@@ -82,7 +82,7 @@ def init_parquet_metadata_dir(directory: Optional[StrPath] = None) -> StrPath:
     return init_dir(directory, appname=PARQUET_METADATA_CACHE_APPNAME)
 
 
-def init_duckdb_index_dir(directory: Optional[StrPath] = None) -> StrPath:
+def init_duckdb_index_cache_dir(directory: Optional[StrPath] = None) -> StrPath:
     """Initialize the duckdb index directory.
 
     If directory is None, it will be set to the default duckdb index location on the machine.
diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py
index f04146a292..87c48d9019 100644
--- a/services/worker/src/worker/job_runner_factory.py
+++ b/services/worker/src/worker/job_runner_factory.py
@@ -74,7 +74,7 @@ class JobRunnerFactory(BaseJobRunnerFactory):
     hf_datasets_cache: Path
     assets_directory: StrPath
     parquet_metadata_directory: StrPath
-    duckdb_index_directory: StrPath
+    duckdb_index_cache_directory: StrPath
 
     def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
         job_type = job_info["type"]
@@ -222,7 +222,7 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
                 job_info=job_info,
                 app_config=self.app_config,
                 processing_step=processing_step,
-                duckdb_index_directory=self.duckdb_index_directory,
+                duckdb_index_cache_directory=self.duckdb_index_cache_directory,
             )
 
         supported_job_types = [
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 5571a8911c..5ffd6e9e88 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -41,9 +41,8 @@
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb"
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
-CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__id', {columns}, overwrite=1);"
-CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __id, {columns} FROM"
-# TODO: What if __id field already exist?
+CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__hf_index_id', {columns}, overwrite=1);"
+CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __hf_index_id, {columns} FROM"
 INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';"
 LOAD_EXTENSION_COMMAND = "LOAD '{extension}';"
 
@@ -221,13 +220,13 @@ def __init__(
         job_info: JobInfo,
         app_config: AppConfig,
         processing_step: ProcessingStep,
-        duckdb_index_directory: StrPath,
+        duckdb_index_cache_directory: StrPath,
     ) -> None:
         super().__init__(
             job_info=job_info,
             app_config=app_config,
             processing_step=processing_step,
-            cache_directory=Path(duckdb_index_directory),
+            cache_directory=Path(duckdb_index_cache_directory),
         )
         self.duckdb_index_config = app_config.duckdb_index
 
diff --git a/services/worker/src/worker/main.py b/services/worker/src/worker/main.py
index 5a866aa74f..4d207280e5 100644
--- a/services/worker/src/worker/main.py
+++ b/services/worker/src/worker/main.py
@@ -8,7 +8,7 @@
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.storage import (
     init_assets_dir,
-    init_duckdb_index_dir,
+    init_duckdb_index_cache_dir,
     init_parquet_metadata_dir,
 )
 
@@ -31,7 +31,7 @@
         # ^ set first to have logs as soon as possible
         assets_directory = init_assets_dir(directory=app_config.assets.storage_directory)
         parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory)
-        duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory)
+        duckdb_index_cache_directory = init_duckdb_index_cache_dir(directory=app_config.duckdb_index.storage_directory)
 
         processing_graph = ProcessingGraph(app_config.processing_graph.specification)
 
@@ -59,7 +59,7 @@
                 hf_datasets_cache=libraries_resource.hf_datasets_cache,
                 assets_directory=assets_directory,
                 parquet_metadata_directory=parquet_metadata_directory,
-                duckdb_index_directory=duckdb_index_directory,
+                duckdb_index_cache_directory=duckdb_index_cache_directory,
             )
             worker_executor = WorkerExecutor(
                 app_config=app_config,
diff --git a/services/worker/src/worker/start_worker_loop.py b/services/worker/src/worker/start_worker_loop.py
index ff498dffa2..d5e69a829c 100644
--- a/services/worker/src/worker/start_worker_loop.py
+++ b/services/worker/src/worker/start_worker_loop.py
@@ -8,7 +8,7 @@
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.storage import (
     init_assets_dir,
-    init_duckdb_index_dir,
+    init_duckdb_index_cache_dir,
     init_parquet_metadata_dir,
 )
 
@@ -30,7 +30,7 @@
     # ^ set first to have logs as soon as possible
     assets_directory = init_assets_dir(directory=app_config.assets.storage_directory)
     parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory)
-    duckdb_index_directory = init_duckdb_index_dir(directory=app_config.duckdb_index.storage_directory)
+    duckdb_index_cache_directory = init_duckdb_index_cache_dir(directory=app_config.duckdb_index.storage_directory)
 
     processing_graph = ProcessingGraph(app_config.processing_graph.specification)
 
@@ -58,7 +58,7 @@
             hf_datasets_cache=libraries_resource.hf_datasets_cache,
             assets_directory=assets_directory,
             parquet_metadata_directory=parquet_metadata_directory,
-            duckdb_index_directory=duckdb_index_directory,
+            duckdb_index_cache_directory=duckdb_index_cache_directory,
         )
         loop = Loop(
             library_cache_paths=libraries_resource.storage_paths,
diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py
index a3f9cbef54..fab2648725 100644
--- a/services/worker/tests/conftest.py
+++ b/services/worker/tests/conftest.py
@@ -11,7 +11,7 @@
 from libcommon.storage import (
     StrPath,
     init_assets_dir,
-    init_duckdb_index_dir,
+    init_duckdb_index_cache_dir,
     init_parquet_metadata_dir,
 )
 from pytest import MonkeyPatch, fixture
@@ -126,8 +126,8 @@ def parquet_metadata_directory(app_config: AppConfig) -> StrPath:
 
 
 @fixture
-def duckdb_index_directory(app_config: AppConfig) -> StrPath:
-    return init_duckdb_index_dir(app_config.duckdb_index.storage_directory)
+def duckdb_index_cache_directory(app_config: AppConfig) -> StrPath:
+    return init_duckdb_index_cache_dir(app_config.duckdb_index.storage_directory)
 
 
 @fixture
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index 2dd5d02f38..e194ff17e6 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -767,7 +767,7 @@ def hub_reponses_spawning_opt_in_out(hub_public_spawning_opt_in_out: str) -> Hub
 
 
 @pytest.fixture
-def hub_reponses_duckdb_index(hub_public_duckdb_index: str) -> HubDatasetTest:
+def hub_responses_duckdb_index(hub_public_duckdb_index: str) -> HubDatasetTest:
     return {
         "name": hub_public_duckdb_index,
         "config_names_response": create_config_names_response(hub_public_duckdb_index),
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index 1c10f37aca..2a8e4b8ad5 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -25,7 +25,7 @@
 
 @pytest.fixture
 def get_job_runner(
-    duckdb_index_directory: StrPath,
+    duckdb_index_cache_directory: StrPath,
     cache_mongo_resource: CacheMongoResource,
     queue_mongo_resource: QueueMongoResource,
 ) -> GetJobRunner:
@@ -69,7 +69,7 @@ def _get_job_runner(
             },
             app_config=app_config,
             processing_step=processing_graph.get_processing_step(processing_step_name),
-            duckdb_index_directory=duckdb_index_directory,
+            duckdb_index_cache_directory=duckdb_index_cache_directory,
         )
 
     return _get_job_runner
@@ -128,12 +128,12 @@ def test_compute(
     get_parquet_job_runner: GetParquetJobRunner,
     get_job_runner: GetJobRunner,
     app_config: AppConfig,
-    hub_reponses_public: HubDatasetTest,
-    hub_reponses_duckdb_index: HubDatasetTest,
+    hub_responses_public: HubDatasetTest,
+    hub_responses_duckdb_index: HubDatasetTest,
     hub_dataset_name: str,
     expected_error_code: str,
 ) -> None:
-    hub_datasets = {"public": hub_reponses_public, "duckdb_index": hub_reponses_duckdb_index}
+    hub_datasets = {"public": hub_responses_public, "duckdb_index": hub_responses_duckdb_index}
     dataset = hub_datasets[hub_dataset_name]["name"]
     config_names = hub_datasets[hub_dataset_name]["config_names_response"]
     config = hub_datasets[hub_dataset_name]["config_names_response"]["config_names"][0]["config"]
diff --git a/services/worker/tests/test_executor.py b/services/worker/tests/test_executor.py
index 4dc2c47862..1a4dc47687 100644
--- a/services/worker/tests/test_executor.py
+++ b/services/worker/tests/test_executor.py
@@ -199,7 +199,7 @@ def job_runner_factory(
     libraries_resource: LibrariesResource,
     assets_directory: StrPath,
     parquet_metadata_directory: StrPath,
-    duckdb_index_directory: StrPath,
+    duckdb_index_cache_directory: StrPath,
 ) -> JobRunnerFactory:
     processing_graph = ProcessingGraph(app_config.processing_graph.specification)
     return JobRunnerFactory(
@@ -208,7 +208,7 @@ def job_runner_factory(
         hf_datasets_cache=libraries_resource.hf_datasets_cache,
         assets_directory=assets_directory,
         parquet_metadata_directory=parquet_metadata_directory,
-        duckdb_index_directory=duckdb_index_directory,
+        duckdb_index_cache_directory=duckdb_index_cache_directory,
     )
 
 
diff --git a/services/worker/tests/test_job_runner_factory.py b/services/worker/tests/test_job_runner_factory.py
index 982c0ae2a5..e10bc8c0f6 100644
--- a/services/worker/tests/test_job_runner_factory.py
+++ b/services/worker/tests/test_job_runner_factory.py
@@ -39,7 +39,7 @@ def test_create_job_runner(
     libraries_resource: LibrariesResource,
     assets_directory: StrPath,
     parquet_metadata_directory: StrPath,
-    duckdb_index_directory: StrPath,
+    duckdb_index_cache_directory: StrPath,
     job_type: str,
     expected_job_runner: Optional[str],
 ) -> None:
@@ -49,7 +49,7 @@ def test_create_job_runner(
         hf_datasets_cache=libraries_resource.hf_datasets_cache,
         assets_directory=assets_directory,
         parquet_metadata_directory=parquet_metadata_directory,
-        duckdb_index_directory=duckdb_index_directory,
+        duckdb_index_cache_directory=duckdb_index_cache_directory,
     )
     job_info: JobInfo = {
         "type": job_type,

From 9e9e25a8f5bdc113fabf14071172228fe36cfdc9 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 20 Jun 2023 17:25:50 -0400
Subject: [PATCH 43/52] Adding close connection

---
 .../src/worker/job_runners/split/duckdb_index.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 5ffd6e9e88..7c8fb40846 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -41,7 +41,7 @@
 VALUE_FEATURE_TYPE = "Value"
 DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb"
 CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 1;"
-CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__hf_index_id', {columns}, overwrite=1);"
+CREATE_INDEX_COMMAND = "PRAGMA create_fts_index('data', '__hf_index_id', '*', overwrite=1);"
 CREATE_TABLE_COMMAND = "CREATE OR REPLACE TABLE data AS SELECT nextval('serial') AS __hf_index_id, {columns} FROM"
 INSTALL_EXTENSION_COMMAND = "INSTALL '{extension}';"
 LOAD_EXTENSION_COMMAND = "LOAD '{extension}';"
@@ -136,13 +136,19 @@ def compute_index_rows(
     db_path = Path(duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME)
 
     con = duckdb.connect(str(db_path.resolve()))
+    logging.debug(CREATE_SEQUENCE_COMMAND)
     con.sql(CREATE_SEQUENCE_COMMAND)
-    con.sql(f"{CREATE_TABLE_COMMAND.format(columns=column_names)} read_parquet({parquet_urls});")
+
+    create_command_sql = f"{CREATE_TABLE_COMMAND.format(columns=column_names)} read_parquet({parquet_urls});"
+    logging.debug(create_command_sql)
+    con.sql(create_command_sql)
 
     # TODO: by default, 'porter' stemmer is being used, use a specific one by dataset language in the future
     # see https://duckdb.org/docs/extensions/full_text_search.html for more details about 'stemmer' parameter
-    con.sql(CREATE_INDEX_COMMAND.format(columns=column_names))
-
+    logging.debug(CREATE_INDEX_COMMAND)
+    con.sql(CREATE_INDEX_COMMAND)
+    con.close()
+    
     # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)
     hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
     committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token)
@@ -167,7 +173,7 @@ def compute_index_rows(
 
             # send the files to the target revision
             add_operations: List[CommitOperation] = [
-                CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_path)
+                CommitOperationAdd(path_in_repo=index_file_location, path_or_fileobj=db_path.resolve())
             ]
 
             committer_hf_api.create_commit(

From b807613552f42d836c8bd71a8b1bee76c9ca8b4a Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 22 Jun 2023 08:13:36 -0400
Subject: [PATCH 44/52] Upgrade duckdb version

---
 services/worker/poetry.lock    | 106 +++++++++++++++++----------------
 services/worker/pyproject.toml |   2 +-
 2 files changed, 57 insertions(+), 51 deletions(-)

diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock
index cab3feb395..8c61c00451 100644
--- a/services/worker/poetry.lock
+++ b/services/worker/poetry.lock
@@ -996,59 +996,64 @@ idna = ["idna (>=2.1)"]
 
 [[package]]
 name = "duckdb"
-version = "0.8.0"
+version = "0.8.1"
 description = "DuckDB embedded database"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6455aee00af30770c20f4a8c5e4347918cf59b578f49ee996a13807b12911871"},
-    {file = "duckdb-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8cf0622ae7f86d4ce72791f8928af4357a46824aadf1b6879c7936b3db65344"},
-    {file = "duckdb-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6132e8183ca3ae08a593e43c97cb189794077dedd48546e27ce43bd6a51a9c33"},
-    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe29e5343fa2a95f2cde4519a4f4533f4fd551a48d2d9a8ab5220d40ebf53610"},
-    {file = "duckdb-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:945165987ca87c097dc0e578dcf47a100cad77e1c29f5dd8443d53ce159dc22e"},
-    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:673c60daf7ada1d9a8518286a6893ec45efabb64602954af5f3d98f42912fda6"},
-    {file = "duckdb-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5075fe1ff97ae62331ca5c61e3597e6e9f7682a6fdd418c23ba5c4873ed5cd1"},
-    {file = "duckdb-0.8.0-cp310-cp310-win32.whl", hash = "sha256:001f5102f45d3d67f389fa8520046c8f55a99e2c6d43b8e68b38ea93261c5395"},
-    {file = "duckdb-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb00800f2e1e865584b13221e0121fce9341bb3a39a93e569d563eaed281f528"},
-    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2707096d6df4321044fcde2c9f04da632d11a8be60957fd09d49a42fae71a29"},
-    {file = "duckdb-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b27df1b70ae74d2c88efb5ffca8490954fdc678099509a9c4404ca30acc53426"},
-    {file = "duckdb-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75a97c800271b52dd0f37696d074c50576dcb4b2750b6115932a98696a268070"},
-    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:804cac261a5e016506a6d67838a65d19b06a237f7949f1704f0e800eb708286a"},
-    {file = "duckdb-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6b9abca7fa6713e1d031c18485343b4de99742c7e1b85c10718aa2f31a4e2c6"},
-    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:51aa6d606d49072abcfeb3be209eb559ac94c1b5e70f58ac3adbb94aca9cd69f"},
-    {file = "duckdb-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7c8dc769aaf2be0a1c57995ca657e5b92c1c56fc8437edb720ca6cab571adf14"},
-    {file = "duckdb-0.8.0-cp311-cp311-win32.whl", hash = "sha256:c4207d18b42387c4a035846d8878eb967070198be8ac26fd77797ce320d1a400"},
-    {file = "duckdb-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:0c392257547c20794c3072fcbca99a49ef0a49974005d755e93893e2b4875267"},
-    {file = "duckdb-0.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2832379e122020814dbe869af7b9ddf3c9f21474cf345531145b099c63ffe17e"},
-    {file = "duckdb-0.8.0-cp36-cp36m-win32.whl", hash = "sha256:914896526f7caba86b170f2c4f17f11fd06540325deeb0000cb4fb24ec732966"},
-    {file = "duckdb-0.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:022ebda86d0e3204cdc206e4af45aa9f0ae0668b34c2c68cf88e08355af4a372"},
-    {file = "duckdb-0.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:96a31c0f3f4ccbf0f5b18f94319f37691205d82f80aae48c6fe04860d743eb2c"},
-    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a07c73c6e6a8cf4ce1a634625e0d1b17e5b817242a8a530d26ed84508dfbdc26"},
-    {file = "duckdb-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424acbd6e857531b06448d757d7c2557938dbddbff0632092090efbf413b4699"},
-    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c83cfd2a868f1acb0692b9c3fd5ef1d7da8faa1348c6eabf421fbf5d8c2f3eb8"},
-    {file = "duckdb-0.8.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5c6f6b2d8db56936f662c649539df81856b5a8cb769a31f9544edf18af2a11ff"},
-    {file = "duckdb-0.8.0-cp37-cp37m-win32.whl", hash = "sha256:0bd6376b40a512172eaf4aa816813b1b9d68994292ca436ce626ccd5f77f8184"},
-    {file = "duckdb-0.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:931221885bcf1e7dfce2400f11fd048a7beef566b775f1453bb1db89b828e810"},
-    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:42e7853d963d68e72403ea208bcf806b0f28c7b44db0aa85ce49bb124d56c133"},
-    {file = "duckdb-0.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcc338399175be3d43366576600aef7d72e82114d415992a7a95aded98a0f3fd"},
-    {file = "duckdb-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03dd08a4624d6b581a59f9f9dbfd34902416398d16795ad19f92361cf21fd9b5"},
-    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c7c24ea0c9d8563dbd5ad49ccb54b7a9a3c7b8c2833d35e5d32a08549cacea5"},
-    {file = "duckdb-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb58f6505cc0f34b4e976154302d26563d2e5d16b206758daaa04b65e55d9dd8"},
-    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef37ac7880100c4b3f913c8483a29a13f8289313b9a07df019fadfa8e7427544"},
-    {file = "duckdb-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2a4f5ee913ca8a6a069c78f8944b9934ffdbc71fd935f9576fdcea2a6f476f1"},
-    {file = "duckdb-0.8.0-cp38-cp38-win32.whl", hash = "sha256:73831c6d7aefcb5f4072cd677b9efebecbf6c578946d21710791e10a1fc41b9a"},
-    {file = "duckdb-0.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:faa36d2854734364d234f37d7ef4f3d763b73cd6b0f799cbc2a0e3b7e2575450"},
-    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50a31ec237ed619e50f9ab79eb0ec5111eb9697d4475da6e0ab22c08495ce26b"},
-    {file = "duckdb-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:351abb4cc2d229d043920c4bc2a4c29ca31a79fef7d7ef8f6011cf4331f297bf"},
-    {file = "duckdb-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:568550a163aca6a787bef8313e358590254de3f4019025a8d68c3a61253fedc1"},
-    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b82617f0e7f9fc080eda217090d82b42d4fad083bc9f6d58dfda9cecb7e3b29"},
-    {file = "duckdb-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01c9be34d272532b75e8faedda0ff77fa76d1034cde60b8f5768ae85680d6d3"},
-    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8549d6a6bf5f00c012b6916f605416226507e733a3ffc57451682afd6e674d1b"},
-    {file = "duckdb-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d145c6d51e55743c3ed1a74cffa109d9e72f82b07e203b436cfa453c925313a"},
-    {file = "duckdb-0.8.0-cp39-cp39-win32.whl", hash = "sha256:f8610dfd21e90d7b04e8598b244bf3ad68599fd6ba0daad3428c03cbfd74dced"},
-    {file = "duckdb-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:d0f0f104d30418808bafbe9bccdcd238588a07bd246b3cff13842d60bfd8e8ba"},
-    {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"},
+    {file = "duckdb-0.8.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:14781d21580ee72aba1f5dcae7734674c9b6c078dd60470a08b2b420d15b996d"},
+    {file = "duckdb-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f13bf7ab0e56ddd2014ef762ae4ee5ea4df5a69545ce1191b8d7df8118ba3167"},
+    {file = "duckdb-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4032042d8363e55365bbca3faafc6dc336ed2aad088f10ae1a534ebc5bcc181"},
+    {file = "duckdb-0.8.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a71bd8f0b0ca77c27fa89b99349ef22599ffefe1e7684ae2e1aa2904a08684"},
+    {file = "duckdb-0.8.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24568d6e48f3dbbf4a933109e323507a46b9399ed24c5d4388c4987ddc694fd0"},
+    {file = "duckdb-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297226c0dadaa07f7c5ae7cbdb9adba9567db7b16693dbd1b406b739ce0d7924"},
+    {file = "duckdb-0.8.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5792cf777ece2c0591194006b4d3e531f720186102492872cb32ddb9363919cf"},
+    {file = "duckdb-0.8.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:12803f9f41582b68921d6b21f95ba7a51e1d8f36832b7d8006186f58c3d1b344"},
+    {file = "duckdb-0.8.1-cp310-cp310-win32.whl", hash = "sha256:d0953d5a2355ddc49095e7aef1392b7f59c5be5cec8cdc98b9d9dc1f01e7ce2b"},
+    {file = "duckdb-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:6e6583c98a7d6637e83bcadfbd86e1f183917ea539f23b6b41178f32f813a5eb"},
+    {file = "duckdb-0.8.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fad7ed0d4415f633d955ac24717fa13a500012b600751d4edb050b75fb940c25"},
+    {file = "duckdb-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:81ae602f34d38d9c48dd60f94b89f28df3ef346830978441b83c5b4eae131d08"},
+    {file = "duckdb-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d75cfe563aaa058d3b4ccaaa371c6271e00e3070df5de72361fd161b2fe6780"},
+    {file = "duckdb-0.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dbb55e7a3336f2462e5e916fc128c47fe1c03b6208d6bd413ac11ed95132aa0"},
+    {file = "duckdb-0.8.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a6df53efd63b6fdf04657385a791a4e3c4fb94bfd5db181c4843e2c46b04fef5"},
+    {file = "duckdb-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b188b80b70d1159b17c9baaf541c1799c1ce8b2af4add179a9eed8e2616be96"},
+    {file = "duckdb-0.8.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5ad481ee353f31250b45d64b4a104e53b21415577943aa8f84d0af266dc9af85"},
+    {file = "duckdb-0.8.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d1d1b1729993611b1892509d21c21628917625cdbe824a61ce891baadf684b32"},
+    {file = "duckdb-0.8.1-cp311-cp311-win32.whl", hash = "sha256:2d8f9cc301e8455a4f89aa1088b8a2d628f0c1f158d4cf9bc78971ed88d82eea"},
+    {file = "duckdb-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:07457a43605223f62d93d2a5a66b3f97731f79bbbe81fdd5b79954306122f612"},
+    {file = "duckdb-0.8.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d2c8062c3e978dbcd80d712ca3e307de8a06bd4f343aa457d7dd7294692a3842"},
+    {file = "duckdb-0.8.1-cp36-cp36m-win32.whl", hash = "sha256:fad486c65ae944eae2de0d590a0a4fb91a9893df98411d66cab03359f9cba39b"},
+    {file = "duckdb-0.8.1-cp36-cp36m-win_amd64.whl", hash = "sha256:86fa4506622c52d2df93089c8e7075f1c4d0ba56f4bf27faebde8725355edf32"},
+    {file = "duckdb-0.8.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:60e07a62782f88420046e30cc0e3de842d0901c4fd5b8e4d28b73826ec0c3f5e"},
+    {file = "duckdb-0.8.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f18563675977f8cbf03748efee0165b4c8ef64e0cbe48366f78e2914d82138bb"},
+    {file = "duckdb-0.8.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16e179443832bea8439ae4dff93cf1e42c545144ead7a4ef5f473e373eea925a"},
+    {file = "duckdb-0.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a413d5267cb41a1afe69d30dd6d4842c588256a6fed7554c7e07dad251ede095"},
+    {file = "duckdb-0.8.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3784680df59eadd683b0a4c2375d451a64470ca54bd171c01e36951962b1d332"},
+    {file = "duckdb-0.8.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:67a1725c2b01f9b53571ecf3f92959b652f60156c1c48fb35798302e39b3c1a2"},
+    {file = "duckdb-0.8.1-cp37-cp37m-win32.whl", hash = "sha256:197d37e2588c5ad063e79819054eedb7550d43bf1a557d03ba8f8f67f71acc42"},
+    {file = "duckdb-0.8.1-cp37-cp37m-win_amd64.whl", hash = "sha256:3843feb79edf100800f5037c32d5d5a5474fb94b32ace66c707b96605e7c16b2"},
+    {file = "duckdb-0.8.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:624c889b0f2d656794757b3cc4fc58030d5e285f5ad2ef9fba1ea34a01dab7fb"},
+    {file = "duckdb-0.8.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fcbe3742d77eb5add2d617d487266d825e663270ef90253366137a47eaab9448"},
+    {file = "duckdb-0.8.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47516c9299d09e9dbba097b9fb339b389313c4941da5c54109df01df0f05e78c"},
+    {file = "duckdb-0.8.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf1ba718b7522d34399446ebd5d4b9fcac0b56b6ac07bfebf618fd190ec37c1d"},
+    {file = "duckdb-0.8.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e36e35d38a9ae798fe8cf6a839e81494d5b634af89f4ec9483f4d0a313fc6bdb"},
+    {file = "duckdb-0.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23493313f88ce6e708a512daacad13e83e6d1ea0be204b175df1348f7fc78671"},
+    {file = "duckdb-0.8.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1fb9bf0b6f63616c8a4b9a6a32789045e98c108df100e6bac783dc1e36073737"},
+    {file = "duckdb-0.8.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:12fc13ecd5eddd28b203b9e3999040d3a7374a8f4b833b04bd26b8c5685c2635"},
+    {file = "duckdb-0.8.1-cp38-cp38-win32.whl", hash = "sha256:a12bf4b18306c9cb2c9ba50520317e6cf2de861f121d6f0678505fa83468c627"},
+    {file = "duckdb-0.8.1-cp38-cp38-win_amd64.whl", hash = "sha256:e4e809358b9559c00caac4233e0e2014f3f55cd753a31c4bcbbd1b55ad0d35e4"},
+    {file = "duckdb-0.8.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7acedfc00d97fbdb8c3d120418c41ef3cb86ef59367f3a9a30dff24470d38680"},
+    {file = "duckdb-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:99bfe264059cdc1e318769103f656f98e819cd4e231cd76c1d1a0327f3e5cef8"},
+    {file = "duckdb-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:538b225f361066231bc6cd66c04a5561de3eea56115a5dd773e99e5d47eb1b89"},
+    {file = "duckdb-0.8.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae0be3f71a18cd8492d05d0fc1bc67d01d5a9457b04822d025b0fc8ee6efe32e"},
+    {file = "duckdb-0.8.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd82ba63b58672e46c8ec60bc9946aa4dd7b77f21c1ba09633d8847ad9eb0d7b"},
+    {file = "duckdb-0.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:780a34559aaec8354e83aa4b7b31b3555f1b2cf75728bf5ce11b89a950f5cdd9"},
+    {file = "duckdb-0.8.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:01f0d4e9f7103523672bda8d3f77f440b3e0155dd3b2f24997bc0c77f8deb460"},
+    {file = "duckdb-0.8.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:31f692decb98c2d57891da27180201d9e93bb470a3051fcf413e8da65bca37a5"},
+    {file = "duckdb-0.8.1-cp39-cp39-win32.whl", hash = "sha256:e7fe93449cd309bbc67d1bf6f6392a6118e94a9a4479ab8a80518742e855370a"},
+    {file = "duckdb-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:81d670bc6807672f038332d9bf587037aabdd741b0810de191984325ed307abd"},
+    {file = "duckdb-0.8.1.tar.gz", hash = "sha256:a54d37f4abc2afc4f92314aaa56ecf215a411f40af4bffe1e86bd25e62aceee9"},
 ]
 
 [[package]]
@@ -4455,7 +4460,6 @@ files = [
     {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"},
     {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"},
-    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"},
     {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"},
     {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"},
     {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"},
@@ -4749,6 +4753,8 @@ python-versions = ">=3.8"
 files = [
     {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:db464c88e10e927725997f9b872a21c9d057789d3b7e9a26e4ef1af41d0bcc8c"},
     {file = "tensorflow_macos-2.12.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:172277c33cb1ae0da19f98c5bcd4946149cfa73c8ea05c6ba18365d58dd3c6f2"},
+    {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:9c9b14fbb73ec4cb0f209722a1489020fd8614c92ae22589f2309c48cefdf21f"},
+    {file = "tensorflow_macos-2.12.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6a54539bd076746f69ae8bef7282f981674fe4dbf59c3a84c4af86ae6bae9d5c"},
     {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e3fa53e63672fd71998bbd71cc5478c74dbe5a2d9291d1801c575358c28403c2"},
     {file = "tensorflow_macos-2.12.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:5499312c21ed3ed47cc6b4cf861896e9564c2c32d8d3c2ef1437c5ca31adfc73"},
     {file = "tensorflow_macos-2.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:84cb873c90be63efabfecca53fdc48b734a037d0750532b55cb7ce7c343b5cac"},
@@ -5648,4 +5654,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.9.15"
-content-hash = "732285314a1b756206bdba83a83ee9e97635117f5fd9a6fd8d2b92d8f51e6679"
+content-hash = "3aa60ce2866418d5594a71e79a63dbd8e2bd3991c079c53bc055a7c584b3f69e"
diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml
index f5ac325bc2..1be29673b9 100644
--- a/services/worker/pyproject.toml
+++ b/services/worker/pyproject.toml
@@ -45,7 +45,7 @@ transformers = "^4.30.0"
 trec-car-tools = { path = "vendors/trec-car-tools/python3" }
 typer = "^0.4.2"
 wget = "^3.2"
-duckdb = "^0.8.0"
+duckdb = "^0.8.1"
 
 [tool.poetry.group.dev.dependencies]
 bandit = "^1.7.4"

From e77b6b47e099f5e56438aff451ad1eecad1b5637 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 22 Jun 2023 08:13:42 -0400
Subject: [PATCH 45/52] Apply code review suggestions

---
 libs/libcommon/src/libcommon/exceptions.py                  | 6 +++---
 .../src/worker/job_runners/config/parquet_and_info.py       | 6 +++++-
 .../worker/src/worker/job_runners/split/duckdb_index.py     | 6 +++---
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 2030a5aaa0..aa470371ef 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -90,6 +90,7 @@ def as_response(self) -> ErrorResponse:
     "DatasetWithTooManyConfigsError",
     "DatasetWithTooManyParquetFilesError",
     "DisabledViewerError",
+    "DuckDBIndexFileNotFoundError",
     "EmptyDatasetError",
     "ExternalFilesSizeRequestConnectionError",
     "ExternalFilesSizeRequestError",
@@ -105,7 +106,6 @@ def as_response(self) -> ErrorResponse:
     "MissingSpawningTokenError",
     "NoIndexableColumnsError",
     "NormalRowsError",
-    "NotAvailableIndexFileError",
     "ParameterMissingError",
     "ParquetResponseEmptyError",
     "PreviousStepFormatError",
@@ -509,11 +509,11 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
 
 
-class NotAvailableIndexFileError(CacheableError):
+class DuckDBIndexFileNotFoundError(CacheableError):
     """Raised when no duckdb index file was found for split."""
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NotAvailableIndexFileError", cause, False)
+        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False)
 
 
 class SplitWithTooBigParquetError(CacheableError):
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index 2c9eff9636..ad5f4200e0 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -1037,7 +1037,11 @@ def compute_config_parquet_and_info_response(
                 repo_id=dataset, repo_type=DATASET_TYPE
             )
             create_branch(
-                dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api
+                dataset=dataset,
+                target_revision=target_revision,
+                refs=refs,
+                hf_api=hf_api,
+                committer_hf_api=committer_hf_api,
             )
 
             # commit the parquet files
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 7c8fb40846..1810fa7446 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -20,7 +20,7 @@
     DatasetNotFoundError,
     LockedDatasetTimeoutError,
     NoIndexableColumnsError,
-    NotAvailableIndexFileError,
+    DuckDBIndexFileNotFoundError,
     ParquetResponseEmptyError,
     PreviousStepFormatError,
     SplitNotFoundError,
@@ -148,7 +148,7 @@ def compute_index_rows(
     logging.debug(CREATE_INDEX_COMMAND)
     con.sql(CREATE_INDEX_COMMAND)
     con.close()
-    
+
     # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)
     hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
     committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token)
@@ -196,7 +196,7 @@ def compute_index_rows(
 
     if not repo_files or len(repo_files) != 1:
         logging.warning(f"Found {len(repo_files)} index files, should be only 1")
-        raise NotAvailableIndexFileError("No index file was found")
+        raise DuckDBIndexFileNotFoundError("No index file was found")
 
     repo_file = repo_files[0]
     if repo_file.size is None:

From 3005e2eac7b047175085d899f7c7fac09a0a67e7 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 22 Jun 2023 08:25:38 -0400
Subject: [PATCH 46/52] Fix style

---
 services/worker/src/worker/job_runners/split/duckdb_index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 1810fa7446..68b2229535 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -18,9 +18,9 @@
 from libcommon.exceptions import (
     CachedDirectoryNotInitializedError,
     DatasetNotFoundError,
+    DuckDBIndexFileNotFoundError,
     LockedDatasetTimeoutError,
     NoIndexableColumnsError,
-    DuckDBIndexFileNotFoundError,
     ParquetResponseEmptyError,
     PreviousStepFormatError,
     SplitNotFoundError,

From 84687e0e7f92dafacea190bfa99d485bc95c5609 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 22 Jun 2023 11:43:45 -0400
Subject: [PATCH 47/52] Adding some test cases

---
 services/worker/tests/fixtures/datasets.py    | 13 ++--
 .../job_runners/split/test_duckdb_index.py    | 62 +++++++++++++++++--
 2 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
index fe6413489d..6e987e20e5 100644
--- a/services/worker/tests/fixtures/datasets.py
+++ b/services/worker/tests/fixtures/datasets.py
@@ -147,11 +147,14 @@ def datasets() -> Mapping[str, Dataset]:
             pd.DataFrame(
                 {
                     "text": [
-                        "foo",
-                        "bar",
-                        "foobar",
-                        "- Hello there !",
-                        "- General Kenobi !",
+                        (
+                            "Grand Moff Tarkin and Lord Vader are interrupted in their discussion by the buzz of the"
+                            " comlink"
+                        ),
+                        "There goes another one.",
+                        "Vader turns round and round in circles as his ship spins into space.",
+                        "We count thirty Rebel ships, Lord Vader.",
+                        "The wingman spots the pirateship coming at him and warns the Dark Lord",
                     ]
                 },
                 dtype=pd.StringDtype(storage="python"),
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index 2a8e4b8ad5..5004d9f446 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2023 The HuggingFace Authors.
 
+import os
+from dataclasses import replace
 from http import HTTPStatus
-from typing import Callable
+from typing import Callable, Optional
 
+import duckdb
 import pytest
+import requests
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.simple_cache import upsert_response
@@ -118,10 +122,11 @@ def _get_job_runner(
 
 
 @pytest.mark.parametrize(
-    "hub_dataset_name,expected_error_code",
+    "hub_dataset_name,max_parquet_size_bytes,expected_error_code",
     [
-        ("duckdb_index", None),
-        ("public", "NoIndexableColumnsError"),  # dataset does not have string columns to index
+        ("duckdb_index", None, None),
+        ("duckdb_index", 1_000, "SplitWithTooBigParquetError"),  # parquet size is 2812
+        ("public", None, "NoIndexableColumnsError"),  # dataset does not have string columns to index
     ],
 )
 def test_compute(
@@ -131,6 +136,7 @@ def test_compute(
     hub_responses_public: HubDatasetTest,
     hub_responses_duckdb_index: HubDatasetTest,
     hub_dataset_name: str,
+    max_parquet_size_bytes: Optional[int],
     expected_error_code: str,
 ) -> None:
     hub_datasets = {"public": hub_responses_public, "duckdb_index": hub_responses_duckdb_index}
@@ -155,10 +161,22 @@ def test_compute(
         content=splits_response,
     )
 
+    app_config = (
+        app_config
+        if max_parquet_size_bytes is None
+        else replace(
+            app_config, duckdb_index=replace(app_config.duckdb_index, max_parquet_size_bytes=max_parquet_size_bytes)
+        )
+    )
+
     parquet_job_runner = get_parquet_job_runner(dataset, config, app_config)
     parquet_response = parquet_job_runner.compute()
     config_parquet = parquet_response.content
 
+    # simulate more than one parquet file to index
+    extra_parquet_file = config_parquet["parquet_files"][0]
+    config_parquet["parquet_files"].append(extra_parquet_file)
+
     upsert_response(
         "config-parquet-and-info",
         dataset=dataset,
@@ -179,6 +197,38 @@ def test_compute(
         response = job_runner.compute()
         assert response
         content = response.content
-        assert content["url"] is not None
-        assert content["filename"] is not None
+        url = content["url"]
+        file_name = content["filename"]
+        assert url is not None
+        assert file_name is not None
         job_runner.post_compute()
+
+        # download locally duckdb index file
+        duckdb_file = requests.get(url)
+        with open(file_name, "wb") as f:
+            f.write(duckdb_file.content)
+
+        duckdb.execute("INSTALL 'fts';")
+        duckdb.execute("LOAD 'fts';")
+        con = duckdb.connect(file_name)
+
+        # validate number of inserted records
+        record_count = con.sql("SELECT COUNT(*) FROM data;").fetchall()
+        assert record_count is not None
+        assert isinstance(record_count, list)
+        assert record_count[0] == (10,)  # dataset has 5 rows but since parquet file was duplicate it is 10
+
+        # perform a search to validate fts feature
+        query = "Lord Vader"
+        result = con.execute(
+            (
+                "SELECT fts_main_data.match_bm25(__hf_index_id, ?) AS score, text FROM data WHERE score IS NOT NULL"
+                " ORDER BY score DESC;"
+            ),
+            [query],
+        )
+        rows = result.df()
+        assert rows is not None
+
+        con.close()
+        os.remove(file_name)

From 021ea34b7efd0bcf8fa577377d8abe917db84cdb Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 22 Jun 2023 11:52:13 -0400
Subject: [PATCH 48/52] Remove duplicate code by merge

---
 services/worker/src/worker/utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index d3e66167ef..5045313f3c 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -47,9 +47,6 @@
 MAX_IMAGE_PIXELS = 10_000_000_000
 # ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS
 
-MAX_IMAGE_PIXELS = 10_000_000_000
-# ^ see https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS
-
 
 class JobRunnerInfo(TypedDict):
     job_type: str

From 80a3c214df043abf7bb683ad5a8fe3776fd9186f Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 26 Jun 2023 07:25:45 -0400
Subject: [PATCH 49/52] Fix imports

---
 services/worker/src/worker/dtos.py                 | 14 +++++---------
 .../src/worker/job_runners/config/parquet.py       |  1 -
 .../worker/job_runners/config/parquet_and_info.py  |  6 +++---
 .../worker/job_runners/config/parquet_metadata.py  |  1 +
 .../src/worker/job_runners/dataset/parquet.py      |  3 ++-
 .../src/worker/job_runners/split/duckdb_index.py   |  3 ++-
 .../tests/job_runners/config/test_parquet.py       |  6 +-----
 .../job_runners/config/test_parquet_metadata.py    |  1 -
 .../tests/job_runners/dataset/test_parquet.py      |  2 +-
 9 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py
index ef4a4cc0b3..5eb630d1b3 100644
--- a/services/worker/src/worker/dtos.py
+++ b/services/worker/src/worker/dtos.py
@@ -4,6 +4,8 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Mapping, Optional, TypedDict
 
+from libcommon.utils import SplitHubFile
+
 
 class JobRunnerInfo(TypedDict):
     job_type: str
@@ -110,14 +112,8 @@ class ConfigInfoResponse(TypedDict):
     dataset_info: Dict[str, Any]
 
 
-class ParquetFileItem(SplitItem):
-    url: str
-    filename: str
-    size: int
-
-
 class ConfigParquetAndInfoResponse(TypedDict):
-    parquet_files: List[ParquetFileItem]
+    parquet_files: List[SplitHubFile]
     dataset_info: Dict[str, Any]
 
 
@@ -134,7 +130,7 @@ class ConfigParquetMetadataResponse(TypedDict):
 
 
 class ConfigParquetResponse(TypedDict):
-    parquet_files: List[ParquetFileItem]
+    parquet_files: List[SplitHubFile]
 
 
 class ConfigSize(TypedDict):
@@ -183,7 +179,7 @@ class DatasetIsValidResponse(TypedDict):
 
 
 class DatasetParquetResponse(TypedDict):
-    parquet_files: List[ParquetFileItem]
+    parquet_files: List[SplitHubFile]
     pending: list[PreviousJob]
     failed: list[PreviousJob]
 
diff --git a/services/worker/src/worker/job_runners/config/parquet.py b/services/worker/src/worker/job_runners/config/parquet.py
index 6bba5ca6c0..572df22cd9 100644
--- a/services/worker/src/worker/job_runners/config/parquet.py
+++ b/services/worker/src/worker/job_runners/config/parquet.py
@@ -6,7 +6,6 @@
 from libcommon.constants import PROCESSING_STEP_CONFIG_PARQUET_VERSION
 from libcommon.exceptions import PreviousStepFormatError
 from libcommon.simple_cache import get_previous_step_or_raise
-from libcommon.utils import SplitHubFile
 
 from worker.dtos import CompleteJobResult, ConfigParquetResponse
 from worker.job_runners.config.config_job_runner import ConfigJobRunner
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index b90b922095..9d6fa16271 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -7,7 +7,7 @@
 from functools import partial
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple, TypedDict
+from typing import Any, List, Optional, Set, Tuple
 
 import datasets
 import datasets.config
@@ -74,9 +74,9 @@
 from tqdm.contrib.concurrent import thread_map
 
 from worker.config import AppConfig, ParquetAndInfoConfig
-from worker.dtos import CompleteJobResult, ConfigParquetAndInfoResponse, ParquetFileItem
+from worker.dtos import CompleteJobResult, ConfigParquetAndInfoResponse
 from worker.job_runners.config.config_job_runner import ConfigJobRunnerWithDatasetsCache
-from worker.utils import retry, create_branch
+from worker.utils import create_branch, hf_hub_url, retry
 
 DATASET_TYPE = "dataset"
 MAX_FILES_PER_DIRECTORY = 10_000  # hf hub limitation
diff --git a/services/worker/src/worker/job_runners/config/parquet_metadata.py b/services/worker/src/worker/job_runners/config/parquet_metadata.py
index 002e043ee1..55de2fe3da 100644
--- a/services/worker/src/worker/job_runners/config/parquet_metadata.py
+++ b/services/worker/src/worker/job_runners/config/parquet_metadata.py
@@ -27,6 +27,7 @@
     ParquetFileMetadataItem,
 )
 from worker.job_runners.config.config_job_runner import ConfigJobRunner
+from worker.utils import get_parquet_file
 
 
 def compute_parquet_metadata_response(
diff --git a/services/worker/src/worker/job_runners/dataset/parquet.py b/services/worker/src/worker/job_runners/dataset/parquet.py
index 68663da1bf..1f4f839949 100644
--- a/services/worker/src/worker/job_runners/dataset/parquet.py
+++ b/services/worker/src/worker/job_runners/dataset/parquet.py
@@ -12,12 +12,13 @@
     get_previous_step_or_raise,
     get_response,
 )
+from libcommon.utils import SplitHubFile
+
 from worker.dtos import (
     ConfigParquetResponse,
     DatasetParquetResponse,
     JobResult,
     PreviousJob,
-    SplitHubFile,
 )
 from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner
 
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 68b2229535..6ab5d47eae 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -33,8 +33,9 @@
 from libcommon.utils import JobInfo, SplitHubFile
 
 from worker.config import AppConfig
+from worker.dtos import CompleteJobResult
 from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache
-from worker.utils import CompleteJobResult, create_branch, hf_hub_url
+from worker.utils import create_branch, hf_hub_url
 
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
diff --git a/services/worker/tests/job_runners/config/test_parquet.py b/services/worker/tests/job_runners/config/test_parquet.py
index 09202118ce..e314000ac1 100644
--- a/services/worker/tests/job_runners/config/test_parquet.py
+++ b/services/worker/tests/job_runners/config/test_parquet.py
@@ -12,11 +12,7 @@
 from libcommon.utils import Priority, SplitHubFile
 
 from worker.config import AppConfig
-from worker.dtos import (
-    ConfigParquetAndInfoResponse,
-    ConfigParquetResponse,
-    SplitHubFile,
-)
+from worker.dtos import ConfigParquetAndInfoResponse, ConfigParquetResponse
 from worker.job_runners.config.parquet import ConfigParquetJobRunner
 
 
diff --git a/services/worker/tests/job_runners/config/test_parquet_metadata.py b/services/worker/tests/job_runners/config/test_parquet_metadata.py
index c1ba370911..21b9ce01e2 100644
--- a/services/worker/tests/job_runners/config/test_parquet_metadata.py
+++ b/services/worker/tests/job_runners/config/test_parquet_metadata.py
@@ -22,7 +22,6 @@
 from worker.dtos import (
     ConfigParquetMetadataResponse,
     ConfigParquetResponse,
-    SplitHubFile,
     ParquetFileMetadataItem,
 )
 from worker.job_runners.config.parquet_metadata import ConfigParquetMetadataJobRunner
diff --git a/services/worker/tests/job_runners/dataset/test_parquet.py b/services/worker/tests/job_runners/dataset/test_parquet.py
index 53d8343b81..8f63b188b0 100644
--- a/services/worker/tests/job_runners/dataset/test_parquet.py
+++ b/services/worker/tests/job_runners/dataset/test_parquet.py
@@ -12,7 +12,7 @@
 from libcommon.utils import Priority, SplitHubFile
 
 from worker.config import AppConfig
-from worker.dtos import ConfigParquetResponse, DatasetParquetResponse, SplitHubFile
+from worker.dtos import ConfigParquetResponse, DatasetParquetResponse
 from worker.job_runners.dataset.parquet import DatasetParquetJobRunner
 
 from ..utils import UpstreamResponse

From b6f3bd991782cf49840d0bd800f3c942966adff8 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 26 Jun 2023 08:32:22 -0400
Subject: [PATCH 50/52] Apply code review suggestions

---
 libs/libcommon/src/libcommon/exceptions.py    | 78 +++++++++----------
 services/worker/pyproject.toml                |  2 +-
 .../job_runners/config/parquet_and_info.py    | 11 +--
 .../worker/job_runners/split/duckdb_index.py  | 35 +++++----
 services/worker/src/worker/utils.py           | 11 ++-
 .../job_runners/split/test_duckdb_index.py    |  2 +
 6 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index aa470371ef..46e66228de 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -73,7 +73,7 @@ def as_response(self) -> ErrorResponse:
 
 
 CacheableErrorCode = Literal[
-    "CachedDirectoryNotInitializedError",
+    "CacheDirectoryNotInitializedError",
     "ConfigNamesError",
     "CreateCommitError",
     "DatasetInBlockListError",
@@ -140,6 +140,13 @@ def __init__(
         )
 
 
+class CacheDirectoryNotInitializedError(CacheableError):
+    """Raised when the cache directory has not been initialized before job compute."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CacheDirectoryNotInitializedError", cause, True)
+
+
 class ConfigNamesError(CacheableError):
     """Raised when the config names could not be fetched."""
 
@@ -236,6 +243,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooBigExternalFilesError", cause, True)
 
 
+class DatasetWithTooManyConfigsError(CacheableError):
+    """Raised when the number of configs of a dataset exceeded the limit."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True)
+
+
 class DatasetWithTooManyExternalFilesError(CacheableError):
     """Raised when the number of external data files of a dataset is too big."""
 
@@ -250,11 +264,11 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyParquetFilesError", cause, True)
 
 
-class LockedDatasetTimeoutError(CacheableError):
-    """Raised when a dataset is locked by another job."""
+class DuckDBIndexFileNotFoundError(CacheableError):
+    """Raised when no duckdb index file was found for split."""
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "LockedDatasetTimeoutError", cause, True)
+        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False)
 
 
 class DisabledViewerError(CacheableError):
@@ -359,6 +373,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         )
 
 
+class LockedDatasetTimeoutError(CacheableError):
+    """Raised when a dataset is locked by another job."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "LockedDatasetTimeoutError", cause, True)
+
+
 class MissingSpawningTokenError(CacheableError):
     """Raised when the spawning.ai token is not set."""
 
@@ -373,6 +394,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NormalRowsError", cause, True)
 
 
+class NoIndexableColumnsError(CacheableError):
+    """Raised when split does not have string columns to index."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
+
+
 class ParameterMissingError(CacheableError):
     """Raised when request is missing some parameter."""
 
@@ -454,6 +482,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         )
 
 
+class SplitWithTooBigParquetError(CacheableError):
+    """Raised when the split parquet size (sum of parquet sizes given) is too big."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False)
+
+
 class StreamingRowsError(CacheableError):
     """Raised when the rows could not be fetched in streaming mode."""
 
@@ -500,38 +535,3 @@ class UnsupportedExternalFilesError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True)
-
-
-class NoIndexableColumnsError(CacheableError):
-    """Raised when split does not have string columns to index."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
-
-
-class DuckDBIndexFileNotFoundError(CacheableError):
-    """Raised when no duckdb index file was found for split."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False)
-
-
-class SplitWithTooBigParquetError(CacheableError):
-    """Raised when the split parquet size (sum of parquet sizes given) is too big."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False)
-
-
-class DatasetWithTooManyConfigsError(CacheableError):
-    """Raised when the number of configs of a dataset exceeded the limit."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True)
-
-
-class CachedDirectoryNotInitializedError(CacheableError):
-    """Raised when the cached directory has not been initialized before job compute."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CachedDirectoryNotInitializedError", cause, True)
diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml
index 1be29673b9..a2e6034dfc 100644
--- a/services/worker/pyproject.toml
+++ b/services/worker/pyproject.toml
@@ -12,6 +12,7 @@ aiohttp = "^3.8.4"
 aiolimiter = "^1.0.0"
 bs4 = "^0.0.1"
 conllu = "^4.5.2"
+duckdb = "^0.8.1"
 environs = "^9.5.0"
 gdown = "^4.6.3"
 huggingface-hub = { git = "https://github.com/huggingface/huggingface_hub", rev = "1055a56b2d2723b55ba4fdf1f3296e04cfd8d6db" }
@@ -45,7 +46,6 @@ transformers = "^4.30.0"
 trec-car-tools = { path = "vendors/trec-car-tools/python3" }
 typer = "^0.4.2"
 wget = "^3.2"
-duckdb = "^0.8.1"
 
 [tool.poetry.group.dev.dependencies]
 bandit = "^1.7.4"
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index 9d6fa16271..77a598abcd 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -76,7 +76,7 @@
 from worker.config import AppConfig, ParquetAndInfoConfig
 from worker.dtos import CompleteJobResult, ConfigParquetAndInfoResponse
 from worker.job_runners.config.config_job_runner import ConfigJobRunnerWithDatasetsCache
-from worker.utils import create_branch, hf_hub_url, retry
+from worker.utils import LOCK_GIT_BRANCH_RETRY_SLEEPS, create_branch, hf_hub_url, retry
 
 DATASET_TYPE = "dataset"
 MAX_FILES_PER_DIRECTORY = 10_000  # hf hub limitation
@@ -1025,18 +1025,15 @@ def compute_config_parquet_and_info_response(
         parquet_operations = convert_to_parquet(builder)
 
     try:
-        sleeps = [1, 1, 1, 1, 1, 10, 10, 10, 10, 100] * 3
         # ^ timeouts after ~7 minutes
-        with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps):
+        with lock.git_branch(
+            dataset=dataset, branch=target_revision, job_id=job_id, sleeps=LOCK_GIT_BRANCH_RETRY_SLEEPS
+        ):
             # create the target revision if we managed to get the parquet files and it does not exist yet
             # (clone from initial commit to avoid cloning all repo's files)
-            refs = retry(on=[requests.exceptions.ConnectionError], sleeps=[1, 1, 1, 10, 10])(hf_api.list_repo_refs)(
-                repo_id=dataset, repo_type=DATASET_TYPE
-            )
             create_branch(
                 dataset=dataset,
                 target_revision=target_revision,
-                refs=refs,
                 hf_api=hf_api,
                 committer_hf_api=committer_hf_api,
             )
diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
index 6ab5d47eae..32abddf675 100644
--- a/services/worker/src/worker/job_runners/split/duckdb_index.py
+++ b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -16,7 +16,7 @@
 from libcommon.config import DuckDbIndexConfig
 from libcommon.constants import PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
 from libcommon.exceptions import (
-    CachedDirectoryNotInitializedError,
+    CacheDirectoryNotInitializedError,
     DatasetNotFoundError,
     DuckDBIndexFileNotFoundError,
     LockedDatasetTimeoutError,
@@ -35,7 +35,7 @@
 from worker.config import AppConfig
 from worker.dtos import CompleteJobResult
 from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache
-from worker.utils import create_branch, hf_hub_url
+from worker.utils import LOCK_GIT_BRANCH_RETRY_SLEEPS, create_branch, hf_hub_url
 
 DATASET_TYPE = "dataset"
 STRING_FEATURE_DTYPE = "string"
@@ -53,7 +53,7 @@ def compute_index_rows(
     dataset: str,
     config: str,
     split: str,
-    duckdb_index_file_directory: Optional[Path],
+    duckdb_index_file_directory: Path,
     target_revision: str,
     hf_endpoint: str,
     commit_message: str,
@@ -132,9 +132,7 @@ def compute_index_rows(
     duckdb.execute(LOAD_EXTENSION_COMMAND.format(extension="fts"))
 
     # index all columns
-    if duckdb_index_file_directory is None:
-        raise CachedDirectoryNotInitializedError("Cache directory has not been initialized.")
-    db_path = Path(duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME)
+    db_path = duckdb_index_file_directory.resolve() / DUCKDB_DEFAULT_INDEX_FILENAME
 
     con = duckdb.connect(str(db_path.resolve()))
     logging.debug(CREATE_SEQUENCE_COMMAND)
@@ -150,22 +148,21 @@ def compute_index_rows(
     con.sql(CREATE_INDEX_COMMAND)
     con.close()
 
-    # create the target revision if it does not exist yet (clone from initial commit to avoid cloning all repo's files)
     hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
     committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token)
     index_file_location = f"{config}/{split}/{DUCKDB_DEFAULT_INDEX_FILENAME}"
-    try:
-        refs = hf_api.list_repo_refs(repo_id=dataset, repo_type=DATASET_TYPE)
-    except RepositoryNotFoundError as err:
-        raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err
-
-    create_branch(
-        dataset=dataset, target_revision=target_revision, refs=refs, hf_api=hf_api, committer_hf_api=committer_hf_api
-    )
 
     try:
-        sleeps = [1, 1, 1, 10, 10, 100, 100, 100, 300]
-        with lock.git_branch(dataset=dataset, branch=target_revision, job_id=job_id, sleeps=sleeps):
+        with lock.git_branch(
+            dataset=dataset, branch=target_revision, job_id=job_id, sleeps=LOCK_GIT_BRANCH_RETRY_SLEEPS
+        ):
+            create_branch(
+                dataset=dataset,
+                target_revision=target_revision,
+                hf_api=hf_api,
+                committer_hf_api=committer_hf_api,
+            )
+
             target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
             all_repo_files: Set[str] = {f.rfilename for f in target_dataset_info.siblings}
             delete_operations: List[CommitOperation] = []
@@ -190,6 +187,8 @@ def compute_index_rows(
             target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=True)
     except TimeoutError as err:
         raise LockedDatasetTimeoutError("the dataset is currently locked, please try again later.") from err
+    except RepositoryNotFoundError as err:
+        raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err
 
     repo_files = [
         repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location
@@ -246,6 +245,8 @@ def get_job_runner_version() -> int:
         return PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION
 
     def compute(self) -> CompleteJobResult:
+        if self.cache_subdirectory is None:
+            raise CacheDirectoryNotInitializedError("Cache directory has not been initialized.")
         return CompleteJobResult(
             compute_index_rows(
                 job_id=self.job_info["job_id"],
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
index 84bb009ee9..c3b985d8b9 100644
--- a/services/worker/src/worker/utils.py
+++ b/services/worker/src/worker/utils.py
@@ -21,6 +21,7 @@
 from urllib.parse import quote
 
 import PIL
+import requests
 from datasets import (
     Dataset,
     DatasetInfo,
@@ -31,7 +32,7 @@
 )
 from datasets.utils.file_utils import get_authentication_headers_for_url
 from fsspec.implementations.http import HTTPFileSystem
-from huggingface_hub.hf_api import GitRefs, HfApi
+from huggingface_hub.hf_api import HfApi
 from huggingface_hub.utils._errors import RepositoryNotFoundError
 from libcommon.exceptions import (
     DatasetNotFoundError,
@@ -335,9 +336,15 @@ def get_parquet_file(url: str, fs: HTTPFileSystem, hf_token: Optional[str]) -> P
 
 DATASET_TYPE = "dataset"
 
+LIST_REPO_REFS_RETRY_SLEEPS = [1, 1, 1, 10, 10]
+LOCK_GIT_BRANCH_RETRY_SLEEPS = [1, 1, 1, 1, 1, 10, 10, 10, 10, 100] * 3
 
-def create_branch(dataset: str, target_revision: str, refs: GitRefs, hf_api: HfApi, committer_hf_api: HfApi) -> None:
+
+def create_branch(dataset: str, target_revision: str, hf_api: HfApi, committer_hf_api: HfApi) -> None:
     try:
+        refs = retry(on=[requests.exceptions.ConnectionError], sleeps=LIST_REPO_REFS_RETRY_SLEEPS)(
+            hf_api.list_repo_refs
+        )(repo_id=dataset, repo_type=DATASET_TYPE)
         if all(ref.ref != target_revision for ref in refs.converts):
             initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id
             committer_hf_api.create_branch(
diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index 5004d9f446..94263f4241 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -187,6 +187,7 @@ def test_compute(
 
     assert parquet_response
     job_runner = get_job_runner(dataset, config, split, app_config)
+    job_runner.pre_compute()
 
     if expected_error_code:
         with pytest.raises(Exception) as e:
@@ -232,3 +233,4 @@ def test_compute(
 
         con.close()
         os.remove(file_name)
+    job_runner.post_compute()
\ No newline at end of file

From 550f1183c691572429546cf5ec85f2295a60791a Mon Sep 17 00:00:00 2001
From: Andrea Francis Soria Jimenez <andrea@huggingface.co>
Date: Mon, 26 Jun 2023 08:33:13 -0400
Subject: [PATCH 51/52] Apply suggestions from code review

Co-authored-by: Sylvain Lesage <sylvain.lesage@huggingface.co>
---
 tools/docker-compose-datasets-server.yml     | 2 +-
 tools/docker-compose-dev-datasets-server.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml
index 19ce53754e..37b1c87d1f 100644
--- a/tools/docker-compose-datasets-server.yml
+++ b/tools/docker-compose-datasets-server.yml
@@ -116,7 +116,7 @@ services:
       DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}
       DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index file}
       DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-}
-      DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index}
+      DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-refs/convert/parquet}
       DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}
diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml
index aa74c5d5f4..233e90f253 100644
--- a/tools/docker-compose-dev-datasets-server.yml
+++ b/tools/docker-compose-dev-datasets-server.yml
@@ -120,7 +120,7 @@ services:
       DUCKDB_INDEX_STORAGE_DIRECTORY: ${DUCKDB_INDEX_STORAGE_DIRECTORY-/duckdb-index}
       DUCKDB_INDEX_COMMIT_MESSAGE: ${DUCKDB_INDEX_COMMIT_MESSAGE-Update duckdb index files}
       DUCKDB_INDEX_COMMITTER_HF_TOKEN: ${DUCKDB_INDEX_COMMITTER_HF_TOKEN-}
-      DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-duckdb/index}
+      DUCKDB_INDEX_TARGET_REVISION: ${DUCKDB_INDEX_TARGET_REVISION-refs/convert/parquet}
       DUCKDB_INDEX_URL_TEMPLATE: ${DUCKDB_INDEX_URL_TEMPLATE-/datasets/%s/resolve/%s/%s}
       DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES: ${DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES-100_000_000}
       WORKER_STORAGE_PATHS: ${ASSETS_STORAGE_DIRECTORY-/assets}

From 930f6c0d12f04d12cc3dc353148f0551dce41ca2 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Mon, 26 Jun 2023 12:03:24 -0400
Subject: [PATCH 52/52] Add test

---
 .../tests/job_runners/split/test_duckdb_index.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/services/worker/tests/job_runners/split/test_duckdb_index.py b/services/worker/tests/job_runners/split/test_duckdb_index.py
index 94263f4241..a8fe40cfc3 100644
--- a/services/worker/tests/job_runners/split/test_duckdb_index.py
+++ b/services/worker/tests/job_runners/split/test_duckdb_index.py
@@ -222,15 +222,21 @@ def test_compute(
         # perform a search to validate fts feature
         query = "Lord Vader"
         result = con.execute(
-            (
-                "SELECT fts_main_data.match_bm25(__hf_index_id, ?) AS score, text FROM data WHERE score IS NOT NULL"
-                " ORDER BY score DESC;"
-            ),
+            "SELECT text FROM data WHERE fts_main_data.match_bm25(__hf_index_id, ?) IS NOT NULL;",
             [query],
         )
         rows = result.df()
         assert rows is not None
+        assert (rows["text"].eq("Vader turns round and round in circles as his ship spins into space.")).any()
+        assert (rows["text"].eq("The wingman spots the pirateship coming at him and warns the Dark Lord")).any()
+        assert (rows["text"].eq("We count thirty Rebel ships, Lord Vader.")).any()
+        assert (
+            rows["text"].eq(
+                "Grand Moff Tarkin and Lord Vader are interrupted in their discussion by the buzz of the comlink"
+            )
+        ).any()
+        assert not (rows["text"].eq("There goes another one.")).any()
 
         con.close()
         os.remove(file_name)
-    job_runner.post_compute()
\ No newline at end of file
+    job_runner.post_compute()