huggingface · AndreaFrancis · Jun 27, 2023 · May 31, 2023 · Jun 2, 2023 · Jun 2, 2023
diff --git a/chart/static-files/openapi.json b/chart/static-files/openapi.json
@@ -925,11 +925,11 @@
         "properties": {
           "parquet_files": {
             "type": "array",
-            "items": { "$ref": "#/components/schemas/ParquetFileItem" }
+            "items": { "$ref": "#/components/schemas/SplitHubFile" }
           }
         }
       },
-      "ParquetFileItem": {
+      "SplitHubFile": {
         "type": "object",
         "required": ["dataset", "config", "split", "url", "filename", "size"],
         "properties": {

diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl
@@ -87,4 +87,25 @@
 - name: CONFIG_NAMES_MAX_NUMBER
   value: {{ .Values.configNames.maxNumber | quote }}
 
+# specific to 'split-duckdb-index' job runner
+- name: DUCKDB_INDEX_COMMIT_MESSAGE
+  value: {{ .Values.duckDBIndex.commitMessage | quote }}
+- name: DUCKDB_INDEX_COMMITTER_HF_TOKEN
+  {{- if .Values.secrets.appParquetConverterHfToken.fromSecret }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ .Values.secrets.appParquetConverterHfToken.secretName | quote }}
+      key: HF_TOKEN
+      optional: false
+  {{- else }}
+  value: {{ .Values.secrets.appParquetConverterHfToken.value }}
+  {{- end }}
+- name: DUCKDB_INDEX_TARGET_REVISION
+  value: {{ .Values.duckDBIndex.targetRevision | quote }}
+- name: DUCKDB_INDEX_URL_TEMPLATE
+  value: {{ .Values.duckDBIndex.urlTemplate | quote }}
+- name: DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES
+  value: {{ .Values.duckDBIndex.maxParquetSizeBytes | quote }}
+- name: DUCKDB_INDEX_STORAGE_DIRECTORY
+  value: {{ .Values.duckDBIndex.storageDirectory | quote }}
 {{- end -}}
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
@@ -169,6 +169,15 @@ The parquet-metadata/ subpath in the NFS
 {{- printf "%s/%s/%s/" .Chart.Name .Release.Name "parquet-metadata" }}
 {{- end }}
 
+{{/*
+The duckdb-index/ subpath in the NFS
+- in a subdirectory named as the chart (datasets-server/), and below it,
+- in a subdirectory named as the Release, so that Releases will not share the same dir
+*/}}
+{{- define "duckDBIndex.subpath" -}}
+{{- printf "%s/%s/%s/" .Chart.Name .Release.Name "duckdb-index" }}
+{{- end }}
+
 {{/*
 The datasets library will use this directory as a cache
 - in a subdirectory named as the chart (datasets-server/), and below it,

diff --git a/chart/templates/_initContainerDuckDBIndex.tpl b/chart/templates/_initContainerDuckDBIndex.tpl
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+
+{{- define "initContainerDuckDBIndex" -}}
+- name: prepare-duckdb-index
+  image: ubuntu:focal
+  imagePullPolicy: {{ .Values.images.pullPolicy }}
+  command: ["/bin/sh", "-c"]
+  args:
+  - chown {{ .Values.uid }}:{{ .Values.gid }} /mounted-path;
+  volumeMounts:
+  - mountPath: /mounted-path
+    mountPropagation: None
+    name: data
+    subPath: "{{ include "duckDBIndex.subpath" . }}"
+    readOnly: false
+  securityContext:
+    runAsNonRoot: false
+    runAsUser: 0
+    runAsGroup: 0
+{{- end -}}
diff --git a/chart/templates/_volumeMountDuckDBIndex.tpl b/chart/templates/_volumeMountDuckDBIndex.tpl
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+
+{{- define "volumeMountDuckDBIndexRW" -}}
+- mountPath: {{ .Values.duckDBIndex.storageDirectory | quote }}
+  mountPropagation: None
+  name: data
+  subPath: "{{ include "duckDBIndex.subpath" . }}"
+  readOnly: false
+{{- end -}}
diff --git a/chart/templates/worker/_container.tpl b/chart/templates/worker/_container.tpl
@@ -24,6 +24,7 @@
   {{ include "volumeMountAssetsRW" . | nindent 2 }}
   {{ include "volumeMountCache" . | nindent 2 }}
   {{ include "volumeMountParquetMetadataRW" . | nindent 2 }}
+  {{ include "volumeMountDuckDBIndexRW" . | nindent 2 }}
   securityContext:
     allowPrivilegeEscalation: false
   resources: {{ toYaml .workerValues.resources | nindent 4 }}

diff --git a/chart/templates/worker/_deployment.yaml b/chart/templates/worker/_deployment.yaml
@@ -26,6 +26,7 @@ spec:
         {{ include "initContainerAssets" . | nindent 8 }}
         {{ include "initContainerCache" . | nindent 8 }}
         {{ include "initContainerParquetMetadata" . | nindent 8 }}
+        {{ include "initContainerDuckDBIndex" . | nindent 8 }}
       containers: {{ include "containerWorker" . | nindent 8 }}
       nodeSelector: {{ toYaml .workerValues.nodeSelector | nindent 8 }}
       tolerations: {{ toYaml .workerValues.tolerations | nindent 8 }}

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -214,6 +214,17 @@ parquetMetadata:
   # Directory on the shared storage (parquet metadata files used for random access in /rows)
   storageDirectory: "/parquet-metadata"
 
+duckDBIndex:
+  # Directory on the shared storage (used temporarily to prepare the duckdb indexes before sending to the Hub)
+  storageDirectory: "/duckdb-index"
+  # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`.
+  commitMessage: "Update duckdb index files"
+  # the git revision of the dataset where to store the duckdb index file. Defaults to `refs/convert/parquet`.
+  targetRevision: "refs/convert/parquet"
+  # the URL template to build the duckdb index file URL. Defaults to `/datasets/%s/resolve/%s/%s`.
+  urlTemplate: "/datasets/%s/resolve/%s/%s"
+  # the maximum size of the split parquets.
+  maxParquetSizeBytes: "100_000_000"
 
 # Directory where the cache data will be stored
 cacheDirectory: "/datasets-server-cache"

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
@@ -24,6 +24,7 @@
     PROCESSING_STEP_DATASET_PARQUET_VERSION,
     PROCESSING_STEP_DATASET_SIZE_VERSION,
     PROCESSING_STEP_DATASET_SPLIT_NAMES_VERSION,
+    PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
     PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION,
     PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION,
     PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION,
@@ -104,6 +105,39 @@ def from_env(cls) -> "ParquetMetadataConfig":
             )
 
 
+DUCKDB_INDEX_STORAGE_DIRECTORY = None
+DUCKDB_INDEX_COMMIT_MESSAGE = "Update duckdb index file"
+DUCKDB_INDEX_COMMITTER_HF_TOKEN = None
+DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES = 100_000_000
+DUCKDB_INDEX_TARGET_REVISION = "refs/convert/parquet"
+DUCKDB_INDEX_URL_TEMPLATE = "/datasets/%s/resolve/%s/%s"
+
+
+@dataclass(frozen=True)
+class DuckDbIndexConfig:
+    storage_directory: Optional[str] = DUCKDB_INDEX_STORAGE_DIRECTORY
+    commit_message: str = DUCKDB_INDEX_COMMIT_MESSAGE
+    committer_hf_token: Optional[str] = DUCKDB_INDEX_COMMITTER_HF_TOKEN
+    target_revision: str = DUCKDB_INDEX_TARGET_REVISION
+    url_template: str = DUCKDB_INDEX_URL_TEMPLATE
+    max_parquet_size_bytes: int = DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES
+
+    @classmethod
+    def from_env(cls) -> "DuckDbIndexConfig":
+        env = Env(expand_vars=True)
+        with env.prefixed("DUCKDB_INDEX_"):
+            return cls(
+                storage_directory=env.str(name="STORAGE_DIRECTORY", default=DUCKDB_INDEX_STORAGE_DIRECTORY),
+                commit_message=env.str(name="COMMIT_MESSAGE", default=DUCKDB_INDEX_COMMIT_MESSAGE),
+                committer_hf_token=env.str(name="COMMITTER_HF_TOKEN", default=DUCKDB_INDEX_COMMITTER_HF_TOKEN),
+                target_revision=env.str(name="TARGET_REVISION", default=DUCKDB_INDEX_TARGET_REVISION),
+                url_template=env.str(name="URL_TEMPLATE", default=DUCKDB_INDEX_URL_TEMPLATE),
+                max_parquet_size_bytes=env.int(
+                    name="MAX_PARQUET_SIZE_BYTES", default=DUCKDB_INDEX_MAX_PARQUET_SIZE_BYTES
+                ),
+            )
+
+
 COMMON_HF_ENDPOINT = "https://huggingface.co"
 COMMON_HF_TOKEN = None
 
@@ -319,6 +353,13 @@ class ProcessingGraphConfig:
                 "triggered_by": ["dataset-config-names", "config-opt-in-out-urls-count"],
                 "job_runner_version": PROCESSING_STEP_DATASET_OPT_IN_OUT_URLS_COUNT_VERSION,
             },
+            "split-duckdb-index": {
+                "input_type": "split",
+                "triggered_by": [
+                    "config-split-names-from-info",
+                ],
+                "job_runner_version": PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION,
+            },
         }
     )
 

diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py
@@ -6,6 +6,7 @@
 CACHE_MONGOENGINE_ALIAS = "cache"
 CACHED_ASSETS_CACHE_APPNAME = "datasets_server_cached_assets"
 PARQUET_METADATA_CACHE_APPNAME = "datasets_server_parquet_metadata"
+DUCKDB_INDEX_CACHE_APPNAME = "datasets_server_duckdb_index"
 METRICS_COLLECTION_CACHE_TOTAL_METRIC = "cacheTotalMetric"
 METRICS_COLLECTION_JOB_TOTAL_METRIC = "jobTotalMetric"
 METRICS_MONGOENGINE_ALIAS = "metrics"
@@ -36,6 +37,7 @@
 PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_COUNT_VERSION = 2
 PROCESSING_STEP_SPLIT_OPT_IN_OUT_URLS_SCAN_VERSION = 4
 PROCESSING_STEP_SPLIT_IMAGE_URL_COLUMNS_VERSION = 1
+PROCESSING_STEP_SPLIT_DUCKDB_INDEX_VERSION = 1
 
 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = 100
 PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = 100

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
@@ -73,6 +73,7 @@ def as_response(self) -> ErrorResponse:
 
 
 CacheableErrorCode = Literal[
+    "CacheDirectoryNotInitializedError",
     "ConfigNamesError",
     "CreateCommitError",
     "DatasetInBlockListError",
@@ -89,6 +90,7 @@ def as_response(self) -> ErrorResponse:
     "DatasetWithTooManyConfigsError",
     "DatasetWithTooManyParquetFilesError",
     "DisabledViewerError",
+    "DuckDBIndexFileNotFoundError",
     "EmptyDatasetError",
     "ExternalFilesSizeRequestConnectionError",
     "ExternalFilesSizeRequestError",
@@ -102,6 +104,7 @@ def as_response(self) -> ErrorResponse:
     "JobManagerExceededMaximumDurationError",
     "LockedDatasetTimeoutError",
     "MissingSpawningTokenError",
+    "NoIndexableColumnsError",
     "NormalRowsError",
     "ParameterMissingError",
     "ParquetResponseEmptyError",
@@ -112,6 +115,7 @@ def as_response(self) -> ErrorResponse:
     "SplitsNamesError",
     "SplitNamesFromStreamingError",
     "SplitNotFoundError",
+    "SplitWithTooBigParquetError",
     "StreamingRowsError",
     "TooBigContentError",
     "TooManyColumnsError",
@@ -136,6 +140,13 @@ def __init__(
         )
 
 
+class CacheDirectoryNotInitializedError(CacheableError):
+    """Raised when the cache directory has not been initialized before job compute."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CacheDirectoryNotInitializedError", cause, True)
+
+
 class ConfigNamesError(CacheableError):
     """Raised when the config names could not be fetched."""
 
@@ -232,6 +243,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooBigExternalFilesError", cause, True)
 
 
+class DatasetWithTooManyConfigsError(CacheableError):
+    """Raised when the number of configs of a dataset exceeded the limit."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True)
+
+
 class DatasetWithTooManyExternalFilesError(CacheableError):
     """Raised when the number of external data files of a dataset is too big."""
 
@@ -246,11 +264,11 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyParquetFilesError", cause, True)
 
 
-class LockedDatasetTimeoutError(CacheableError):
-    """Raised when a dataset is locked by another job."""
+class DuckDBIndexFileNotFoundError(CacheableError):
+    """Raised when no duckdb index file was found for split."""
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "LockedDatasetTimeoutError", cause, True)
+        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False)
 
 
 class DisabledViewerError(CacheableError):
@@ -355,6 +373,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         )
 
 
+class LockedDatasetTimeoutError(CacheableError):
+    """Raised when a dataset is locked by another job."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "LockedDatasetTimeoutError", cause, True)
+
+
 class MissingSpawningTokenError(CacheableError):
     """Raised when the spawning.ai token is not set."""
 
@@ -369,6 +394,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NormalRowsError", cause, True)
 
 
+class NoIndexableColumnsError(CacheableError):
+    """Raised when split does not have string columns to index."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True)
+
+
 class ParameterMissingError(CacheableError):
     """Raised when request is missing some parameter."""
 
@@ -450,6 +482,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         )
 
 
+class SplitWithTooBigParquetError(CacheableError):
+    """Raised when the split parquet size (sum of parquet sizes given) is too big."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False)
+
+
 class StreamingRowsError(CacheableError):
     """Raised when the rows could not be fetched in streaming mode."""
 
@@ -496,10 +535,3 @@ class UnsupportedExternalFilesError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True)
-
-
-class DatasetWithTooManyConfigsError(CacheableError):
-    """Raised when the number of configs of a dataset exceeded the limit."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True)
diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py
@@ -20,6 +20,7 @@
 from libcommon.processing_graph import ProcessingGraph
 from libcommon.prometheus import StepProfiler
 from libcommon.simple_cache import get_previous_step_or_raise
+from libcommon.utils import SplitHubFile
 
 StrPath = Union[str, PathLike[str]]
 
@@ -36,15 +37,6 @@ class FileSystemError(Exception):
     pass
 
 
-class ParquetFileItem(TypedDict):
-    dataset: str
-    config: str
-    split: str
-    url: str
-    filename: str
-    size: int
-
-
 class ParquetFileMetadataItem(TypedDict):
     dataset: str
     config: str
@@ -134,7 +126,7 @@ def query(self, offset: int, length: int) -> pa.Table:
 
     @staticmethod
     def from_parquet_file_items(
-        parquet_file_items: List[ParquetFileItem],
+        parquet_file_items: List[SplitHubFile],
         dataset: str,
         config: str,
         split: str,

diff --git a/libs/libcommon/src/libcommon/storage.py b/libs/libcommon/src/libcommon/storage.py
@@ -12,6 +12,7 @@
 from libcommon.constants import (
     ASSETS_CACHE_APPNAME,
     CACHED_ASSETS_CACHE_APPNAME,
+    DUCKDB_INDEX_CACHE_APPNAME,
     PARQUET_METADATA_CACHE_APPNAME,
 )
 
@@ -81,6 +82,20 @@ def init_parquet_metadata_dir(directory: Optional[StrPath] = None) -> StrPath:
     return init_dir(directory, appname=PARQUET_METADATA_CACHE_APPNAME)
 
 
+def init_duckdb_index_cache_dir(directory: Optional[StrPath] = None) -> StrPath:
+    """Initialize the duckdb index directory.
+
+    If directory is None, it will be set to the default duckdb index location on the machine.
+
+    Args:
+        directory (Optional[Union[str, PathLike[str]]], optional): The directory to initialize. Defaults to None.
+
+    Returns:
+        Union[str, PathLike[str]]: The directory.
+    """
+    return init_dir(directory, appname=DUCKDB_INDEX_CACHE_APPNAME)
+
+
 def exists(path: StrPath) -> bool:
     """Check if a path exists.