From dc7c9708b686a749ce2d55117507583bfdd5dbb2 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 5 Apr 2024 14:56:27 +0200 Subject: [PATCH] Remove pipeline references (#923) Cleaning up the core code, removing pipeline naming in doc strings and documentation. Also updated the readme generation of component to remove the pipeline name there. --- scripts/component_readme/readme_template.md | 10 +- src/fondant/cli.py | 121 +++++++++--------- src/fondant/component/executor.py | 4 +- .../components/caption_images/README.md | 8 +- src/fondant/components/chunk_text/README.md | 8 +- src/fondant/components/crop_images/README.md | 8 +- .../components/download_images/README.md | 8 +- src/fondant/components/embed_images/README.md | 8 +- src/fondant/components/embed_text/README.md | 8 +- .../extract_image_resolution/README.md | 8 +- .../filter_image_resolution/README.md | 8 +- .../components/filter_language/README.md | 8 +- .../components/filter_text_length/README.md | 8 +- .../components/generate_minhash/README.md | 8 +- .../components/index_aws_opensearch/README.md | 8 +- src/fondant/components/index_qdrant/README.md | 8 +- .../components/index_weaviate/README.md | 35 +---- .../index_weaviate/fondant_component.yaml | 27 +--- .../components/load_from_csv/README.md | 10 +- .../load_from_csv/fondant_component.yaml | 2 +- .../components/load_from_files/README.md | 8 +- .../components/load_from_hf_hub/README.md | 8 +- .../components/load_from_parquet/README.md | 8 +- .../components/load_from_pdf/README.md | 8 +- .../components/resize_images/README.md | 8 +- .../README.md | 8 +- .../retrieve_from_faiss_by_prompt/README.md | 8 +- .../retrieve_laion_by_embedding/README.md | 8 +- .../retrieve_laion_by_prompt/README.md | 8 +- .../components/segment_images/README.md | 8 +- .../components/write_to_file/README.md | 8 +- .../components/write_to_hf_hub/README.md | 8 +- src/fondant/core/manifest.py | 2 +- src/fondant/core/schema.py | 2 +- src/fondant/dataset/compiler.py | 13 +- src/fondant/dataset/dataset.py | 26 ++-- src/fondant/dataset/runner.py | 24 ++-- src/fondant/explore.py | 4 +- src/fondant/testing.py | 54 ++++---- tests/pipeline/test_compiler.py | 6 +- tests/test_cli.py | 8 +- 41 files changed, 229 insertions(+), 319 deletions(-) diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md index 42244bab0..eb6657c78 100644 --- a/scripts/component_readme/readme_template.md +++ b/scripts/component_readme/readme_template.md @@ -73,18 +73,16 @@ This component takes no arguments. ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - {% if "Data loading" in tags %} -dataset = pipeline.read( +dataset = Dataset.create( {% else %} -dataset = pipeline.read(...) +dataset = Dataset.read(...) {% if "Data writing" not in tags %} dataset = dataset.apply( diff --git a/src/fondant/cli.py b/src/fondant/cli.py index c4203dc1e..35734e0ad 100644 --- a/src/fondant/cli.py +++ b/src/fondant/cli.py @@ -51,14 +51,15 @@ def entrypoint(): formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent( """ - Fondant is an Open-Source framework for building and running data pipelines. + Fondant is an Open-Source framework for collaborative building of datasets. You can read more about fondant here: https://github.com/ml6team/fondant - This CLI is used to interact with fondant pipelines like compiling and running your pipelines. + This CLI is used to interact with fondant datasets like compiling and running workflows to + materialize datasets. Example: - fondant compile my_project.my_pipeline.py + fondant run local my_dataset.py """, ), epilog=textwrap.dedent( @@ -95,9 +96,9 @@ def register_explore(parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent( """ - Explore and visualize the data produced by a fondant pipeline. + Explore and visualize the data of a Fondant dataset. - This will spin up a docker container that hosts a web application that allows you to explore the data produced by a fondant pipeline. + This will spin up a docker container that hosts a web application that allows you to explore the dataset. The default address is http://localhost:8501. You can choose both a local and remote base path to explore. If the data that you want to explore is stored remotely, you should use the --extra-volumes flag to specify credentials or local files you need to mount. @@ -118,7 +119,7 @@ def register_explore(parent_parser): "--base_path", "-b", type=str, - help="""Base path that contains the data produced by a Fondant pipeline (local or remote) + help="""Base path that contains the dataset (local or remote) .""", ) start_parser.add_argument( @@ -159,7 +160,7 @@ def register_explore(parent_parser): start_parser.add_argument( "--extra-volumes", help="""Extra volumes to mount in containers. You can use the --extra-volumes flag to specify extra volumes to mount in the containers this can be used: - - to mount data directories to be used by the pipeline (note that if your pipeline's base_path is local it will already be mounted for you). + - to mount data directories to be used by the dataset (note that if your datasets base_path is local it will already be mounted for you). - to mount cloud credentials""", nargs="+", ) @@ -288,17 +289,17 @@ def register_compile(parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent( """ - Compile a fondant pipeline into pipeline specification file file. + Compile a fondant dataset into workflow specification file. - The pipeline argument is a formatstring. The compiler will try to import the pipeline from the module specified in the formatstring. + The dataset argument is a formatstring. The compiler will try to import the dataset from the module specified in the formatstring. (NOTE: path is patched to include the current working directory so you can do relative imports) You can use different modes for fondant runners. Current existing modes are local and kubeflow. Examples of compiling component: - fondant compile local --extra-volumes $HOME/.aws/credentials:/root/.aws/credentials my_project.my_pipeline.py + fondant compile local --extra-volumes $HOME/.aws/credentials:/root/.aws/credentials my_project.my_dataset.py - fondant compile kubeflow --extra-volumes $HOME/.config/gcloud/application_default_credentials.json:/root/.config/gcloud/application_default_credentials.json my_project.my_pipeline.py + fondant compile kubeflow --extra-volumes $HOME/.config/gcloud/application_default_credentials.json:/root/.config/gcloud/application_default_credentials.json my_project.my_dataset.py """, ), ) @@ -324,20 +325,20 @@ def register_compile(parent_parser): # Local runner parser local_parser.add_argument( "ref", - help="""Reference to the pipeline to run, a path to a to a module containing - the pipeline instance that will be compiled (e.g. my-project/pipeline.py)""", + help="""Reference to the dataset to materialize, a path to a to a module containing + the dataset instance that will be compiled (e.g. my-project/dataset.py)""", action="store", ) local_parser.add_argument( "--output-path", "-o", - help="Output path of compiled pipeline", + help="Output path of compiled workflow", default="docker-compose.yml", ) local_parser.add_argument( "--extra-volumes", help="""Extra volumes to mount in containers. You can use the --extra-volumes flag to specify extra volumes to mount in the containers this can be used: - - to mount data directories to be used by the pipeline (note that if your pipeline's base_path is local it will already be mounted for you). + - to mount data directories to be used by the workflow (note that if your dataset working directory is local it will already be mounted for you). - to mount cloud credentials""", nargs="+", ) @@ -358,42 +359,42 @@ def register_compile(parent_parser): # Kubeflow parser kubeflow_parser.add_argument( "ref", - help="""Reference to the pipeline to run, a path to a to a module containing - the pipeline instance that will be compiled (e.g. my-project/pipeline.py)""", + help="""Reference to the dataset to materialize, a path to a to a module containing + the dataset instance that will be compiled (e.g. my-project/dataset.py)""", action="store", ) kubeflow_parser.add_argument( "--output-path", "-o", - help="Output path of compiled pipeline", + help="Output path of compiled dataset workflow", default="kubeflow-pipeline.yaml", ) # vertex parser vertex_parser.add_argument( "ref", - help="""Reference to the pipeline to run, a path to a to a module containing - the pipeline instance that will be compiled (e.g. my-project/pipeline.py)""", + help="""Reference to the dataset to materialize, a path to a to a module containing + the dataset instance that will be compiled (e.g. my-project/dataset.py)""", action="store", ) vertex_parser.add_argument( "--output-path", "-o", - help="Output path of compiled pipeline", + help="Output path of compiled workflow", default="vertex-pipeline.yml", ) # sagemaker parser sagemaker_parser.add_argument( "ref", - help="""Reference to the pipeline to run, a path to a to a module containing - the pipeline instance that will be compiled (e.g. my-project/pipeline.py)""", + help="""Reference to the dataset to materialize, a path to a to a module containing + the dataset instance that will be compiled (e.g. my-project/dataset.py)""", action="store", ) sagemaker_parser.add_argument( "--output-path", "-o", - help="Output path of compiled pipeline", + help="Output path of compiled workflow", default=".fondant/sagemaker_pipeline.json", ) sagemaker_parser.add_argument( @@ -416,10 +417,10 @@ def compile_local(args): if args.extra_volumes: extra_volumes.extend(args.extra_volumes) - pipeline = dataset_from_string(args.ref) + dataset = dataset_from_string(args.ref) compiler = DockerCompiler() compiler.compile( - pipeline=pipeline, + dataset=dataset, extra_volumes=extra_volumes, output_path=args.output_path, build_args=args.build_arg, @@ -430,26 +431,26 @@ def compile_local(args): def compile_kfp(args): from fondant.dataset.compiler import KubeFlowCompiler - pipeline = dataset_from_string(args.ref) + dataset = dataset_from_string(args.ref) compiler = KubeFlowCompiler() - compiler.compile(pipeline=pipeline, output_path=args.output_path) + compiler.compile(dataset=dataset, output_path=args.output_path) def compile_vertex(args): from fondant.dataset.compiler import VertexCompiler - pipeline = dataset_from_string(args.ref) + dataset = dataset_from_string(args.ref) compiler = VertexCompiler() - compiler.compile(pipeline=pipeline, output_path=args.output_path) + compiler.compile(dataset=dataset, output_path=args.output_path) def compile_sagemaker(args): from fondant.dataset.compiler import SagemakerCompiler - pipeline = dataset_from_string(args.ref) + dataset = dataset_from_string(args.ref) compiler = SagemakerCompiler() compiler.compile( - pipeline=pipeline, + dataset=dataset, output_path=args.output_path, role_arn=args.role_arn, ) @@ -461,16 +462,16 @@ def register_run(parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent( """ - Run a fondant pipeline locally or on kubeflow. The run command excepts a reference to an already compiled - pipeline (see fondant compile --help for more info) - OR a path to a spec file in which case it will compile the pipeline first and then run it. + Run a fondant dataset workflow locally or remote. The run command excepts a reference to an already compiled + workflow (see fondant compile --help for more info) + OR a path to a spec file in which case it will compile the dataset first and then run it. You can use different modes for fondant runners. Current existing modes are `local` and `kubeflow`. You can run `fondant --help` to find out more about the specific arguments for each mode. Examples of running component: fondant run local --auth-gcp - fondant run kubeflow ./my_compiled_kubeflow_pipeline.tgz + fondant run kubeflow ./my_compiled_kubeflow_dataset.tgz """, ), ) @@ -495,8 +496,8 @@ def register_run(parent_parser): # Local runner parser local_parser.add_argument( "ref", - help="""Reference to the pipeline to run, can be a path to a spec file or - a module containing the pipeline instance that will be compiled first (e.g. pipeline.py) + help="""Reference to the dataset to materialize, can be a path to a spec file or + a module containing the dataset instance that will be compiled first (e.g. dataset.py) """, action="store", ) @@ -504,13 +505,13 @@ def register_run(parent_parser): "--extra-volumes", nargs="+", help="""Extra volumes to mount in containers. You can use the --extra-volumes flag to specify extra volumes to mount in the containers this can be used: - - to mount data directories to be used by the pipeline (note that if your pipeline's base_path is local it will already be mounted for you). + - to mount data directories to be used by the dataset (note that if your datasets working directory is local it will already be mounted for you). - to mount cloud credentials""", ) local_parser.add_argument( "--working-directory", - help="""Working directory where the pipeline will be executed.""", + help="""Working directory where the dataset workflow will be executed.""", ) local_parser.add_argument( @@ -529,56 +530,56 @@ def register_run(parent_parser): # kubeflow runner parser kubeflow_parser.add_argument( "ref", - help="""Reference to the pipeline to run, can be a path to a spec file or - a module containing the pipeline instance that will be compiled first (e.g. pipeline.py) + help="""Reference to the dataset to materialize, can be a path to a spec file or + a module containing the dataset instance that will be compiled first (e.g. dataset.py) """, action="store", ) kubeflow_parser.add_argument( "--working-directory", - help="""Working directory where the pipeline will be executed.""", + help="""Working directory where the dataset workflow will be executed.""", required=True, ) kubeflow_parser.add_argument( "--output-path", "-o", - help="Output path of compiled pipeline", + help="Output path of compiled dataset workflow", default="kubeflow-pipeline.yaml", ) kubeflow_parser.add_argument( "--host", - help="KubeFlow pipeline host url", + help="KubeFlow host url", required=True, ) # Vertex runner parser vertex_parser.add_argument( "ref", - help="""Reference to the pipeline to run, can be a path to a spec file or - a module containing the pipeline instance that will be compiled first (e.g. pipeline.py) + help="""Reference to the dataset to materialize, can be a path to a spec file or + a module containing the dataset instance that will be compiled first (e.g. dataset.py) """, action="store", ) vertex_parser.add_argument( "--working-directory", - help="""Working directory where the pipeline will be executed.""", + help="""Working directory where the dataset workflow will be executed.""", required=True, ) vertex_parser.add_argument( "--project-id", - help="""The project id of the GCP project used to submit the pipeline""", + help="""The project id of the GCP project used to submit the workflow""", ) vertex_parser.add_argument( "--region", - help="The region where to run the pipeline", + help="The region where to run the workflow", ) vertex_parser.add_argument( "--output-path", "-o", - help="Output path of compiled pipeline", + help="Output path of compiled dataset", default="vertex-pipeline.yaml", ) @@ -598,14 +599,14 @@ def register_run(parent_parser): # sagemaker runner parser sagemaker_parser.add_argument( "ref", - help="""Reference to the pipeline to run, can be a path to a spec file or - a module containing the pipeline instance that will be compiled first (e.g. pipeline.py) + help="""Reference to the dataset to materialize, can be a path to a spec file or + a module containing the dataset instance that will be compiled first (e.g. dataset.py) """, action="store", ) sagemaker_parser.add_argument( "--working-directory", - help="""Working directory where the pipeline will be executed.""", + help="""Working directory where the dataset workflow will be executed.""", required=True, ) sagemaker_parser.add_argument( @@ -699,7 +700,7 @@ def run_sagemaker(args): runner.run( dataset=ref, - pipeline_name=args.pipeline_name, + pipeline_name=args.dataset_name, role_arn=args.role_arn, working_directory=args.working_directory, ) @@ -711,7 +712,7 @@ def register_execute(parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent( """ - Execute a Fondant component using specified pipeline parameters. + Execute a Fondant component using specified dataset parameters. This command is intended to be included in the entrypoint of a component's Dockerfile. The provided argument to this command should indicate the module where the component's implementation resides. @@ -805,12 +806,12 @@ def dataset_from_string(string_ref: str) -> Dataset: # noqa: PLR0912 https://github.com/pallets/flask/blob/d611989/src/flask/cli.py#L112 Args: - string_ref: String reference describing the pipeline in the format {module}:{attribute}. + string_ref: String reference describing the dataset in the format {module}:{attribute}. The attribute can also be a function call, optionally including arguments: {module}:{function} or {module}:{function(args)}. Returns: - The pipeline obtained from the provided string + The dataset obtained from the provided string """ if ":" not in string_ref: return dataset_from_module(string_ref) @@ -819,7 +820,7 @@ def dataset_from_string(string_ref: str) -> Dataset: # noqa: PLR0912 module = get_module(module_str) - # Parse `pipeline_str` as a single expression to determine if it's a valid + # Parse `dataset_str` as a single expression to determine if it's a valid # attribute name or function call. try: expr = ast.parse(dataset_str.strip(), mode="eval").body @@ -869,7 +870,7 @@ def dataset_from_string(string_ref: str) -> Dataset: # noqa: PLR0912 ) from e # If the attribute is a function, call it with any args and kwargs - # to get the real pipeline. + # to get the real dataset. if inspect.isfunction(attr): try: app = attr(*args, **kwargs) # type: ignore diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 2a79053d0..81c31fcdb 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -259,7 +259,7 @@ def _is_previous_cached(self, input_manifest: Manifest) -> bool: This function compares the run ID of the input manifest (representing the previous component) with the run ID of the current component metadata. If the run IDs are different, it indicates that the previous component's output belongs to - another pipeline run, implying that it is cached. Otherwise, if the run IDs match, it + another workflow run, implying that it is cached. Otherwise, if the run IDs match, it suggests that the previous component was not cached and had to execute to produce the current output. @@ -280,7 +280,7 @@ def _is_previous_cached(self, input_manifest: Manifest) -> bool: logger.info( f"Previous component `{previous_component_id}` run was cached. " - f"Cached pipeline id: {input_manifest.run_id}", + f"Cached workflow id: {input_manifest.run_id}", ) return True diff --git a/src/fondant/components/caption_images/README.md b/src/fondant/components/caption_images/README.md index 2158b173b..9c21b29b0 100644 --- a/src/fondant/components/caption_images/README.md +++ b/src/fondant/components/caption_images/README.md @@ -38,15 +38,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "caption_images", diff --git a/src/fondant/components/chunk_text/README.md b/src/fondant/components/chunk_text/README.md index 490f9e4e2..1aaff6e83 100644 --- a/src/fondant/components/chunk_text/README.md +++ b/src/fondant/components/chunk_text/README.md @@ -50,15 +50,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "chunk_text", diff --git a/src/fondant/components/crop_images/README.md b/src/fondant/components/crop_images/README.md index 33c2bfd4c..0c7ad2e8a 100644 --- a/src/fondant/components/crop_images/README.md +++ b/src/fondant/components/crop_images/README.md @@ -54,15 +54,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "crop_images", diff --git a/src/fondant/components/download_images/README.md b/src/fondant/components/download_images/README.md index 222ac62ae..67ef2585e 100644 --- a/src/fondant/components/download_images/README.md +++ b/src/fondant/components/download_images/README.md @@ -52,15 +52,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "download_images", diff --git a/src/fondant/components/embed_images/README.md b/src/fondant/components/embed_images/README.md index 8b59e56cf..00de8b5e9 100644 --- a/src/fondant/components/embed_images/README.md +++ b/src/fondant/components/embed_images/README.md @@ -37,15 +37,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "embed_images", diff --git a/src/fondant/components/embed_text/README.md b/src/fondant/components/embed_text/README.md index 030a7fc91..61f483b3a 100644 --- a/src/fondant/components/embed_text/README.md +++ b/src/fondant/components/embed_text/README.md @@ -39,15 +39,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "embed_text", diff --git a/src/fondant/components/extract_image_resolution/README.md b/src/fondant/components/extract_image_resolution/README.md index 521374d85..6b738f30e 100644 --- a/src/fondant/components/extract_image_resolution/README.md +++ b/src/fondant/components/extract_image_resolution/README.md @@ -34,15 +34,13 @@ This component takes no arguments. ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "extract_image_resolution", diff --git a/src/fondant/components/filter_image_resolution/README.md b/src/fondant/components/filter_image_resolution/README.md index 6c10b6025..cd89e699c 100644 --- a/src/fondant/components/filter_image_resolution/README.md +++ b/src/fondant/components/filter_image_resolution/README.md @@ -36,15 +36,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "filter_image_resolution", diff --git a/src/fondant/components/filter_language/README.md b/src/fondant/components/filter_language/README.md index 369efc8e8..d75015b99 100644 --- a/src/fondant/components/filter_language/README.md +++ b/src/fondant/components/filter_language/README.md @@ -34,15 +34,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "filter_language", diff --git a/src/fondant/components/filter_text_length/README.md b/src/fondant/components/filter_text_length/README.md index fa2088de6..850561ced 100644 --- a/src/fondant/components/filter_text_length/README.md +++ b/src/fondant/components/filter_text_length/README.md @@ -35,15 +35,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "filter_text_length", diff --git a/src/fondant/components/generate_minhash/README.md b/src/fondant/components/generate_minhash/README.md index 0efe19f29..ca533f3b0 100644 --- a/src/fondant/components/generate_minhash/README.md +++ b/src/fondant/components/generate_minhash/README.md @@ -36,15 +36,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "generate_minhash", diff --git a/src/fondant/components/index_aws_opensearch/README.md b/src/fondant/components/index_aws_opensearch/README.md index 3eb0c5b7a..f3ca1be88 100644 --- a/src/fondant/components/index_aws_opensearch/README.md +++ b/src/fondant/components/index_aws_opensearch/README.md @@ -42,15 +42,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply(...) diff --git a/src/fondant/components/index_qdrant/README.md b/src/fondant/components/index_qdrant/README.md index 6fc43b30a..45b19e46c 100644 --- a/src/fondant/components/index_qdrant/README.md +++ b/src/fondant/components/index_qdrant/README.md @@ -49,15 +49,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply(...) diff --git a/src/fondant/components/index_weaviate/README.md b/src/fondant/components/index_weaviate/README.md index 8e49242f1..4c6419e1f 100644 --- a/src/fondant/components/index_weaviate/README.md +++ b/src/fondant/components/index_weaviate/README.md @@ -10,19 +10,9 @@ To run the component with text snippets as input, the component needs to be conn ```python import pyarrow as pa -from fondant.dataset import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline") - -dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } -) +dataset = Dataset.read(...) dataset.write( "index_weaviate", @@ -44,19 +34,10 @@ dataset.write( ```python import pyarrow as pa -from fondant.dataset import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(name="my_pipeline",base_path="path/to/pipeline") -dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } -) +dataset = Dataset.read(...) dataset = dataset.apply( "embed_text", @@ -123,15 +104,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline - +from fondant.dataset import Dataset -pipeline = Pipeline(...) -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply(...) diff --git a/src/fondant/components/index_weaviate/fondant_component.yaml b/src/fondant/components/index_weaviate/fondant_component.yaml index 4e3fe2d8f..46320d091 100644 --- a/src/fondant/components/index_weaviate/fondant_component.yaml +++ b/src/fondant/components/index_weaviate/fondant_component.yaml @@ -8,19 +8,9 @@ description: | ```python import pyarrow as pa - from fondant.dataset import Pipeline + from fondant.dataset import Dataset - pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline") - - dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } - ) + dataset = Dataset.read(...) dataset.write( "index_weaviate", @@ -42,19 +32,10 @@ description: | ```python import pyarrow as pa - from fondant.dataset import Pipeline + from fondant.dataset import Dataset - pipeline = Pipeline(name="my_pipeline",base_path="path/to/pipeline") - dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } - ) + dataset = Dataset.read(...) dataset = dataset.apply( "embed_text", diff --git a/src/fondant/components/load_from_csv/README.md b/src/fondant/components/load_from_csv/README.md index 82fa1e8d4..b82eaa175 100644 --- a/src/fondant/components/load_from_csv/README.md +++ b/src/fondant/components/load_from_csv/README.md @@ -34,21 +34,19 @@ The component takes the following arguments to alter its behavior: | dataset_uri | str | The remote path to the csv file(s) containing the dataset | / | | column_separator | str | Define the column separator of the csv file | / | | column_name_mapping | dict | Mapping of the consumed dataset | / | -| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing dataset workflows on a small scale | / | | index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / | ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read( +dataset = Dataset.create( "load_from_csv", arguments={ # Add arguments diff --git a/src/fondant/components/load_from_csv/fondant_component.yaml b/src/fondant/components/load_from_csv/fondant_component.yaml index 4c27c6d6a..a145748e7 100644 --- a/src/fondant/components/load_from_csv/fondant_component.yaml +++ b/src/fondant/components/load_from_csv/fondant_component.yaml @@ -19,7 +19,7 @@ args: type: dict default: {} n_rows_to_load: - description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale + description: Optional argument that defines the number of rows to load. Useful for testing dataset workflows on a small scale type: int default: None index_column: diff --git a/src/fondant/components/load_from_files/README.md b/src/fondant/components/load_from_files/README.md index ee7934b38..6fd0d8060 100644 --- a/src/fondant/components/load_from_files/README.md +++ b/src/fondant/components/load_from_files/README.md @@ -37,15 +37,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read( +dataset = Dataset.create( "load_from_files", arguments={ # Add arguments diff --git a/src/fondant/components/load_from_hf_hub/README.md b/src/fondant/components/load_from_hf_hub/README.md index 41ad7ade6..9f0dd4acd 100644 --- a/src/fondant/components/load_from_hf_hub/README.md +++ b/src/fondant/components/load_from_hf_hub/README.md @@ -40,15 +40,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read( +dataset = Dataset.create( "load_from_hf_hub", arguments={ # Add arguments diff --git a/src/fondant/components/load_from_parquet/README.md b/src/fondant/components/load_from_parquet/README.md index 835ee2efb..d72c15490 100644 --- a/src/fondant/components/load_from_parquet/README.md +++ b/src/fondant/components/load_from_parquet/README.md @@ -39,15 +39,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read( +dataset = Dataset.create( "load_from_parquet", arguments={ # Add arguments diff --git a/src/fondant/components/load_from_pdf/README.md b/src/fondant/components/load_from_pdf/README.md index d257f3dc9..8409a2c17 100644 --- a/src/fondant/components/load_from_pdf/README.md +++ b/src/fondant/components/load_from_pdf/README.md @@ -40,15 +40,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read( +dataset = Dataset.create( "load_from_pdf", arguments={ # Add arguments diff --git a/src/fondant/components/resize_images/README.md b/src/fondant/components/resize_images/README.md index 98cd43890..18e85e3a6 100644 --- a/src/fondant/components/resize_images/README.md +++ b/src/fondant/components/resize_images/README.md @@ -37,15 +37,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "resize_images", diff --git a/src/fondant/components/retrieve_from_faiss_by_embedding/README.md b/src/fondant/components/retrieve_from_faiss_by_embedding/README.md index 2d3d2e673..3f365ec13 100644 --- a/src/fondant/components/retrieve_from_faiss_by_embedding/README.md +++ b/src/fondant/components/retrieve_from_faiss_by_embedding/README.md @@ -41,15 +41,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "retrieve_from_faiss_by_embedding", diff --git a/src/fondant/components/retrieve_from_faiss_by_prompt/README.md b/src/fondant/components/retrieve_from_faiss_by_prompt/README.md index 86fb418dc..d69329e8f 100644 --- a/src/fondant/components/retrieve_from_faiss_by_prompt/README.md +++ b/src/fondant/components/retrieve_from_faiss_by_prompt/README.md @@ -44,15 +44,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "retrieve_from_faiss_by_prompt", diff --git a/src/fondant/components/retrieve_laion_by_embedding/README.md b/src/fondant/components/retrieve_laion_by_embedding/README.md index 8a50e68f8..9795dcac8 100644 --- a/src/fondant/components/retrieve_laion_by_embedding/README.md +++ b/src/fondant/components/retrieve_laion_by_embedding/README.md @@ -41,15 +41,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "retrieve_laion_by_embedding", diff --git a/src/fondant/components/retrieve_laion_by_prompt/README.md b/src/fondant/components/retrieve_laion_by_prompt/README.md index 16988f1f1..33f6608f1 100644 --- a/src/fondant/components/retrieve_laion_by_prompt/README.md +++ b/src/fondant/components/retrieve_laion_by_prompt/README.md @@ -45,15 +45,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "retrieve_laion_by_prompt", diff --git a/src/fondant/components/segment_images/README.md b/src/fondant/components/segment_images/README.md index 2ba49c19a..03eb66862 100644 --- a/src/fondant/components/segment_images/README.md +++ b/src/fondant/components/segment_images/README.md @@ -37,15 +37,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply( "segment_images", diff --git a/src/fondant/components/write_to_file/README.md b/src/fondant/components/write_to_file/README.md index 9c4c26ab9..2807b3af9 100644 --- a/src/fondant/components/write_to_file/README.md +++ b/src/fondant/components/write_to_file/README.md @@ -40,15 +40,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply(...) diff --git a/src/fondant/components/write_to_hf_hub/README.md b/src/fondant/components/write_to_hf_hub/README.md index 327eafb8a..1b54cd284 100644 --- a/src/fondant/components/write_to_hf_hub/README.md +++ b/src/fondant/components/write_to_hf_hub/README.md @@ -43,15 +43,13 @@ The component takes the following arguments to alter its behavior: ## Usage -You can add this component to your pipeline using the following code: +You can apply this component to your dataset using the following code: ```python -from fondant.pipeline import Pipeline +from fondant.dataset import Dataset -pipeline = Pipeline(...) - -dataset = pipeline.read(...) +dataset = Dataset.read(...) dataset = dataset.apply(...) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 52e21476a..728b3937a 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -104,7 +104,7 @@ def create( Args: dataset_name: the name of the dataset - run_id: The id of the current pipeline run + run_id: The id of the current workflow run component_id: The id of the current component being executed cache_key: The component cache key manifest_location: location of the manifest.json file itself diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index 1606d3414..e6143ee5a 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -1,5 +1,5 @@ """This module defines common schemas and datatypes used to define Fondant manifests, components -and pipelines. +and datasets. """ import os diff --git a/src/fondant/dataset/compiler.py b/src/fondant/dataset/compiler.py index aa3a57941..fce63514d 100644 --- a/src/fondant/dataset/compiler.py +++ b/src/fondant/dataset/compiler.py @@ -86,7 +86,7 @@ def _build_entrypoint(image: Image) -> t.List[str]: class DockerCompiler(Compiler): - """Compiler that creates a docker-compose spec from a pipeline.""" + """Compiler that creates a docker-compose spec from a dataset.""" def compile( self, @@ -98,7 +98,7 @@ def compile( build_args: t.Optional[t.List[str]] = None, auth_provider: t.Optional[CloudCredentialsMount] = None, ) -> None: - """Compile a pipeline to docker-compose spec and save it to a specified output path. + """Compile a dataset workflow to docker-compose spec and save it to a specified output path. Args: dataset: the dataset to compile @@ -194,7 +194,7 @@ def _generate_spec( build_args: t.List[str], ) -> dict: """Generate a docker-compose spec as a python dictionary, - loops over the pipeline graph to create services and their dependencies. + loops over the dataset graph to create services and their dependencies. """ path, volume = self._patch_path(base_path=working_directory) run_id = dataset.manifest.run_id @@ -476,7 +476,7 @@ def __repr__(self) -> str: class KubeFlowCompiler(Compiler): - """Compiler that creates a Kubeflow pipeline spec from a pipeline.""" + """Compiler that creates a Kubeflow pipeline spec from a dataset.""" def __init__(self): self._resolve_imports() @@ -503,7 +503,8 @@ def compile( working_directory: str, output_path: str, ) -> None: - """Compile a pipeline to Kubeflow pipeline spec and save it to a specified output path. + """Compile a dataset workflow to Kubeflow pipeline spec and save it to a specified + output path. Args: dataset: the dataset to compile @@ -874,7 +875,7 @@ def compile( *, role_arn: t.Optional[str] = None, ) -> None: - """Compile a fondant pipeline to sagemaker pipeline spec and save it + """Compile a fondant dataset workflow to sagemaker pipeline spec and save it to a specified output path. Args: diff --git a/src/fondant/dataset/dataset.py b/src/fondant/dataset/dataset.py index 5aee821dd..6d630eab3 100644 --- a/src/fondant/dataset/dataset.py +++ b/src/fondant/dataset/dataset.py @@ -1,4 +1,4 @@ -"""This module defines classes to represent a Fondant Pipeline.""" +"""This module defines classes to represent a Fondant Dataset.""" import copy import datetime @@ -70,15 +70,16 @@ class Resources: """ Class representing the resources to assign to a Fondant Component operation in a Fondant - Pipeline. + Dataset. Arguments: number_of_accelerators: The number of accelerators to assign to the operation (GPU, TPU) accelerator_name: The name of the accelerator to assign. If you're using a cluster setup on GKE, select "GPU" for GPU or "TPU" for TPU. Make sure that you select a nodepool with the available hardware. If you're running the - pipeline on Vertex, then select one of the machines specified in the list of - accelerators here https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec. + dataset materilization workflow on Vertex, then select one of the machines specified + in the list of accelerators + here https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec. node_pool_label: The label of the node pool to which the operation will be assigned. node_pool_name: The name of the node pool to which the operation will be assigned. cache: Set to False to disable caching, True by default. @@ -119,8 +120,8 @@ def to_dict(self) -> t.Dict[str, t.Any]: class ComponentOp: """ - Class representing an operation for a Fondant Component in a Fondant Pipeline. An operation - is a representation of a function that will be executed as part of a pipeline. + Class representing an operation for a Fondant Component in a Fondant Dataset. An operation + is a representation of a function that will be executed as part of the workflow. Arguments: name_or_path: The name of a reusable component, or the path to the directory containing a @@ -558,7 +559,7 @@ def create( def sort_graph(self): """Sort the graph topologically based on task dependencies.""" - logger.info("Sorting pipeline component graph topologically.") + logger.info("Sorting workflow graph topologically.") sorted_graph = [] visited = set() @@ -581,7 +582,7 @@ def depth_first_traversal(node: str): self._graph = OrderedDict((node, self._graph[node]) for node in sorted_graph) def validate(self): - """Sort and run validation on the pipeline definition. + """Sort and run validation on the dataset definition. Args: run_id: run identifier @@ -603,10 +604,11 @@ def _validate_dataset_definition(self): """ run_id = self.manifest.run_id if len(self._graph.keys()) == 0: - logger.info("No components defined in the pipeline. Nothing to validate.") + logger.info( + "No components defined in the dataset workflow. Nothing to validate.", + ) return - # TODO: change later if we decide to run 2 fondant pipelines after each other load_component = True load_component_name = list(self._graph.keys())[0] @@ -663,10 +665,10 @@ def _validate_dataset_definition(self): ) load_component = False - logger.info("All pipeline component specifications match.") + logger.info("All workflow component specifications match.") def __repr__(self) -> str: - """Return a string representation of the FondantPipeline object.""" + """Return a string representation of the Fondant dataset object.""" return f"{self.__class__.__name__}({self._graph!r}" @property diff --git a/src/fondant/dataset/runner.py b/src/fondant/dataset/runner.py index 9fe4935f2..6b27764ff 100644 --- a/src/fondant/dataset/runner.py +++ b/src/fondant/dataset/runner.py @@ -42,14 +42,14 @@ def _run(self, input_spec: str, *args, **kwargs): "--remove-orphans", ] - print("Starting pipeline run...") + print("Starting workflow run...") # copy the current environment with the DOCKER_DEFAULT_PLATFORM argument subprocess.call( # nosec cmd, env=dict(os.environ, DOCKER_DEFAULT_PLATFORM="linux/amd64"), ) - print("Finished pipeline run.") + print("Finished workflow run.") def run( self, @@ -60,7 +60,7 @@ def run( build_args: t.Optional[t.List[str]] = None, auth_provider: t.Optional[CloudCredentialsMount] = None, ) -> None: - """Run a pipeline, either from a compiled docker-compose spec or from a fondant pipeline. + """Run a workflow, either from a compiled docker-compose spec or from a fondant dataset. Args: dataset: the dataset to compile or a path to an already compiled docker-compose spec @@ -78,7 +78,7 @@ def run( os.makedirs(".fondant", exist_ok=True) output_path = ".fondant/compose.yaml" logging.info( - "Found reference to un-compiled pipeline... compiling", + "Found reference to un-compiled workflow... compiling", ) compiler = DockerCompiler() compiler.compile( @@ -190,7 +190,7 @@ def run( *, experiment_name: str = "Default", ): - """Run a pipeline, either from a compiled kubeflow spec or from a fondant pipeline. + """Run a workflow, either from a compiled kubeflow spec or from a fondant dataset. Args: dataset: the dataset to compile or a path to an already compiled sagemaker spec @@ -201,7 +201,7 @@ def run( os.makedirs(".fondant", exist_ok=True) output_path = ".fondant/kubeflow-pipeline.yaml" logging.info( - "Found reference to un-compiled pipeline... compiling", + "Found reference to un-compiled workflow... compiling", ) compiler = KubeFlowCompiler() compiler.compile( @@ -237,8 +237,8 @@ def _run( pipeline_package_path=input_spec, ) - pipeline_url = f"{self.host}/#/runs/details/{runner.run_id}" - logger.info(f"Pipeline is running at: {pipeline_url}") + workflow_url = f"{self.host}/#/runs/details/{runner.run_id}" + logger.info(f"Pipeline is running at: {workflow_url}") def get_name_from_spec(self, input_spec: str): """Get the name of the pipeline from the spec.""" @@ -274,7 +274,7 @@ def run( dataset: t.Union[Dataset, str], working_directory: str, ): - """Run a pipeline, either from a compiled vertex spec or from a fondant pipeline. + """Run a workflow, either from a compiled vertex spec or from a fondant dataset. Args: dataset: the dataset to compile or a path to an already compiled sagemaker spec @@ -284,7 +284,7 @@ def run( os.makedirs(".fondant", exist_ok=True) output_path = ".fondant/vertex-pipeline.yaml" logging.info( - "Found reference to un-compiled pipeline... compiling", + "Found reference to un-compiled workflow... compiling", ) compiler = VertexCompiler() compiler.compile( @@ -344,7 +344,7 @@ def run( Args: dataset: the dataset to compile or a path to a already compiled sagemaker spec working_directory: path of the working directory - pipeline_name: the name of the pipeline to create + pipeline_name: the name of the workflow to create role_arn: the Amazon Resource Name role to use for the processing steps, if none provided the `sagemaker.get_execution_role()` role will be used. """ @@ -352,7 +352,7 @@ def run( os.makedirs(".fondant", exist_ok=True) output_path = ".fondant/sagemaker-pipeline.yaml" logging.info( - "Found reference to un-compiled pipeline... compiling", + "Found reference to un-compiled workflow... compiling", ) compiler = SagemakerCompiler() compiler.compile( diff --git a/src/fondant/explore.py b/src/fondant/explore.py index 0b6d87eae..2b8ca9119 100644 --- a/src/fondant/explore.py +++ b/src/fondant/explore.py @@ -123,8 +123,8 @@ def run_explorer_app( # type: ignore # noqa: PLR0913 tag: The tag/version of the Docker container. Default is "latest". extra_volumes: Extra volumes to mount in containers. You can use the --extra-volumes flag to specify extra volumes to mount in the containers this can be used: - - to mount data directories to be used by the pipeline (note that if your pipeline's - base_path is local it will already be mounted for you). + - to mount data directories to be used by the workflow (note that if your datasets + working directory is local it will already be mounted for you). - to mount cloud credentials auth_provider: The cloud provider to use for authentication. Default is None. """ diff --git a/src/fondant/testing.py b/src/fondant/testing.py index 9c7102a1c..2790b75de 100644 --- a/src/fondant/testing.py +++ b/src/fondant/testing.py @@ -80,45 +80,45 @@ class DockerComponentConfig(ComponentConfigs): @dataclass -class PipelineConfigs: +class DatasetConfigs: """ - Represents the configurations for a pipeline. + Represents the configurations for a dataset workflow. Args: - pipeline_name: Name of the pipeline. - pipeline_description: Description of the pipeline. + dataset_name: Name of the dataset. + dataset_description: Description of the dataset. """ - pipeline_name: str - pipeline_version: str - pipeline_description: t.Optional[str] = None + dataset_name: str + dataset_version: str + dataset_description: t.Optional[str] = None @classmethod @abstractmethod - def from_spec(cls, spec_path: str) -> "PipelineConfigs": - """Get pipeline configs from a pipeline specification.""" + def from_spec(cls, spec_path: str) -> "DatasetConfigs": + """Get dataset configs from a dataset workflow specification.""" @dataclass -class DockerComposeConfigs(PipelineConfigs): +class DockerComposeConfigs(DatasetConfigs): """ - Represents Docker-specific configurations for a pipeline. + Represents Docker-specific configurations for a dataset. Args: - component_configs: Dictionary of Docker component configurations for the pipeline. + component_configs: Dictionary of Docker component configurations for the dataset. """ component_configs: t.Optional[t.Dict[str, DockerComponentConfig]] = None @classmethod def from_spec(cls, spec_path: str) -> "DockerComposeConfigs": - """Get pipeline configs from a pipeline specification.""" + """Get dataset configs from a dataset workflow specification.""" with open(spec_path) as file_: specification = yaml.safe_load(file_) components_configs_dict = {} - pipeline_description = None + dataset_description = None # Iterate through each service for component_name, component_configs in specification["services"].items(): # Get arguments from command @@ -154,33 +154,33 @@ def from_spec(cls, spec_path: str) -> "DockerComposeConfigs": memory_limit=None, ) components_configs_dict[component_name] = component_config - pipeline_description = component_configs.get("labels", {}).get( - "pipeline_description", + dataset_description = component_configs.get("labels", {}).get( + "dataset_description", "No description provided", ) return cls( - pipeline_name=specification["name"], - pipeline_version=specification["version"], - pipeline_description=pipeline_description, + dataset_name=specification["name"], + dataset_version=specification["version"], + dataset_description=dataset_description, component_configs=components_configs_dict, ) @dataclass -class KubeflowPipelineConfigs(PipelineConfigs): +class KubeflowPipelineConfigs(DatasetConfigs): """ - Represents Kubeflow-specific configurations for a pipeline. + Represents Kubeflow-specific configurations for a dataset. Args: - component_configs: Dictionary of Kubeflow component configurations for the pipeline. + component_configs: Dictionary of Kubeflow component configurations for the dataset. """ component_configs: t.Optional[t.Dict[str, KubeflowComponentConfig]] = None @classmethod def from_spec(cls, spec_path: str) -> "KubeflowPipelineConfigs": - """Get pipeline configs from a pipeline specification.""" + """Get dataset configs from a dataset specification.""" # Two specs are present and loaded in the yaml file (component spec and k8s specs) k8_specification = {} specification = {} @@ -195,7 +195,7 @@ def from_spec(cls, spec_path: str) -> "KubeflowPipelineConfigs": ]["executors"] if not specification: - msg = "No component specification found in the pipeline specification" + msg = "No component specification found in the dataset specification" raise InvalidDatasetDefinition(msg) components_configs_dict = {} @@ -266,9 +266,9 @@ def from_spec(cls, spec_path: str) -> "KubeflowPipelineConfigs": pipeline_info = specification["pipelineInfo"] return cls( - pipeline_name=pipeline_info["name"], - pipeline_version=specification["sdkVersion"], - pipeline_description=pipeline_info.get("description", None), + dataset_name=pipeline_info["name"], + dataset_version=specification["sdkVersion"], + dataset_description=pipeline_info.get("description", None), component_configs=components_configs_dict, ) diff --git a/tests/pipeline/test_compiler.py b/tests/pipeline/test_compiler.py index f421a4721..2052ae7e0 100644 --- a/tests/pipeline/test_compiler.py +++ b/tests/pipeline/test_compiler.py @@ -179,7 +179,7 @@ def test_docker_compiler(setup_pipeline, tmp_path_factory): build_args=[], ) pipeline_configs = DockerComposeConfigs.from_spec(output_path) - assert pipeline_configs.pipeline_name == dataset.name + assert pipeline_configs.dataset_name == dataset.name for ( component_name, component_configs, @@ -488,7 +488,7 @@ def test_kubeflow_compiler(setup_pipeline, tmp_path_factory): output_path=output_path, ) pipeline_configs = KubeflowPipelineConfigs.from_spec(output_path) - assert pipeline_configs.pipeline_name == dataset.name + assert pipeline_configs.dataset_name == dataset.name for ( component_name, component_configs, @@ -596,7 +596,7 @@ def test_vertex_compiler(setup_pipeline, tmp_path_factory): output_path=output_path, ) pipeline_configs = VertexPipelineConfigs.from_spec(output_path) - assert pipeline_configs.pipeline_name == dataset.name + assert pipeline_configs.dataset_name == dataset.name for ( component_name, component_configs, diff --git a/tests/test_cli.py b/tests/test_cli.py index 7a23e9e26..bb784b40d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -201,7 +201,7 @@ def test_local_compile(tmp_path_factory): compile_local(args) mock_compiler.assert_called_once_with( - pipeline=TEST_DATASET, + dataset=TEST_DATASET, extra_volumes=[], output_path=str(fn / "docker-compose.yml"), build_args=[], @@ -222,7 +222,7 @@ def test_kfp_compile(tmp_path_factory): ) compile_kfp(args) mock_compiler.assert_called_once_with( - pipeline=TEST_DATASET, + dataset=TEST_DATASET, output_path=str(fn / "kubeflow_pipeline.yml"), ) @@ -240,7 +240,7 @@ def test_vertex_compile(tmp_path_factory): ) compile_vertex(args) mock_compiler.assert_called_once_with( - pipeline=TEST_DATASET, + dataset=TEST_DATASET, output_path=str(fn / "vertex_pipeline.yml"), ) @@ -260,7 +260,7 @@ def test_sagemaker_compile(tmp_path_factory): ) compile_sagemaker(args) mock_compiler.assert_called_once_with( - pipeline=TEST_DATASET, + dataset=TEST_DATASET, output_path=str(fn / "sagemaker_pipeline.json"), role_arn="some_role", )