Add defaults to components (#289)

PR that add default arguments to component yaml. This makes it easier to get started with default values for reusable components.
ml6team · Jul 12, 2023 · 86867a5 · 86867a5
1 parent 711f317
commit 86867a5
Show file tree

Hide file tree

Showing 13 changed files with 43 additions and 21 deletions.
diff --git a/components/caption_images/fondant_component.yaml b/components/caption_images/fondant_component.yaml
@@ -18,9 +18,12 @@ args:
   model_id:
     description: id of the model on the Hugging Face hub
     type: str
+    default: "Salesforce/blip-image-captioning-base"
   batch_size:
     description: batch size to use
     type: int
+    default: 8
   max_new_tokens:
     description: maximum token length of each caption
-    type: int
+    type: int
+    default: 50
diff --git a/components/embedding_based_laion_retrieval/fondant_component.yaml b/components/embedding_based_laion_retrieval/fondant_component.yaml
@@ -24,6 +24,8 @@ args:
   aesthetic_score:
     description: Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier).
     type: int
+    default: 9
   aesthetic_weight:
     description: Weight of the aesthetic embedding when added to the query, between 0 and 1
-    type: float
+    type: float
+    default: 0.5
diff --git a/components/filter_comments/fondant_component.yaml b/components/filter_comments/fondant_component.yaml
@@ -13,6 +13,8 @@ args:
   min_comments_ratio:
     description: The minimum code to comment ratio
     type: float
+    default: 0.1
   max_comments_ratio:
     description: The maximum code to comment ratio
-    type: float
+    type: float
+    default: 0.9
diff --git a/components/image_cropping/fondant_component.yaml b/components/image_cropping/fondant_component.yaml
@@ -22,6 +22,8 @@ args:
   cropping_threshold:
     description: Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30
     type: int
+    default: -30
   padding:
     description: Padding for the image cropping. The padding is added to all borders of the image.
     type: int
+    default: 10
diff --git a/components/image_cropping/src/main.py b/components/image_cropping/src/main.py
@@ -34,8 +34,8 @@ def transform(
         self,
         *,
         dataframe: dd.DataFrame,
-        cropping_threshold: int = -30,
-        padding: int = 10,
+        cropping_threshold: int,
+        padding: int,
     ) -> dd.DataFrame:
         """
         Args:

diff --git a/components/image_embedding/fondant_component.yaml b/components/image_embedding/fondant_component.yaml
@@ -20,6 +20,8 @@ args:
   model_id:
     description: Model id on the Hugging Face hub  (e.g. "openai/clip-vit-large-patch14")
     type: str
+    default: "openai/clip-vit-large-patch14"
   batch_size:
     description: Batch size to use when embedding
-    type: int
+    type: int
+    default: 8
diff --git a/components/language_filter/fondant_component.yaml b/components/language_filter/fondant_component.yaml
@@ -11,4 +11,5 @@ consumes:
 args:
   language:
     description: A valid language code or identifier (e.g., "en", "fr", "de").
-    type: str
+    type: str
+    default: "en"
diff --git a/components/language_filter/src/main.py b/components/language_filter/src/main.py
@@ -7,21 +7,22 @@
 
 logger = logging.getLogger(__name__)
 
+MODEL_PATH = "lid.176.ftz"
+
 
 class LanguageIdentification:
     """A class for language detection using FastText."""
 
-    def __init__(self, language, model_path: str = "lid.176.ftz"):
+    def __init__(self,
+                 language: str):
         """
         Initializes the LanguageDetect class.
 
         Args:
            language (str): language to filter on
-           model_path (str): The path to the FastText language identification model.
         """
-        pretrained_lang_model_weight_path = model_path
         self.language = language
-        self.model = fasttext.load_model(pretrained_lang_model_weight_path)
+        self.model = fasttext.load_model(MODEL_PATH)
 
     def predict_lang(self, text: str):
         """
@@ -52,7 +53,6 @@ def setup(self, *, language):
         """
         self.lang_detector = LanguageIdentification(language)
 
-
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         """
         Args:

diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml
@@ -22,10 +22,13 @@ args:
   aesthetic_score:
     description: Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier).
     type: int
+    default: 9
   aesthetic_weight:
     description: Weight of the aesthetic embedding when added to the query, between 0 and 1
     type: float
+    default: 0.5
   url:
     description: The url of the backend clip retrieval service, defaults to the public service
     type: str
-    default: https://knn.laion.ai/knn-service
+    default: https://knn.laion.ai/knn-service
+
diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml
@@ -18,6 +18,8 @@ args:
   model_id:
     description: id of the model on the Hugging Face hub
     type: str
+    default: "openmmlab/upernet-convnext-small"
   batch_size:
     description: batch size to use
-    type: int
+    type: int
+    batch_size: 8
diff --git a/components/segment_images/src/main.py b/components/segment_images/src/main.py
@@ -40,7 +40,10 @@ def convert_to_rgb(seg: np.array) -> bytes:
     return crop_bytes.getvalue()
 
 
-def process_image(image: bytes, *, processor: SegformerImageProcessor, device: str) -> torch.Tensor:
+def process_image(image: bytes,
+                  *,
+                  processor: SegformerImageProcessor,
+                  device: str) -> torch.Tensor:
     """
     Process the image to a tensor.
 
@@ -65,7 +68,9 @@ def transform(img: Image) -> BatchFeature:
 
 
 @torch.no_grad()
-def segment_image_batch(image_batch: pd.DataFrame, *, model: AutoModelForSemanticSegmentation,
+def segment_image_batch(image_batch: pd.DataFrame,
+                        *,
+                        model: AutoModelForSemanticSegmentation,
                         processor: SegformerImageProcessor) -> pd.Series:
     """Embed a batch of images."""
     input_batch = torch.cat(image_batch.tolist())

diff --git a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml
@@ -63,5 +63,4 @@ services:
       second_component:
         condition: service_completed_successfully
     volumes: []
-version: '3.8'
-
+version: '3.8'
diff --git a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml
@@ -34,13 +34,14 @@ services:
       "height": {"type": "int16"}}}}, "args": {"cropping_threshold": {"description":
       "Threshold parameter used for detecting borders. A lower (negative) parameter
       results in a more performant border detection, but can cause overcropping. Default
-      is -30", "type": "int"}, "padding": {"description": "Padding for the image cropping.
-      The padding is added to all borders of the image.", "type": "int"}}}'
+      is -30", "type": "int", "default": -30}, "padding": {"description": "Padding
+      for the image cropping. The padding is added to all borders of the image.",
+      "type": "int", "default": 10}}}'
     - --input_manifest_path
     - /foo/bar/first_component/manifest.json
     depends_on:
       first_component:
         condition: service_completed_successfully
     image: ghcr.io/ml6team/image_cropping:dev
     volumes: []
-version: '3.8'
+version: '3.8'