Skip to content

Commit

Permalink
Use PandasTransformComponent instead of DaskTransformComponent
Browse files Browse the repository at this point in the history
  • Loading branch information
mrchtr committed Jun 26, 2023
1 parent 65d4e28 commit 0f9159d
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 30 deletions.
2 changes: 1 addition & 1 deletion components/language_filter/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ consumes:
args:
language:
description: A valid language code or identifier (e.g., "en", "fr", "de").
type: string
type: str
41 changes: 22 additions & 19 deletions components/language_filter/src/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""A component that filter a provided dataframe based on the language"""
"""A component that filter a provided dataframe based on the language."""
import logging
import dask.dataframe as dd
from fondant.component import DaskTransformComponent
from fondant.logger import configure_logging

import fasttext
import pandas as pd

from fondant.component import PandasTransformComponent
from fondant.logger import configure_logging

configure_logging()
logger = logging.getLogger(__name__)
Expand All @@ -14,11 +16,11 @@ class LanguageIdentification:

def __init__(self, model_path: str = "lid.176.ftz"):
"""
Initializes the LanguageDetect class.
Initializes the LanguageDetect class.
Args:
Args:
model_path (str): The path to the FastText language identification model.
"""
"""
pretrained_lang_model_weight_path = model_path
self.model = fasttext.load_model(pretrained_lang_model_weight_path)

Expand All @@ -36,31 +38,32 @@ def predict_lang(self, text: str):
return predictions[0][0]

def is_language(self, row, language):
"""Predict if text of a row is written in the defined language."""
return language in self.predict_lang(row["text"])


class LanguageFilterComponent(DaskTransformComponent):
"""Component that filter columns based on provided language"""
class LanguageFilterComponent(PandasTransformComponent):
"""Component that filter columns based on provided language."""

def setup(self, *args, **kwargs):
"""Setup language filter component."""
self.lang_detector = LanguageIdentification()

def transform(
self,
*,
dataframe: dd.DataFrame,
language: str,
) -> dd.DataFrame:
dataframe: pd.DataFrame
) -> pd.DataFrame:
"""
Args:
dataframe: Dask dataframe.
language: Only keep text passages which are in the provided language
language: Only keep text passages which are in the provided language.
Returns:
Dask dataframe
"""

lang_detector = LanguageIdentification()
mask = dataframe.map_partitions(
lambda df: df.apply(lambda row: lang_detector.is_language(row, language), axis=1),
meta=bool)
language = self.user_arguments["language"]
mask = dataframe.apply(
lambda row: self.lang_detector.is_language(row, language), axis=1)

return dataframe[mask]

Expand Down
Empty file.
42 changes: 32 additions & 10 deletions components/language_filter/tests/language_filter_component_test.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
"""Unit test for language filter component"""
import pandas as pd
from components.language_filter.src.main import LanguageFilterComponent
from fondant.component_spec import ComponentSpec
from dask.dataframe import from_pandas


def test_run_component_test():
"""Test language filter component"""

"""Test language filter component."""
# Given: Dataframe with text in different languages
data = [{"text": "Das hier ist ein Satz in deutscher Sprache"}, {"text": "This is a sentence in English"},
data = [{"text": "Das hier ist ein Satz in deutscher Sprache"},
{"text": "This is a sentence in English"},
{"text": "Dit is een zin in het Nederlands"}]
df = pd.DataFrame(data)
ddf = from_pandas(df, npartitions=1)
dataframe = pd.DataFrame(data)

# When: The language filter component proceed the dataframe
# and filter out all entries which are not written in german
Expand All @@ -23,9 +22,32 @@ def test_run_component_test():
user_arguments={"language": "de"}
)

ddf = component.transform(dataframe=ddf, **component.user_arguments)
dataframe = component.transform(dataframe=dataframe)

# Then: dataframe only contains one german row
df = ddf.compute()
assert len(df) == 1
assert df.loc[0]["text"] == "Das hier ist ein Satz in deutscher Sprache"
dataframe = dataframe.compute()
assert len(dataframe) == 1
assert dataframe.loc[0]["text"] == "Das hier ist ein Satz in deutscher Sprache"

def test_run_component_test_filter_out_all():
"""Test language filter component."""
# Given: Dataframe with text in different languages
data = [{"text": "Das hier ist ein Satz in deutscher Sprache"},
{"text": "This is a sentence in English"},
{"text": "Dit is een zin in het Nederlands"}]
dataframe = pd.DataFrame(data)

# When: The language filter component proceed the dataframe
# and filter out all entries which are not written in french
spec = ComponentSpec.from_file("../fondant_component.yaml")

component = LanguageFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json",
output_manifest_path="./dummy_input_manifest.json",
metadata={},
user_arguments={"language": "fr"}
)

dataframe = component.transform(dataframe=dataframe)

# Then: dataframe should contain no rows anymore
assert len(dataframe) == 0

0 comments on commit 0f9159d

Please sign in to comment.