-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into large-scale-controlnet
- Loading branch information
Showing
47 changed files
with
1,129 additions
and
311 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
name: Build Python 🐍 distributions 📦 and publish to TestPyPI | ||
on: | ||
push: | ||
tags: | ||
- '[0-9]+.[0-9]+.[0-9]+*' | ||
jobs: | ||
build-n-publish: | ||
name: Build Python 🐍 distributions 📦 and publish to TestPyPI | ||
runs-on: ubuntu-latest | ||
permissions: | ||
id-token: write | ||
packages: write | ||
steps: | ||
- uses: actions/checkout@master | ||
|
||
- name: Set up Python 3.9 | ||
uses: actions/setup-python@v1 | ||
with: | ||
python-version: 3.9 | ||
|
||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v2 | ||
|
||
- name: Set buildx alias | ||
run: docker buildx install | ||
|
||
- name: Login to GitHub Container Registry | ||
uses: docker/login-action@v2 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Build components | ||
run: ./scripts/build_components.sh -t $GITHUB_REF_NAME | ||
|
||
- name: Build data explorer | ||
run: ./scripts/build_explorer.sh -t $GITHUB_REF_NAME | ||
|
||
- name: Update version in pyproject.toml with tag version | ||
run: sed -i "s/^version = .*/version = '${{github.ref_name}}'/" pyproject.toml | ||
|
||
- name: Build a binary wheel and a source tarball | ||
run: | | ||
pip install poetry | ||
./scripts/pre-build.sh | ||
poetry build | ||
- name: Publish distribution 📦 to Test PyPI | ||
uses: pypa/gh-action-pypi-publish@v1.8.6 | ||
with: | ||
name: testpypi | ||
repository_url: https://test.pypi.org/legacy/ | ||
url: https://test.pypi.org/p/fondant |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
FROM --platform=linux/amd64 python:3.8-slim | ||
|
||
## System dependencies | ||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install git -y | ||
|
||
# install requirements | ||
COPY requirements.txt / | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
# Set the working directory to the component folder | ||
WORKDIR /component/src | ||
|
||
# Copy over src-files | ||
COPY src/ . | ||
|
||
ENTRYPOINT ["python", "main.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Language filter | ||
|
||
## Description | ||
This component is based on the `TransformComponent` and is used to filter a dataframe based on language. | ||
It allows you to remove rows that do not match the provided language, thus providing a way to focus | ||
on specific languages within your data. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
name: Filter languages | ||
description: A component that filters text based on the language. | ||
image: ghcr.io/ml6team/filter_language:latest | ||
|
||
consumes: | ||
text: | ||
fields: | ||
data: | ||
type: string | ||
|
||
args: | ||
language: | ||
description: A valid language code or identifier (e.g., "en", "fr", "de"). | ||
type: str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
git+https://github.com/ml6team/fondant@main | ||
pyarrow>=7.0 | ||
gcsfs==2023.4.00 | ||
fasttext-wheel==0.9.2 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
"""A component that filters text based on the language.""" | ||
import logging | ||
|
||
import fasttext | ||
import pandas as pd | ||
|
||
from fondant.component import PandasTransformComponent | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class LanguageIdentification: | ||
"""A class for language detection using FastText.""" | ||
|
||
def __init__(self, language, model_path: str = "lid.176.ftz"): | ||
""" | ||
Initializes the LanguageDetect class. | ||
Args: | ||
language (str): language to filter on | ||
model_path (str): The path to the FastText language identification model. | ||
""" | ||
pretrained_lang_model_weight_path = model_path | ||
self.language = language | ||
self.model = fasttext.load_model(pretrained_lang_model_weight_path) | ||
|
||
def predict_lang(self, text: str): | ||
""" | ||
Detects the language of a text sequence. | ||
Args: | ||
text (str): The text for language detection. | ||
Returns: | ||
str: The predicted language label. | ||
""" | ||
predictions = self.model.predict(text, k=1) | ||
return predictions[0][0] | ||
|
||
def is_language(self, row): | ||
"""Predict if text of a row is written in the defined language.""" | ||
return self.language in self.predict_lang(row["text"]) | ||
|
||
|
||
class LanguageFilterComponent(PandasTransformComponent): | ||
"""Component that filter columns based on provided language.""" | ||
|
||
def setup(self, *, language): | ||
"""Setup language filter component. | ||
Args: | ||
language: Only keep text passages which are in the provided language. | ||
""" | ||
self.lang_detector = LanguageIdentification(language) | ||
|
||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Args: | ||
dataframe: Pandas dataframe. | ||
Returns: | ||
Pandas dataframe | ||
""" | ||
mask = dataframe.apply(self.lang_detector.is_language, axis=1) | ||
|
||
return dataframe[mask] | ||
|
||
|
||
if __name__ == "__main__": | ||
component = LanguageFilterComponent.from_args() | ||
component.run() |
54 changes: 54 additions & 0 deletions
54
components/language_filter/tests/language_filter_component_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
"""Unit test for language filter component.""" | ||
import pandas as pd | ||
|
||
from components.language_filter.src.main import LanguageFilterComponent | ||
from fondant.component_spec import ComponentSpec | ||
|
||
|
||
def test_run_component_test(): | ||
"""Test language filter component.""" | ||
# Given: Dataframe with text in different languages | ||
data = [{"text": "Das hier ist ein Satz in deutscher Sprache"}, | ||
{"text": "This is a sentence in English"}, | ||
{"text": "Dit is een zin in het Nederlands"}] | ||
dataframe = pd.DataFrame(data) | ||
|
||
# When: The language filter component proceed the dataframe | ||
# and filter out all entries which are not written in german | ||
spec = ComponentSpec.from_file("../fondant_component.yaml") | ||
|
||
component = LanguageFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json", | ||
output_manifest_path="./dummy_input_manifest.json", | ||
metadata={}, | ||
user_arguments={"language": "de"}, | ||
) | ||
component.setup(language="de") | ||
dataframe = component.transform(dataframe=dataframe) | ||
|
||
# Then: dataframe only contains one german row | ||
assert len(dataframe) == 1 | ||
assert dataframe.loc[0]["text"] == "Das hier ist ein Satz in deutscher Sprache" | ||
|
||
|
||
def test_run_component_test_filter_out_all(): | ||
"""Test language filter component.""" | ||
# Given: Dataframe with text in different languages | ||
data = [{"text": "Das hier ist ein Satz in deutscher Sprache"}, | ||
{"text": "This is a sentence in English"}, | ||
{"text": "Dit is een zin in het Nederlands"}] | ||
dataframe = pd.DataFrame(data) | ||
|
||
# When: The language filter component proceed the dataframe | ||
# and filter out all entries which are not written in french | ||
spec = ComponentSpec.from_file("../fondant_component.yaml") | ||
|
||
component = LanguageFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json", | ||
output_manifest_path="./dummy_input_manifest.json", | ||
metadata={}, | ||
user_arguments={"language": "fr"}, | ||
) | ||
component.setup() | ||
dataframe = component.transform(dataframe=dataframe) | ||
|
||
# Then: dataframe should contain no rows anymore | ||
assert len(dataframe) == 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
FROM --platform=linux/amd64 python:3.8-slim | ||
|
||
## System dependencies | ||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install git -y | ||
|
||
# install requirements | ||
COPY requirements.txt / | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
# Set the working directory to the component folder | ||
WORKDIR /component/src | ||
|
||
# Copy over src-files | ||
COPY src/ . | ||
|
||
ENTRYPOINT ["python", "main.py"] |
Oops, something went wrong.