Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add resource requirements to the retrieve from faiss component #905

Merged
4 changes: 2 additions & 2 deletions src/fondant/components/caption_images/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
RUN pip3 install fondant[aws,azure,gcp,gpu]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component
Expand All @@ -25,4 +25,4 @@ RUN python -m pytest tests

FROM base
WORKDIR /component/src
ENTRYPOINT ["fondant", "execute", "main"]
ENTRYPOINT ["fondant", "execute", "main"]
13 changes: 13 additions & 0 deletions src/fondant/components/caption_images/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import os
import typing as t

import dask
import numpy as np
import pandas as pd
import torch
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from fondant.component import PandasTransformComponent
from PIL import Image
from transformers import BatchEncoding, BlipForConditionalGeneration, BlipProcessor
Expand Down Expand Up @@ -90,6 +93,16 @@ def __init__(
self.batch_size = batch_size
self.max_new_tokens = max_new_tokens

def setup(self) -> Client:
"""Setup LocalCudaCluster if gpu is available."""
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
dask.config.set({"dataframe.convert-string": False})
dask.config.set({"distributed.worker.daemon": False})

if self.device == "cuda":
cluster = LocalCUDACluster()
return Client(cluster)
return super().setup()

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
images = dataframe["image"]

Expand Down
21 changes: 19 additions & 2 deletions src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import os
import typing as t

import dask
import dask.dataframe as dd
import faiss
import fsspec
import pandas as pd
import torch
from dask.distributed import Client, get_worker
from dask.distributed import Client, LocalCluster, get_worker
from dask_cuda import LocalCUDACluster
from fondant.component import PandasTransformComponent
from transformers import AutoTokenizer, CLIPTextModelWithProjection
Expand Down Expand Up @@ -47,11 +48,27 @@ def __init__( # PLR0913

def setup(self) -> Client:
"""Setup LocalCudaCluster if gpu is available."""
dask.config.set({"dataframe.convert-string": False})
dask.config.set({"distributed.worker.daemon": False})

if self.device == "cuda":
cluster = LocalCUDACluster()
return Client(cluster)

return super().setup()
total_memory = (os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")) / (
1024**3
)
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
# We need at least 8Gb RAM for the datacomp small faiss index
# We should consider calculating the memory required for the index based on the faiss
# index size
cores_to_utilise = total_memory // 8
cluster = LocalCluster(
processes=True,
n_workers=cores_to_utilise,
threads_per_worker=1,
memory_limit="8 GiB",
)
return Client(cluster)

def embed_prompt(self, prompt: str):
"""Embed prompt using CLIP model."""
Expand Down
2 changes: 1 addition & 1 deletion src/fondant/components/segment_images/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
RUN pip3 install fondant[aws,azure,gcp,gpu]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src
Expand Down
13 changes: 13 additions & 0 deletions src/fondant/components/segment_images/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import os
import typing as t

import dask
import numpy as np
import pandas as pd
import torch
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from fondant.component import PandasTransformComponent
from palette import palette
from PIL import Image
Expand Down Expand Up @@ -127,6 +130,16 @@ def __init__(

self.batch_size = batch_size

def setup(self) -> Client:
"""Setup LocalCudaCluster if gpu is available."""
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
dask.config.set({"dataframe.convert-string": False})
dask.config.set({"distributed.worker.daemon": False})

if self.device == "cuda":
cluster = LocalCUDACluster()
return Client(cluster)
return super().setup()

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
images = dataframe["image"]

Expand Down
Loading