From 4248d21194892e80dc34d1f0456cf14bfd42cd07 Mon Sep 17 00:00:00 2001 From: Julio Date: Thu, 29 Jun 2023 15:35:30 -0400 Subject: [PATCH 1/2] fix faiss setup to take dataframe as input --- merlin/systems/dag/ops/faiss.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/merlin/systems/dag/ops/faiss.py b/merlin/systems/dag/ops/faiss.py index 2cb138be2..dce656c36 100644 --- a/merlin/systems/dag/ops/faiss.py +++ b/merlin/systems/dag/ops/faiss.py @@ -21,7 +21,7 @@ import numpy as np from merlin.core.dispatch import HAS_GPU -from merlin.core.protocols import Transformable +from merlin.core.protocols import DataFrameLike, Transformable from merlin.dag import ColumnSelector from merlin.schema import ColumnSchema, Schema from merlin.systems.dag.ops.operator import InferenceOperator @@ -189,7 +189,13 @@ def validate_schemas( ) -def setup_faiss(item_vector, output_path: str, metric=faiss.METRIC_INNER_PRODUCT): +def setup_faiss( + item_vector: DataFrameLike, + output_path: str, + metric=faiss.METRIC_INNER_PRODUCT, + item_id_column="item_id", + embedding_column="embedding", +): """ Utiltiy function that will create a Faiss index from a set of embedding vectors @@ -200,8 +206,10 @@ def setup_faiss(item_vector, output_path: str, metric=faiss.METRIC_INNER_PRODUCT output_path : string target output path """ - ids = item_vector[:, 0].astype(np.int64) - item_vectors = np.ascontiguousarray(item_vector[:, 1:].astype(np.float32)) + ids = item_vector[item_id_column].to_numpy().astype(np.int64) + item_vectors = np.ascontiguousarray( + np.stack(item_vector[embedding_column].to_numpy()).astype(np.float32) + ) index = faiss.index_factory(item_vectors.shape[1], "IVF32,Flat", metric) index.nprobe = 8 From 0762f75f8880a8f1a394fe19d4c65f61ee68a386 Mon Sep 17 00:00:00 2001 From: Julio Date: Fri, 30 Jun 2023 13:30:50 -0400 Subject: [PATCH 2/2] use make df in faiss test --- tests/unit/systems/ops/faiss/test_executor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/systems/ops/faiss/test_executor.py b/tests/unit/systems/ops/faiss/test_executor.py index 3fff19019..93f1358a9 100644 --- a/tests/unit/systems/ops/faiss/test_executor.py +++ b/tests/unit/systems/ops/faiss/test_executor.py @@ -18,6 +18,7 @@ import numpy as np import pytest +from merlin.core.dispatch import make_df from merlin.schema import ColumnSchema, Schema from merlin.systems.dag.ensemble import Ensemble from merlin.systems.dag.ops.faiss import QueryFaiss, setup_faiss @@ -57,9 +58,11 @@ def test_faiss_in_triton_executor_model(tmpdir): ) faiss_path = tmpdir / "faiss.index" - item_ids = np.arange(0, 100).reshape(-1, 1) - item_embeddings = np.ascontiguousarray(np.random.rand(100, 128)) - setup_faiss(np.concatenate((item_ids, item_embeddings), axis=1), faiss_path) + item_ids = np.arange(0, 100) + item_embeddings = np.random.rand(100, 128) + # cannot turn a list column in cudf directly to numpy so must delegate to pandas as bridge + df = make_df({"item_id": item_ids, "embedding": item_embeddings.tolist()}, device="cpu") + setup_faiss(df, faiss_path) request_schema = Schema( [