greenplum-db · xuebinsu · Dec 7, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
diff --git a/greenplumpython/experimental/embedding.py b/greenplumpython/experimental/embedding.py
@@ -3,7 +3,7 @@
 
 import greenplumpython as gp
 from greenplumpython.row import Row
-from greenplumpython.type import TypeCast
+from greenplumpython.type import TypeCast, _serialize_to_expr
 
 
 @gp.create_function
@@ -39,7 +39,7 @@ class ObjectAddress(ctypes.Structure):
 
 
 @gp.create_function
-def _generate_embedding(content: str, model_name: str) -> gp.type_("vector"):  # type: ignore reportUnknownParameterType
+def create_embedding(content: str, model_name: str) -> gp.type_("vector"):  # type: ignore reportUnknownParameterType
     import sys
 
     import sentence_transformers  # type: ignore reportMissingImports
@@ -134,7 +134,7 @@ def create_index(
                         Callable[[gp.DataFrame], TypeCast],
                         # FIXME: Modifier must be adapted to all types of model.
                         # Can this be done with transformers.AutoConfig?
-                        lambda t: gp.type_("vector", modifier=embedding_dimension)(_generate_embedding(t[column], model_name)),  # type: ignore reportUnknownLambdaType
+                        lambda t: gp.type_("vector", modifier=embedding_dimension)(create_embedding(t[column], model_name)),  # type: ignore reportUnknownLambdaType
                     )
                 },
             )[embedding_df_cols]
@@ -152,22 +152,36 @@ def create_index(
             )
         assert self._dataframe._db is not None
         _record_dependency._create_in_db(self._dataframe._db)
+        query_col_names = _serialize_to_expr(
+            list(self._dataframe.unique_key) + [column], self._dataframe._db
+        )
         sql_add_relationship = f"""
             DO $$
             BEGIN
                 SET LOCAL allow_system_table_mods TO ON;
-
-                WITH embedding_info AS (
-                    SELECT '{embedding_df._qualified_table_name}'::regclass::oid AS embedding_relid, attnum, '{model_name}' AS model
-                    FROM pg_attribute
+
+                WITH attnum_map AS (
+                    SELECT attname, attnum FROM pg_attribute
                     WHERE 
                         attrelid = '{self._dataframe._qualified_table_name}'::regclass::oid AND
-                        attname = '{column}'
+                        EXISTS (
+                            SELECT FROM unnest({query_col_names}) AS query
+                            WHERE attname = query
+                        )
+                ), embedding_info AS (
+                    SELECT 
+                        '{embedding_df._qualified_table_name}'::regclass::oid AS embedding_relid,
+                        attnum AS content_attnum,
+                        {len(self._dataframe._unique_key) + 1} AS embedding_attnum,
+                        '{model_name}' AS model,
+                        ARRAY(SELECT attnum FROM attnum_map WHERE attname != '{column}') AS unique_key
+                    FROM attnum_map
+                    WHERE attname = '{column}'
                 )
                 UPDATE pg_class
                 SET reloptions = array_append(
-                    reloptions, 
-                    format('_pygp_emb_%s=%s', attnum::text, to_json(embedding_info))
+                    reloptions,
+                    format('_pygp_emb_%s=%s', content_attnum::text, to_json(embedding_info))
                 )
                 FROM embedding_info
                 WHERE oid = '{self._dataframe._qualified_table_name}'::regclass::oid;
@@ -177,6 +191,7 @@ def create_index(
                         '{self._dataframe._qualified_table_name}'::regclass::oid,
                         '{embedding_df._qualified_table_name}'::regclass::oid
                     );
+
                 IF version() LIKE '%Greenplum%' THEN
                     PERFORM
                         {_record_dependency._qualified_name_str}(
@@ -210,9 +225,9 @@ def search(self, column: str, query: Any, top_k: int) -> gp.DataFrame:
         embdedding_info = self._dataframe._db._execute(
             f"""
             WITH indexed_col_info AS (
-                SELECT attrelid, attnum
+                SELECT attrelid, attnum AS content_attnum
                 FROM pg_attribute
-                WHERE 
+                WHERE
                     attrelid = '{self._dataframe._qualified_table_name}'::regclass::oid AND
                     attname = '{column}'
             ), reloptions AS (
@@ -222,39 +237,45 @@ def search(self, column: str, query: Any, top_k: int) -> gp.DataFrame:
             ), embedding_info_json AS (
                 SELECT split_part(option, '=', 2)::json AS val
                 FROM reloptions, indexed_col_info
-                WHERE option LIKE format('_pygp_emb_%s=%%', attnum)
+                WHERE option LIKE format('_pygp_emb_%s=%%', content_attnum)
             ), embedding_info AS (
                 SELECT * 
-                FROM embedding_info_json, json_to_record(val) AS (attnum int4, embedding_relid oid, model text)
+                FROM embedding_info_json, json_to_record(val) AS (
+                    embedding_attnum int4, embedding_relid oid, model text, unique_key int[]
+                )
+            ), unique_key_names AS (
+                SELECT ARRAY(
+                    SELECT attname FROM pg_attribute
+                    WHERE attrelid = embedding_relid AND attnum = ANY(unique_key)
+                ) AS val
+                FROM embedding_info
             )
-            SELECT nspname, relname, attname, model
-            FROM embedding_info, pg_class, pg_namespace, pg_attribute
+            SELECT nspname, relname, attname, model, unique_key_names.val AS unique_key
+            FROM embedding_info, pg_class, pg_namespace, pg_attribute, unique_key_names
             WHERE 
                 pg_class.oid = embedding_relid AND
                 relnamespace = pg_namespace.oid AND
                 embedding_relid = attrelid AND
-                pg_attribute.attnum = 2;
+                embedding_attnum = attnum;
             """
         )
-        row: Row = embdedding_info[0]
-        schema: str = row["nspname"]
-        embedding_table_name: str = row["relname"]
-        model = row["model"]
-        embedding_col_name = row["attname"]
+        row: Row = embdedding_info[0]  # type: ignore reportUnknownVariableType
+        schema: str = row["nspname"]  # type: ignore reportUnknownVariableType
+        embedding_table_name: str = row["relname"]  # type: ignore reportUnknownVariableType
+        model = row["model"]  # type: ignore reportUnknownVariableType
+        embedding_col_name = row["attname"]  # type: ignore reportUnknownVariableType
         embedding_df = self._dataframe._db.create_dataframe(
-            table_name=embedding_table_name, schema=schema
+            table_name=embedding_table_name, schema=schema  # type: ignore reportUnknownArgumentType
         )
+        unique_key: list[str] = row["unique_key"]  # type: ignore reportUnknownVariableType
         assert embedding_df is not None
-        assert self._dataframe.unique_key is not None
         distance = gp.operator("<->")  # L2 distance is the default operator class in pgvector
         return self._dataframe.join(
             embedding_df.assign(
-                distance=lambda t: distance(
-                    embedding_df[embedding_col_name], _generate_embedding(query, model)
-                )
+                distance=lambda t: distance(t[embedding_col_name], create_embedding(query, model))
             ).order_by("distance")[:top_k],
             how="inner",
-            on=self._dataframe.unique_key,
+            on=unique_key,  # type: ignore reportUnknownArgumentType
             self_columns={"*"},
             other_columns={},
         )

diff --git a/tests/test_embedding.py b/tests/test_embedding.py
@@ -1,15 +1,24 @@
 import pytest
 
 import greenplumpython as gp
+import greenplumpython.experimental.embedding as _
 from tests import db
 
 
+def search_embeddings(t: gp.DataFrame):
+    results = t.embedding().search(column="content", query="apple", top_k=1)
+    assert len(list(results)) == 1
+    for row in results:
+        assert row["content"] == "I like eating apples."
+
+
 @pytest.mark.requires_pgvector
-def test_embedding_query_string(db: gp.Database):
+def test_embedding_query_text(db: gp.Database):
     content = ["I have a dog.", "I like eating apples."]
     t = (
         db.create_dataframe(columns={"id": range(len(content)), "content": content})
         .save_as(
+            table_name="doc",
             temp=True,
             column_names=["id", "content"],
             distribution_key={"id"},
@@ -19,8 +28,35 @@ def test_embedding_query_string(db: gp.Database):
         )
         .check_unique(columns={"id"})
     )
-    t = t.embedding().create_index(column="content", model_name="all-MiniLM-L6-v2")
-    df = t.embedding().search(column="content", query="apple", top_k=1)
-    assert len(list(df)) == 1
-    for row in df:
-        assert row["content"] == "I like eating apples."
+    t.embedding().create_index(column="content", model_name="all-MiniLM-L6-v2")
+    search_embeddings(t)
+
+    # Ensure that a new DataFrame created from table in database can also be
+    # searched.
+    search_embeddings(db.create_dataframe("doc", schema="pg_temp"))
+
+
+@pytest.mark.requires_pgvector
+def test_embedding_multi_col_unique(db: gp.Database):
+    content = ["I have a dog.", "I like eating apples."]
+    columns = {"id": range(len(content)), "id2": [1] * len(content), "content": content}
+    t = (
+        db.create_dataframe(columns=columns)
+        .save_as(
+            temp=True,
+            column_names=list(columns.keys()),
+            distribution_key={"id"},
+            distribution_type="hash",
+            drop_if_exists=True,
+            drop_cascade=True,
+        )
+        .check_unique(columns={"id", "id2"})
+    )
+    t.embedding().create_index(column="content", model_name="all-MiniLM-L6-v2")
+    print(
+        "reloptions =",
+        db._execute(
+            f"SELECT reloptions FROM pg_class WHERE oid = '{t._qualified_table_name}'::regclass"
+        ),
+    )
+    search_embeddings(t)