timescale · Askir · Jan 11, 2025 · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -86,7 +86,8 @@ The `vectorizer_relationship` accepts the following parameters:
 Additional parameters are simply forwarded to the underlying [SQLAlchemy relationship](https://docs.sqlalchemy.org/en/20/orm/relationships.html) so you can configure it as you desire.
 
 Think of the `vectorizer_relationship` as a normal SQLAlchemy relationship, but with a preconfigured model instance under the hood.
-
+The relationship into the other direction is also automatically set, if you want to change it's configuration you can set the
+`parent_kwargs`parameter. E.g. `parent_kwargs={"lazy": "joined"}` to configure eager loading.
 
 ## Setting up the Vectorizer
 

@@ -8,7 +8,6 @@
     Mapper,
     Relationship,
     RelationshipProperty,
-    backref,
     mapped_column,
     relationship,
 )
@@ -68,12 +67,12 @@ def set_schemas_correctly(self, owner: type[DeclarativeBase]) -> None:
         )
 
     def create_embedding_class(
-        self, owner: type[DeclarativeBase]
+        self, owner: type[DeclarativeBase], parent_kwargs: dict[str, Any]
     ) -> type[EmbeddingModel[Any]]:
         assert self.name is not None
         table_name = self.target_table or f"{owner.__tablename__}_embedding_store"
         self.set_schemas_correctly(owner)
-        class_name = f"{to_pascal_case(self.name)}Embedding"
+        class_name = f"{owner.__name__}{to_pascal_case(self.name)}"
         registry_instance = owner.registry
         base: type[DeclarativeBase] = owner.__base__  # type: ignore
 
@@ -103,6 +102,10 @@ def create_embedding_class(
             "chunk": mapped_column(Text, nullable=False),
             "embedding": mapped_column(Vector(self.dimensions), nullable=False),
             "chunk_seq": mapped_column(Integer, nullable=False),
+            "parent": relationship(
+                owner,
+                **parent_kwargs,
+            ),
         }
 
         # Add primary key columns to the dictionary
@@ -133,38 +136,37 @@ def create_embedding_class(
 
     @overload
     def __get__(
-        self, obj: None, objtype: type[DeclarativeBase]
+        self, obj: None, owner: type[DeclarativeBase]
     ) -> type[EmbeddingModel[Any]]: ...
 
     @overload
     def __get__(
-        self, obj: DeclarativeBase, objtype: type[DeclarativeBase] | None = None
+        self, obj: DeclarativeBase, owner: type[DeclarativeBase]
     ) -> Relationship[EmbeddingModel[Any]]: ...
 
     def __get__(
-        self, obj: DeclarativeBase | None, objtype: type[DeclarativeBase] | None = None
+        self, obj: DeclarativeBase | None, owner: type[DeclarativeBase]
     ) -> Relationship[EmbeddingModel[Any]] | type[EmbeddingModel[Any]]:
         assert self.name is not None
         relationship_name = f"_{self.name}_relationship"
-        if not self._initialized and objtype is not None:
-            self._embedding_class = self.create_embedding_class(objtype)
+        if not self._initialized:
+            self._embedding_class = self.create_embedding_class(
+                owner, self.relationship_args.pop("parent_kwargs", {})
+            )
 
-            mapper = inspect(objtype)
+            mapper = inspect(owner)
             assert mapper is not None
             pk_cols = mapper.primary_key
-            if not hasattr(objtype, relationship_name):
+            if not hasattr(owner, relationship_name):
                 self.relationship_instance = relationship(
                     self._embedding_class,
                     foreign_keys=[
                         getattr(self._embedding_class, col.name) for col in pk_cols
                     ],
-                    backref=self.relationship_args.pop(
-                        "backref", backref("parent", lazy="select")
-                    ),
                     **self.relationship_args,
                 )
-                setattr(objtype, f"{self.name}_model", self._embedding_class)
-                setattr(objtype, relationship_name, self.relationship_instance)
+                setattr(owner, f"{self.name}_model", self._embedding_class)
+                setattr(owner, relationship_name, self.relationship_instance)
             self._initialized = True
         if obj is None and self._initialized:
             return self._embedding_class  # type: ignore

@@ -72,7 +72,7 @@ class BlogPost(Base):
 
     with Session(initialized_engine) as session:
         # Test 1: Access embedding class directly
-        assert BlogPost.content_embeddings.__name__ == "ContentEmbeddingsEmbedding"
+        assert BlogPost.content_embeddings.__name__ == "BlogPostContentEmbeddings"
 
         # Get all embeddings directly
         all_embeddings = session.query(BlogPost.content_embeddings).all()

@@ -66,7 +66,7 @@ def test_vectorizer_composite_key(
 
     # Verify embeddings were created
     with Session(initialized_engine) as session:
-        assert Author.bio_embeddings.__name__ == "BioEmbeddingsEmbedding"
+        assert Author.bio_embeddings.__name__ == "AuthorBioEmbeddings"
 
         # Check embeddings exist and have correct properties
         embedding = session.query(Author.bio_embeddings).first()

@@ -58,7 +58,7 @@ def test_vectorizer_embedding_creation(
     # Verify embeddings were created
     with Session(initialized_engine) as session:
         # Verify embedding class was created correctly
-        assert BlogPost.content_embeddings.__name__ == "ContentEmbeddingsEmbedding"
+        assert BlogPost.content_embeddings.__name__ == "BlogPostContentEmbeddings"
 
         # Check embeddings exist and have correct properties
         embedding = session.query(BlogPost.content_embeddings).first()

@@ -62,7 +62,6 @@ def test_joined_loading(
             )
             session.add(article)
             articles.append(article)
-            # _ = article.embeddings
         session.commit()
 
     # Run vectorizer worker for each vectorizer

@@ -1,13 +1,15 @@
 from typing import Any
 
 import numpy as np
-from sqlalchemy import Column, Engine, Integer, Text
-from sqlalchemy.orm import DeclarativeBase, Session
+from sqlalchemy import Column, Engine, Integer, Text, select
+from sqlalchemy.orm import DeclarativeBase, Session, joinedload
 from sqlalchemy.sql import text
 from testcontainers.postgres import PostgresContainer  # type: ignore
 
 from pgai.sqlalchemy import vectorizer_relationship
-from tests.vectorizer.extensions.utils import run_vectorizer_worker
+from tests.vectorizer.extensions.utils import (
+    run_vectorizer_worker,
+)
 
 
 class Base(DeclarativeBase):
@@ -19,9 +21,7 @@ class BlogPost(Base):
     id = Column(Integer, primary_key=True)
     title = Column(Text, nullable=False)
     content = Column(Text, nullable=False)
-    content_embeddings = vectorizer_relationship(
-        dimensions=768,
-    )
+    content_embeddings = vectorizer_relationship(dimensions=768, lazy="joined")
 
 
 def test_vectorizer_embedding_creation(
@@ -66,7 +66,7 @@ def test_vectorizer_embedding_creation(
         blog_post = session.query(BlogPost).first()
         assert blog_post is not None
         assert blog_post.content_embeddings is not None
-        assert BlogPost.content_embeddings.__name__ == "ContentEmbeddingsEmbedding"
+        assert BlogPost.content_embeddings.__name__ == "BlogPostContentEmbeddings"
 
         # Check embeddings exist and have correct properties
         embedding = session.query(BlogPost.content_embeddings).first()
@@ -88,3 +88,77 @@ def test_vectorizer_embedding_creation(
         assert embedding_entity is not None
         assert embedding_entity.chunk in blog_post.content
         assert embedding_entity.parent is not None
+
+
+def test_select_parent(
+    postgres_container: PostgresContainer, initialized_engine: Engine, vcr_: Any
+):
+    """Test basic data insertion and embedding generation with default relationship."""
+    db_url = postgres_container.get_connection_url()
+    # Create tables
+    metadata = BlogPost.metadata
+    metadata.create_all(initialized_engine, tables=[metadata.sorted_tables[0]])
+    with initialized_engine.connect() as conn:
+        conn.execute(
+            text("""
+                SELECT ai.create_vectorizer(
+                    'blog_posts'::regclass,
+                    embedding =>
+                    ai.embedding_openai('text-embedding-3-small', 768),
+                    chunking =>
+                    ai.chunking_recursive_character_text_splitter('content', 50, 10)
+                );
+            """)
+        )
+        conn.commit()
+
+    # Insert test data
+    with Session(initialized_engine) as session:
+        post = BlogPost(
+            title="Introduction to Machine Learning",
+            content="Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience.",  # noqa
+        )
+        session.add(post)
+        session.commit()
+
+    # Run vectorizer worker
+    with vcr_.use_cassette("test_vectorizer_embedding_creation_relationship.yaml"):
+        run_vectorizer_worker(db_url, 1)
+
+    # Verify embeddings were created
+    with Session(initialized_engine) as session:
+        # Check embeddings exist and have correct properties
+        embedding = (
+            session.execute(
+                select(BlogPost.content_embeddings).options(
+                    joinedload(BlogPost.content_embeddings.parent)  # type: ignore
+                )
+            )
+            .scalars()
+            .first()
+        )
+        assert embedding is not None
+        assert embedding.parent is not None
+
+
+class Document(Base):
+    __tablename__ = "documents"
+
+    id = Column(Integer, primary_key=True)
+    content = Column(Text())
+    content_embeddings = vectorizer_relationship(dimensions=768, lazy="joined")
+
+
+def test_can_build_select():
+    """
+    This is a very minimal test case that failed when doing
+    some development with the extension.
+    The nature of the vectorizer_relationship being a descriptor messes
+    with sqlalchemys relationship resolution.
+    It was previously using `backref` to propagate the parent field,
+    which is resolved later and an immediate access
+    to build queries like this would fail.
+    """
+    select(Document.content_embeddings).options(
+        joinedload(Document.content_embeddings.parent)  # type: ignore
+    )