Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: two usability issues with sqlalchemy #354

Merged
merged 5 commits into from
Jan 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/python-integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ The `vectorizer_relationship` accepts the following parameters:
Additional parameters are simply forwarded to the underlying [SQLAlchemy relationship](https://docs.sqlalchemy.org/en/20/orm/relationships.html) so you can configure it as you desire.

Think of the `vectorizer_relationship` as a normal SQLAlchemy relationship, but with a preconfigured model instance under the hood.

The relationship into the other direction is also automatically set, if you want to change it's configuration you can set the
`parent_kwargs`parameter. E.g. `parent_kwargs={"lazy": "joined"}` to configure eager loading.

## Setting up the Vectorizer

Expand Down
32 changes: 17 additions & 15 deletions projects/pgai/pgai/sqlalchemy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
Mapper,
Relationship,
RelationshipProperty,
backref,
mapped_column,
relationship,
)
Expand Down Expand Up @@ -68,12 +67,12 @@ def set_schemas_correctly(self, owner: type[DeclarativeBase]) -> None:
)

def create_embedding_class(
self, owner: type[DeclarativeBase]
self, owner: type[DeclarativeBase], parent_kwargs: dict[str, Any]
) -> type[EmbeddingModel[Any]]:
assert self.name is not None
table_name = self.target_table or f"{owner.__tablename__}_embedding_store"
self.set_schemas_correctly(owner)
class_name = f"{to_pascal_case(self.name)}Embedding"
class_name = f"{owner.__name__}{to_pascal_case(self.name)}"
registry_instance = owner.registry
base: type[DeclarativeBase] = owner.__base__ # type: ignore

Expand Down Expand Up @@ -103,6 +102,10 @@ def create_embedding_class(
"chunk": mapped_column(Text, nullable=False),
"embedding": mapped_column(Vector(self.dimensions), nullable=False),
"chunk_seq": mapped_column(Integer, nullable=False),
"parent": relationship(
owner,
**parent_kwargs,
),
}

# Add primary key columns to the dictionary
Expand Down Expand Up @@ -133,38 +136,37 @@ def create_embedding_class(

@overload
def __get__(
self, obj: None, objtype: type[DeclarativeBase]
self, obj: None, owner: type[DeclarativeBase]
) -> type[EmbeddingModel[Any]]: ...

@overload
def __get__(
self, obj: DeclarativeBase, objtype: type[DeclarativeBase] | None = None
self, obj: DeclarativeBase, owner: type[DeclarativeBase]
) -> Relationship[EmbeddingModel[Any]]: ...

def __get__(
self, obj: DeclarativeBase | None, objtype: type[DeclarativeBase] | None = None
self, obj: DeclarativeBase | None, owner: type[DeclarativeBase]
) -> Relationship[EmbeddingModel[Any]] | type[EmbeddingModel[Any]]:
assert self.name is not None
relationship_name = f"_{self.name}_relationship"
if not self._initialized and objtype is not None:
self._embedding_class = self.create_embedding_class(objtype)
if not self._initialized:
self._embedding_class = self.create_embedding_class(
owner, self.relationship_args.pop("parent_kwargs", {})
)

mapper = inspect(objtype)
mapper = inspect(owner)
assert mapper is not None
pk_cols = mapper.primary_key
if not hasattr(objtype, relationship_name):
if not hasattr(owner, relationship_name):
self.relationship_instance = relationship(
self._embedding_class,
foreign_keys=[
getattr(self._embedding_class, col.name) for col in pk_cols
],
backref=self.relationship_args.pop(
"backref", backref("parent", lazy="select")
),
**self.relationship_args,
)
setattr(objtype, f"{self.name}_model", self._embedding_class)
setattr(objtype, relationship_name, self.relationship_instance)
setattr(owner, f"{self.name}_model", self._embedding_class)
setattr(owner, relationship_name, self.relationship_instance)
self._initialized = True
if obj is None and self._initialized:
return self._embedding_class # type: ignore
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class BlogPost(Base):

with Session(initialized_engine) as session:
# Test 1: Access embedding class directly
assert BlogPost.content_embeddings.__name__ == "ContentEmbeddingsEmbedding"
assert BlogPost.content_embeddings.__name__ == "BlogPostContentEmbeddings"

# Get all embeddings directly
all_embeddings = session.query(BlogPost.content_embeddings).all()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_vectorizer_composite_key(

# Verify embeddings were created
with Session(initialized_engine) as session:
assert Author.bio_embeddings.__name__ == "BioEmbeddingsEmbedding"
assert Author.bio_embeddings.__name__ == "AuthorBioEmbeddings"

# Check embeddings exist and have correct properties
embedding = session.query(Author.bio_embeddings).first()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_vectorizer_embedding_creation(
# Verify embeddings were created
with Session(initialized_engine) as session:
# Verify embedding class was created correctly
assert BlogPost.content_embeddings.__name__ == "ContentEmbeddingsEmbedding"
assert BlogPost.content_embeddings.__name__ == "BlogPostContentEmbeddings"

# Check embeddings exist and have correct properties
embedding = session.query(BlogPost.content_embeddings).first()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def test_joined_loading(
)
session.add(article)
articles.append(article)
# _ = article.embeddings
session.commit()

# Run vectorizer worker for each vectorizer
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from typing import Any

import numpy as np
from sqlalchemy import Column, Engine, Integer, Text
from sqlalchemy.orm import DeclarativeBase, Session
from sqlalchemy import Column, Engine, Integer, Text, select
from sqlalchemy.orm import DeclarativeBase, Session, joinedload
from sqlalchemy.sql import text
from testcontainers.postgres import PostgresContainer # type: ignore

from pgai.sqlalchemy import vectorizer_relationship
from tests.vectorizer.extensions.utils import run_vectorizer_worker
from tests.vectorizer.extensions.utils import (
run_vectorizer_worker,
)


class Base(DeclarativeBase):
Expand All @@ -19,9 +21,7 @@ class BlogPost(Base):
id = Column(Integer, primary_key=True)
title = Column(Text, nullable=False)
content = Column(Text, nullable=False)
content_embeddings = vectorizer_relationship(
dimensions=768,
)
content_embeddings = vectorizer_relationship(dimensions=768, lazy="joined")


def test_vectorizer_embedding_creation(
Expand Down Expand Up @@ -66,7 +66,7 @@ def test_vectorizer_embedding_creation(
blog_post = session.query(BlogPost).first()
assert blog_post is not None
assert blog_post.content_embeddings is not None
assert BlogPost.content_embeddings.__name__ == "ContentEmbeddingsEmbedding"
assert BlogPost.content_embeddings.__name__ == "BlogPostContentEmbeddings"

# Check embeddings exist and have correct properties
embedding = session.query(BlogPost.content_embeddings).first()
Expand All @@ -88,3 +88,77 @@ def test_vectorizer_embedding_creation(
assert embedding_entity is not None
assert embedding_entity.chunk in blog_post.content
assert embedding_entity.parent is not None


def test_select_parent(
postgres_container: PostgresContainer, initialized_engine: Engine, vcr_: Any
):
"""Test basic data insertion and embedding generation with default relationship."""
db_url = postgres_container.get_connection_url()
# Create tables
metadata = BlogPost.metadata
metadata.create_all(initialized_engine, tables=[metadata.sorted_tables[0]])
with initialized_engine.connect() as conn:
conn.execute(
text("""
SELECT ai.create_vectorizer(
'blog_posts'::regclass,
embedding =>
ai.embedding_openai('text-embedding-3-small', 768),
chunking =>
ai.chunking_recursive_character_text_splitter('content', 50, 10)
);
""")
)
conn.commit()

# Insert test data
with Session(initialized_engine) as session:
post = BlogPost(
title="Introduction to Machine Learning",
content="Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience.", # noqa
)
session.add(post)
session.commit()

# Run vectorizer worker
with vcr_.use_cassette("test_vectorizer_embedding_creation_relationship.yaml"):
run_vectorizer_worker(db_url, 1)

# Verify embeddings were created
with Session(initialized_engine) as session:
# Check embeddings exist and have correct properties
embedding = (
session.execute(
select(BlogPost.content_embeddings).options(
joinedload(BlogPost.content_embeddings.parent) # type: ignore
)
)
.scalars()
.first()
)
assert embedding is not None
assert embedding.parent is not None


class Document(Base):
__tablename__ = "documents"

id = Column(Integer, primary_key=True)
content = Column(Text())
content_embeddings = vectorizer_relationship(dimensions=768, lazy="joined")


def test_can_build_select():
"""
This is a very minimal test case that failed when doing
some development with the extension.
The nature of the vectorizer_relationship being a descriptor messes
with sqlalchemys relationship resolution.
It was previously using `backref` to propagate the parent field,
which is resolved later and an immediate access
to build queries like this would fail.
"""
select(Document.content_embeddings).options(
joinedload(Document.content_embeddings.parent) # type: ignore
)
Loading