diff --git a/backend/alembic/versions/2023_06_11_1544-d18213c629be_add_search_vector_update_date_indexes.py b/backend/alembic/versions/2023_06_11_1544-d18213c629be_add_search_vector_update_date_indexes.py new file mode 100644 index 0000000000..1b1f826acc --- /dev/null +++ b/backend/alembic/versions/2023_06_11_1544-d18213c629be_add_search_vector_update_date_indexes.py @@ -0,0 +1,33 @@ +"""Add search vector update date, indexes + +Revision ID: d18213c629be +Revises: c181661eba3a +Create Date: 2023-06-11 15:44:49.911455 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "d18213c629be" +down_revision = "c181661eba3a" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("message", sa.Column("search_vector_update_date", sa.DateTime(timezone=True), nullable=True)) + op.create_index( + op.f("ix_message_search_vector_update_date"), "message", ["search_vector_update_date"], unique=False + ) + op.create_index(op.f("ix_message_revision_created_date"), "message_revision", ["created_date"], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_message_revision_created_date"), table_name="message_revision") + op.drop_index(op.f("ix_message_search_vector_update_date"), table_name="message") + op.drop_column("message", "search_vector_update_date") + # ### end Alembic commands ### diff --git a/backend/oasst_backend/scheduled_tasks.py b/backend/oasst_backend/scheduled_tasks.py index 2fe2c44528..d3a90493fb 100644 --- a/backend/oasst_backend/scheduled_tasks.py +++ b/backend/oasst_backend/scheduled_tasks.py @@ -72,32 +72,42 @@ def update_search_vectors(batch_size: int) -> None: try: with default_session_factory() as session: while True: - to_update: list[Message] = ( - session.query(Message) - .outerjoin( - MessageRevision, + query = session.query(Message) + + # Subquery to obtain creation date of most recent revision for a message + latest_revision_date_subquery = ( + session.query(func.max(MessageRevision.created_date)) + .filter(MessageRevision.message_id == Message.id) + .correlate(Message) + .as_scalar() + ) + + # Outerjoin messages to their most recent revisions + query = query.outerjoin( + MessageRevision, + and_( + Message.id == MessageRevision.message_id, + MessageRevision.created_date == latest_revision_date_subquery, + ), + ) + + # Filter for only messages where we want to update the search vector + # The core components are when search vector is null, or there is a revision since last vector update + # We also add the case where is a revision and no vector update date + # This accounts for messages where the vector was generated before vector update dates were added + query = query.filter( + or_( + Message.search_vector.is_(None), + MessageRevision.created_date > Message.search_vector_update_date, and_( - Message.id == MessageRevision.message_id, - MessageRevision.created_date - == session.query(func.max(MessageRevision.created_date)) - .filter(MessageRevision.message_id == Message.id) - .as_scalar(), + Message.search_vector_update_date.is_(None), + MessageRevision.created_date.isnot(None), ), ) - .filter( - or_( - Message.search_vector.is_(None), - MessageRevision.created_date > Message.search_vector_update_date, - and_( - Message.search_vector_update_date.is_(None), - MessageRevision.created_date.isnot(None), - ), - ) - ) - .limit(batch_size) - .all() ) + to_update: list[Message] = query.limit(batch_size).all() + if not to_update: break