From 712f577b30717b7e85d85cf92568fc5543a29f11 Mon Sep 17 00:00:00 2001 From: Jannis Leidel Date: Tue, 17 Oct 2017 23:37:43 +0200 Subject: [PATCH] Add full text search for queries based on the Postgres tsvector type. (re #260) --- migrations/versions/5ec5c84ba61e_.py | 35 ++++++++++++++ migrations/versions/6b5be7e0a0ef_.py | 48 +++++++++++++++++++ redash/admin.py | 3 +- redash/handlers/queries.py | 7 ++- redash/models.py | 69 ++++++++++++++++++++-------- requirements.txt | 2 + tests/models/test_queries.py | 57 +++++++++++++++++++++++ 7 files changed, 198 insertions(+), 23 deletions(-) create mode 100644 migrations/versions/5ec5c84ba61e_.py create mode 100644 migrations/versions/6b5be7e0a0ef_.py diff --git a/migrations/versions/5ec5c84ba61e_.py b/migrations/versions/5ec5c84ba61e_.py new file mode 100644 index 0000000000..7a8bdeb3ce --- /dev/null +++ b/migrations/versions/5ec5c84ba61e_.py @@ -0,0 +1,35 @@ +"""empty message + +Revision ID: 5ec5c84ba61e +Revises: d1eae8b9893e +Create Date: 2017-10-17 18:21:00.174015 + +""" +from alembic import op +import sqlalchemy as sa +import sqlalchemy_utils as su +import sqlalchemy_searchable as ss + + +# revision identifiers, used by Alembic. +revision = '5ec5c84ba61e' +down_revision = '58f810489c47' +branch_labels = None +depends_on = None + + +def upgrade(): + conn = op.get_bind() + op.add_column('queries', sa.Column('search_vector', su.TSVectorType())) + op.create_index('ix_queries_search_vector', 'queries', ['search_vector'], + unique=False, postgresql_using='gin') + ss.sync_trigger(conn, 'queries', 'search_vector', + ['name', 'description', 'query']) + + +def downgrade(): + conn = op.get_bind() + + ss.drop_trigger(conn, 'queries', 'search_vector') + op.drop_index('ix_queries_search_vector', table_name='queries') + op.drop_column('queries', 'search_vector') diff --git a/migrations/versions/6b5be7e0a0ef_.py b/migrations/versions/6b5be7e0a0ef_.py new file mode 100644 index 0000000000..4765f07095 --- /dev/null +++ b/migrations/versions/6b5be7e0a0ef_.py @@ -0,0 +1,48 @@ +"""empty message + +Revision ID: 6b5be7e0a0ef +Revises: 5ec5c84ba61e +Create Date: 2017-11-02 20:42:13.356360 + +""" +from alembic import op +import sqlalchemy as sa +import sqlalchemy_searchable as ss + + +# revision identifiers, used by Alembic. +revision = '6b5be7e0a0ef' +down_revision = '5ec5c84ba61e' +branch_labels = None +depends_on = None + + +def upgrade(): + ss.vectorizer.clear() + + conn = op.get_bind() + + metadata = sa.MetaData(bind=conn) + queries = sa.Table('queries', metadata, autoload=True) + + @ss.vectorizer(queries.c.id) + def integer_vectorizer(column): + return sa.func.cast(column, sa.Text) + + ss.sync_trigger( + conn, + 'queries', + 'search_vector', + ['id', 'name', 'description', 'query'], + metadata=metadata + ) + + +def downgrade(): + conn = op.get_bind() + ss.drop_trigger(conn, 'queries', 'search_vector') + op.drop_index('ix_queries_search_vector', table_name='queries') + op.create_index('ix_queries_search_vector', 'queries', ['search_vector'], + unique=False, postgresql_using='gin') + ss.sync_trigger(conn, 'queries', 'search_vector', + ['name', 'description', 'query']) diff --git a/redash/admin.py b/redash/admin.py index f2a892e1ea..3173e90cd5 100644 --- a/redash/admin.py +++ b/redash/admin.py @@ -55,7 +55,8 @@ class QueryResultModelView(BaseModelView): class QueryModelView(BaseModelView): column_exclude_list = ('latest_query_data',) - form_excluded_columns = ('version', 'visualizations', 'alerts', 'org', 'created_at', 'updated_at', 'latest_query_data') + form_excluded_columns = ('version', 'visualizations', 'alerts', 'org', 'created_at', + 'updated_at', 'latest_query_data', 'search_vector') class DashboardModelView(BaseModelView): diff --git a/redash/handlers/queries.py b/redash/handlers/queries.py index df5a56d8a5..b6308846c4 100644 --- a/redash/handlers/queries.py +++ b/redash/handlers/queries.py @@ -37,7 +37,7 @@ class QuerySearchResource(BaseResource): @require_permission('view_query') def get(self): """ - Search query text, titles, and descriptions. + Search query text, names, and descriptions. :qparam string q: Search term @@ -50,7 +50,10 @@ def get(self): 'object_id': term, 'object_type': 'query', }) - return [q.to_dict(with_last_modified_by=False) for q in models.Query.search(term, self.current_user.group_ids, include_drafts=include_drafts)] + return [q.to_dict(with_last_modified_by=False) + for q in models.Query.search(term, + self.current_user.group_ids, + include_drafts=include_drafts)] class QueryRecentResource(BaseResource): diff --git a/redash/models.py b/redash/models.py index 02505dd219..e1e16bddf4 100644 --- a/redash/models.py +++ b/redash/models.py @@ -12,7 +12,7 @@ import xlsxwriter from flask_login import AnonymousUserMixin, UserMixin -from flask_sqlalchemy import SQLAlchemy +from flask_sqlalchemy import SQLAlchemy, BaseQuery from passlib.apps import custom_app_context as pwd_context from redash import settings, redis_connection, utils from redash.destinations import (get_configuration_schema_for_destination_type, @@ -30,11 +30,13 @@ from sqlalchemy.event import listens_for from sqlalchemy.ext.mutable import Mutable from sqlalchemy.inspection import inspect -from sqlalchemy.orm import backref, joinedload, object_session, subqueryload +from sqlalchemy.orm import backref, joinedload, object_session from sqlalchemy.orm.exc import NoResultFound # noqa: F401 from sqlalchemy.types import TypeDecorator from sqlalchemy.orm.attributes import flag_modified from functools import reduce +from sqlalchemy_searchable import SearchQueryMixin, make_searchable, vectorizer +from sqlalchemy_utils.types import TSVectorType class SQLAlchemyExt(SQLAlchemy): @@ -49,6 +51,21 @@ def apply_pool_defaults(self, app, options): db = SQLAlchemyExt(session_options={ 'expire_on_commit': False }) +# Make sure the SQLAlchemy mappers are all properly configured first. +# This is required by SQLAlchemy-Searchable as it adds DDL listeners +# on the configuration phase of models. +db.configure_mappers() + +# listen to a few database events to set up functions, trigger updates +# and indexes for the full text search +make_searchable(options={'regconfig': 'pg_catalog.simple'}) + + +class SearchBaseQuery(BaseQuery, SearchQueryMixin): + """ + The SQA query class to use when full text search is wanted. + """ + Column = functools.partial(db.Column, nullable=False) @@ -861,7 +878,14 @@ class Query(ChangeTrackingMixin, TimestampMixin, BelongsToOrgMixin, db.Model): schedule_until = Column(db.DateTime(True), nullable=True) visualizations = db.relationship("Visualization", cascade="all, delete-orphan") options = Column(MutableDict.as_mutable(PseudoJSON), default={}) - + search_vector = Column(TSVectorType('id', 'name', 'description', 'query', + weights={'name': 'A', + 'id': 'B', + 'description': 'C', + 'query': 'D'}), + nullable=True) + + query_class = SearchBaseQuery __tablename__ = 'queries' __mapper_args__ = { "version_id_col": version, @@ -987,27 +1011,24 @@ def outdated_queries(cls): return outdated_queries.values() @classmethod - def search(cls, term, group_ids, include_drafts=False): - # TODO: This is very naive implementation of search, to be replaced with PostgreSQL full-text-search solution. - where = (Query.name.ilike(u"%{}%".format(term)) | - Query.description.ilike(u"%{}%".format(term))) - - if term.isdigit(): - where |= Query.id == term - - where &= Query.is_archived == False + def search(cls, term, group_ids, include_drafts=False, limit=20): + where = cls.is_archived == False if not include_drafts: - where &= Query.is_draft == False + where &= cls.is_draft == False where &= DataSourceGroup.group_id.in_(group_ids) - query_ids = ( - db.session.query(Query.id).join( - DataSourceGroup, - Query.data_source_id == DataSourceGroup.data_source_id) - .filter(where)).distinct() - return Query.query.options(joinedload(Query.user)).filter(Query.id.in_(query_ids)) + return cls.query.join( + DataSourceGroup, + cls.data_source_id == DataSourceGroup.data_source_id + ).options( + joinedload(cls.user) + ).filter(where).search( + term, + # sort the result using the weight as defined in the search vector column + sort=True + ).distinct().limit(limit) @classmethod def recent(cls, group_ids, user_id=None, limit=20): @@ -1074,6 +1095,14 @@ def groups(self): def __unicode__(self): return unicode(self.id) + def __repr__(self): + return '' % (self.id, self.name or 'untitled') + + +@vectorizer(db.Integer) +def integer_vectorizer(column): + return db.func.cast(column, db.Text) + @listens_for(Query.query_text, 'set') def gen_query_hash(target, val, oldval, initiator): @@ -1382,7 +1411,7 @@ def all(cls, org, group_ids, user_id): @classmethod def search(cls, term, user_id, group_ids, include_drafts=False): - # limit_to_users_dashboards=False, + # limit_to_users_dashboards=False, # TODO: This is very naive implementation of search, to be replaced with PostgreSQL full-text-search solution. where = (Dashboard.name.ilike(u"%{}%".format(term))) diff --git a/requirements.txt b/requirements.txt index 94a853b29d..66aa3766b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,8 @@ redis==2.10.5 requests==2.11.1 six==1.10.0 SQLAlchemy==1.1.4 +SQLAlchemy-Searchable==0.10.6 +SQLAlchemy-Utils>=0.29.0 sqlparse==0.1.8 wsgiref==0.1.2 honcho==0.5.0 diff --git a/tests/models/test_queries.py b/tests/models/test_queries.py index f38f8b0712..299c5508d3 100644 --- a/tests/models/test_queries.py +++ b/tests/models/test_queries.py @@ -47,6 +47,13 @@ def test_search_by_id_returns_query(self): self.assertNotIn(q1, queries) self.assertNotIn(q2, queries) + def test_search_by_number(self): + q = self.factory.create_query(description="Testing search 12345") + db.session.flush() + queries = Query.search('12345', [self.factory.default_group.id]) + + self.assertIn(q, queries) + def test_search_respects_groups(self): other_group = Group(org=self.factory.org, name="Other Group") db.session.add(other_group) @@ -98,6 +105,56 @@ def test_search_is_case_insensitive(self): self.assertIn(q, Query.search('testing', [self.factory.default_group.id])) + def test_search_query_parser_or(self): + q1 = self.factory.create_query(name="Testing") + q2 = self.factory.create_query(name="search") + + queries = list(Query.search('testing or search', [self.factory.default_group.id])) + self.assertIn(q1, queries) + self.assertIn(q2, queries) + + def test_search_query_parser_negation(self): + q1 = self.factory.create_query(name="Testing") + q2 = self.factory.create_query(name="search") + + queries = list(Query.search('testing -search', [self.factory.default_group.id])) + self.assertIn(q1, queries) + self.assertNotIn(q2, queries) + + def test_search_query_parser_parenthesis(self): + q1 = self.factory.create_query(name="Testing search") + q2 = self.factory.create_query(name="Testing searching") + q3 = self.factory.create_query(name="Testing finding") + + queries = list(Query.search('(testing search) or finding', [self.factory.default_group.id])) + self.assertIn(q1, queries) + self.assertIn(q2, queries) + self.assertIn(q3, queries) + + def test_search_query_parser_hyphen(self): + q1 = self.factory.create_query(name="Testing search") + q2 = self.factory.create_query(name="Testing-search") + + queries = list(Query.search('testing search', [self.factory.default_group.id])) + self.assertIn(q1, queries) + self.assertIn(q2, queries) + + def test_search_query_parser_emails(self): + q1 = self.factory.create_query(name="janedoe@example.com") + q2 = self.factory.create_query(name="johndoe@example.com") + + queries = list(Query.search('example', [self.factory.default_group.id])) + self.assertIn(q1, queries) + self.assertIn(q2, queries) + + queries = list(Query.search('com', [self.factory.default_group.id])) + self.assertIn(q1, queries) + self.assertIn(q2, queries) + + queries = list(Query.search('johndoe', [self.factory.default_group.id])) + self.assertNotIn(q1, queries) + self.assertIn(q2, queries) + class QueryRecentTest(BaseTestCase): def test_global_recent(self):