Jojnts · Mar 13, 2019
diff --git a/‎client/app/assets/less/ant.less
+1 b/‎client/app/assets/less/ant.less
+1
diff --git a/‎client/app/assets/less/inc/schema-browser.less
+9-5 b/‎client/app/assets/less/inc/schema-browser.less
+9-5
diff --git a/‎client/app/components/proptypes.js
+7 b/‎client/app/components/proptypes.js
+7
diff --git a/‎client/app/components/queries/SchemaData.jsx
+65 b/‎client/app/components/queries/SchemaData.jsx
+65
diff --git a/‎client/app/components/queries/schema-browser.html
+10-2 b/‎client/app/components/queries/schema-browser.html
+10-2
diff --git a/‎client/app/components/queries/schema-browser.js
+11 b/‎client/app/components/queries/schema-browser.js
+11
diff --git a/‎migrations/versions/280daa582976_.py
+55 b/‎migrations/versions/280daa582976_.py
+55
diff --git a/‎redash/handlers/data_sources.py
+11-3 b/‎redash/handlers/data_sources.py
+11-3
diff --git a/‎redash/models/__init__.py
+79-15 b/‎redash/models/__init__.py
+79-15
diff --git a/‎redash/query_runner/__init__.py
+22 b/‎redash/query_runner/__init__.py
+22
diff --git a/‎redash/query_runner/athena.py
+14-3 b/‎redash/query_runner/athena.py
+14-3
diff --git a/‎redash/query_runner/mysql.py
+14-4 b/‎redash/query_runner/mysql.py
+14-4
diff --git a/‎redash/query_runner/pg.py
+20-5 b/‎redash/query_runner/pg.py
+20-5
diff --git a/‎redash/query_runner/presto.py
+11-4 b/‎redash/query_runner/presto.py
+11-4
diff --git a/‎redash/settings/__init__.py
+3 b/‎redash/settings/__init__.py
+3
diff --git a/‎redash/tasks/__init__.py
+1-1 b/‎redash/tasks/__init__.py
+1-1
diff --git a/‎redash/tasks/queries.py
+135-2 b/‎redash/tasks/queries.py
+135-2
diff --git a/‎redash/worker.py
+4 b/‎redash/worker.py
+4
diff --git a/‎tests/factories.py
+15 b/‎tests/factories.py
+15
diff --git a/‎tests/models/test_data_sources.py
+36-32 b/‎tests/models/test_data_sources.py
+36-32
diff --git a/‎tests/query_runner/test_get_schema_format.py
+77 b/‎tests/query_runner/test_get_schema_format.py
+77
diff --git a/‎tests/tasks/test_queries.py
+26-2 b/‎tests/tasks/test_queries.py
+26-2
diff --git a/‎tests/tasks/test_refresh_schemas.py
+138-1 b/‎tests/tasks/test_refresh_schemas.py
+138-1
diff --git a/‎tests/test_cli.py
+1-1 b/‎tests/test_cli.py
+1-1
@@ -14,6 +14,7 @@
 @import '~antd/lib/radio/style/index';
 @import '~antd/lib/time-picker/style/index';
 @import '~antd/lib/pagination/style/index';
+@import '~antd/lib/drawer/style/index';
 @import '~antd/lib/table/style/index';
 @import '~antd/lib/popover/style/index';
 @import '~antd/lib/icon/style/index';
 
@@ -7,14 +7,14 @@ div.table-name {
   border-radius: @redash-radius;
   position: relative;
 
-  .copy-to-editor {
+  .copy-to-editor, .info {
     display: none;
   }
 
   &:hover {
     background: fade(@redash-gray, 10%);
 
-    .copy-to-editor {
+    .copy-to-editor, .info {
       display: flex;
     }
   }
@@ -36,7 +36,7 @@ div.table-name {
     background: transparent;
   }
 
-  .copy-to-editor {
+  .copy-to-editor, .info {
     color: fade(@redash-gray, 90%);
     cursor: pointer;
     position: absolute;
@@ -49,21 +49,25 @@ div.table-name {
     justify-content: center;
   }
 
+  .info {
+    right: 20px
+  }
+
   .table-open {
     padding: 0 22px 0 26px;
     overflow: hidden;
     text-overflow: ellipsis;
     white-space: nowrap;
     position: relative;
 
-    .copy-to-editor {
+    .copy-to-editor, .info {
       display: none;
     }
 
     &:hover {
       background: fade(@redash-gray, 10%);
 
-      .copy-to-editor {
+      .copy-to-editor, .info {
         display: flex;
       }
     }
 
@@ -11,6 +11,13 @@ export const DataSource = PropTypes.shape({
   type_name: PropTypes.string,
 });
 
+export const DataSourceMetadata = PropTypes.shape({
+  key: PropTypes.number,
+  name: PropTypes.string,
+  type: PropTypes.string,
+  example: PropTypes.string,
+});
+
 export const Table = PropTypes.shape({
   columns: PropTypes.arrayOf(PropTypes.string).isRequired,
 });
 
@@ -0,0 +1,65 @@
+import React from 'react';
+import PropTypes from 'prop-types';
+import { react2angular } from 'react2angular';
+import Drawer from 'antd/lib/drawer';
+import Table from 'antd/lib/table';
+
+import { DataSourceMetadata } from '@/components/proptypes';
+
+class SchemaData extends React.PureComponent {
+  static propTypes = {
+    show: PropTypes.bool.isRequired,
+    onClose: PropTypes.func.isRequired,
+    tableName: PropTypes.string,
+    tableMetadata: PropTypes.arrayOf(DataSourceMetadata),
+  };
+
+  static defaultProps = {
+    tableName: '',
+    tableMetadata: [],
+  };
+
+  render() {
+    const columns = [{
+      title: 'Column Name',
+      dataIndex: 'name',
+      width: 400,
+      key: 'name',
+    }, {
+      title: 'Column Type',
+      dataIndex: 'type',
+      width: 400,
+      key: 'type',
+    }, {
+      title: 'Example',
+      dataIndex: 'example',
+      width: 400,
+      key: 'example',
+    }];
+
+    return (
+      <Drawer
+        title={this.props.tableName}
+        closable={false}
+        placement="bottom"
+        height={500}
+        onClose={this.props.onClose}
+        visible={this.props.show}
+      >
+        <Table
+          dataSource={this.props.tableMetadata}
+          pagination={false}
+          scroll={{ y: 350 }}
+          size="small"
+          columns={columns}
+        />
+      </Drawer>
+    );
+  }
+}
+
+export default function init(ngModule) {
+  ngModule.component('schemaData', react2angular(SchemaData, null, []));
+}
+
+init.init = true;
@@ -16,15 +16,23 @@
           <span title="{{table.name}}">{{table.name}}</span>
           <span ng-if="table.size !== undefined"> ({{table.size}})</span>
         </strong>
+        <i ng-if="table.hasColumnMetadata" class="fa fa-question-circle info" title="More Info" aria-hidden="true"
+          ng-click="openSchemaInfo($event, table.name, table.columns)"></i>
         <i class="fa fa-angle-double-right copy-to-editor" aria-hidden="true"
           ng-click="$ctrl.itemSelected($event, [table.name])"></i>
       </div>
       <div uib-collapse="table.collapsed">
-        <div ng-repeat="column in table.columns | filter:$ctrl.schemaFilterColumn track by column" class="table-open">{{column}}
+        <div ng-repeat="column in table.columns | filter:$ctrl.schemaFilterColumn track by column.key" class="table-open">{{column.name}}
           <i class="fa fa-angle-double-right copy-to-editor" aria-hidden="true"
-            ng-click="$ctrl.itemSelected($event, [column])"></i>
+            ng-click="$ctrl.itemSelected($event, [column.name])"></i>
         </div>
       </div>
     </div>
   </div>
+  <schema-data
+    show="showSchemaInfo"
+    table-name="tableName"
+    table-metadata="tableMetadata"
+    on-close="closeSchemaInfo"
+  ></schema-data>
 </div>
@@ -8,6 +8,17 @@ function SchemaBrowserCtrl($rootScope, $scope) {
     $scope.$broadcast('vsRepeatTrigger');
   };
 
+  $scope.showSchemaInfo = false;
+  $scope.openSchemaInfo = ($event, tableName, tableMetadata) => {
+    $scope.tableName = tableName;
+    $scope.tableMetadata = tableMetadata;
+    $scope.showSchemaInfo = true;
+    $event.stopPropagation();
+  };
+  $scope.closeSchemaInfo = () => {
+    $scope.$apply(() => { $scope.showSchemaInfo = false; });
+  };
+
   this.getSize = (table) => {
     let size = 22;
 
 
@@ -0,0 +1,55 @@
+"""Add column metadata and table metadata
+
+Revision ID: 280daa582976
+Revises: e5c7a4e2df4d
+Create Date: 2019-01-24 18:23:53.040608
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '280daa582976'
+down_revision = 'e5c7a4e2df4d'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.create_table(
+        'table_metadata',
+        sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('org_id', sa.Integer(), nullable=False),
+        sa.Column('data_source_id', sa.Integer(), nullable=False),
+        sa.Column('exists', sa.Boolean(), nullable=False),
+        sa.Column('name', sa.String(length=255), nullable=False),
+        sa.Column('description', sa.String(length=4096), nullable=True),
+        sa.Column('column_metadata', sa.Boolean(), nullable=False),
+        sa.Column('sample_query', sa.Text(), nullable=True),
+        sa.ForeignKeyConstraint(['data_source_id'], ['data_sources.id'], ondelete="CASCADE"),
+        sa.ForeignKeyConstraint(['org_id'], ['organizations.id.id']),
+        sa.PrimaryKeyConstraint('id')
+    )
+    op.create_table(
+        'column_metadata',
+        sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('org_id', sa.Integer(), nullable=False),
+        sa.Column('table_id', sa.Integer(), nullable=False),
+        sa.Column('name', sa.String(length=255), nullable=False),
+        sa.Column('type', sa.String(length=255), nullable=True),
+        sa.Column('example', sa.String(length=4096), nullable=True),
+        sa.Column('exists', sa.Boolean(), nullable=False),
+        sa.ForeignKeyConstraint(['table_id'], ['table_metadata.id'], ondelete="CASCADE"),
+        sa.ForeignKeyConstraint(['org_id'], ['organizations.id.id']),
+        sa.PrimaryKeyConstraint('id')
+    )
+
+
+def downgrade():
+    op.drop_table('column_metadata')
+    op.drop_table('table_metadata')
@@ -6,10 +6,11 @@
 from six import text_type
 from sqlalchemy.exc import IntegrityError
 
-from redash import models
+from redash import models, settings
 from redash.handlers.base import BaseResource, get_object_or_404
 from redash.permissions import (require_access, require_admin,
                                 require_permission, view_only)
+from redash.tasks.queries import refresh_schemas
 from redash.query_runner import (get_configuration_schema_for_query_runner_type,
                                  query_runners, NotSupported)
 from redash.utils import filter_none
@@ -52,6 +53,9 @@ def post(self, data_source_id):
         data_source.name = req['name']
         models.db.session.add(data_source)
 
+        # Refresh the stored schemas when a data source is updated
+        refresh_schemas.apply_async(queue=settings.SCHEMAS_REFRESH_QUEUE)
+
         try:
             models.db.session.commit()
         except IntegrityError as e:
@@ -129,6 +133,9 @@ def post(self):
                                                              options=config)
 
             models.db.session.commit()
+
+            # Refresh the stored schemas when a new data source is added to the list
+            refresh_schemas.apply_async(queue=settings.SCHEMAS_REFRESH_QUEUE)
         except IntegrityError as e:
             if req['name'] in e.message:
                 abort(400, message="Data source with the name {} already exists.".format(req['name']))
@@ -151,9 +158,10 @@ def get(self, data_source_id):
         refresh = request.args.get('refresh') is not None
 
         response = {}
-
         try:
-            response['schema'] = data_source.get_schema(refresh)
+            if refresh:
+                refresh_schemas.apply(queue=settings.SCHEMAS_REFRESH_QUEUE)
+            response['schema'] = data_source.get_schema()
         except NotSupported:
             response['error'] = {
                 'code': 1,
 
@@ -7,6 +7,7 @@
 import pytz
 
 import xlsxwriter
+from operator import itemgetter
 from six import python_2_unicode_compatible, text_type
 from sqlalchemy import distinct, or_, and_, UniqueConstraint
 from sqlalchemy.dialects import postgresql
@@ -65,6 +66,62 @@ def get(self, query_id):
 scheduled_queries_executions = ScheduledQueriesExecutions()
 
 
+@python_2_unicode_compatible
+class TableMetadata(TimestampMixin, db.Model):
+    id = Column(db.Integer, primary_key=True)
+    org_id = Column(db.Integer, db.ForeignKey("organizations.id"))
+    data_source_id = Column(db.Integer, db.ForeignKey("data_sources.id", ondelete="CASCADE"))
+    exists = Column(db.Boolean, default=True)
+    name = Column(db.String(255))
+    description = Column(db.String(4096), nullable=True)
+    column_metadata = Column(db.Boolean, default=False)
+    sample_query = Column("sample_query", db.Text, nullable=True)
+
+    __tablename__ = 'table_metadata'
+
+    def __str__(self):
+        return text_type(self.table_name)
+
+    def to_dict(self):
+        return {
+            'id': self.id,
+            'org_id': self.org_id,
+            'data_source_id': self.data_source_id,
+            'exists': self.exists,
+            'name': self.name,
+            'description': self.description,
+            'column_metadata': self.column_metadata,
+            'sample_query': self.sample_query,
+        }
+
+
+@python_2_unicode_compatible
+class ColumnMetadata(TimestampMixin, db.Model):
+    id = Column(db.Integer, primary_key=True)
+    org_id = Column(db.Integer, db.ForeignKey("organizations.id"))
+    table_id = Column(db.Integer, db.ForeignKey("table_metadata.id", ondelete="CASCADE"))
+    name = Column(db.String(255))
+    type = Column(db.String(255), nullable=True)
+    example = Column(db.String(4096), nullable=True)
+    exists = Column(db.Boolean, default=True)
+
+    __tablename__ = 'column_metadata'
+
+    def __str__(self):
+        return text_type(self.name)
+
+    def to_dict(self):
+        return {
+            'id': self.id,
+            'org_id': self.org_id,
+            'table_id': self.table_id,
+            'name': self.name,
+            'type': self.type,
+            'example': self.example,
+            'exists': self.exists,
+        }
+
+
 @python_2_unicode_compatible
 @generic_repr('id', 'name', 'type', 'org_id', 'created_at')
 class DataSource(BelongsToOrgMixin, db.Model):
@@ -145,22 +202,29 @@ def delete(self):
         db.session.commit()
         return res
 
-    def get_schema(self, refresh=False):
-        key = "data_source:schema:{}".format(self.id)
-
-        cache = None
-        if not refresh:
-            cache = redis_connection.get(key)
-
-        if cache is None:
-            query_runner = self.query_runner
-            schema = sorted(query_runner.get_schema(get_stats=refresh), key=lambda t: t['name'])
-
-            redis_connection.set(key, json_dumps(schema))
-        else:
-            schema = json_loads(cache)
+    def get_schema(self):
+        schema = []
+        tables = TableMetadata.query.filter(TableMetadata.data_source_id == self.id).all()
+        for table in tables:
+            if not table.exists:
+                continue
 
-        return schema
+            table_info = {
+                'name': table.name,
+                'exists': table.exists,
+                'hasColumnMetadata': table.column_metadata,
+                'columns': []}
+            columns = ColumnMetadata.query.filter(ColumnMetadata.table_id == table.id)
+            table_info['columns'] = sorted([{
+                'key': column.id,
+                'name': column.name,
+                'type': column.type,
+                'exists': column.exists,
+                'example': column.example
+            } for column in columns if column.exists == True], key=itemgetter('name'))
+            schema.append(table_info)
+
+        return sorted(schema, key=itemgetter('name'))
 
     def _pause_key(self):
         return 'ds:{}:pause'.format(self.id)
 
@@ -54,6 +54,7 @@ class NotSupported(Exception):
 
 class BaseQueryRunner(object):
     noop_query = None
+    data_sample_query = None
 
     def __init__(self, configuration):
         self.syntax = 'sql'
@@ -118,6 +119,27 @@ def _run_query_internal(self, query):
             raise Exception("Failed running query [%s]." % query)
         return json_loads(results)['rows']
 
+    def get_table_sample(self, table_name):
+        if not self.configuration.get('samples', False):
+            return {}
+
+        if self.data_sample_query is None:
+            raise NotImplementedError()
+
+        query = self.data_sample_query.format(table=table_name)
+
+        results, error = self.run_query(query, None)
+        if error is not None:
+            raise NotSupported()
+
+        rows = json_loads(results).get('rows', [])
+        if len(rows) > 0:
+            sample = rows[0]
+        else:
+            sample = {}
+
+        return sample
+
     @classmethod
     def to_dict(cls):
         return {
 
@@ -43,6 +43,7 @@ def format(self, operation, parameters=None):
 
 class Athena(BaseQueryRunner):
     noop_query = 'SELECT 1'
+    data_sample_query = "SELECT * FROM {table} LIMIT 1"
 
     @classmethod
     def name(cls):
@@ -78,6 +79,10 @@ def configuration_schema(cls):
                     'type': 'boolean',
                     'title': 'Use Glue Data Catalog',
                 },
+                'samples': {
+                    'type': 'boolean',
+                    'title': 'Show Data Samples'
+                },
             },
             'required': ['region', 's3_staging_dir'],
             'order': ['region', 'aws_access_key', 'aws_secret_key', 's3_staging_dir', 'schema'],
@@ -143,7 +148,7 @@ def get_schema(self, get_stats=False):
 
         schema = {}
         query = """
-        SELECT table_schema, table_name, column_name
+        SELECT table_schema, table_name, column_name, data_type AS column_type
         FROM information_schema.columns
         WHERE table_schema NOT IN ('information_schema')
         """
@@ -153,11 +158,17 @@ def get_schema(self, get_stats=False):
             raise Exception("Failed getting schema.")
 
         results = json_loads(results)
-        for row in results['rows']:
+
+        for i, row in enumerate(results['rows']):
             table_name = '{0}.{1}'.format(row['table_schema'], row['table_name'])
             if table_name not in schema:
-                schema[table_name] = {'name': table_name, 'columns': []}
+                schema[table_name] = {'name': table_name, 'columns': [], 'metadata': []}
+
             schema[table_name]['columns'].append(row['column_name'])
+            schema[table_name]['metadata'].append({
+                "name": row['column_name'],
+                "type": row['column_type'],
+            })
 
         return schema.values()
 
 
@@ -28,6 +28,7 @@
 
 class Mysql(BaseSQLQueryRunner):
     noop_query = "SELECT 1"
+    data_sample_query = "SELECT * FROM {table} LIMIT 1"
 
     @classmethod
     def configuration_schema(cls):
@@ -54,7 +55,11 @@ def configuration_schema(cls):
                 'port': {
                     'type': 'number',
                     'default': 3306,
-                }
+                },
+                'samples': {
+                    'type': 'boolean',
+                    'title': 'Show Data Samples'
+                },
             },
             "order": ['host', 'port', 'user', 'passwd', 'db'],
             'required': ['db'],
@@ -100,7 +105,8 @@ def _get_tables(self, schema):
         query = """
         SELECT col.table_schema as table_schema,
                col.table_name as table_name,
-               col.column_name as column_name
+               col.column_name as column_name,
+               col.data_type AS column_type
         FROM `information_schema`.`columns` col
         WHERE col.table_schema NOT IN ('information_schema', 'performance_schema', 'mysql', 'sys');
         """
@@ -112,16 +118,20 @@ def _get_tables(self, schema):
 
         results = json_loads(results)
 
-        for row in results['rows']:
+        for i, row in enumerate(results['rows']):
             if row['table_schema'] != self.configuration['db']:
                 table_name = u'{}.{}'.format(row['table_schema'], row['table_name'])
             else:
                 table_name = row['table_name']
 
             if table_name not in schema:
-                schema[table_name] = {'name': table_name, 'columns': []}
+                schema[table_name] = {'name': table_name, 'columns': [], 'metadata': []}
 
             schema[table_name]['columns'].append(row['column_name'])
+            schema[table_name]['metadata'].append({
+                "name": row['column_name'],
+                "type": row['column_type'],
+            })
 
         return schema.values()
 
 
@@ -67,6 +67,7 @@ def _wait(conn, timeout=None):
 
 class PostgreSQL(BaseSQLQueryRunner):
     noop_query = "SELECT 1"
+    data_sample_query = "SELECT * FROM {table} LIMIT 1"
 
     @classmethod
     def configuration_schema(cls):
@@ -95,7 +96,11 @@ def configuration_schema(cls):
                    "type": "string",
                    "title": "SSL Mode",
                    "default": "prefer"
-                }
+                },
+                "samples": {
+                    "type": "boolean",
+                    "title": "Show Data Samples"
+                },
             },
             "order": ['host', 'port', 'user', 'password'],
             "required": ["dbname"],
@@ -121,9 +126,13 @@ def _get_definitions(self, schema, query):
                 table_name = row['table_name']
 
             if table_name not in schema:
-                schema[table_name] = {'name': table_name, 'columns': []}
+                schema[table_name] = {'name': table_name, 'columns': [], 'metadata': []}
 
             schema[table_name]['columns'].append(row['column_name'])
+            schema[table_name]['metadata'].append({
+                "name": row['column_name'],
+                "type": row['column_type'],
+            })
 
     def _get_tables(self, schema):
         '''
@@ -143,7 +152,8 @@ def _get_tables(self, schema):
         query = """
         SELECT s.nspname as table_schema,
                c.relname as table_name,
-               a.attname as column_name
+               a.attname as column_name,
+               a.atttypid::regtype as column_type
         FROM pg_class c
         JOIN pg_namespace s
         ON c.relnamespace = s.oid
@@ -251,7 +261,11 @@ def configuration_schema(cls):
                    "type": "string",
                    "title": "SSL Mode",
                    "default": "prefer"
-                }
+                },
+                "samples": {
+                    "type": "boolean",
+                    "title": "Show Data Samples"
+                },
             },
             "order": ['host', 'port', 'user', 'password'],
             "required": ["dbname", "user", "password", "host", "port"],
@@ -271,11 +285,12 @@ def _get_tables(self, schema):
             SELECT DISTINCT table_name,
                             table_schema,
                             column_name,
+                            data_type AS column_type,
                             ordinal_position AS pos
             FROM svv_columns
             WHERE table_schema NOT IN ('pg_internal','pg_catalog','information_schema')
         )
-        SELECT table_name, table_schema, column_name
+        SELECT table_name, table_schema, column_name, column_type
         FROM tables
         WHERE
             HAS_SCHEMA_PRIVILEGE(table_schema, 'USAGE') AND
 
@@ -31,6 +31,7 @@
 
 class Presto(BaseQueryRunner):
     noop_query = 'SHOW TABLES'
+    data_sample_query = "SELECT * FROM {table} LIMIT 1"
 
     @classmethod
     def configuration_schema(cls):
@@ -56,6 +57,10 @@ def configuration_schema(cls):
                 'username': {
                     'type': 'string'
                 },
+                'samples': {
+                    'type': 'boolean',
+                    'title': 'Show Data Samples'
+                },
             },
             'order': ['host', 'protocol', 'port', 'username', 'schema', 'catalog'],
             'required': ['host']
@@ -72,25 +77,27 @@ def type(cls):
     def get_schema(self, get_stats=False):
         schema = {}
         query = """
-        SELECT table_schema, table_name, column_name
+        SELECT table_schema, table_name, column_name, data_type AS column_type
         FROM information_schema.columns
         WHERE table_schema NOT IN ('pg_catalog', 'information_schema')
         """
 
         results, error = self.run_query(query, None)
-
         if error is not None:
             raise Exception("Failed getting schema.")
 
         results = json_loads(results)
 
         for row in results['rows']:
             table_name = '{}.{}'.format(row['table_schema'], row['table_name'])
-
             if table_name not in schema:
-                schema[table_name] = {'name': table_name, 'columns': []}
+                schema[table_name] = {'name': table_name, 'columns': [], 'metadata': []}
 
             schema[table_name]['columns'].append(row['column_name'])
+            schema[table_name]['metadata'].append({
+                "name": row['column_name'],
+                "type": row['column_type'],
+            })
 
         return schema.values()
 
 
@@ -259,6 +259,9 @@ def all_settings():
 # Enhance schema fetching
 SCHEMA_RUN_TABLE_SIZE_CALCULATIONS = parse_boolean(os.environ.get("REDASH_SCHEMA_RUN_TABLE_SIZE_CALCULATIONS", "false"))
 
+# Frequency of clearing out old schema metadata.
+SCHEMA_METADATA_TTL_DAYS = int(os.environ.get("REDASH_SCHEMA_METADATA_TTL_DAYS", 60))
+
 # Allow Parameters in Embeds
 # WARNING: With this option enabled, Redash reads query parameters from the request URL (risk of SQL injection!)
 ALLOW_PARAMETERS_IN_EMBEDS = parse_boolean(os.environ.get("REDASH_ALLOW_PARAMETERS_IN_EMBEDS", "false"))
 
@@ -1,3 +1,3 @@
 from .general import record_event, version_check, send_mail, sync_user_details
-from .queries import QueryTask, refresh_queries, refresh_schemas, cleanup_query_results, execute_query
+from .queries import QueryTask, refresh_queries, refresh_schemas, refresh_schema, cleanup_query_results, execute_query, get_table_sample_data, cleanup_schema_metadata
 from .alerts import check_alerts_for_query
@@ -1,14 +1,17 @@
 import logging
 import signal
 import time
+import datetime
 
 import redis
 from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded
 from celery.result import AsyncResult
 from celery.utils.log import get_task_logger
 from six import text_type
+from sqlalchemy.orm import load_only
 
-from redash import models, redis_connection, settings, statsd_client
+from redash import models, redis_connection, settings, statsd_client, utils
+from redash.models import TableMetadata, ColumnMetadata, db
 from redash.query_runner import InterruptException
 from redash.tasks.alerts import check_alerts_for_query
 from redash.utils import gen_query_hash, json_dumps, json_loads, utcnow, mustache_render
@@ -229,13 +232,143 @@ def cleanup_query_results():
     logger.info("Deleted %d unused query results.", deleted_count)
 
 
+@celery.task(name="redash.tasks.get_table_sample_data")
+def get_table_sample_data(data_source_id, table, table_id):
+    ds = models.DataSource.get_by_id(data_source_id)
+    sample = ds.query_runner.get_table_sample(table['name'])
+    if not sample:
+        return
+
+    # If a column exists, add a sample to it.
+    for i, column in enumerate(table['columns']):
+        persisted_column = ColumnMetadata.query.filter(
+            ColumnMetadata.name == column,
+            ColumnMetadata.table_id == table_id,
+        ).options(load_only('id')).first()
+
+        if persisted_column:
+            column_example = str(sample.get(column, None))
+            if column_example and len(column_example) > 4000:
+                column_example = u'{}...'.format(column_example[:4000])
+
+            ColumnMetadata.query.filter(
+                ColumnMetadata.id == persisted_column.id,
+            ).update({
+                'example': column_example,
+            })
+    models.db.session.commit()
+
+def cleanup_data_in_table(table_model):
+    removed_metadata = table_model.query.filter(
+        table_model.exists == False,
+    ).options(load_only('updated_at'))
+
+    for removed_metadata_row in removed_metadata:
+        is_old_data = (
+            utils.utcnow() - removed_metadata_row.updated_at
+        ) > datetime.timedelta(days=settings.SCHEMA_METADATA_TTL_DAYS)
+
+        table_model.query.filter(
+            table_model.id == removed_metadata_row.id,
+        ).delete()
+
+    db.session.commit()
+
+@celery.task(name="redash.tasks.cleanup_schema_metadata")
+def cleanup_schema_metadata():
+    cleanup_data_in_table(TableMetadata)
+    cleanup_data_in_table(ColumnMetadata)
+
 @celery.task(name="redash.tasks.refresh_schema", time_limit=90, soft_time_limit=60)
 def refresh_schema(data_source_id):
     ds = models.DataSource.get_by_id(data_source_id)
     logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id)
     start_time = time.time()
+
     try:
-        ds.get_schema(refresh=True)
+        existing_tables = set()
+        schema = ds.query_runner.get_schema(get_stats=True)
+        for table in schema:
+            table_name = table['name']
+            existing_tables.add(table_name)
+
+            # Assume that there will only exist 1 table with a given name for a given data source so we use first()
+            persisted_table = TableMetadata.query.filter(
+                TableMetadata.name == table_name,
+                TableMetadata.data_source_id == ds.id,
+            ).first()
+
+            if persisted_table:
+                TableMetadata.query.filter(
+                    TableMetadata.id == persisted_table.id,
+                ).update({"exists": True})
+            else:
+                metadata = 'metadata' in table
+                persisted_table = TableMetadata(
+                    org_id=ds.org_id,
+                    name=table_name,
+                    data_source_id=ds.id,
+                    column_metadata=metadata
+                )
+                models.db.session.add(persisted_table)
+                models.db.session.flush()
+
+            existing_columns = set()
+            for i, column in enumerate(table['columns']):
+                existing_columns.add(column)
+                column_metadata = {
+                    'org_id': ds.org_id,
+                    'table_id': persisted_table.id,
+                    'name': column,
+                    'type': None,
+                    'example': None,
+                    'exists': True
+                }
+                if 'metadata' in table:
+                    column_metadata['type'] = table['metadata'][i]['type']
+
+                # If the column exists, update it, otherwise create a new one.
+                persisted_column = ColumnMetadata.query.filter(
+                     ColumnMetadata.name == column,
+                     ColumnMetadata.table_id == persisted_table.id,
+                ).options(load_only('id')).first()
+                if persisted_column:
+                    ColumnMetadata.query.filter(
+                        ColumnMetadata.id == persisted_column.id,
+                    ).update(column_metadata)
+                else:
+                    models.db.session.add(ColumnMetadata(**column_metadata))
+            models.db.session.commit()
+
+            get_table_sample_data.apply_async(
+                args=(data_source_id, table, persisted_table.id),
+                queue=settings.SCHEMAS_REFRESH_QUEUE
+            )
+
+            # If a column did not exist, set the 'column_exists' flag to false.
+            existing_columns_list = tuple(existing_columns)
+            ColumnMetadata.query.filter(
+                ColumnMetadata.exists == True,
+                ColumnMetadata.table_id == persisted_table.id,
+                ~ColumnMetadata.name.in_(existing_columns_list),
+            ).update({
+                "exists": False,
+                "updated_at": db.func.now()
+            }, synchronize_session='fetch')
+
+        # If a table did not exist in the get_schema() response above, set the 'exists' flag to false.
+        existing_tables_list = tuple(existing_tables)
+        tables_to_update = TableMetadata.query.filter(
+            TableMetadata.exists == True,
+            TableMetadata.data_source_id == ds.id,
+             ~TableMetadata.name.in_(existing_tables_list)
+         ).update({
+            "exists": False,
+            "updated_at": db.func.now()
+        }, synchronize_session='fetch')
+
+        models.db.session.commit()
+
         logger.info(u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
         statsd_client.incr('refresh_schema.success')
     except SoftTimeLimitExceeded:
 
@@ -27,6 +27,10 @@
     'sync_user_details': {
         'task': 'redash.tasks.sync_user_details',
         'schedule': timedelta(minutes=1),
+    },
+    'cleanup_schema_metadata': {
+        'task': 'redash.tasks.cleanup_schema_metadata',
+        'schedule': timedelta(days=3),
     }
 }
 
 
@@ -79,6 +79,15 @@ def __call__(self):
                              data_source=data_source_factory.create,
                              org_id=1)
 
+table_metadata_factory = ModelFactory(redash.models.TableMetadata,
+                                      data_source_id=1,
+                                      exists=True,
+                                      name='table')
+
+column_metadata_factory = ModelFactory(redash.models.ColumnMetadata,
+                                       table_id=1,
+                                       name='column')
+
 query_with_params_factory = ModelFactory(redash.models.Query,
                              name='New Query with Params',
                              description='',
@@ -176,6 +185,12 @@ def create_org(self, **kwargs):
 
         return org
 
+    def create_table_metadata(self, **kwargs):
+        return table_metadata_factory.create(**kwargs)
+
+    def create_column_metadata(self, **kwargs):
+        return column_metadata_factory.create(**kwargs)
+
     def create_user(self, **kwargs):
         args = {
             'org': self.org,
 
@@ -1,4 +1,3 @@
-import mock
 from tests import BaseTestCase
 
 from redash.models import DataSource, Query, QueryResult
@@ -7,38 +6,43 @@
 
 class DataSourceTest(BaseTestCase):
     def test_get_schema(self):
-        return_value = [{'name': 'table', 'columns': []}]
-
-        with mock.patch('redash.query_runner.pg.PostgreSQL.get_schema') as patched_get_schema:
-            patched_get_schema.return_value = return_value
-
-            schema = self.factory.data_source.get_schema()
-
-            self.assertEqual(return_value, schema)
-
-    def test_get_schema_uses_cache(self):
-        return_value = [{'name': 'table', 'columns': []}]
-        with mock.patch('redash.query_runner.pg.PostgreSQL.get_schema') as patched_get_schema:
-            patched_get_schema.return_value = return_value
-
-            self.factory.data_source.get_schema()
-            schema = self.factory.data_source.get_schema()
-
-            self.assertEqual(return_value, schema)
-            self.assertEqual(patched_get_schema.call_count, 1)
-
-    def test_get_schema_skips_cache_with_refresh_true(self):
-        return_value = [{'name': 'table', 'columns': []}]
-        with mock.patch('redash.query_runner.pg.PostgreSQL.get_schema') as patched_get_schema:
-            patched_get_schema.return_value = return_value
-
-            self.factory.data_source.get_schema()
-            new_return_value = [{'name': 'new_table', 'columns': []}]
-            patched_get_schema.return_value = new_return_value
-            schema = self.factory.data_source.get_schema(refresh=True)
+        data_source = self.factory.create_data_source()
 
-            self.assertEqual(new_return_value, schema)
-            self.assertEqual(patched_get_schema.call_count, 2)
+        # Create an existing table with a non-existing column
+        table_metadata = self.factory.create_table_metadata(
+            data_source_id=data_source.id,
+            org_id=data_source.org_id
+        )
+        column_metadata = self.factory.create_column_metadata(
+            table_id=table_metadata.id,
+            org_id=data_source.org_id,
+            type='boolean',
+            example=True,
+            exists=False
+        )
+
+        # Create a non-existing table with an existing column
+        table_metadata = self.factory.create_table_metadata(
+            data_source_id=data_source.id,
+            org_id=data_source.org_id,
+            name='table_doesnt_exist',
+            exists=False
+        )
+        column_metadata = self.factory.create_column_metadata(
+            table_id=table_metadata.id,
+            org_id=data_source.org_id,
+            type='boolean',
+            example=True,
+        )
+
+        return_value = [{
+            'name': 'table',
+            'hasColumnMetadata': False,
+            'exists': True,
+            'columns': []
+        }]
+        schema = data_source.get_schema()
+        self.assertEqual(return_value, schema)
 
 
 class TestDataSourceCreate(BaseTestCase):
 
@@ -0,0 +1,77 @@
+import json
+import mock
+
+from unittest import TestCase
+
+from redash.query_runner.presto import Presto
+from redash.query_runner.athena import Athena
+from redash.query_runner.mysql import Mysql
+from redash.query_runner.pg import PostgreSQL, Redshift
+
+class TestBaseQueryRunner(TestCase):
+    def setUp(self):
+        self.query_runners = [{
+            'instance': Presto({}),
+            'mock_location': 'presto.Presto'
+        }, {
+            'instance': Athena({}),
+            'mock_location': 'athena.Athena'
+        }, {
+            'instance': Mysql({'db': None}),
+            'mock_location': 'mysql.Mysql'
+        }, {
+            'instance': PostgreSQL({}),
+            'mock_location': 'pg.PostgreSQL'
+        }, {
+            'instance': Redshift({}),
+            'mock_location': 'pg.Redshift'
+        }]
+
+    def _setup_mock(self, function_to_patch):
+        patcher = mock.patch(function_to_patch)
+        patched_function = patcher.start()
+        self.addCleanup(patcher.stop)
+        return patched_function
+
+    def assert_correct_schema_format(self, query_runner, mock_location):
+        EXPECTED_SCHEMA_RESULT = [{
+            'columns': ['created_date'],
+            'metadata': [{
+                'name': 'created_date',
+                'type': 'varchar',
+            }],
+            'name': 'default.table_name'
+        }]
+
+        get_schema_query_response = {
+            "rows": [{
+                "table_schema": "default",
+                "table_name": "table_name",
+                "column_type": "varchar",
+                "column_name": "created_date"
+            }]
+        }
+        get_samples_query_response = {
+            "rows": [{
+                "created_date": "2017-10-26"
+            }]
+        }
+
+        self.run_count = 0
+        def query_runner_resonses(query, user):
+            response = (json.dumps(get_schema_query_response), None)
+            if self.run_count > 0:
+                response = (json.dumps(get_samples_query_response), None)
+            self.run_count += 1
+            return response
+
+        self.patched_run_query = self._setup_mock(
+            'redash.query_runner.{location}.run_query'.format(location=mock_location))
+        self.patched_run_query.side_effect = query_runner_resonses
+
+        schema = query_runner.get_schema()
+        self.assertEqual(schema, EXPECTED_SCHEMA_RESULT)
+
+    def test_get_schema_format(self):
+        for runner in self.query_runners:
+            self.assert_correct_schema_format(runner['instance'], runner['mock_location'])
@@ -3,11 +3,14 @@
 import uuid
 
 import mock
+import datetime
 
 from tests import BaseTestCase
-from redash import redis_connection, models
+from redash import redis_connection, models, utils
+from redash.models import TableMetadata
 from redash.query_runner.pg import PostgreSQL
-from redash.tasks.queries import QueryExecutionError, enqueue_query, execute_query
+from redash.tasks.queries import (QueryExecutionError, enqueue_query,
+                                    execute_query, cleanup_data_in_table)
 
 
 FakeResult = namedtuple('FakeResult', 'id')
@@ -114,3 +117,24 @@ def test_success_after_failure(self):
                           scheduled_query_id=q.id)
             q = models.Query.get_by_id(q.id)
             self.assertEqual(q.schedule_failures, 0)
+
+
+class TestPruneSchemaMetadata(BaseTestCase):
+
+    def test_cleanup_data_in_table(self):
+        data_source = self.factory.create_data_source()
+
+         # Create an existing table with a non-existing column
+        table_metadata = self.factory.create_table_metadata(
+            data_source_id=data_source.id,
+            org_id=data_source.org_id,
+            exists=False,
+            updated_at=(utils.utcnow() - datetime.timedelta(days=70))
+        )
+        all_tables = TableMetadata.query.all()
+        self.assertEqual(len(all_tables), 1)
+
+        cleanup_data_in_table(TableMetadata)
+
+        all_tables = TableMetadata.query.all()
+        self.assertEqual(len(all_tables), 0)
@@ -1,10 +1,49 @@
+import copy
+
 from mock import patch
 from tests import BaseTestCase
 
-from redash.tasks import refresh_schemas
+from redash import models
+from redash.tasks import refresh_schemas, refresh_schema, get_table_sample_data
+from redash.models import TableMetadata, ColumnMetadata
 
 
 class TestRefreshSchemas(BaseTestCase):
+    def setUp(self):
+        super(TestRefreshSchemas, self).setUp()
+
+        self.COLUMN_NAME = 'first_column'
+        self.COLUMN_TYPE = 'text'
+        self.COLUMN_EXAMPLE = 'some text for column value'
+        self.EXPECTED_COLUMN_METADATA = {
+            'id': 1,
+            'org_id': 1,
+            'table_id': 1,
+            'name': self.COLUMN_NAME,
+            'type': self.COLUMN_TYPE,
+            'example': self.COLUMN_EXAMPLE,
+            'exists': True,
+        }
+
+        get_schema_patcher = patch('redash.query_runner.pg.PostgreSQL.get_schema')
+        self.patched_get_schema = get_schema_patcher.start()
+        self.addCleanup(get_schema_patcher.stop)
+        self.default_schema_return_value = [{
+            'name': 'table',
+            'columns': [self.COLUMN_NAME],
+            'metadata': [{
+                'name': self.COLUMN_NAME,
+                'type': self.COLUMN_TYPE,
+            }]
+        }]
+        self.patched_get_schema.return_value = self.default_schema_return_value
+
+
+        get_table_sample_patcher = patch('redash.query_runner.BaseQueryRunner.get_table_sample')
+        patched_get_table_sample = get_table_sample_patcher.start()
+        self.addCleanup(get_table_sample_patcher.stop)
+        patched_get_table_sample.return_value = {self.COLUMN_NAME: self.COLUMN_EXAMPLE}
+
     def test_calls_refresh_of_all_data_sources(self):
         self.factory.data_source  # trigger creation
         with patch('redash.tasks.queries.refresh_schema.apply_async') as refresh_job:
@@ -23,3 +62,101 @@ def test_skips_paused_data_sources(self):
         with patch('redash.tasks.queries.refresh_schema.apply_async') as refresh_job:
             refresh_schemas()
             refresh_job.assert_called()
+
+    def test_refresh_schema_creates_tables(self):
+        EXPECTED_TABLE_METADATA = {
+            'id': 1,
+            'org_id': 1,
+            'exists': True,
+            'name': 'table',
+            'sample_query': None,
+            'description': None,
+            'column_metadata': True,
+            'data_source_id': 1
+        }
+
+        refresh_schema(self.factory.data_source.id)
+        get_table_sample_data(
+            self.factory.data_source.id, {
+                "name": 'table',
+                "columns": [self.COLUMN_NAME]
+            }, 1
+        )
+        table_metadata = TableMetadata.query.all()
+        column_metadata = ColumnMetadata.query.all()
+
+        self.assertEqual(len(table_metadata), 1)
+        self.assertEqual(len(column_metadata), 1)
+        self.assertEqual(table_metadata[0].to_dict(), EXPECTED_TABLE_METADATA)
+        self.assertEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA)
+
+    def test_refresh_schema_deleted_table_marked(self):
+        refresh_schema(self.factory.data_source.id)
+        table_metadata = TableMetadata.query.all()
+        column_metadata = ColumnMetadata.query.all()
+
+        self.assertEqual(len(table_metadata), 1)
+        self.assertEqual(len(column_metadata), 1)
+        self.assertTrue(table_metadata[0].to_dict()['exists'])
+
+        # Table is gone, `exists` should be False.
+        self.patched_get_schema.return_value = []
+
+        refresh_schema(self.factory.data_source.id)
+        table_metadata = TableMetadata.query.all()
+        column_metadata = ColumnMetadata.query.all()
+
+        self.assertEqual(len(table_metadata), 1)
+        self.assertEqual(len(column_metadata), 1)
+        self.assertFalse(table_metadata[0].to_dict()['exists'])
+
+        # Table is back, `exists` should be True again.
+        self.patched_get_schema.return_value = self.default_schema_return_value
+        refresh_schema(self.factory.data_source.id)
+        table_metadata = TableMetadata.query.all()
+        self.assertTrue(table_metadata[0].to_dict()['exists'])
+
+    def test_refresh_schema_delete_column(self):
+        NEW_COLUMN_NAME = 'new_column'
+        refresh_schema(self.factory.data_source.id)
+        column_metadata = ColumnMetadata.query.all()
+
+        self.assertTrue(column_metadata[0].to_dict()['exists'])
+
+        self.patched_get_schema.return_value = [{
+            'name': 'table',
+            'columns': [NEW_COLUMN_NAME],
+            'metadata': [{
+                'name': NEW_COLUMN_NAME,
+                'type': self.COLUMN_TYPE,
+            }]
+        }]
+
+        refresh_schema(self.factory.data_source.id)
+        column_metadata = ColumnMetadata.query.all()
+        self.assertEqual(len(column_metadata), 2)
+
+        self.assertFalse(column_metadata[1].to_dict()['exists'])
+        self.assertTrue(column_metadata[0].to_dict()['exists'])
+
+    def test_refresh_schema_update_column(self):
+        UPDATED_COLUMN_TYPE = 'varchar'
+
+        refresh_schema(self.factory.data_source.id)
+        get_table_sample_data(
+            self.factory.data_source.id, {
+                "name": 'table',
+                "columns": [self.COLUMN_NAME]
+            }, 1
+        )
+        column_metadata = ColumnMetadata.query.all()
+        self.assertEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA)
+
+        updated_schema = copy.deepcopy(self.default_schema_return_value)
+        updated_schema[0]['metadata'][0]['type'] = UPDATED_COLUMN_TYPE
+        self.patched_get_schema.return_value = updated_schema
+
+        refresh_schema(self.factory.data_source.id)
+        column_metadata = ColumnMetadata.query.all()
+        self.assertNotEqual(column_metadata[0].to_dict(), self.EXPECTED_COLUMN_METADATA)
+        self.assertEqual(column_metadata[0].to_dict()['type'], UPDATED_COLUMN_TYPE)
@@ -16,7 +16,7 @@ def test_interactive_new(self):
         result = runner.invoke(
             manager,
             ['ds', 'new'],
-            input="test\n%s\n\n\nexample.com\n\n\ntestdb\n" % (pg_i,))
+            input="test\n%s\n\n\nexample.com\n\n\n\ntestdb\n" % (pg_i,))
         self.assertFalse(result.exception)
         self.assertEqual(result.exit_code, 0)
         self.assertEqual(DataSource.query.count(), 1)
Original file line number	Diff line number	Diff line change
`@@ -7,14 +7,14 @@ div.table-name {`
`7`	`7`	`border-radius: @redash-radius;`
`8`	`8`	`position: relative;`
`9`	`9`
`10`		`- .copy-to-editor {`
	`10`	`+ .copy-to-editor, .info {`
`11`	`11`	`display: none;`
`12`	`12`	`}`
`13`	`13`
`14`	`14`	`&:hover {`
`15`	`15`	`background: fade(@redash-gray, 10%);`
`16`	`16`
`17`		`- .copy-to-editor {`
	`17`	`+ .copy-to-editor, .info {`
`18`	`18`	`display: flex;`
`19`	`19`	`}`
`20`	`20`	`}`
`@@ -36,7 +36,7 @@ div.table-name {`
`36`	`36`	`background: transparent;`
`37`	`37`	`}`
`38`	`38`
`39`		`- .copy-to-editor {`
	`39`	`+ .copy-to-editor, .info {`
`40`	`40`	`color: fade(@redash-gray, 90%);`
`41`	`41`	`cursor: pointer;`
`42`	`42`	`position: absolute;`
`@@ -49,21 +49,25 @@ div.table-name {`
`49`	`49`	`justify-content: center;`
`50`	`50`	`}`
`51`	`51`
	`52`	`+ .info {`
	`53`	`+ right: 20px`
	`54`	`+ }`
	`55`	`+`
`52`	`56`	`.table-open {`
`53`	`57`	`padding: 0 22px 0 26px;`
`54`	`58`	`overflow: hidden;`
`55`	`59`	`text-overflow: ellipsis;`
`56`	`60`	`white-space: nowrap;`
`57`	`61`	`position: relative;`
`58`	`62`
`59`		`- .copy-to-editor {`
	`63`	`+ .copy-to-editor, .info {`
`60`	`64`	`display: none;`
`61`	`65`	`}`
`62`	`66`
`63`	`67`	`&:hover {`
`64`	`68`	`background: fade(@redash-gray, 10%);`
`65`	`69`
`66`		`- .copy-to-editor {`
	`70`	`+ .copy-to-editor, .info {`
`67`	`71`	`display: flex;`
`68`	`72`	`}`
`69`	`73`	`}`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,10 @@`
`27`	`27`	`'sync_user_details': {`
`28`	`28`	`'task': 'redash.tasks.sync_user_details',`
`29`	`29`	`'schedule': timedelta(minutes=1),`
	`30`	`+ },`
	`31`	`+ 'cleanup_schema_metadata': {`
	`32`	`+ 'task': 'redash.tasks.cleanup_schema_metadata',`
	`33`	`+ 'schedule': timedelta(days=3),`
`30`	`34`	`}`
`31`	`35`	`}`
`32`	`36`