Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions samples/custom-sql-statistics/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.csv
*.txt
21 changes: 21 additions & 0 deletions samples/custom-sql-statistics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Overview

The custom-sql-statistics python script uses the Metadata API to find how much CustomSQL is used on a Tableau Site. It will report statistics of what percentage of Workbooks and Data Sources are using CustomSQL. It also tracks how many CustomSQL queries were not accepted/supported by the Catalog lineage parser. Please refer to the [official documentation for Catalog's support for CustomSQL](https://help.tableau.com/current/pro/desktop/en-us/customsql.htm#tableau-catalog-support-for-custom-sql) for more information on what SQL is intended to be supported by Catalog.

This script reports its statistics in two ways:
1. Outputs a brief summary of how much CustomSQL is on the site and how much was supported. This will be outputted to `customSQL-stats-summary.txt`.
2. Outputs data to `customSQL-stats.csv` that can be used for deeper analysis. **Note:** This format includes the actual SQL queries text included in the Data Source or Workbook (it does not include any data returned by the query, just the SQL used.)

## Instructions for use
To call the script, please use the latest version of python3 and call the script like so:
```
python3 custom-sql-statistics.py --server <http://example.com> --username <your server/site username> --sitename <only needed for non-default sites>
```
The scripts results will be outputted to two files `customSQL-stats-summary.txt` which is summary of the data, and `customSQL-stats.csv` which gives row-level data on the results. The latter can be used in Tableau for more thorough analysis.

For best results, run this script as a site or server admin so that full query data can be returned. If run as a non-admin, the `query_string` examples may be blank due to permissions.

Note that for a large server with a lot of CustomSQL usage, this script may take a long time to run since it needs to iterate through all content in the server through the Metadata API.

### Known limitations
This script does not include CustomSQL used from Prep flows.
231 changes: 231 additions & 0 deletions samples/custom-sql-statistics/custom-sql-statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
####
# This script can be used for getting more information about CustomSQL prevelance on a Tableau Server/Site.
#
# This script was written on Python 3.7.6 and was not tested to work on other versions of Python.
# This script
####


import argparse
import getpass
import logging
import requests
import csv

from datetime import time

import tableauserverclient as TSC

#See more examples here https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_examples.html



def main():
parser = argparse.ArgumentParser(description='Reports on CustomSQL statistics in the Catalog graph. Outputs data into CSV files for reporting.')
parser.add_argument('--server', '-s', required=True, help='server address (include "http(s)://")')
parser.add_argument('--username', '-u', required=True, help='username to sign into server')
parser.add_argument('--logging-level', '-l', choices=['debug', 'info', 'error'], default='error',
help='desired logging level (set to error by default)')
parser.add_argument('--sitename', '-n', help='Sitename to process CustomSQL Statistics for. This is optional and defaults to the `Default` site')

args = parser.parse_args()

password = getpass.getpass("Password: ")


# Set logging level based on user input, or error by default
logging_level = getattr(logging, args.logging_level.upper())
logging.basicConfig(level=logging_level)


tableau_auth = TSC.TableauAuth(args.username, password, args.sitename)
server = TSC.Server(args.server)
server.version = '3.3'

with server.auth.sign_in(tableau_auth):
logging.debug("Signed into Server")

# resp = server.metadata.query(query)
#Query the Metadata API and store the response in resp

query = """
{
customSQLTablesConnection(first: 20, after: AFTER_TOKEN_SIGNAL) {
nodes {
id

database {
connectionType
}

tables {
id
}

query

columns {
workbooks_directly_connected: referencedByFields {
datasource {
... on EmbeddedDatasource {
workbook {
name
id
}
}
}
}
datasources_directly_connected: referencedByFields {
datasource {
... on PublishedDatasource {
name
id
}
}
}
}
}
pageInfo {
hasNextPage
endCursor
}
}
}
"""
print("--------------------------\nBeginning to query information about CustomSQLTables on this site...")
resp = server.metadata.query(query.replace('AFTER_TOKEN_SIGNAL', 'null'))
workbooks = {}
datasources = {}
table_stats = {'num_skipped': 0, 'num_tables_seen': 0, 'num_failed_parse': 0}
while True:
process_page(resp, workbooks, datasources, table_stats)

page_info = resp['data']['customSQLTablesConnection']['pageInfo']

if page_info['hasNextPage']:
resp = server.metadata.query(query.replace('AFTER_TOKEN_SIGNAL', '"' + page_info['endCursor'] + '"'))
else:
break

logging.debug("{} CustomSQLTables were skipped due to unexpected data".format(table_stats['num_skipped']))
totalCountsQuery = """
{
total_workbooks_count: workbooksConnection { totalCount }
total_datasources_count: publishedDatasourcesConnection { totalCount }
}
"""
resp = server.metadata.query(totalCountsQuery)
total_workbooks = resp['data']['total_workbooks_count']['totalCount']
total_datasources = resp['data']['total_datasources_count']['totalCount']

## Outputting summary to customSQL-stats-summary.txt file
with open("./customSQL-stats-summary.txt", 'w', newline='') as file:

print("--------------------------\nFinished processing CustomSQLTables on this site...", file=file)
print("Total # of CustomSQLTables on site={} and {} of them ({:.2f}%) were not parsed by Catalog".format(table_stats['num_tables_seen'], table_stats['num_failed_parse'], percentify(table_stats['num_failed_parse'] / table_stats['num_tables_seen'])), file=file)
print("Total # of Workbooks on Site={}".format(total_workbooks), file=file)
print("# of Workbooks using CustomSQL={} ({:.2f}% of total)".format(len(workbooks), percentify(len(workbooks) / total_workbooks)), file=file)

print("Total # of Published Data Sources on Site={}".format(total_datasources), file=file)
print("# of Published Data Sources using CustomSQL={} ({:.2f}% of total)".format(len(datasources), percentify(len(datasources) / total_datasources)), file=file)


## Outputting detaield data to CSV file
filename='./customSQL-stats.csv'
with open(filename, 'w', newline='') as file:
csv_writer = csv.writer(file)

columnHeaders = ['parent_content_type', 'parent_content_graph_id', 'custom_sql_graph_id', 'sql_failed_to_parse', 'query_string', 'database_type']
csv_writer.writerow(columnHeaders)

serialize_to_csv(csv_writer, workbooks, 'workbook')
serialize_to_csv(csv_writer, datasources, 'published datasource')



# Serializes info to a CSV file
def serialize_to_csv(writer, collection, content_type):
## Create a row per each customSQL table in each worbkook or data source
for content_item_id in collection.keys():
for cust_sql_table_id in collection[content_item_id]['customSQLTables'].keys():
cust_sql_table = collection[content_item_id]['customSQLTables'][cust_sql_table_id]

new_row = [content_type]
new_row.append(content_item_id)
new_row.append(cust_sql_table_id)
new_row.append(cust_sql_table['sql_failed_to_parse'])
new_row.append(cust_sql_table['query_string'])
new_row.append(cust_sql_table['database_type'])

writer.writerow(new_row)



def percentify(decimal):
return decimal * 100


def process_page(response, workbooks, datasources, table_stats):
customSQLTables = response['data']['customSQLTablesConnection']['nodes']

for table in customSQLTables:
table_stats['num_tables_seen'] += 1
table_stats['num_failed_parse'] += 1 if has_failed_sql(table) else 0

if len(table['columns']) == 0:
logging.debug("Table {} has no columns and will be skipped".format(table['id']))
table_stats['num_skipped'] += 1
continue

if len(table['columns'][0]['workbooks_directly_connected']) == 0:
logging.debug("Table {} has nothing in `workbooks_directly_connected` and will be skipped".format(table['id']))
table_stats['num_skipped'] += 1
continue

## this is CustomSQLTable connecting to a WB
if bool(table['columns'][0]['workbooks_directly_connected'][0]['datasource']):
object_id = table['columns'][0]['workbooks_directly_connected'][0]['datasource']['workbook']['id']
process_table_for_collection(table, object_id, workbooks)

## This is a CustomSQLTable connecting to a PDS
else:
object_id = table['columns'][0]['datasources_directly_connected'][0]['datasource']['id']
process_table_for_collection(table, object_id, datasources)


def process_table_for_collection(table, object_id, collection):

## This is first time we've seen this workbook
if object_id not in collection:
collection[object_id] = {}
collection[object_id]['customSQLTables'] = {}
collection[object_id]['customSQLTables'][table['id']] = {}
extract_sql_table_info(table, collection[object_id]['customSQLTables'][table['id']])
else:
if table['id'] in collection[object_id]['customSQLTables']:
logging.debug('Seeing same CustomSQLTable twice. Skipping adding to dictionary. Table ID: {}'.format(table['id']))
else:
collection[object_id]['customSQLTables'][table['id']] = {}
extract_sql_table_info(table, collection[object_id]['customSQLTables'][table['id']])

logging.info("Processed table id={} and added to collection".format(table['id']))

def test_extract(source_table_dict, collection, object_id):

collection[object_id]['customSQLTables'][table['id']]['sql_failed_to_parse'] = has_failed_sql(source_table_dict)
collection[object_id]['customSQLTables'][table['id']]['query_string'] = source_table_dict['query']
collection[object_id]['customSQLTables'][table['id']]['database_type'] = source_table_dict['database']['connectionType']


def extract_sql_table_info(source_table_dict, dest_table_dict):
dest_table_dict['sql_failed_to_parse'] = has_failed_sql(source_table_dict)
dest_table_dict['query_string'] = source_table_dict['query']
dest_table_dict['database_type'] = source_table_dict['database']['connectionType']


def has_failed_sql(table):
return False if len(table['tables']) > 0 else True

if __name__ == '__main__':
main()