diff --git a/samples/custom-sql-statistics/.gitignore b/samples/custom-sql-statistics/.gitignore new file mode 100644 index 0000000..63400aa --- /dev/null +++ b/samples/custom-sql-statistics/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.txt \ No newline at end of file diff --git a/samples/custom-sql-statistics/README.md b/samples/custom-sql-statistics/README.md new file mode 100644 index 0000000..e1292b8 --- /dev/null +++ b/samples/custom-sql-statistics/README.md @@ -0,0 +1,21 @@ +# Overview + +The custom-sql-statistics python script uses the Metadata API to find how much CustomSQL is used on a Tableau Site. It will report statistics of what percentage of Workbooks and Data Sources are using CustomSQL. It also tracks how many CustomSQL queries were not accepted/supported by the Catalog lineage parser. Please refer to the [official documentation for Catalog's support for CustomSQL](https://help.tableau.com/current/pro/desktop/en-us/customsql.htm#tableau-catalog-support-for-custom-sql) for more information on what SQL is intended to be supported by Catalog. + +This script reports its statistics in two ways: +1. Outputs a brief summary of how much CustomSQL is on the site and how much was supported. This will be outputted to `customSQL-stats-summary.txt`. +2. Outputs data to `customSQL-stats.csv` that can be used for deeper analysis. **Note:** This format includes the actual SQL queries text included in the Data Source or Workbook (it does not include any data returned by the query, just the SQL used.) + +## Instructions for use +To call the script, please use the latest version of python3 and call the script like so: +``` + python3 custom-sql-statistics.py --server --username --sitename +``` +The scripts results will be outputted to two files `customSQL-stats-summary.txt` which is summary of the data, and `customSQL-stats.csv` which gives row-level data on the results. The latter can be used in Tableau for more thorough analysis. + +For best results, run this script as a site or server admin so that full query data can be returned. If run as a non-admin, the `query_string` examples may be blank due to permissions. + +Note that for a large server with a lot of CustomSQL usage, this script may take a long time to run since it needs to iterate through all content in the server through the Metadata API. + +### Known limitations +This script does not include CustomSQL used from Prep flows. diff --git a/samples/custom-sql-statistics/custom-sql-statistics.py b/samples/custom-sql-statistics/custom-sql-statistics.py new file mode 100644 index 0000000..0a3cc3b --- /dev/null +++ b/samples/custom-sql-statistics/custom-sql-statistics.py @@ -0,0 +1,231 @@ +#### +# This script can be used for getting more information about CustomSQL prevelance on a Tableau Server/Site. +# +# This script was written on Python 3.7.6 and was not tested to work on other versions of Python. +# This script +#### + + +import argparse +import getpass +import logging +import requests +import csv + +from datetime import time + +import tableauserverclient as TSC + +#See more examples here https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_examples.html + + + +def main(): + parser = argparse.ArgumentParser(description='Reports on CustomSQL statistics in the Catalog graph. Outputs data into CSV files for reporting.') + parser.add_argument('--server', '-s', required=True, help='server address (include "http(s)://")') + parser.add_argument('--username', '-u', required=True, help='username to sign into server') + parser.add_argument('--logging-level', '-l', choices=['debug', 'info', 'error'], default='error', + help='desired logging level (set to error by default)') + parser.add_argument('--sitename', '-n', help='Sitename to process CustomSQL Statistics for. This is optional and defaults to the `Default` site') + + args = parser.parse_args() + + password = getpass.getpass("Password: ") + + + # Set logging level based on user input, or error by default + logging_level = getattr(logging, args.logging_level.upper()) + logging.basicConfig(level=logging_level) + + + tableau_auth = TSC.TableauAuth(args.username, password, args.sitename) + server = TSC.Server(args.server) + server.version = '3.3' + + with server.auth.sign_in(tableau_auth): + logging.debug("Signed into Server") + + # resp = server.metadata.query(query) + #Query the Metadata API and store the response in resp + + query = """ +{ +customSQLTablesConnection(first: 20, after: AFTER_TOKEN_SIGNAL) { + nodes { + id + + database { + connectionType + } + + tables { + id + } + + query + + columns { + workbooks_directly_connected: referencedByFields { + datasource { + ... on EmbeddedDatasource { + workbook { + name + id + } + } + } + } + datasources_directly_connected: referencedByFields { + datasource { + ... on PublishedDatasource { + name + id + } + } + } + } + } + pageInfo { + hasNextPage + endCursor + } + } +} +""" + print("--------------------------\nBeginning to query information about CustomSQLTables on this site...") + resp = server.metadata.query(query.replace('AFTER_TOKEN_SIGNAL', 'null')) + workbooks = {} + datasources = {} + table_stats = {'num_skipped': 0, 'num_tables_seen': 0, 'num_failed_parse': 0} + while True: + process_page(resp, workbooks, datasources, table_stats) + + page_info = resp['data']['customSQLTablesConnection']['pageInfo'] + + if page_info['hasNextPage']: + resp = server.metadata.query(query.replace('AFTER_TOKEN_SIGNAL', '"' + page_info['endCursor'] + '"')) + else: + break + + logging.debug("{} CustomSQLTables were skipped due to unexpected data".format(table_stats['num_skipped'])) + totalCountsQuery = """ + { + total_workbooks_count: workbooksConnection { totalCount } + total_datasources_count: publishedDatasourcesConnection { totalCount } + } + """ + resp = server.metadata.query(totalCountsQuery) + total_workbooks = resp['data']['total_workbooks_count']['totalCount'] + total_datasources = resp['data']['total_datasources_count']['totalCount'] + + ## Outputting summary to customSQL-stats-summary.txt file + with open("./customSQL-stats-summary.txt", 'w', newline='') as file: + + print("--------------------------\nFinished processing CustomSQLTables on this site...", file=file) + print("Total # of CustomSQLTables on site={} and {} of them ({:.2f}%) were not parsed by Catalog".format(table_stats['num_tables_seen'], table_stats['num_failed_parse'], percentify(table_stats['num_failed_parse'] / table_stats['num_tables_seen'])), file=file) + print("Total # of Workbooks on Site={}".format(total_workbooks), file=file) + print("# of Workbooks using CustomSQL={} ({:.2f}% of total)".format(len(workbooks), percentify(len(workbooks) / total_workbooks)), file=file) + + print("Total # of Published Data Sources on Site={}".format(total_datasources), file=file) + print("# of Published Data Sources using CustomSQL={} ({:.2f}% of total)".format(len(datasources), percentify(len(datasources) / total_datasources)), file=file) + + + ## Outputting detaield data to CSV file + filename='./customSQL-stats.csv' + with open(filename, 'w', newline='') as file: + csv_writer = csv.writer(file) + + columnHeaders = ['parent_content_type', 'parent_content_graph_id', 'custom_sql_graph_id', 'sql_failed_to_parse', 'query_string', 'database_type'] + csv_writer.writerow(columnHeaders) + + serialize_to_csv(csv_writer, workbooks, 'workbook') + serialize_to_csv(csv_writer, datasources, 'published datasource') + + + +# Serializes info to a CSV file +def serialize_to_csv(writer, collection, content_type): + ## Create a row per each customSQL table in each worbkook or data source + for content_item_id in collection.keys(): + for cust_sql_table_id in collection[content_item_id]['customSQLTables'].keys(): + cust_sql_table = collection[content_item_id]['customSQLTables'][cust_sql_table_id] + + new_row = [content_type] + new_row.append(content_item_id) + new_row.append(cust_sql_table_id) + new_row.append(cust_sql_table['sql_failed_to_parse']) + new_row.append(cust_sql_table['query_string']) + new_row.append(cust_sql_table['database_type']) + + writer.writerow(new_row) + + + +def percentify(decimal): + return decimal * 100 + + +def process_page(response, workbooks, datasources, table_stats): + customSQLTables = response['data']['customSQLTablesConnection']['nodes'] + + for table in customSQLTables: + table_stats['num_tables_seen'] += 1 + table_stats['num_failed_parse'] += 1 if has_failed_sql(table) else 0 + + if len(table['columns']) == 0: + logging.debug("Table {} has no columns and will be skipped".format(table['id'])) + table_stats['num_skipped'] += 1 + continue + + if len(table['columns'][0]['workbooks_directly_connected']) == 0: + logging.debug("Table {} has nothing in `workbooks_directly_connected` and will be skipped".format(table['id'])) + table_stats['num_skipped'] += 1 + continue + + ## this is CustomSQLTable connecting to a WB + if bool(table['columns'][0]['workbooks_directly_connected'][0]['datasource']): + object_id = table['columns'][0]['workbooks_directly_connected'][0]['datasource']['workbook']['id'] + process_table_for_collection(table, object_id, workbooks) + + ## This is a CustomSQLTable connecting to a PDS + else: + object_id = table['columns'][0]['datasources_directly_connected'][0]['datasource']['id'] + process_table_for_collection(table, object_id, datasources) + + +def process_table_for_collection(table, object_id, collection): + + ## This is first time we've seen this workbook + if object_id not in collection: + collection[object_id] = {} + collection[object_id]['customSQLTables'] = {} + collection[object_id]['customSQLTables'][table['id']] = {} + extract_sql_table_info(table, collection[object_id]['customSQLTables'][table['id']]) + else: + if table['id'] in collection[object_id]['customSQLTables']: + logging.debug('Seeing same CustomSQLTable twice. Skipping adding to dictionary. Table ID: {}'.format(table['id'])) + else: + collection[object_id]['customSQLTables'][table['id']] = {} + extract_sql_table_info(table, collection[object_id]['customSQLTables'][table['id']]) + + logging.info("Processed table id={} and added to collection".format(table['id'])) + +def test_extract(source_table_dict, collection, object_id): + + collection[object_id]['customSQLTables'][table['id']]['sql_failed_to_parse'] = has_failed_sql(source_table_dict) + collection[object_id]['customSQLTables'][table['id']]['query_string'] = source_table_dict['query'] + collection[object_id]['customSQLTables'][table['id']]['database_type'] = source_table_dict['database']['connectionType'] + + +def extract_sql_table_info(source_table_dict, dest_table_dict): + dest_table_dict['sql_failed_to_parse'] = has_failed_sql(source_table_dict) + dest_table_dict['query_string'] = source_table_dict['query'] + dest_table_dict['database_type'] = source_table_dict['database']['connectionType'] + + +def has_failed_sql(table): + return False if len(table['tables']) > 0 else True + +if __name__ == '__main__': + main() + \ No newline at end of file