Extracting SQL details from all OLAP cubes in a project (real data lineage)

Manager, Sales Engineering • MicroStrategy

This Python script example will help you to: 1) List all OLAP cubes in your project 2) Download real SQL query for each cube 3) Parse the SQL and extract details about tables and columns being used 4) Export gathered details to CSV files for further processing

In my previous Community posts I have presented Data Lineage Python scripts that help understanding connections between all Strategy objects like Reports, Attributes, Metrics, Filters, Tables and others. Those connections were "hypothetical" meaning that relation between an attribute and table exist, but in reality this connection may never been used in real data queries (because other, faster, relations to other tables exist).

This example focuses on real relations and real SQL queries. Please see below Python code that you can use to extract this information from your projects. This has been tested with Strategy One update 11 and mstrio 11.3.10.101

Please note - I've added color formatting in the code below ( http://hilite.me/). Code copied from here might not work due to indentation issues. In that case, use attached file with sample code. Also this CRM is adding ';' after URLs.


from mstrio.project_objects import list_olap_cubes, OlapCube, SuperCube, list_all_cubes, list_reports, Report
from mstrio import connection
#from mstrio.project_objects.report import list_reports
import re
import datetime
import getpass
import csv, json, pickle, itertools
import pandas as pd

# libraries that you should install to parse SQL and get tables
from sql_metadata import Parser

# Define a variable which can be later used in a script
SERVER_URL = "https://env-XXXXXX.customer.cloud.Strategy.com/StrategyLibrary"
USERNAME = "mstr"
PASSWORD = getpass.getpass(prompt='Password ')
PROJECT_NAME = "Strategy Tutorial"

def export_to_csv(level_id, df):
    df.to_csv(f"{level_id}.csv", index=False, encoding='utf-8', sep=",", quoting=csv.QUOTE_ALL, escapechar="\\", header=True)

def unique_list(alist): # sorts a list of lists
    alist.sort()
    alist=list(alist for alist,_ in itertools.groupby(alist))
    return alist

#conn = get_connection(workstationData, project_name=PROJECT_NAME)
conn = connection.Connection(SERVER_URL, username=USERNAME, password=PASSWORD, project_name=PROJECT_NAME)
conn.connect()

# List OLAP Cubes with different conditions
#list_of_all_cubes = list_all_cubes(connection=conn)
list_of_all_cubes_as_dicts = list_all_cubes(connection=conn, to_dictionary=True)
print("\nFound " + str(len(list_of_all_cubes_as_dicts)) + " cubes\n")

# Limit number of cubes to be analyzed, 779 is data import cube (might contain custom SQL code)
cubes_subtypes=[776, 779]
list_of_all_cubes_as_dicts=[cu for cu in list_of_all_cubes_as_dicts if cu['subtype'] in cubes_subtypes]
print(f"Number of cubes: {len(list_of_all_cubes_as_dicts)}")

# Print
for cube in list_of_all_cubes_as_dicts: 
	print(f"{cube['name']} - ID: {cube['id']} - type: {cube['type']} - subtype: {cube['subtype']} - ext_type: {cube['ext_type']}")

tables_list, columns_list=[],[]
cubes_list=list_of_all_cubes_as_dicts[:]
pattern = r"(select\s+.*?)\n\n"

for i,cube in enumerate(cubes_list):
    cube_id, cube_name, cube_supbtype = cube["id"], cube["name"], cube["subtype"]
    print(f"\n{'='*5}| {i+1} / {len(cubes_list)} - {cube_name} - id: {cube_id} |{'='*30}")

    try:
        current_cube = OlapCube(conn, cube_id)
        sql_view = current_cube.export_sql_view()
        #print(sql_view)
    except:
        print(f"Failed to get the SQL for cube {cube_name} {cube_id}")
        sql_view = ""

    if sql_view:
        matches = re.findall(pattern, sql_view, flags=re.DOTALL | re.IGNORECASE)
        for match in matches:
            match=" ".join(match.split())
            parser = Parser(match)
            
            for table in parser.tables:
                if not table.startswith(("ZZ","*")):
                    #print(f"\"{cube_id}\",\"{cube_name}\",\"{table}\"")
                    tables_list.append([PROJECT_NAME,cube_id,cube_name,cube_supbtype,table])

            for column in parser.columns:
                if not column.startswith(("ZZ","*")):
                    #print(f"\"{cube_id}\",\"{cube_name}\",\"{column}\"")
                    columns_list.append([PROJECT_NAME,cube_id,cube_name,cube_supbtype,column])

print("FINITO")

### Export to CSV

tables_list=unique_list(tables_list)
columns_list=unique_list(columns_list)

headers=["Project Name", "Cube ID", "Cube Name", "Cube Suntype", "Table Name"]
df = pd.DataFrame(tables_list, columns = headers)

headers=["Project Name", "Cube ID", "Cube Name", "Cube Subtype", "Column Name"]
dfc = pd.DataFrame(columns_list, columns = headers)

export_to_csv("tables", df)
export_to_csv("columns", dfc)



# List all reports in the project
# Future development
reports = list_reports(connection=conn, to_dictionary=True)

rep=Report(conn, "CEF3C2B64463B7202F49AF81FCA9BE35")

Comment

0 comments

Details

Example

Published:

October 19, 2023

Last Updated:

October 19, 2023

from mstrio.project_objects import list_olap_cubes, OlapCube, SuperCube, list_all_cubes, list_reports, Report from mstrio import connection #from mstrio.project_objects.report import list_reports import re import datetime import getpass import csv, json, pickle, itertools import pandas as pd # libraries that you should install to parse SQL and get tables from sql_metadata import Parser # Define a variable which can be later used in a script SERVER_URL = "https://env-XXXXXX.customer.cloud.Strategy.com/StrategyLibrary" USERNAME = "mstr" PASSWORD = getpass.getpass(prompt='Password ') PROJECT_NAME = "Strategy Tutorial" def export_to_csv(level_id, df): df.to_csv(f"{level_id}.csv", index=False, encoding='utf-8', sep=",", quoting=csv.QUOTE_ALL, escapechar="\\", header=True) def unique_list(alist): # sorts a list of lists alist.sort() alist=list(alist for alist,_ in itertools.groupby(alist)) return alist #conn = get_connection(workstationData, project_name=PROJECT_NAME) conn = connection.Connection(SERVER_URL, username=USERNAME, password=PASSWORD, project_name=PROJECT_NAME) conn.connect() # List OLAP Cubes with different conditions #list_of_all_cubes = list_all_cubes(connection=conn) list_of_all_cubes_as_dicts = list_all_cubes(connection=conn, to_dictionary=True) print("\nFound " + str(len(list_of_all_cubes_as_dicts)) + " cubes\n") # Limit number of cubes to be analyzed, 779 is data import cube (might contain custom SQL code) cubes_subtypes=[776, 779] list_of_all_cubes_as_dicts=[cu for cu in list_of_all_cubes_as_dicts if cu['subtype'] in cubes_subtypes] print(f"Number of cubes: {len(list_of_all_cubes_as_dicts)}") # Print for cube in list_of_all_cubes_as_dicts: print(f"{cube['name']} - ID: {cube['id']} - type: {cube['type']} - subtype: {cube['subtype']} - ext_type: {cube['ext_type']}") tables_list, columns_list=[],[] cubes_list=list_of_all_cubes_as_dicts[:] pattern = r"(select\s+.*?)\n\n" for i,cube in enumerate(cubes_list): cube_id, cube_name, cube_supbtype = cube["id"], cube["name"], cube["subtype"] print(f"\n{'='*5}| {i+1} / {len(cubes_list)} - {cube_name} - id: {cube_id} |{'='*30}") try: current_cube = OlapCube(conn, cube_id) sql_view = current_cube.export_sql_view() #print(sql_view) except: print(f"Failed to get the SQL for cube {cube_name} {cube_id}") sql_view = "" if sql_view: matches = re.findall(pattern, sql_view, flags=re.DOTALL | re.IGNORECASE) for match in matches: match=" ".join(match.split()) parser = Parser(match) for table in parser.tables: if not table.startswith(("ZZ","*")): #print(f"\"{cube_id}\",\"{cube_name}\",\"{table}\"") tables_list.append([PROJECT_NAME,cube_id,cube_name,cube_supbtype,table]) for column in parser.columns: if not column.startswith(("ZZ","*")): #print(f"\"{cube_id}\",\"{cube_name}\",\"{column}\"") columns_list.append([PROJECT_NAME,cube_id,cube_name,cube_supbtype,column]) print("FINITO") ### Export to CSV tables_list=unique_list(tables_list) columns_list=unique_list(columns_list) headers=["Project Name", "Cube ID", "Cube Name", "Cube Suntype", "Table Name"] df = pd.DataFrame(tables_list, columns = headers) headers=["Project Name", "Cube ID", "Cube Name", "Cube Subtype", "Column Name"] dfc = pd.DataFrame(columns_list, columns = headers) export_to_csv("tables", df) export_to_csv("columns", dfc) # List all reports in the project # Future development reports = list_reports(connection=conn, to_dictionary=True) rep=Report(conn, "CEF3C2B64463B7202F49AF81FCA9BE35")