# -*- coding: UTF-8 -*- #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# import os from collections import Counter import pandas as pd import numpy as np from jinja2 import Template from colorcet import glasbey #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# # CFPB data downloaded from # https://www.consumerfinance.gov/data-research/consumer-complaints/search/?from=0&searchField=all&searchText=&size=25&sort=created_date_desc DATASET_FILE = '../Consumer_Complaints.csv' # directory to output bar charts to OUTPUT_DIR = '../dashboards' #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# # template for generating Sankey Diagram using Google charts sankey_template = Template(""" Google Visualization API Sample
""") #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# if __name__ == '__main__': # load dataset into Pandas DataFrame #---------------------------------------------------------------------------# df = pd.read_csv( DATASET_FILE ) # create output directory if it doesn't exist os.makedirs( OUTPUT_DIR, exist_ok = True ) # list of hex colors for use in the Sankey diagram color_list = glasbey[:51] # 1. Create Sankey for most common company by product #---------------------------------------------------------------------------# products = list(np.array(Counter(df['Product']).most_common(10))[:, 0]) edges = [] edges.append(['Source', 'Target', 'Count', 'Product']) for product in products: edges.append(['All', product, np.sum(df['Product'] == product), 'All' ]) c = Counter(df[df['Product'] == product]['Company']).most_common( 5 ) for company, count in c: edges.append([product, company, count, 'All']) # 2. Create Sankey for most common company by issue, for each product #---------------------------------------------------------------------------# for product in products: issues = list(np.array(Counter(df[df['Product'] == product]['Issue']).most_common(10))[:, 0]) for issue in issues: edges.append( [product, issue, np.sum(((df['Product'] == product) & (df['Issue'] == issue))), product] ) c = Counter(df[((df['Product'] == product) & (df['Issue'] == issue))]['Company']).most_common(5) for company, count in c: edges.append([issue, company, count, product]) edges_str = str( edges ) edges_str =edges_str.replace('], [', '],\n[') product_name = product.replace(' ', '_') output_file = os.path.join(OUTPUT_DIR, f'SANKEY-dashboard.html') with open(output_file, 'w') as f: f.write(sankey_template.render( edges = edges_str, color_list = color_list)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#