# -*- coding: UTF-8 -*-
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
import os
from collections import Counter
import pandas as pd
import numpy as np
from jinja2 import Template
from colorcet import glasbey
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
# CFPB data downloaded from
# https://www.consumerfinance.gov/data-research/consumer-complaints/search/?from=0&searchField=all&searchText=&size=25&sort=created_date_desc
DATASET_FILE = '../Consumer_Complaints.csv'
# directory to output bar charts to
OUTPUT_DIR = '../dashboards'
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
# template for generating Sankey Diagram using Google charts
sankey_template = Template("""
Google Visualization API Sample
""")
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
if __name__ == '__main__':
# load dataset into Pandas DataFrame
#---------------------------------------------------------------------------#
df = pd.read_csv( DATASET_FILE )
# create output directory if it doesn't exist
os.makedirs( OUTPUT_DIR, exist_ok = True )
# list of hex colors for use in the Sankey diagram
color_list = glasbey[:51]
# 1. Create Sankey for most common company by product
#---------------------------------------------------------------------------#
products = list(np.array(Counter(df['Product']).most_common(10))[:, 0])
edges = []
edges.append(['Source', 'Target', 'Count', 'Product'])
for product in products:
edges.append(['All', product, np.sum(df['Product'] == product), 'All' ])
c = Counter(df[df['Product'] == product]['Company']).most_common( 5 )
for company, count in c:
edges.append([product, company, count, 'All'])
# 2. Create Sankey for most common company by issue, for each product
#---------------------------------------------------------------------------#
for product in products:
issues = list(np.array(Counter(df[df['Product'] == product]['Issue']).most_common(10))[:, 0])
for issue in issues:
edges.append( [product, issue, np.sum(((df['Product'] == product) & (df['Issue'] == issue))), product] )
c = Counter(df[((df['Product'] == product) & (df['Issue'] == issue))]['Company']).most_common(5)
for company, count in c:
edges.append([issue, company, count, product])
edges_str = str( edges )
edges_str =edges_str.replace('], [', '],\n[')
product_name = product.replace(' ', '_')
output_file = os.path.join(OUTPUT_DIR, f'SANKEY-dashboard.html')
with open(output_file, 'w') as f:
f.write(sankey_template.render(
edges = edges_str,
color_list = color_list))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#