# -*- coding: utf-8 -*- import numpy as np import pandas as pd import graphviz import graphviz.backend from numpy.distutils.system_info import f2py_info from sklearn import tree from sklearn.datasets import load_boston, load_iris, load_wine, load_digits, load_breast_cancer, load_diabetes, fetch_mldata from matplotlib.figure import figaspect import string import re import matplotlib.pyplot as plt import seaborn as sns from dtreeviz.shadow import * from numbers import Number import matplotlib.patches as patches from scipy import stats from sklearn.neighbors import KernelDensity import inspect import sys import tempfile from dtreeviz.trees import * """ Generate samples into testing/samples dir to compare against future images as a means of visually checking for errors. Run with working directory as main dtreeviz dir so this code can see dtreeviz package and data paths are set correctly. """ # REGRESSION def viz_boston(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): regr = tree.DecisionTreeRegressor( max_depth=max_depth, random_state=random_state) boston = load_boston() regr.fit(boston.data, boston.target) X = None if pickX: X = boston.data[np.random.randint(0, len(boston.data)), :] if fontname == "TakaoPGothic": feature_names = list(map(lambda x: f"特徴量{x}", boston.feature_names)) else: feature_names = boston.feature_names viz = dtreeviz(regr, boston.data, boston.target, target_name='price', feature_names=feature_names, orientation=orientation, fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def viz_diabetes(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): diabetes = load_diabetes() regr = tree.DecisionTreeRegressor( max_depth=max_depth, random_state=random_state) regr.fit(diabetes.data, diabetes.target) X = None if pickX: X = diabetes.data[np.random.randint(0, len(diabetes.data)), :] viz = dtreeviz(regr, diabetes.data, diabetes.target, target_name='progr', feature_names=diabetes.feature_names, orientation=orientation, fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def viz_sweets(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): sweets = pd.read_csv("data/sweetrs.csv") sweets = sweets.sample(n=500) # just grab 500 of 17k for plotting X_train, y_train = sweets.drop('rating', axis=1), sweets['rating'] regr = tree.DecisionTreeRegressor( max_depth=max_depth, random_state=random_state) regr.fit(X_train, y_train) X = None if pickX: X = X_train.iloc[np.random.randint(0, len(X_train))] viz = dtreeviz(regr, X_train, y_train, target_name='rating', feature_names=sweets.columns, orientation=orientation, fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def viz_fires(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): fires = pd.read_csv("data/forestfires.csv") fires['month'] = fires['month'].astype('category').cat.as_ordered() fires['month'] = fires['month'].cat.codes + 1 fires['day'] = fires['day'].astype('category').cat.as_ordered() fires['day'] = fires['day'].cat.codes + 1 X_train, y_train = fires.drop('area', axis=1), fires['area'] regr = tree.DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) regr.fit(X_train, y_train) X = None if pickX: X = X_train.iloc[np.random.randint(0, len(X_train))].values viz = dtreeviz(regr, X_train, y_train, target_name='area', feature_names=fires.columns, orientation=orientation, fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz # CLASSIFICATION def viz_iris(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): clf = tree.DecisionTreeClassifier( max_depth=max_depth, random_state=random_state) iris = load_iris() clf.fit(iris.data, iris.target) if fontname == "TakaoPGothic": feature_names = list(map(lambda x: f"特徴量{x}", iris.feature_names)) else: feature_names = iris.feature_names X = None if pickX: X = iris.data[np.random.randint(0, len(iris.data)), :] viz = dtreeviz(clf, iris.data, iris.target, target_name='variety', feature_names=feature_names, orientation=orientation, class_names=["setosa", "versicolor", "virginica"], # 0,1,2 targets fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def viz_digits(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): clf = tree.DecisionTreeClassifier(max_depth=max_depth, random_state=random_state) digits = load_digits() # "8x8 image of integer pixels in the range 0..16." columns = [f'pixel[{i},{j}]' for i in range(8) for j in range(8)] clf.fit(digits.data, digits.target) X = None if pickX: X = digits.data[np.random.randint(0, len(digits.data)), :] viz = dtreeviz(clf, digits.data, digits.target, target_name='number', feature_names=columns, orientation=orientation, class_names=[chr(c) for c in range(ord('0'), ord('9')+1)], fancy=fancy, histtype='bar', X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def viz_wine(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): clf = tree.DecisionTreeClassifier( max_depth=max_depth, random_state=random_state) wine = load_wine() clf.fit(wine.data, wine.target) X = None if pickX: X = wine.data[np.random.randint(0, len(wine.data)), :] viz = dtreeviz(clf, wine.data, wine.target, target_name='wine', feature_names=wine.feature_names, orientation=orientation, class_names=list(wine.target_names), fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def viz_breast_cancer(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): clf = tree.DecisionTreeClassifier( max_depth=max_depth, random_state=random_state) cancer = load_breast_cancer() clf.fit(cancer.data, cancer.target) X = None if pickX: X = cancer.data[np.random.randint(0, len(cancer)), :] viz = dtreeviz(clf, cancer.data, cancer.target, target_name='cancer', feature_names=cancer.feature_names, orientation=orientation, class_names=list(cancer.target_names), fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def viz_knowledge(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False, label_fontsize=12, ticks_fontsize=8, fontname="Arial"): # data from https://archive.ics.uci.edu/ml/datasets/User+Knowledge+Modeling clf = tree.DecisionTreeClassifier(max_depth=max_depth, random_state=random_state) know = pd.read_csv("data/knowledge.csv") target_names = ['very_low', 'Low', 'Middle', 'High'] know['UNS'] = know['UNS'].map({n: i for i, n in enumerate(target_names)}) X_train, y_train = know.drop('UNS', axis=1), know['UNS'] clf.fit(X_train, y_train) X = None if pickX: X = X_train.iloc[np.random.randint(0, len(know))] viz = dtreeviz(clf, X_train, y_train, target_name='UNS', feature_names=X_train.columns.values, orientation=orientation, class_names=target_names, fancy=fancy, X=X, label_fontsize=label_fontsize, ticks_fontsize=ticks_fontsize, fontname=fontname) return viz def save(name, dirname, orientation, max_depth, fancy=True, pickX=False, fontname="Arial"): print(f"Process {name} orientation={orientation} max_depth={max_depth} fancy={fancy}, pickX={pickX}, fontname={fontname}") viz = f(orientation=orientation, max_depth=max_depth, fancy=fancy, pickX=pickX, fontname=fontname) X = "-X" if pickX else "" filename = f"{name}-{orientation}-{max_depth}{X}-{fontname}" if not fancy: filename = filename+"-simple" print(f"{dirname}/{filename}.svg") viz.save(f"{dirname}/{filename}.svg") # do it the hard way to set dpi for png # g = graphviz.Source(st, format='png') # filepath = g.save(filename=f"{filename}.dot", directory=tempfile.gettempdir()) # save dot file # # cmd, rendered = graphviz.backend.command('dot', 'png', filepath) # cmd = ['dot', '-Gdpi=300', '-Tpng', f'-o{dirname}/{filename}.png', filepath] # graphviz.backend.run(cmd, capture_output=True, check=True, quiet=False) # That conversion fails to get good image. do this on command line: # # $ convert -density 300x300 boston-TD-2.pdf foo.png if __name__ == '__main__': all_functions = inspect.getmembers( sys.modules[__name__], inspect.isfunction) these_functions = [t for t in all_functions if inspect.getmodule( t[1]) == sys.modules[__name__]] viz_funcs = [f[1] for f in these_functions if f[0].startswith('viz_')] if len(sys.argv) > 1: dirname = sys.argv[1] else: dirname = "." print(f"tmp dir is {tempfile.gettempdir()}") for f in viz_funcs: name = f.__name__[len("viz_"):] # if name!='sweets': continue save(name, dirname, "TD", 2) save(name, dirname, "TD", 4) if name == 'iris': save(name, dirname, "TD", 5) save(name, dirname, "TD", 5, pickX=True) save(name, dirname, "TD", 5, pickX=True, fontname="TakaoPGothic") if name == 'boston': # save(name, dirname, "TD", 3) # save(name, dirname, "TD", 5, fancy=False, pickX=True) # save(name, dirname, "LR", 5, fancy=False, pickX=True) save(name, dirname, "LR", 5, fancy=False, pickX=True, fontname="TakaoPGothic") if name == 'knowledge': save(name, dirname, "TD", 15, fancy=False, pickX=True) save(name, dirname, "LR", 3) save(name, dirname, "TD", 4, fancy=False) save(name, dirname, "LR", 2, pickX=True) save(name, dirname, "TD", 3, pickX=True)