{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Árvores de Decisão com Python: Base de Dados Risco de Crédito\n", "### Implementando o algoritmo de árvores de decisão usando a base de dados \"Risco de Crédito\" para classificar riscos de empréstimos" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# importando a biblioteca pandas do python\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# atribuindo ao \"dataframe\" todos os registros da base de dados\n", "dataframe = pd.read_csv(\"risco_credito.csv\", encoding = \"utf-8\", sep = \",\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
historiadividagarantiasrendarisco
0ruimaltanenhuma0_15alto
1desconhecidaaltanenhuma15_35alto
2desconhecidabaixanenhuma15_35moderado
3desconhecidabaixanenhumaacima_35alto
4desconhecidabaixanenhumaacima_35baixo
5desconhecidabaixaadequadaacima_35baixo
6ruimbaixanenhuma0_15alto
7ruimbaixaadequadaacima_35moderado
8boabaixanenhumaacima_35baixo
9boaaltaadequadaacima_35baixo
10boaaltanenhuma0_15alto
11boaaltanenhuma15_35moderado
12boaaltanenhumaacima_35baixo
13ruimaltanenhuma15_35alto
\n", "
" ], "text/plain": [ " historia divida garantias renda risco\n", "0 ruim alta nenhuma 0_15 alto\n", "1 desconhecida alta nenhuma 15_35 alto\n", "2 desconhecida baixa nenhuma 15_35 moderado\n", "3 desconhecida baixa nenhuma acima_35 alto\n", "4 desconhecida baixa nenhuma acima_35 baixo\n", "5 desconhecida baixa adequada acima_35 baixo\n", "6 ruim baixa nenhuma 0_15 alto\n", "7 ruim baixa adequada acima_35 moderado\n", "8 boa baixa nenhuma acima_35 baixo\n", "9 boa alta adequada acima_35 baixo\n", "10 boa alta nenhuma 0_15 alto\n", "11 boa alta nenhuma 15_35 moderado\n", "12 boa alta nenhuma acima_35 baixo\n", "13 ruim alta nenhuma 15_35 alto" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# visualizando a base de dados\n", "dataframe" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# separando os atributos previsores do meta classe\n", "previsores = dataframe.iloc[:, 0:4].values\n", "meta_classe = dataframe.iloc[:, 4].values" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['ruim' 'alta' 'nenhuma' '0_15']\n", " ['desconhecida' 'alta' 'nenhuma' '15_35']\n", " ['desconhecida' 'baixa' 'nenhuma' '15_35']\n", " ['desconhecida' 'baixa' 'nenhuma' 'acima_35']\n", " ['desconhecida' 'baixa' 'nenhuma' 'acima_35']\n", " ['desconhecida' 'baixa' 'adequada' 'acima_35']\n", " ['ruim' 'baixa' 'nenhuma' '0_15']\n", " ['ruim' 'baixa' 'adequada' 'acima_35']\n", " ['boa' 'baixa' 'nenhuma' 'acima_35']\n", " ['boa' 'alta' 'adequada' 'acima_35']\n", " ['boa' 'alta' 'nenhuma' '0_15']\n", " ['boa' 'alta' 'nenhuma' '15_35']\n", " ['boa' 'alta' 'nenhuma' 'acima_35']\n", " ['ruim' 'alta' 'nenhuma' '15_35']]\n" ] } ], "source": [ "# visualizando os atributos previsores\n", "print(previsores)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['alto' 'alto' 'moderado' 'alto' 'baixo' 'baixo' 'alto' 'moderado' 'baixo'\n", " 'baixo' 'alto' 'moderado' 'baixo' 'alto']\n" ] } ], "source": [ "# visualizando os atributos meta classe\n", "print(meta_classe)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# importando a biblioteca sklearn do python\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# definindo e configurando o objeto \"labelencoder\"\n", "labelencoder = LabelEncoder()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# transformando as variáveis categóricas em numéricas, nos previsores, usando o objeto \"labelencoder\" \n", "# criado\n", "previsores[:, 0] = labelencoder.fit_transform(previsores[:, 0])\n", "previsores[:, 1] = labelencoder.fit_transform(previsores[:, 1])\n", "previsores[:, 2] = labelencoder.fit_transform(previsores[:, 2])\n", "previsores[:, 3] = labelencoder.fit_transform(previsores[:, 3])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[2 0 1 0]\n", " [1 0 1 1]\n", " [1 1 1 1]\n", " [1 1 1 2]\n", " [1 1 1 2]\n", " [1 1 0 2]\n", " [2 1 1 0]\n", " [2 1 0 2]\n", " [0 1 1 2]\n", " [0 0 0 2]\n", " [0 0 1 0]\n", " [0 0 1 1]\n", " [0 0 1 2]\n", " [2 0 1 1]]\n" ] } ], "source": [ "# visualizando os atributos previsores\n", "print(previsores)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# importando a biblioteca sklearn do python\n", "from sklearn.tree import DecisionTreeClassifier" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# configurando e definindo o objeto \"classificador\"\n", "classificador = DecisionTreeClassifier(criterion = \"entropy\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',\n", " max_depth=None, max_features=None, max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, presort='deprecated',\n", " random_state=None, splitter='best')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fazendo o treinamento com a base de dados no objeto criado \"classificador\". Nele, será criado a árvore de\n", "# decisão\n", "classificador.fit(previsores, meta_classe)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.48015651 0.03885431 0.03885431 0.44213486]\n" ] } ], "source": [ "# observando o resultando de ganho para cada um dos campos de dados. O maior valor tem mais relevância na\n", "# hierárquia da árvore de decisão\n", "print(classificador.feature_importances_)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "|**Valor de ganho na hierárquia da árvore de decisões**|\n", "|:-----------------------------------------------------|\n", "|1 - Renda|\n", "|2 - História|\n", "|3 - Dívida|\n", "|4 - Garantias|" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# importando a biblioteca sklearn do python\n", "from sklearn.tree import DecisionTreeClassifier, export" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# criando o arquivo \"arvore.dot\" com códigos renderizáveis no Graphviz\n", "export.export_graphviz(classificador, out_file = 'arvore.dot', \n", " feature_names = ['história', 'dívidas', 'garantia', 'renda'],\n", " class_names = ['alto', 'moderado', 'baixo'],\n", " filled = True, leaves_parallel = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Links para o Graphviz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- [Graphviz Online](https://dreampuf.github.io/GraphvizOnline/);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- [Árvore de Decisão para a base de dados risco de crédito](https://dreampuf.github.io/GraphvizOnline/#digraph%20Tree%20%7B%0Anode%20%5Bshape%3Dbox%2C%20style%3D%22filled%22%2C%20color%3D%22black%22%5D%20%3B%0Agraph%20%5Branksep%3Dequally%2C%20splines%3Dpolyline%5D%20%3B%0A0%20%5Blabel%3D%22renda%20%3C%3D%201.5%5Cnentropy%20%3D%201.531%5Cnsamples%20%3D%2014%5Cnvalue%20%3D%20%5B6%2C%205%2C%203%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23fcf1e9%22%5D%20%3B%0A1%20%5Blabel%3D%22hist%C3%B3ria%20%3C%3D%201.5%5Cnentropy%20%3D%200.863%5Cnsamples%20%3D%207%5Cnvalue%20%3D%20%5B5%2C%200%2C%202%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23efb388%22%5D%20%3B%0A0%20-%3E%201%20%5Blabeldistance%3D2.5%2C%20labelangle%3D45%2C%20headlabel%3D%22True%22%5D%20%3B%0A2%20%5Blabel%3D%22renda%20%3C%3D%200.5%5Cnentropy%20%3D%201.0%5Cnsamples%20%3D%204%5Cnvalue%20%3D%20%5B2%2C%200%2C%202%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23ffffff%22%5D%20%3B%0A1%20-%3E%202%20%3B%0A3%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%201%5Cnvalue%20%3D%20%5B1%2C%200%2C%200%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23e58139%22%5D%20%3B%0A2%20-%3E%203%20%3B%0A4%20%5Blabel%3D%22d%C3%ADvidas%20%3C%3D%200.5%5Cnentropy%20%3D%200.918%5Cnsamples%20%3D%203%5Cnvalue%20%3D%20%5B1%2C%200%2C%202%5D%5Cnclass%20%3D%20baixo%22%2C%20fillcolor%3D%22%23c09cf2%22%5D%20%3B%0A2%20-%3E%204%20%3B%0A5%20%5Blabel%3D%22hist%C3%B3ria%20%3C%3D%200.5%5Cnentropy%20%3D%201.0%5Cnsamples%20%3D%202%5Cnvalue%20%3D%20%5B1%2C%200%2C%201%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23ffffff%22%5D%20%3B%0A4%20-%3E%205%20%3B%0A6%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%201%5Cnvalue%20%3D%20%5B0%2C%200%2C%201%5D%5Cnclass%20%3D%20baixo%22%2C%20fillcolor%3D%22%238139e5%22%5D%20%3B%0A5%20-%3E%206%20%3B%0A7%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%201%5Cnvalue%20%3D%20%5B1%2C%200%2C%200%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23e58139%22%5D%20%3B%0A5%20-%3E%207%20%3B%0A8%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%201%5Cnvalue%20%3D%20%5B0%2C%200%2C%201%5D%5Cnclass%20%3D%20baixo%22%2C%20fillcolor%3D%22%238139e5%22%5D%20%3B%0A4%20-%3E%208%20%3B%0A9%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%203%5Cnvalue%20%3D%20%5B3%2C%200%2C%200%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23e58139%22%5D%20%3B%0A1%20-%3E%209%20%3B%0A10%20%5Blabel%3D%22hist%C3%B3ria%20%3C%3D%201.5%5Cnentropy%20%3D%201.149%5Cnsamples%20%3D%207%5Cnvalue%20%3D%20%5B1%2C%205%2C%201%5D%5Cnclass%20%3D%20moderado%22%2C%20fillcolor%3D%22%237beeab%22%5D%20%3B%0A0%20-%3E%2010%20%5Blabeldistance%3D2.5%2C%20labelangle%3D-45%2C%20headlabel%3D%22False%22%5D%20%3B%0A11%20%5Blabel%3D%22hist%C3%B3ria%20%3C%3D%200.5%5Cnentropy%20%3D%200.65%5Cnsamples%20%3D%206%5Cnvalue%20%3D%20%5B1%2C%205%2C%200%5D%5Cnclass%20%3D%20moderado%22%2C%20fillcolor%3D%22%2361ea9a%22%5D%20%3B%0A10%20-%3E%2011%20%3B%0A12%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%203%5Cnvalue%20%3D%20%5B0%2C%203%2C%200%5D%5Cnclass%20%3D%20moderado%22%2C%20fillcolor%3D%22%2339e581%22%5D%20%3B%0A11%20-%3E%2012%20%3B%0A13%20%5Blabel%3D%22garantia%20%3C%3D%200.5%5Cnentropy%20%3D%200.918%5Cnsamples%20%3D%203%5Cnvalue%20%3D%20%5B1%2C%202%2C%200%5D%5Cnclass%20%3D%20moderado%22%2C%20fillcolor%3D%22%239cf2c0%22%5D%20%3B%0A11%20-%3E%2013%20%3B%0A14%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%201%5Cnvalue%20%3D%20%5B0%2C%201%2C%200%5D%5Cnclass%20%3D%20moderado%22%2C%20fillcolor%3D%22%2339e581%22%5D%20%3B%0A13%20-%3E%2014%20%3B%0A15%20%5Blabel%3D%22entropy%20%3D%201.0%5Cnsamples%20%3D%202%5Cnvalue%20%3D%20%5B1%2C%201%2C%200%5D%5Cnclass%20%3D%20alto%22%2C%20fillcolor%3D%22%23ffffff%22%5D%20%3B%0A13%20-%3E%2015%20%3B%0A16%20%5Blabel%3D%22entropy%20%3D%200.0%5Cnsamples%20%3D%201%5Cnvalue%20%3D%20%5B0%2C%200%2C%201%5D%5Cnclass%20%3D%20baixo%22%2C%20fillcolor%3D%22%238139e5%22%5D%20%3B%0A10%20-%3E%2016%20%3B%0A%7Brank%3Dsame%20%3B%200%7D%20%3B%0A%7Brank%3Dsame%20%3B%201%3B%2010%7D%20%3B%0A%7Brank%3Dsame%20%3B%202%3B%2011%7D%20%3B%0A%7Brank%3Dsame%20%3B%204%3B%2013%7D%20%3B%0A%7Brank%3Dsame%20%3B%205%7D%20%3B%0A%7Brank%3Dsame%20%3B%203%3B%206%3B%207%3B%208%3B%209%3B%2012%3B%2014%3B%2015%3B%2016%7D%20%3B%0A%7D);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Árvore de Decisão para a base de dados Risco de Crédito](Graphviz/graphviz.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "|**Cliente com Baixo Risco**|\n", "|:-------------------------------------|\n", "|História de Crédito: **Boa** (0)|\n", "|Dívida: **Alta** (0)|\n", "|Garantias: **Nenhuma** (1)|\n", "|Renda: **>35** (2)|" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "|**Cliente de Alto Risco**|\n", "|:-------------------------------------|\n", "|História de Crédito: **Ruim** (2)|\n", "|Dívida: **Alta** (0)|\n", "|Garantias: **Adequada** (0)|\n", "|Renda: **<15** (0)|" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# avaliando o noss algoritmo para os dois cliente com os registros acima\n", "resultado = classificador.predict([[0,0,1,2], [2,0,0,0]])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['baixo' 'alto']\n" ] } ], "source": [ "# observando so resultados para os dois cliente acima\n", "print(resultado)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['alto' 'baixo' 'moderado']\n" ] } ], "source": [ "# observando as classes do objeto \"classificador\"\n", "print(classificador.classes_)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# função para análise de riscos em empréstimos\n", "def risco_creditos(historia, divida, garantia, renda):\n", " \n", " if(historia == \"boa\"):\n", " historia = 0\n", " elif(historia == \"ruim\"):\n", " historia = 2\n", " elif(historia == \"desconhecida\"):\n", " historia = 1\n", " \n", " if(divida == \"alta\"):\n", " divida = 0\n", " elif(divida == \"baixa\"):\n", " divida = 1\n", " \n", " if(garantia == \"adequada\"):\n", " garantia = 1\n", " elif(garantia == \"nenhuma\"):\n", " garantia = 0\n", " \n", " if(renda == \"0_15\"):\n", " renda = 0\n", " elif(renda == \"15_35\"):\n", " renda = 1\n", " elif(renda == \"acima_35\"):\n", " renda = 2\n", " \n", " retorno = classificador.predict([[historia, divida, garantia, renda]])\n", " \n", " print(\"Risco de Empréstimo: {}\".format(retorno))\n", " \n", " return" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Risco de Empréstimo: ['baixo']\n" ] } ], "source": [ "# usando a função criada para fazer a predição do primeiro usuário\n", "risco_creditos(historia = \"boa\", divida = \"alta\", garantia = \"nenhuma\", renda = \"acima_35\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Risco de Empréstimo: ['alto']\n" ] } ], "source": [ "# usando a função criada para fazer a predição do segundo usuário\n", "risco_creditos(historia = \"ruim\", divida = \"alta\", garantia = \"adequada\", renda = \"0_15\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Alguma dúvida? Entre em contato comigo:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- [Me envie um e-mail](mailto:alysson.barbosa@ee.ufcg.edu.br)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }