{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"IEF-IVA-text-numeric-features.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"LI4LrAHPi4rN"},"source":["# Paso 1. Lectura del fichero"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"t2mBw0-KGiO9","executionInfo":{"status":"ok","timestamp":1632768490431,"user_tz":-120,"elapsed":541,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}},"outputId":"8cb5ee54-59f8-4911-e89f-a0ac02a32ad0"},"source":["#Lectura del fichero\n","\n","import pandas as pd\n","github_storage = \"https://raw.githubusercontent.com/oeg-upm/Instituto-Estudios-Fiscales-ontologias/master/machine-learning/\"\n","\n","original_file = github_storage+\"datos-IVA.csv\"\n","\n","# Leer csv donde se encuentran los datos\n","df = pd.read_csv(original_file, sep=\";\", engine='python', header=None, names=['persona.actividadEmpresarial','sujetoPasivo.domicilioFiscal','operacion.tipoOperacion','operacion.lugarRealizacion','operacion.exencion','factura.importe','factura.fecha','operacion.docs','exencion.hechoImponibleCubiertoEnExencion','factura.descripcion','sujetoPasivo.acogidoARegimenEspecial','numFacturasMismoTopico','infraccion','tipoInfraccion'])\n","df = df.drop([0]) #eliminado porque no se lee bien la primera línea\n","df = df.drop(columns=['exencion.hechoImponibleCubiertoEnExencion']) #eliminado pues no tiene valores relevantes\n","df\n"],"execution_count":131,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
persona.actividadEmpresarialsujetoPasivo.domicilioFiscaloperacion.tipoOperacionoperacion.lugarRealizacionoperacion.exencionfactura.importefactura.fechaoperacion.docsfactura.descripcionsujetoPasivo.acogidoARegimenEspecialnumFacturasMismoTopicoinfracciontipoInfraccion
1venta de ropaToledoadquisicion de bienesMadridExencionRecargoEquivalencia140.022/7/21Camisa de hombre de color blanco. Pantalón vaq...Camisa de hombre de color blanco. Pantalón vaq...RecargoEquivalencia2.0falseNaN
2venta de calzadoMadridadquisicion de bienesMadridExencionRecargoEquivalencia180.023/7/21Mocasines de piel de color marrónMocasines de piel de color marrónRecargoEquivalencia3.0falseNaN
3venta de ropaToledoadquisicion de bienesToledoExencionRecargoEquivalencia230.024/7/21Camisa de hombre de color azul. Pantalones de ...Camisa de hombre de color azul. Pantalones de ...RecargoEquivalencia20.0falseNaN
4venta de calzadoAvilaadquisicion de bienesAvilaExencionRecargoEquivalencia120.025/7/21Zapatillas de deporte Adidas. Zapatos de cabal...Zapatillas de deporte Adidas. Zapatos de cabal...RecargoEquivalencia15.0falseNaN
5venta de artículos de pielMadridadquisicion de bienesToledoExencionRecargoEquivalencia560.026/7/21Abrigo de visón largo con mangas anchasAbrigo de visón largo con mangas anchasRecargoEquivalencia2.0trueinfraccionEnAdquisicionDeBienes
6joyeríaToledoadquisicion de bienesMadridExencionRecargoEquivalencia1200.027/7/21Reloj de oro con incrustaciones de diamantesReloj de oro con incrustaciones de diamantesRecargoEquivalencia3.0trueinfraccionEnAdquisicionDeBienes
7materiales de construcciónAvilaadquisicion de bienesAvilaExencionRecargoEquivalencia3400.028/7/2145 kg de cemento. 20 metros cuadrados de azule...45 kg de cemento. 20 metros cuadrados de azule...RecargoEquivalencia1.0trueinfraccionEnAdquisicionDeBienes
8joyeríaMadridadquisicion de bienesMadridExencionRecargoEquivalencia1800.029/7/21Collar de perlas.Collar de perlas.RecargoEquivalencia5.0trueinfraccionEnAdquisicionDeBienes
9materiales de construcciónMadridprestación de serviciosMadrid03500.030/7/2145 kg de cemento. 20 metros cuadrados de azule...45 kg de cemento. 20 metros cuadrados de azule...020.0falseNaN
10obras y edificacionesMadridprestación de serviciosMadrid012000.031/7/21Instalación de ventanas en habitaciones. Remod...Instalación de ventanas en habitaciones. Remod...03.0falseNaN
11materiales de construcciónToledoprestación de serviciosMadrid013500.01/8/21300kg de cemento. 850 metros cuadrados de azuj...300kg de cemento. 850 metros cuadrados de azuj...045.0falseNaN
12obras y edificacionesAvilaprestación de serviciosToledo01800.02/8/21Instalación de una ventana en buhardillaInstalación de una ventana en buhardilla03.0falseNaN
13materiales de construcciónMadridprestación de serviciosAvila02450.03/8/2145 kg de cemento. 20 metros cuadrados de azule...45 kg de cemento. 20 metros cuadrados de azule...0120.0truefaltaComunicacionEjecucionObra
14obras y edificacionesToledoprestación de serviciosMadrid013000.04/8/21Remodelación de fachada con fecha de 1/1/2020Remodelación de fachada con fecha de 1/1/202003.0truefaltaComunicacionEjecucionObra
15materiales de construcciónAvilaprestación de serviciosToledo011254.05/8/2145 kg de cemento. 20 metros cuadrados de azule...45 kg de cemento. 20 metros cuadrados de azule...045.0truefaltaComunicacionEjecucionObra
16obras y edificacionesMadridprestación de serviciosAvila0340.06/8/21Remodelación de fachada con fecha de 1/7/2021Remodelación de fachada con fecha de 1/7/202103.0truefaltaComunicacionEjecucionObra
17venta de electrónicaMadridadquisicionIntracomunitariaFrancia0340.07/8/21Venta de móvil de gama media. Samsung X3Venta de móvil de gama media. Samsung X304.0falseNaN
18venta de ropaToledoimportacionBienesAlemania0245.08/8/21cinco cqmisas de marca HMcinco cqmisas de marca HM025.0falseNaN
19venta de calzadoAvilaadquisicionIntracomunitariaFrancia01200.09/8/2130 mocasines de piel de vacuno de color marrón...30 mocasines de piel de vacuno de color marrón...030.0falseNaN
20joyeríaMadridimportacionBienesAlemania03422.010/8/21dos relojes de oro con incrustaciones de platados relojes de oro con incrustaciones de plata022.0falseNaN
21materiales de construcciónMadridadquisicionIntracomunitariaFrancia0234.011/8/21azulejos porcelánicos de alta gamaazulejos porcelánicos de alta gama0345.0trueinfraccionNoPresentarDeclaracion
22venta de artículos de pielToledoimportacionBienesAlemania0430.012/8/2110 bolsos de piel para mujer10 bolsos de piel para mujer022.0trueinfraccionNoPresentarDeclaracion
23venta de electrónicaAvilaadquisicionIntracomunitariaFrancia0222.013/8/212 calculadoras mutlifunción. 1 router inalámbrico2 calculadoras mutlifunción. 1 router inalámbrico01.0trueinfraccionNoPresentarDeclaracion
24venta de ropaMadridimportacionBienesAlemania013000.014/8/2130 camisas de caballero. 20 vaqueros de señora30 camisas de caballero. 20 vaqueros de señora023.0trueinfraccionNoPresentarDeclaracion
\n","
"],"text/plain":[" persona.actividadEmpresarial ... tipoInfraccion\n","1 venta de ropa ... NaN\n","2 venta de calzado ... NaN\n","3 venta de ropa ... NaN\n","4 venta de calzado ... NaN\n","5 venta de artículos de piel ... infraccionEnAdquisicionDeBienes\n","6 joyería ... infraccionEnAdquisicionDeBienes\n","7 materiales de construcción ... infraccionEnAdquisicionDeBienes\n","8 joyería ... infraccionEnAdquisicionDeBienes\n","9 materiales de construcción ... NaN\n","10 obras y edificaciones ... NaN\n","11 materiales de construcción ... NaN\n","12 obras y edificaciones ... NaN\n","13 materiales de construcción ... faltaComunicacionEjecucionObra\n","14 obras y edificaciones ... faltaComunicacionEjecucionObra\n","15 materiales de construcción ... faltaComunicacionEjecucionObra\n","16 obras y edificaciones ... faltaComunicacionEjecucionObra\n","17 venta de electrónica ... NaN\n","18 venta de ropa ... NaN\n","19 venta de calzado ... NaN\n","20 joyería ... NaN\n","21 materiales de construcción ... infraccionNoPresentarDeclaracion\n","22 venta de artículos de piel ... infraccionNoPresentarDeclaracion\n","23 venta de electrónica ... infraccionNoPresentarDeclaracion\n","24 venta de ropa ... infraccionNoPresentarDeclaracion\n","\n","[24 rows x 13 columns]"]},"metadata":{},"execution_count":131}]},{"cell_type":"markdown","metadata":{"id":"DOkrrLTvjIhh"},"source":["# Paso 2. Tratamiento de lenguaje natural. Preprocesado de columnas con textos"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5V9arbonGcaw","executionInfo":{"status":"ok","timestamp":1632768497268,"user_tz":-120,"elapsed":6489,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}},"outputId":"41114d41-1cac-4778-f8ba-305c0a471322"},"source":["# Librerías relacionadas con procesamiento de lenguaje natural\n","!python -m spacy download es_core_news_sm\n","\n","import nltk\n","nltk.download('stopwords')\n","nltk.download('wordnet')\n","nltk.download('punkt')\n","nltk.download('averaged_perceptron_tagger')\n","\n","from nltk.corpus import stopwords\n","from nltk.stem import PorterStemmer\n","from nltk.stem import WordNetLemmatizer\n","from nltk.stem import SnowballStemmer\n","\n","import spacy\n","import es_core_news_sm\n","\n","import codecs"],"execution_count":132,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting es_core_news_sm==2.2.5\n"," Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.2.5/es_core_news_sm-2.2.5.tar.gz (16.2 MB)\n","\u001b[K |████████████████████████████████| 16.2 MB 5.2 MB/s \n","\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from es_core_news_sm==2.2.5) (2.2.4)\n","Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (1.0.5)\n","Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (3.0.5)\n","Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (2.23.0)\n","Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (1.1.3)\n","Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (1.19.5)\n","Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (0.8.2)\n","Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (1.0.5)\n","Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (2.0.5)\n","Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (1.0.0)\n","Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (0.4.1)\n","Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (7.4.0)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (57.4.0)\n","Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->es_core_news_sm==2.2.5) (4.62.2)\n","Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->es_core_news_sm==2.2.5) (4.8.1)\n","Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->es_core_news_sm==2.2.5) (3.7.4.3)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->es_core_news_sm==2.2.5) (3.5.0)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->es_core_news_sm==2.2.5) (2.10)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->es_core_news_sm==2.2.5) (3.0.4)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->es_core_news_sm==2.2.5) (1.24.3)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->es_core_news_sm==2.2.5) (2021.5.30)\n","\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('es_core_news_sm')\n","[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Package stopwords is already up-to-date!\n","[nltk_data] Downloading package wordnet to /root/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package averaged_perceptron_tagger to\n","[nltk_data] /root/nltk_data...\n","[nltk_data] Package averaged_perceptron_tagger is already up-to-\n","[nltk_data] date!\n"]}]},{"cell_type":"code","metadata":{"id":"wgaAHDeNGqAa","executionInfo":{"status":"ok","timestamp":1632768497269,"user_tz":-120,"elapsed":6,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["# Función de limpieza de tokens \n","def clear_first_token(token):\n"," result = \"\"\n"," char = '\\ufeff'\n"," for i in range(len(token)):\n"," if token[i] != char: result += token[i]\n"," return result\n","\n","def preprocess(text):\n"," # Texto en minusculas\n"," text_lower = text.lower()\n"," # Tokenize \n"," token_word = nltk.word_tokenize(text_lower, \"spanish\")\n"," # Algunos textos tienen caracteres raros al principio, por tanto hay que eliminarlos para que no influyan en el código \n"," print(token_word)\n"," token_word[0] = clear_first_token(token_word[0])\n"," # Stopwords de palabras españolas\n"," stopword_spanish = stopwords.words(\"spanish\")\n"," i = 0\n","\n"," while(i < len(token_word)):\n"," # Se eliminan los tokens que se encuentren dentro de las stopwords \n"," if token_word[i] in stopword_spanish:\n"," token_word.remove(token_word[i])\n"," #Se elimina cualquier token que sea distinto de caracteres alfanumericos\n"," elif not (token_word[i].isalpha()):\n"," token_word.remove(token_word[i])\n"," else:\n"," # Debido a que la libreria nltk no lematiza textos en español es necesario utilizar otra libreria de NLP la cual es Spacy\n"," word = sp(token_word[i])[0]\n"," token_word[i] = word.lemma_\n"," i = i+1\n"," return token_word"],"execution_count":133,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0c91XhcYGtpU","executionInfo":{"status":"ok","timestamp":1632768502430,"user_tz":-120,"elapsed":5166,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}},"outputId":"911589b1-70a8-44e4-c593-469128b4624e"},"source":["# Tokenización de las dos columnas com textos largos \n","sp = es_core_news_sm.load()\n","df['operacion.docs.tokens']=df['operacion.docs'].apply(preprocess)\n","df['factura.descripcion.tokens']=df['factura.descripcion'].apply(preprocess)"],"execution_count":134,"outputs":[{"output_type":"stream","name":"stdout","text":["['camisa', 'de', 'hombre', 'de', 'color', 'blanco', '.', 'pantalón', 'vaquero', 'de', 'talla', '42']\n","['mocasines', 'de', 'piel', 'de', 'color', 'marrón']\n","['camisa', 'de', 'hombre', 'de', 'color', 'azul', '.', 'pantalones', 'de', 'vestir', 'de', 'color', 'gris']\n","['zapatillas', 'de', 'deporte', 'adidas', '.', 'zapatos', 'de', 'caballero', 'de', 'talla', '43']\n","['abrigo', 'de', 'visón', 'largo', 'con', 'mangas', 'anchas']\n","['reloj', 'de', 'oro', 'con', 'incrustaciones', 'de', 'diamantes']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento']\n","['collar', 'de', 'perlas', '.']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento']\n","['instalación', 'de', 'ventanas', 'en', 'habitaciones', '.', 'remodelado', 'de', 'cocina', 'y', 'baño', 'principal']\n","['300kg', 'de', 'cemento', '.', '850', 'metros', 'cuadrados', 'de', 'azujelos', 'porcelánicos', '.']\n","['instalación', 'de', 'una', 'ventana', 'en', 'buhardilla']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento', '.', 'con', 'fecha', 'de', '3/8/21']\n","['remodelación', 'de', 'fachada', 'con', 'fecha', 'de', '1/1/2020']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento', '.', 'con', 'fecha', 'de', '3/8/21']\n","['remodelación', 'de', 'fachada', 'con', 'fecha', 'de', '1/7/2021']\n","['venta', 'de', 'móvil', 'de', 'gama', 'media', '.', 'samsung', 'x3']\n","['cinco', 'cqmisas', 'de', 'marca', 'hm']\n","['30', 'mocasines', 'de', 'piel', 'de', 'vacuno', 'de', 'color', 'marrón', 'y', 'negro']\n","['dos', 'relojes', 'de', 'oro', 'con', 'incrustaciones', 'de', 'plata']\n","['azulejos', 'porcelánicos', 'de', 'alta', 'gama']\n","['10', 'bolsos', 'de', 'piel', 'para', 'mujer']\n","['2', 'calculadoras', 'mutlifunción', '.', '1', 'router', 'inalámbrico']\n","['30', 'camisas', 'de', 'caballero', '.', '20', 'vaqueros', 'de', 'señora']\n","['camisa', 'de', 'hombre', 'de', 'color', 'blanco', '.', 'pantalón', 'vaquero', 'de', 'talla', '42']\n","['mocasines', 'de', 'piel', 'de', 'color', 'marrón']\n","['camisa', 'de', 'hombre', 'de', 'color', 'azul', '.', 'pantalones', 'de', 'vestir', 'de', 'color', 'gris']\n","['zapatillas', 'de', 'deporte', 'adidas', '.', 'zapatos', 'de', 'caballero', 'de', 'talla', '43']\n","['abrigo', 'de', 'visón', 'largo', 'con', 'mangas', 'anchas']\n","['reloj', 'de', 'oro', 'con', 'incrustaciones', 'de', 'diamantes']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento']\n","['collar', 'de', 'perlas', '.']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento']\n","['instalación', 'de', 'ventanas', 'en', 'habitaciones', '.', 'remodelado', 'de', 'cocina', 'y', 'baño', 'principal']\n","['300kg', 'de', 'cemento', '.', '850', 'metros', 'cuadrados', 'de', 'azujelos', 'porcelánicos', '.']\n","['instalación', 'de', 'una', 'ventana', 'en', 'buhardilla']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento', '.', 'con', 'fecha', 'de', '3/8/21']\n","['remodelación', 'de', 'fachada', 'con', 'fecha', 'de', '1/1/2020']\n","['45', 'kg', 'de', 'cemento', '.', '20', 'metros', 'cuadrados', 'de', 'azulejos', 'porcelanosa', '.', '30', 'sacos', 'de', 'yeso', '.', '2', 'kg', 'de', 'pegamento', '.', 'con', 'fecha', 'de', '3/8/21']\n","['remodelación', 'de', 'fachada', 'con', 'fecha', 'de', '1/7/2021']\n","['venta', 'de', 'móvil', 'de', 'gama', 'media', '.', 'samsung', 'x3']\n","['cinco', 'cqmisas', 'de', 'marca', 'hm']\n","['30', 'mocasines', 'de', 'piel', 'de', 'vacuno', 'de', 'color', 'marrón', 'y', 'negro']\n","['dos', 'relojes', 'de', 'oro', 'con', 'incrustaciones', 'de', 'plata']\n","['azulejos', 'porcelánicos', 'de', 'alta', 'gama']\n","['10', 'bolsos', 'de', 'piel', 'para', 'mujer']\n","['2', 'calculadoras', 'mutlifunción', '.', '1', 'router', 'inalámbrico']\n","['30', 'camisas', 'de', 'caballero', '.', '20', 'vaqueros', 'de', 'señora']\n"]}]},{"cell_type":"markdown","metadata":{"id":"czHb82uvjUct"},"source":["# Paso 3. Preparación del pipeline de procesamiento"]},{"cell_type":"code","metadata":{"id":"yQbGMam8j7yY","executionInfo":{"status":"ok","timestamp":1632768502431,"user_tz":-120,"elapsed":19,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["import numpy as np\n","from sklearn.impute import SimpleImputer\n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.compose import ColumnTransformer\n","from sklearn.linear_model import LogisticRegression\n","import matplotlib.pyplot as plt\n","from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.pipeline import Pipeline, FeatureUnion\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.preprocessing import FunctionTransformer\n","from sklearn.model_selection import GridSearchCV, StratifiedKFold\n","from sklearn.impute import SimpleImputer\n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.compose import ColumnTransformer\n","from sklearn.linear_model import LogisticRegression"],"execution_count":135,"outputs":[]},{"cell_type":"code","metadata":{"id":"Y1ZaVOlW0fVT","executionInfo":{"status":"ok","timestamp":1632768502432,"user_tz":-120,"elapsed":18,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["# Function Transformer para el Feature Union\n","def get_numeric_data(x):\n"," return [record[:-2].astype(float) for record in x]\n","\n","def get_text_data(x):\n"," return [record[-1] for record in x]\n","\n","transfomer_numeric = FunctionTransformer(get_numeric_data)\n","transformer_text = FunctionTransformer(get_text_data)"],"execution_count":136,"outputs":[]},{"cell_type":"code","metadata":{"id":"9LnmaqVR0nOD","executionInfo":{"status":"ok","timestamp":1632769522060,"user_tz":-120,"elapsed":208,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["# Features\n","numeric_features = ['factura.importe', 'numFacturasMismoTopico']\n","numeric_transformer = Pipeline(steps=[\n"," ('imputer', SimpleImputer(strategy='median')),\n"," ('scaler', StandardScaler())])\n","\n","categorical_features = ['persona.actividadEmpresarial', 'sujetoPasivo.domicilioFiscal', 'operacion.tipoOperacion', 'operacion.lugarRealizacion', 'operacion.exencion', 'sujetoPasivo.acogidoARegimenEspecial']\n","categorical_transformer = OneHotEncoder(handle_unknown='ignore')\n","\n","textual_features = ['operacion.docs', 'factura.descripcion']\n","textual_transformer = Pipeline(steps=[\n"," ('vec', TfidfVectorizer(analyzer='word'))])\n","\n","#factura.fecha\n","#infraccion\n","#tipoInfraccion\n","#operacion.docs.tokens \n","#factura.descripcion.tokens \n","\n","preprocessor = ColumnTransformer(\n"," transformers=[\n"," ('num', numeric_transformer, numeric_features),\n"," ('cat', categorical_transformer, categorical_features),\n"," ('text', textual_transformer, textual_features)])\n","\n","\n","\n","# Pipeline para concatenar Vector Tfidf y datos numéricos\n","#pipeline = Pipeline([\n","# ('features', FeatureUnion([\n","# ('numeric_features', Pipeline([\n","# ('selector', transfomer_numeric)\n","# ])),\n","# ('text_features', Pipeline([\n","# ('selector', transformer_text),\n","# ('vec', TfidfVectorizer(analyzer='word'))\n","# ]))\n","# ])),\n","# ('clf', RandomForestClassifier())\n","#])\n","\n","\n","# Añadiendo el clasificador\n","pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n"," # ('features', FeatureUnion(['num','text'])),\n"," ('clf', RandomForestClassifier())])\n"],"execution_count":161,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"collapsed":true,"id":"Gw23I0Qs1EGe","executionInfo":{"status":"error","timestamp":1632769530075,"user_tz":-120,"elapsed":2029,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}},"outputId":"dae98c01-b079-4561-adc0-b86179991409"},"source":["# Grid Search Parameters for RandomForest\n","param_grid = {'clf__n_estimators': np.linspace(1, 100, 10, dtype=int),\n"," 'clf__min_samples_split': [3, 10],\n"," 'clf__min_samples_leaf': [3],\n"," 'clf__max_features': [7],\n"," 'clf__max_depth': [None],\n"," 'clf__criterion': ['gini'],\n"," 'clf__bootstrap': [False]}\n"," \n","# Training config\n","kfold = StratifiedKFold(n_splits=7)\n","scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}\n","refit = 'F1'\n","\n","# Perform GridSearch\n","rf_model = GridSearchCV(pipeline, param_grid=param_grid, cv=kfold, scoring=scoring, \n"," refit=refit, n_jobs=-1, return_train_score=True, verbose=1)\n","\n","\n","X_Train, X_Test, Y_Train, Y_Test = model_selection.train_test_split(df.iloc[:,0:11],df['infraccion'],test_size=0.2, shuffle = True, random_state=0)\n","\n","#X_Train = X_Train.iloc[:,0:11]\n","#X_train\n","#X_train = df.iloc[:,0:1]\n","#X_train = df['persona.actividadEmpresarial']\n","print(X_Train)\n","#Y_Train = Y_Train[:,0]\n","#print(Y_Train)\n","print(Y_Train)\n","\n","datatypes = df.dtypes\n","print('Data type of each column of Dataframe :')\n","print(datatypes)\n","filteredColumns = df.dtypes[df.dtypes == np.object]\n","# list of columns whose data type is object i.e. string\n","listOfColumnNames = list(filteredColumns.index)\n","print(listOfColumnNames)\n","\n","\n","rf_model.fit(X_Train, Y_Train)\n","rf_best = rf_model.best_estimator_"],"execution_count":162,"outputs":[{"output_type":"stream","name":"stdout","text":[" persona.actividadEmpresarial ... numFacturasMismoTopico\n","2 venta de calzado ... 3.0\n","14 obras y edificaciones ... 3.0\n","24 venta de ropa ... 23.0\n","17 venta de electrónica ... 4.0\n","9 materiales de construcción ... 20.0\n","7 materiales de construcción ... 1.0\n","18 venta de ropa ... 25.0\n","5 venta de artículos de piel ... 2.0\n","3 venta de ropa ... 20.0\n","6 joyería ... 3.0\n","19 venta de calzado ... 30.0\n","10 obras y edificaciones ... 3.0\n","8 joyería ... 5.0\n","20 joyería ... 22.0\n","4 venta de calzado ... 15.0\n","1 venta de ropa ... 2.0\n","22 venta de artículos de piel ... 22.0\n","16 obras y edificaciones ... 3.0\n","13 materiales de construcción ... 120.0\n","\n","[19 rows x 11 columns]\n","2 false\n","14 true\n","24 true\n","17 false\n","9 false\n","7 true\n","18 false\n","5 true\n","3 false\n","6 true\n","19 false\n","10 false\n","8 true\n","20 false\n","4 false\n","1 false\n","22 true\n","16 true\n","13 true\n","Name: infraccion, dtype: object\n","Data type of each column of Dataframe :\n","persona.actividadEmpresarial object\n","sujetoPasivo.domicilioFiscal object\n","operacion.tipoOperacion object\n","operacion.lugarRealizacion object\n","operacion.exencion object\n","factura.importe float64\n","factura.fecha object\n","operacion.docs object\n","factura.descripcion object\n","sujetoPasivo.acogidoARegimenEspecial object\n","numFacturasMismoTopico float64\n","infraccion object\n","tipoInfraccion object\n","operacion.docs.tokens object\n","factura.descripcion.tokens object\n","dtype: object\n","['persona.actividadEmpresarial', 'sujetoPasivo.domicilioFiscal', 'operacion.tipoOperacion', 'operacion.lugarRealizacion', 'operacion.exencion', 'factura.fecha', 'operacion.docs', 'factura.descripcion', 'sujetoPasivo.acogidoARegimenEspecial', 'infraccion', 'tipoInfraccion', 'operacion.docs.tokens', 'factura.descripcion.tokens']\n","Fitting 7 folds for each of 20 candidates, totalling 140 fits\n"]},{"output_type":"stream","name":"stderr","text":["[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n","[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 1.9s finished\n"]},{"output_type":"error","ename":"ValueError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0mrf_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_Train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY_Train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m \u001b[0mrf_best\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrf_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mrefit_start_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 738\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 739\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 740\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 741\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0mThis\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 349\u001b[0m \"\"\"\n\u001b[0;32m--> 350\u001b[0;31m \u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 351\u001b[0m with _print_elapsed_time('Pipeline',\n\u001b[1;32m 352\u001b[0m self._log_message(len(self.steps) - 1)):\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0mmessage_clsname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Pipeline'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_log_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstep_idx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 315\u001b[0;31m **fit_params_steps[name])\n\u001b[0m\u001b[1;32m 316\u001b[0m \u001b[0;31m# Replace the transformer of the step with the fitted\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0;31m# transformer. This is necessary when loading the transformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/joblib/memory.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 352\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 353\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcall_and_shelve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36m_fit_transform_one\u001b[0;34m(transformer, X, y, weight, message_clsname, message, **fit_params)\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_print_elapsed_time\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_clsname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 727\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransformer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'fit_transform'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 728\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 729\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 730\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 538\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 539\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 540\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 541\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 542\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py\u001b[0m in \u001b[0;36m_hstack\u001b[0;34m(self, Xs)\u001b[0m\n\u001b[1;32m 621\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 622\u001b[0m \u001b[0mXs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msparse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0missparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mXs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 623\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 624\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m<__array_function__ internals>\u001b[0m in \u001b[0;36mhstack\u001b[0;34m(*args, **kwargs)\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/numpy/core/shape_base.py\u001b[0m in \u001b[0;36mhstack\u001b[0;34m(tup)\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_nx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_nx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m<__array_function__ internals>\u001b[0m in \u001b[0;36mconcatenate\u001b[0;34m(*args, **kwargs)\u001b[0m\n","\u001b[0;31mValueError\u001b[0m: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 19 and the array at index 2 has size 2"]}]},{"cell_type":"code","metadata":{"id":"-var7L_iIMNw","executionInfo":{"status":"aborted","timestamp":1632768502909,"user_tz":-120,"elapsed":26,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["from sklearn.model_selection import train_test_split\n","from sklearn.feature_extraction.text import CountVectorizer\n","from sklearn import model_selection, naive_bayes, svm\n","from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix\n","import pickle \n","import matplotlib.pyplot as plt\n","from collections import Counter"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"fyon9CbEpqVn"},"source":["# Paso 4. Evaluación del modelo (no realizado aún)"]},{"cell_type":"code","metadata":{"id":"_kfmUzPRxXla","executionInfo":{"status":"aborted","timestamp":1632768502912,"user_tz":-120,"elapsed":28,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["# Evaluacion del modelo \n","predictions_SVM = SVM.predict(Test_X_Tfidf)\n","print(\"Accuracy del modelo -> \",accuracy_score(predictions_SVM, Test_Y)*100)\n","\n","print(\"Porcentaje de acierto para cada una de las clases: \")\n","print(classification_report(Test_Y, predictions_SVM))\n","\n","print(\"Matriz de confusion test: \")\n","plt.clf()\n","fig,ax = plt.subplots(figsize=(12,12))\n","plot_confusion_matrix(SVM, Test_X_Tfidf, Test_Y, cmap=plt.cm.Blues, ax=ax, values_format='d')\n","plt.show() \n","\n","print(\"Matriz de confusion train: \")\n","plt.clf()\n","fig,ax = plt.subplots(figsize=(12,12))\n","plot_confusion_matrix(SVM, Train_X_Tfidf, Train_Y, cmap=plt.cm.Blues, ax=ax, values_format='d')\n","plt.show() \n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"d9YiH6iBPfIr","executionInfo":{"status":"aborted","timestamp":1632768502921,"user_tz":-120,"elapsed":30,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["# Algoritmo SVM utilizando cross-validation\n","from sklearn.model_selection import cross_val_score\n","\n","Tfidf_vect = TfidfVectorizer()\n","Tfidf_vect.fit(df_procesado['Tokens'])\n","\n","X = Tfidf_vect.transform(df_procesado['Tokens'])\n","\n","SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', random_state=0)\n","scores = cross_val_score(SVM, X, df_procesado['ODS'], cv=10)\n","\n","for index, value in enumerate(scores):\n"," print(f\"Accuracy del fold {index} -> {value:.2%}\")\n","\n","print(f\"Accuracy medio: {scores.mean():.2%}, desviación estandar: {scores.std():.2%}\")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"TE1Vw4L1L99V","executionInfo":{"status":"aborted","timestamp":1632768502922,"user_tz":-120,"elapsed":31,"user":{"displayName":"Oscar Corcho","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi3cF7DAfhib0_PTrsWLs1ct7LY6beP-TSnuSL4=s64","userId":"18270436930746444615"}}},"source":["# Variables X e Y para entrenar y probar el algoritmo SVM\n","# Training = 100%\n","Train_X = df_procesado['Tokens']\n","Train_Y = df_procesado['ODS']\n","\n","# TF-IDF\n","Tfidf_vect = TfidfVectorizer()\n","Tfidf_vect.fit(df_procesado['Tokens'])\n","\n","Train_X_Tfidf = Tfidf_vect.transform(Train_X)"],"execution_count":null,"outputs":[]}]}