{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Packages" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "import string\n", "from nltk.corpus import stopwords\n", "from nltk.stem.snowball import SpanishStemmer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.ensemble import VotingClassifier\n", "from sklearn.multiclass import OneVsRestClassifier\n", "from xgboost import XGBClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import f1_score\n", "from sklearn.metrics import log_loss\n", "from nltk.corpus import stopwords\n", "from scipy.sparse import hstack" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load Data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_data = pd.read_excel('train_universidad.xlsx',sheetname=1)\n", "test_data = pd.read_excel('test_universidad.xlsx')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get Columns" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['COD_ENCUESTADO', 'Nombre Campus', 'NIVEL ACTUAL', 'Clave de carrera',\n", " 'Ciclo', 'COMENTARIO', 'IND_GEA', 'IND_DELEGADO',\n", " 'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista',\n", " 'NPS'],\n", " dtype='object')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### First 5 rows" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COD_ENCUESTADONombre CampusNIVEL ACTUALClave de carreraCicloCOMENTARIOIND_GEAIND_DELEGADOCANT_CURSOS_MATRICU_SIN_INGLESUOD_depostista_ind_deportistaNPS
0135011AC32Me gusta la u es paja bro y soy pitoNaNDelegado6.0NaN3
1236223AC251El metodo de blended no le hace bien a todosNaNDelegado5.0NaN3
283544AC311Los profesores, sus métodos de enseñanza bes ...NaNNaN5.0NaN4
3177454AC286Porque posee gran mayoría de profesores espec...GEANaN6.0NaN3
4108673AC341La pencionNaNNaN6.0NaN3
\n", "
" ], "text/plain": [ " COD_ENCUESTADO Nombre Campus NIVEL ACTUAL Clave de carrera Ciclo \\\n", "0 13501 1 AC 3 2 \n", "1 23622 3 AC 25 1 \n", "2 8354 4 AC 31 1 \n", "3 17745 4 AC 28 6 \n", "4 10867 3 AC 34 1 \n", "\n", " COMENTARIO IND_GEA IND_DELEGADO \\\n", "0 Me gusta la u es paja bro y soy pito NaN Delegado \n", "1 El metodo de blended no le hace bien a todos NaN Delegado \n", "2 Los profesores, sus métodos de enseñanza bes ... NaN NaN \n", "3 Porque posee gran mayoría de profesores espec... GEA NaN \n", "4 La pencion NaN NaN \n", "\n", " CANT_CURSOS_MATRICU_SIN_INGLES UOD_depostista_ind_deportista NPS \n", "0 6.0 NaN 3 \n", "1 5.0 NaN 3 \n", "2 5.0 NaN 4 \n", "3 6.0 NaN 3 \n", "4 6.0 NaN 3 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Append Data" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data = train_data.append(test_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding 'Comentario' length" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data['COMENTARIO_LEN'] =all_data['COMENTARIO'].str.len()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Columns Sets" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data_cols= ['NIVEL ACTUAL',\n", " 'Ciclo', 'COMENTARIO', 'COMENTARIO_LEN', 'IND_GEA', 'IND_DELEGADO',\n", " 'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista']\n", "\n", "model_cols= ['NIVEL ACTUAL',\n", " 'Ciclo', 'IND_GEA', 'IND_DELEGADO',\n", " 'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista']\n", "\n", "model_log_cols= ['NIVEL ACTUAL',\n", " 'Ciclo', 'COMENTARIO_LEN','IND_GEA', 'IND_DELEGADO',\n", " 'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleaning Data" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data['IND_GEA'] = all_data['IND_GEA'].map({'IND_GEA':1}).fillna(0)\n", "all_data['IND_DELEGADO'] = all_data['IND_DELEGADO'].map({'Delegado':1}).fillna(0)\n", "all_data['UOD_depostista_ind_deportista'] = all_data['UOD_depostista_ind_deportista'].map({'Deportista':1}).fillna(0)\n", "all_data['CANT_CURSOS_MATRICU_SIN_INGLES'] = all_data['CANT_CURSOS_MATRICU_SIN_INGLES'].fillna(4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Encoding 'Nivel Actual'" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "le = LabelEncoder()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data['NIVEL ACTUAL'] = le.fit_transform(all_data['NIVEL ACTUAL'])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data = all_data.reset_index(drop=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Removing stopwords, punctuation, stemming." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "stop = stopwords.words('spanish')\n", "stemmer = SpanishStemmer()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pretable = dict.fromkeys(string.punctuation)\n", "table = str.maketrans(pretable)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data['COMENTARIO'] = all_data['COMENTARIO'].apply(lambda x: ' '.join([word.translate(table) for word in x.split() if word not in stop]))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data['COMENTARIO'] = all_data['COMENTARIO'].str.\\\n", " replace('enseñansa','enseñanza').\\\n", " replace('pencion','pension')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_data['COMENTARIO'] = all_data['COMENTARIO'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop]))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CANT_CURSOS_MATRICU_SIN_INGLESCOD_ENCUESTADOCOMENTARIOCicloClave de carreraIND_DELEGADOIND_GEANIVEL ACTUALNPSNombre CampusUOD_depostista_ind_deportistaCOMENTARIO_LEN
06.013501me gust u paj bro pit231.00.003.010.037
15.023622el metod blend hac bien1251.00.003.030.045
25.08354los profesor metod enseñ bes buen el uso tecno...1310.00.004.040.0168
36.017745porqu pose gran mayor profesor especializ enseñ6280.00.003.040.073
46.010867la pencion1340.00.003.030.011
\n", "
" ], "text/plain": [ " CANT_CURSOS_MATRICU_SIN_INGLES COD_ENCUESTADO \\\n", "0 6.0 13501 \n", "1 5.0 23622 \n", "2 5.0 8354 \n", "3 6.0 17745 \n", "4 6.0 10867 \n", "\n", " COMENTARIO Ciclo Clave de carrera \\\n", "0 me gust u paj bro pit 2 3 \n", "1 el metod blend hac bien 1 25 \n", "2 los profesor metod enseñ bes buen el uso tecno... 1 31 \n", "3 porqu pose gran mayor profesor especializ enseñ 6 28 \n", "4 la pencion 1 34 \n", "\n", " IND_DELEGADO IND_GEA NIVEL ACTUAL NPS Nombre Campus \\\n", "0 1.0 0.0 0 3.0 1 \n", "1 1.0 0.0 0 3.0 3 \n", "2 0.0 0.0 0 4.0 4 \n", "3 0.0 0.0 0 3.0 4 \n", "4 0.0 0.0 0 3.0 3 \n", "\n", " UOD_depostista_ind_deportista COMENTARIO_LEN \n", "0 0.0 37 \n", "1 0.0 45 \n", "2 0.0 168 \n", "3 0.0 73 \n", "4 0.0 11 " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Split Data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = all_data.loc[:19999,data_cols]\n", "y = all_data.loc[:19999,'NPS']\n", "X_final = all_data.loc[20000:,data_cols]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x_train,x_test, y_train, y_test = train_test_split(X,y, test_size=0.1,random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vectorize 'Comentarios'" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "vec = TfidfVectorizer(ngram_range=(1,3), min_df=0.001, max_df=0.6,strip_accents='unicode')" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=0.6, max_features=None, min_df=0.001,\n", " ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,\n", " stop_words=None, strip_accents='unicode', sublinear_tf=False,\n", " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", " vocabulary=None)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vec" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=0.6, max_features=None, min_df=0.001,\n", " ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,\n", " stop_words=None, strip_accents='unicode', sublinear_tf=False,\n", " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", " vocabulary=None)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vec.fit(x_train['COMENTARIO'])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['10',\n", " '100',\n", " '11',\n", " 'abastec',\n", " 'abren',\n", " 'abrir',\n", " 'acab',\n", " 'academ',\n", " 'acced',\n", " 'acces',\n", " 'accesibil',\n", " 'acept',\n", " 'acerc',\n", " 'acondicion',\n", " 'acord',\n", " 'acredit',\n", " 'acredit wasc',\n", " 'activ',\n", " 'actual',\n", " 'actualiz',\n", " 'acuerd',\n", " 'adecu',\n", " 'adem',\n", " 'ademas',\n", " 'adicional',\n", " 'administr',\n", " 'administracion',\n", " 'admision',\n", " 'adquir',\n", " 'afect',\n", " 'agrad',\n", " 'agrand',\n", " 'agreg',\n", " 'ahi',\n", " 'ahor',\n", " 'air',\n", " 'air acondicion',\n", " 'al',\n", " 'algui',\n", " 'algun',\n", " 'algun curs',\n", " 'algun profesor',\n", " 'almorz',\n", " 'almuerz',\n", " 'alt',\n", " 'alta',\n", " 'alta calid',\n", " 'altas',\n", " 'alto',\n", " 'alto nivel',\n", " 'altos',\n", " 'alumn',\n", " 'alumn aprend',\n", " 'alumn buen',\n", " 'alumn campus',\n", " 'alumn deb',\n", " 'alumn epe',\n", " 'alumn hac',\n", " 'alumn mal',\n", " 'alumn mas',\n", " 'alumn matricul',\n", " 'alumn mejor',\n", " 'alumn pesim',\n", " 'alumn profesor',\n", " 'alumn pued',\n", " 'alumn sed',\n", " 'alumn sol',\n", " 'alumn univers',\n", " 'ambient',\n", " 'ambient estudi',\n", " 'ambit',\n", " 'amig',\n", " 'ampli',\n", " 'ampliacion',\n", " 'ano',\n", " 'anos',\n", " 'anterior',\n", " 'antigu',\n", " 'apart',\n", " 'apertur',\n", " 'aplic',\n", " 'apoy',\n", " 'apoy alumn',\n", " 'app',\n", " 'aprend',\n", " 'aprendizaj',\n", " 'aqu',\n", " 'are',\n", " 'are estudi',\n", " 'are verd',\n", " 'arquitectur',\n", " 'arregl',\n", " 'ascensor',\n", " 'asesor',\n", " 'asesori',\n", " 'asi',\n", " 'asi mism',\n", " 'asient',\n", " 'asim',\n", " 'asist',\n", " 'asistent',\n", " 'aspect',\n", " 'atencion',\n", " 'atencion alumn',\n", " 'atencion alumn pesim',\n", " 'atencion client',\n", " 'atencion telefon',\n", " 'atend',\n", " 'atiend',\n", " 'atras',\n", " 'audiovisual',\n", " 'aul',\n", " 'aul virtual',\n", " 'aument',\n", " 'aument pension',\n", " 'aun',\n", " 'aun falt',\n", " 'aunqu',\n", " 'avanz',\n", " 'avec',\n", " 'avis',\n", " 'ayud',\n", " 'ayud alumn',\n", " 'baj',\n", " 'baj pension',\n", " 'baj preci',\n", " 'ban',\n", " 'bas',\n", " 'basic',\n", " 'bastant',\n", " 'bec',\n", " 'benefici',\n", " 'bibliotec',\n", " 'bien',\n", " 'blackboard',\n", " 'blend',\n", " 'blend curs',\n", " 'bolet',\n", " 'bols',\n", " 'bols trabaj',\n", " 'bonit',\n", " 'break',\n", " 'brind',\n", " 'brind alumn',\n", " 'brind buen',\n", " 'brind servici',\n", " 'brind univers',\n", " 'buen',\n", " 'buen ambient',\n", " 'buen atencion',\n", " 'buen calid',\n", " 'buen docent',\n", " 'buen educ',\n", " 'buen ensen',\n", " 'buen infraestructur',\n", " 'buen infraestructur buen',\n", " 'buen metodolog',\n", " 'buen nivel',\n", " 'buen plan',\n", " 'buen plan docent',\n", " 'buen prof',\n", " 'buen profesional',\n", " 'buen profesor',\n", " 'buen profesor buen',\n", " 'buen servici',\n", " 'buen sistem',\n", " 'buen univers',\n", " 'bus',\n", " 'busc',\n", " 'cad',\n", " 'cad ano',\n", " 'cad carrer',\n", " 'cad cicl',\n", " 'cad curs',\n", " 'cad dia',\n", " 'cad rat',\n", " 'cad vez',\n", " 'cafet',\n", " 'cafeteri',\n", " 'cafetin',\n", " 'calcul',\n", " 'calid',\n", " 'calid docent',\n", " 'calid educ',\n", " 'calid ensen',\n", " 'calid ensen buen',\n", " 'calid ensen profesor',\n", " 'calid profesor',\n", " 'calid servici',\n", " 'calif',\n", " 'calific',\n", " 'cambi',\n", " 'cambi horari',\n", " 'cambi mall',\n", " 'cambi mall curricul',\n", " 'cambi sed',\n", " 'camin',\n", " 'camp',\n", " 'campus',\n", " 'campus monterr',\n", " 'campus san',\n", " 'canal',\n", " 'cantid',\n", " 'cantid alumn',\n", " 'caos',\n", " 'capac',\n", " 'capacit',\n", " 'car',\n", " 'carg',\n", " 'carnet',\n", " 'carpet',\n", " 'carrer',\n", " 'cas',\n", " 'casi',\n", " 'casiller',\n", " 'caus',\n", " 'centr',\n", " 'centr atencion',\n", " 'centr atencion alumn',\n", " 'centr inform',\n", " 'cerc',\n", " 'cercan',\n", " 'cerr',\n", " 'charl',\n", " 'chever',\n", " 'chic',\n", " 'cicl',\n", " 'cienci',\n", " 'cierr',\n", " 'ciert',\n", " 'civil',\n", " 'clar',\n", " 'clas',\n", " 'clas blend',\n", " 'clas deb',\n", " 'clas dinam',\n", " 'clas presencial',\n", " 'clas profesor',\n", " 'clas virtual',\n", " 'client',\n", " 'cobr',\n", " 'col',\n", " 'colaps',\n", " 'colegi',\n", " 'coloc',\n", " 'com',\n", " 'comedor',\n", " 'comod',\n", " 'companer',\n", " 'compar',\n", " 'competent',\n", " 'competit',\n", " 'complej',\n", " 'complet',\n", " 'complic',\n", " 'compr',\n", " 'comprend',\n", " 'compromis',\n", " 'comput',\n", " 'comun',\n", " 'comunic',\n", " 'comunicacion',\n", " 'con',\n", " 'conect',\n", " 'conexion',\n", " 'conform',\n", " 'conoc',\n", " 'consegu',\n", " 'consider',\n", " 'constant',\n", " 'consult',\n", " 'cont',\n", " 'contabil',\n", " 'contact',\n", " 'contact web',\n", " 'conten',\n", " 'conten curs',\n", " 'contest',\n", " 'continu',\n", " 'contrasen',\n", " 'contrat',\n", " 'control',\n", " 'convalid',\n", " 'conveni',\n", " 'conveni internacional',\n", " 'conveni univers',\n", " 'coordin',\n", " 'corre',\n", " 'correct',\n", " 'cort',\n", " 'cos',\n", " 'cost',\n", " 'cost pension',\n", " 'costos',\n", " 'cre',\n", " 'cre deb',\n", " 'creativ',\n", " 'crec',\n", " 'credit',\n", " 'cruz',\n", " 'cual',\n", " 'cualqu',\n", " 'cualqui',\n", " 'cuand',\n", " 'cuant',\n", " 'cubicul',\n", " 'cuelg',\n", " 'cuent',\n", " 'cuent buen',\n", " 'cuest',\n", " 'cultural',\n", " 'cumpl',\n", " 'curricul',\n", " 'curs',\n", " 'curs blend',\n", " 'curs carrer',\n", " 'curs deb',\n", " 'curs deb ser',\n", " 'curs ingles',\n", " 'curs llev',\n", " 'curs matemat',\n", " 'curs numer',\n", " 'curs profesor',\n", " 'da',\n", " 'dad',\n", " 'dan',\n", " 'dan facil',\n", " 'dan solucion',\n", " 'dar',\n", " 'dar mas',\n", " 'darl',\n", " 'de',\n", " 'deb',\n", " 'deb ampli',\n", " 'deb dar',\n", " 'deb dej',\n", " 'deb exist',\n", " 'deb hab',\n", " 'deb hab mas',\n", " 'deb hac',\n", " 'deb implement',\n", " 'deb mejor',\n", " 'deb mejor atencion',\n", " 'deb pon',\n", " 'deb ser',\n", " 'deb ser mas',\n", " 'deb ser presencial',\n", " 'deb sub',\n", " 'deb ten',\n", " 'deb trabaj',\n", " 'deberi',\n", " 'deberi hab',\n", " 'deberi hab mas',\n", " 'deberi mejor',\n", " 'deberi ser',\n", " 'deberi ser mas',\n", " 'deberi ten',\n", " 'dec',\n", " 'dedic',\n", " 'deficient',\n", " 'definit',\n", " 'dej',\n", " 'dej des',\n", " 'del',\n", " 'demand',\n", " 'demas',\n", " 'demasi',\n", " 'demasi alumn',\n", " 'demasi gent',\n", " 'demasi lent',\n", " 'demor',\n", " 'den',\n", " 'dentr',\n", " 'dentr univers',\n", " 'depend',\n", " 'deport',\n", " 'derech',\n", " 'des',\n", " 'desarroll',\n", " 'desastr',\n", " 'descans',\n", " 'descuent',\n", " 'desd',\n", " 'desempen',\n", " 'desorden',\n", " 'desorganiz',\n", " 'despues',\n", " 'detall',\n", " 'dia',\n", " 'dia dia',\n", " 'dias',\n", " 'dic',\n", " 'dict',\n", " 'dict clas',\n", " 'didact',\n", " 'diferent',\n", " 'dificil',\n", " 'dificult',\n", " 'dinam',\n", " 'diner',\n", " 'direct',\n", " 'disen',\n", " 'disminu',\n", " 'dispon',\n", " 'disponibil',\n", " 'disponibil horari',\n", " 'disposicion',\n", " 'distint',\n", " 'distribu',\n", " 'divers',\n", " 'docenci',\n", " 'docent',\n", " 'docent infraestructur',\n", " 'doming',\n", " 'dos',\n", " 'dud',\n", " 'econom',\n", " 'educ',\n", " 'educ brind',\n", " 'educ buen',\n", " 'educacion',\n", " 'efect',\n", " 'eficient',\n", " 'egres',\n", " 'ejempl',\n", " 'ejercici',\n", " 'el',\n", " 'el ambient',\n", " 'el aul',\n", " 'el cambi',\n", " 'el campus',\n", " 'el horari',\n", " 'el internet',\n", " 'el metod',\n", " 'el metod ensen',\n", " 'el nivel',\n", " 'el nivel ensen',\n", " 'el nivel exigent',\n", " 'el orden',\n", " 'el prestigi',\n", " 'el proces',\n", " 'el servici',\n", " 'el servici atencion',\n", " 'el sistem',\n", " 'el trat',\n", " 'el uso',\n", " 'el wifi',\n", " 'eleccion',\n", " 'eleg',\n", " 'elev',\n", " 'elimin',\n", " 'ello',\n", " 'embarg',\n", " 'empez',\n", " 'emple',\n", " 'empres',\n", " 'en',\n", " 'en aspect',\n", " 'en atencion',\n", " 'en atencion alumn',\n", " 'en curs',\n", " 'en horari',\n", " 'en matricul',\n", " 'en proces',\n", " 'en servici',\n", " 'en sistem',\n", " 'encant',\n", " 'encarg',\n", " 'enchuf',\n", " 'encim',\n", " 'encontr',\n", " 'encuentr',\n", " 'encuest',\n", " 'enfoc',\n", " 'ensen',\n", " 'ensen bien',\n", " 'ensen brind',\n", " 'ensen buen',\n", " 'ensen calid',\n", " 'ensen curs',\n", " 'ensen exigent',\n", " 'ensen infraestructur',\n", " 'ensen profesor',\n", " 'ensen univers',\n", " 'entend',\n", " 'entiend',\n", " 'entrad',\n", " 'entrar',\n", " 'entreg',\n", " 'envi',\n", " 'epe',\n", " 'equip',\n", " 'error',\n", " 'es',\n", " 'es buen',\n", " 'es buen univers',\n", " 'es univers',\n", " 'escog',\n", " 'escuch',\n", " 'esfuerz',\n", " 'eso',\n", " 'espaci',\n", " 'espaci alumn',\n", " 'espaci deport',\n", " 'espaci descans',\n", " 'espaci estudi',\n", " 'espaci pod',\n", " 'esparc',\n", " 'especial',\n", " 'especializ',\n", " 'especif',\n", " 'esper',\n", " 'esta',\n", " 'establec',\n", " 'estacion',\n", " 'estadist',\n", " 'estan',\n", " 'estoy',\n", " 'estructur',\n", " 'estudi',\n", " 'estudi buen',\n", " 'estudi epe',\n", " 'estudi univers',\n", " 'estudiantil',\n", " 'etc',\n", " 'evalu',\n", " 'event',\n", " 'evit',\n", " 'exam',\n", " 'exam admision',\n", " 'examen',\n", " 'excelent',\n", " 'excelent academ',\n", " 'excelent profesor',\n", " 'exces',\n", " 'exig',\n", " 'exigent',\n", " 'exigent academ',\n", " 'exigent buen',\n", " 'exigent curs',\n", " 'exigent profesor',\n", " 'exist',\n", " 'exit',\n", " 'expect',\n", " 'experient',\n", " 'experient profesor',\n", " 'experient univers',\n", " 'explic',\n", " 'explic bien',\n", " 'extra',\n", " 'extranjer',\n", " 'facil',\n", " 'facil brind',\n", " 'facil dan',\n", " 'facil estudi',\n", " 'facil horari',\n", " 'facil pag',\n", " 'facilit',\n", " 'facult',\n", " 'fall',\n", " 'falt',\n", " 'falt espaci',\n", " 'falt mas',\n", " 'falt mejor',\n", " 'famili',\n", " 'favor',\n", " 'fech',\n", " 'filtr',\n", " 'fin',\n", " 'fin seman',\n", " 'final',\n", " 'fisic',\n", " 'flexibil',\n", " 'flexibil horari',\n", " 'flexibl',\n", " 'foment',\n", " 'form',\n", " 'form ensen',\n", " 'formacion',\n", " 'funcion',\n", " 'futur',\n", " 'gan',\n", " 'gast',\n", " 'gener',\n", " 'general',\n", " 'genial',\n", " 'gent',\n", " 'gent trabaj',\n", " 'gestion',\n", " 'gimnasi',\n", " 'graci',\n", " 'grad',\n", " 'gran',\n", " 'gran cantid',\n", " 'grand',\n", " 'gratis',\n", " 'gratuit',\n", " 'grup',\n", " 'grupal',\n", " 'guard',\n", " 'gust',\n", " 'gust univers',\n", " 'gustari',\n", " 'hab',\n", " 'hab mas',\n", " 'habil',\n", " 'habilit',\n", " 'habl',\n", " 'hac',\n", " 'hac clas',\n", " 'hac trabaj',\n", " 'haci',\n", " 'haci alumn',\n", " 'hag',\n", " 'hast',\n", " 'hay',\n", " 'hay curs',\n", " 'hay demasi',\n", " 'hay much',\n", " 'hay much gent',\n", " 'hay profesor',\n", " 'hech',\n", " 'herramient',\n", " 'higien',\n", " 'hor',\n", " 'hor almuerz',\n", " 'hor clas',\n", " 'hor punt',\n", " 'horari',\n", " 'horari bus',\n", " 'horari clas',\n", " 'horari curs',\n", " 'horari deb',\n", " 'horari flexibl',\n", " 'horari sed',\n", " 'horribl',\n", " 'human',\n", " 'ide',\n", " 'igual',\n", " 'imag',\n", " 'implement',\n", " 'implement mas',\n", " 'import',\n", " 'impos',\n", " 'impresion',\n", " 'inclu',\n", " 'inclus',\n", " 'incomod',\n", " 'inconvenient',\n", " 'increment',\n", " 'indic',\n", " 'ineficient',\n", " 'inform',\n", " 'informacion',\n", " 'infraestructur',\n", " 'infraestructur buen',\n", " 'infraestructur calid',\n", " 'infraestructur profesor',\n", " 'infraestructur univers',\n", " 'ing',\n", " 'ingeni',\n", " 'ingenieri',\n", " 'ingles',\n", " 'ingles deb',\n", " 'ingres',\n", " 'ingres univers',\n", " 'inici',\n", " 'inmediat',\n", " 'innov',\n", " 'innovacion',\n", " 'instal',\n", " 'institu',\n", " 'insuficient',\n", " 'integr',\n", " 'interaccion',\n", " 'intercambi',\n", " 'interes',\n", " 'intern',\n", " 'internacional',\n", " 'internet',\n", " 'internet lent',\n", " 'intranet',\n", " 'investig',\n", " 'ipad',\n", " 'ipads',\n", " 'ir',\n", " 'isidr',\n", " 'it',\n", " 'junt',\n", " 'just',\n", " 'la',\n", " 'la atencion',\n", " 'la atencion alumn',\n", " 'la bibliotec',\n", " 'la buen',\n", " 'la buen ensen',\n", " 'la calid',\n", " 'la calid docent',\n", " 'la calid educ',\n", " 'la calid ensen',\n", " 'la calid profesor',\n", " 'la comod',\n", " 'la educ',\n", " 'la ensen',\n", " 'la ensen buen',\n", " 'la ensen profesor',\n", " 'la exigent',\n", " 'la exigent profesor',\n", " 'la experient',\n", " 'la facil',\n", " 'la flexibil',\n", " 'la form',\n", " 'la form ensen',\n", " 'la infraestructur',\n", " 'la innov',\n", " 'la mall',\n", " 'la matricul',\n", " 'la mensual',\n", " 'la metodolog',\n", " 'la metodolog ensen',\n", " 'la organiz',\n", " 'la pension',\n", " 'la pension sub',\n", " 'la tecnolog',\n", " 'la tecnologi',\n", " 'la univers',\n", " 'la univers buen',\n", " 'labor',\n", " 'laboral',\n", " 'laboratori',\n", " 'lad',\n", " 'larg',\n", " 'las',\n", " 'las clas',\n", " 'las facil',\n", " 'las instal',\n", " 'las oportun',\n", " 'las pension',\n", " 'le',\n", " 'lectur',\n", " 'leer',\n", " 'lej',\n", " 'lent',\n", " 'libr',\n", " 'libr bibliotec',\n", " 'lim',\n", " 'limit',\n", " 'limpi',\n", " 'limpiez',\n", " 'limpiez ban',\n", " 'lin',\n", " 'line',\n", " 'llam',\n", " 'lleg',\n", " 'lleg tard',\n", " 'llen',\n", " 'llev',\n", " 'llev curs',\n", " 'lo',\n", " 'lo mas',\n", " 'lo mas valor',\n", " 'lo valor',\n", " 'local',\n", " 'logr',\n", " 'los',\n", " 'los buen',\n", " 'los buen profesor',\n", " 'los conveni',\n", " 'los curs',\n", " 'los curs blend',\n", " 'los docent',\n", " 'los horari',\n", " 'los profesor',\n", " 'los profesor buen',\n", " 'los servici',\n", " 'los taller',\n", " 'lucr',\n", " 'lueg',\n", " 'lug',\n", " 'lugar',\n", " 'lugar estudi',\n", " 'mac',\n", " 'macs',\n", " 'maestr',\n", " 'mal',\n", " 'mal atencion',\n", " 'mal atencion alumn',\n", " 'mal servici',\n", " 'malisim',\n", " 'mall',\n", " 'mall curricul',\n", " 'man',\n", " 'manan',\n", " 'mand',\n", " 'manej',\n", " 'maner',\n", " 'manten',\n", " 'maquet',\n", " 'maquin',\n", " 'mas',\n", " 'mas alumn',\n", " 'mas are',\n", " 'mas cubicul',\n", " 'mas curs',\n", " 'mas espaci',\n", " 'mas espaci estudi',\n", " 'mas exigent',\n", " 'mas facil',\n", " 'mas grand',\n", " 'mas hor',\n", " 'mas horari',\n", " 'mas libr',\n", " 'mas lugar',\n", " 'mas rap',\n", " 'mas salon',\n", " 'mas taller',\n", " 'mas valor',\n", " 'mat',\n", " 'matemat',\n", " 'materi',\n", " 'material',\n", " 'matricul',\n", " 'matricul curs',\n", " 'matricul line',\n", " 'maxim',\n", " 'mayor',\n", " 'mayor cantid',\n", " 'mayor espaci',\n", " 'mayor profesor',\n", " 'mayori',\n", " 'me',\n", " 'me encant',\n", " 'me gust',\n", " 'me parec',\n", " 'me parec buen',\n", " 'medi',\n", " 'mediant',\n", " 'medicin',\n", " 'mejor',\n", " 'mejor aspect',\n", " 'mejor atencion',\n", " 'mejor atencion alumn',\n", " 'mejor aul',\n", " 'mejor calid',\n", " 'mejor horari',\n", " 'mejor infraestructur',\n", " 'mejor internet',\n", " 'mejor matricul',\n", " 'mejor organiz',\n", " 'mejor proces',\n", " 'mejor proces matricul',\n", " 'mejor profesor',\n", " 'mejor servici',\n", " 'mejor servici atencion',\n", " 'mejor sistem',\n", " 'mejor tem',\n", " 'mejor univers',\n", " 'mejor wifi',\n", " 'men',\n", " 'menor',\n", " 'mensual',\n", " 'menu',\n", " 'merc',\n", " 'mes',\n", " 'met',\n", " 'metod',\n", " 'metod ensen',\n", " 'metodolog',\n", " 'metodolog ensen',\n", " 'metodologi',\n", " 'metodologi ensen',\n", " 'mi',\n", " 'miguel',\n", " 'minim',\n", " 'minut',\n", " 'mism',\n", " 'mo',\n", " 'mod',\n", " 'modal',\n", " 'modal blend',\n", " 'modern',\n", " 'modul',\n", " 'molest',\n", " 'moment',\n", " 'moment matricul',\n", " 'mont',\n", " 'monterr',\n", " 'motiv',\n", " 'movil',\n", " 'much',\n", " 'much alumn',\n", " 'much cos',\n", " 'much facil',\n", " 'much gent',\n", " 'much oportun',\n", " 'much person',\n", " 'much vec',\n", " 'music',\n", " 'muy',\n", " 'muy buen',\n", " 'muy car',\n", " 'nadi',\n", " 'neces',\n", " 'necesari',\n", " 'necesit',\n", " 'necesit mas',\n", " 'negoci',\n", " 'ningun',\n", " 'nivel',\n", " 'nivel academ',\n", " 'nivel educ',\n", " 'nivel ensen',\n", " 'nivel exigent',\n", " 'no',\n", " 'no deb',\n", " 'no gust',\n", " 'no sub',\n", " 'no suficient',\n", " 'noch',\n", " 'normal',\n", " 'nos',\n", " 'not',\n", " 'nuev',\n", " 'numer',\n", " 'nunc',\n", " 'nunc contest',\n", " 'objet',\n", " 'oblig',\n", " 'obten',\n", " 'ocasion',\n", " 'ocup',\n", " 'ofrec',\n", " 'ofrec univers',\n", " 'onlin',\n", " 'opcion',\n", " 'opinion',\n", " 'oportun',\n", " 'oportun laboral',\n", " 'optim',\n", " 'orden',\n", " 'organiz',\n", " 'organizacion',\n", " 'orient',\n", " 'otorg',\n", " 'pabellon',\n", " 'pacienci',\n", " 'pag',\n", " 'pag pension',\n", " 'pagin',\n", " 'pais',\n", " 'par',\n", " 'parec',\n", " 'parec buen',\n", " 'parec buen univers',\n", " 'parec univers',\n", " 'part',\n", " 'particip',\n", " 'pas',\n", " 'ped',\n", " 'pens',\n", " 'pension',\n", " 'pension alta',\n", " 'pension cad',\n", " 'pension cad ano',\n", " 'pension cad cicl',\n", " 'pension car',\n", " 'pension deb',\n", " 'pension sub',\n", " 'pension sub cad',\n", " 'peor',\n", " 'pequen',\n", " 'per',\n", " 'perd',\n", " 'perfect',\n", " 'perjud',\n", " 'perjudic',\n", " 'permit',\n", " 'person',\n", " 'person trabaj',\n", " 'personal',\n", " 'personaliz',\n", " 'peru',\n", " 'pes',\n", " 'pesim',\n", " 'pesim atencion',\n", " 'pid',\n", " 'piens',\n", " 'pierd',\n", " 'pis',\n", " 'piscin',\n", " 'plan',\n", " 'plan docent',\n", " 'plat',\n", " 'plataform',\n", " 'plataform virtual',\n", " 'pm',\n", " 'poblacion',\n", " 'poc',\n", " 'poc espaci',\n", " 'pod',\n", " ...]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vec.get_feature_names()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x_train_tokens = vec.transform(x_train['COMENTARIO'])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x_test_tokens = vec.transform(x_test['COMENTARIO'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stacking together tokens and categorical features" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "full_x_train = hstack((x_train[model_cols].as_matrix(),x_train_tokens))\n", "full_x_test = hstack((x_test[model_cols].as_matrix(),x_test_tokens))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Voting Classifier" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "clf_log2 = LogisticRegression(C= 1, class_weight= None, solver= 'newton-cg', random_state=1)\n", "clf_xgb = XGBClassifier( objective='multi:softprob', scale_pos_weight=1, \n", " max_depth= 9,gamma=0.3, colsample_bytree= 0.9, subsample= 0.8,seed=27)\n", "clf_nb2 = OneVsRestClassifier(MultinomialNB())" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "clf_voting = VotingClassifier(estimators=[('lr',clf_log2),('xgb', clf_xgb),('nb',clf_nb2)], voting='soft')" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "VotingClassifier(estimators=[('lr', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=1, solver='newton-cg', tol=0.0001,\n", " verbose=0, warm_start=False)), ('xgb', XGBClassifi...assifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),\n", " n_jobs=1))],\n", " n_jobs=1, voting='soft', weights=None)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf_voting.fit(full_x_train,y_train)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": true }, "outputs": [], "source": [ "predict_clf_voting = clf_voting.predict(full_x_test)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy: 0.7145\n", "log_loss: 0.735936697571\n" ] } ], "source": [ "print('accuracy: %s' % accuracy_score(predict_clf_voting,y_test))\n", "print('log_loss: %s' % log_loss(y_test, clf_voting.predict_proba(full_x_test)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Results for submission" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x_final_tokens = vec.transform(X_final['COMENTARIO'])" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": true }, "outputs": [], "source": [ "final_x_test = hstack((X_final[model_cols].as_matrix(),x_final_tokens))" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": true }, "outputs": [], "source": [ "final_predict = clf_voting.predict_proba(final_x_test)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": true }, "outputs": [], "source": [ "final_cod = all_data.loc[20000:,'COD_ENCUESTADO'].copy().reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": true }, "outputs": [], "source": [ "final_predict_df = pd.concat([final_cod,pd.DataFrame(final_predict,columns = ['NPS1','NPS2','NPS3','NPS4'])],axis=1)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": true }, "outputs": [], "source": [ "final_predict_df.to_csv('submission.csv',index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }