In [1]:
#imports
import pandas as pd
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_average_precision_score


from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from nltk import word_tokenize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data preparation

In [2]:
df = pd.read_csv('../input/dataset/train.csv')
dftest = pd.read_csv('../input/dataset/test.csv')

#df = pd.read_csv('../input/dataset10/train10.csv')
#dftest = pd.read_csv('../input/dataset10/test10.csv')

dftest['descripcion'] = dftest['descripcion'].apply(lambda x: x.strip('"'))
df['descripcion'] = df['descripcion'].apply(lambda x: x.strip('"'))

df.pop('Unnamed: 0')
dftest.pop('Unnamed: 0')
df.head()

Unnamed: 0,descripcion,03,09,14,15,16,18,19,22,24,...,73,75,76,77,79,80,85,90,92,98
0,Contrato Administrativo de Servicios de diseño...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2019(Y)1535 Construcción escalera de emergenci...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Suministro de energía electrica de diversas in...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Servicio desplazamiento del personal operativo...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Contrato de suministro de gas natural en los ...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
labels = df.columns[1:]

In [4]:
X_train = df['descripcion']
X_test = dftest['descripcion']

y_train = df.drop('descripcion', axis=1)
y_test = dftest.drop('descripcion', axis=1)

y_train

Unnamed: 0,03,09,14,15,16,18,19,22,24,30,...,73,75,76,77,79,80,85,90,92,98
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72425,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72427,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Models

In [5]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_average_precision_score
    
# adapted from: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(ytest,y_pred):
    # finally, compute metrics
    y_true = ytest
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    coverage_err = coverage_error(y_test, y_pred)
    label_ranking_average_precision = label_ranking_average_precision_score(y_test, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy,
               'coverage_error': coverage_err,
               'label_ranking_average_precision_score': label_ranking_average_precision}
    return metrics

In [6]:
from sklearn.multiclass import OneVsRestClassifier

In [7]:
tfidf_tree = Pipeline([
     ('vectorizer', TfidfVectorizer(ngram_range=(1,3),
                                   analyzer="word",
                                   max_features=None,
                                   stop_words=stopwords.words('spanish'),
                                   use_idf=True,
                                   lowercase=True)),
    ('tree', DecisionTreeClassifier(random_state=42))
])

tfidf_tree.fit(X_train, y_train)
ypred = tfidf_tree.predict(X_test)

In [8]:
print(f'tfidf_tree:')	

print(multi_label_metrics(y_test, ypred))

tfidf_tree:


{'f1': 0.6247494183579404, 'roc_auc': 0.7993711656698566, 'accuracy': 0.5994136975710328, 'coverage_error': 17.67556858449842, 'label_ranking_average_precision_score': 0.6415911108734487}


In [9]:
from sklearn.neighbors import KNeighborsClassifier

tfidf_knn = Pipeline([
     ('vectorizer', TfidfVectorizer(ngram_range=(1,3),
                                   analyzer="word",
                                   max_features=None,
                                   stop_words=stopwords.words('spanish'),
                                   use_idf=True,
                                   lowercase=True)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

tfidf_knn.fit(X_train, y_train)
ypred = tfidf_knn.predict(X_test)

#ypredprob = tfidf_knn.predict_proba(X_test)
#print(ypredprob)

print(f'tfidf_knn:')	
print(multi_label_metrics(y_test, ypred))

tfidf_knn:


{'f1': 0.6233266282860513, 'roc_auc': 0.754034082546732, 'accuracy': 0.5210038013014625, 'coverage_error': 21.68468526512467, 'label_ranking_average_precision_score': 0.5560795536653822}


In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier


tfidf_NB = Pipeline([
   ('vectorizer', TfidfVectorizer(ngram_range=(1,3),
                                   analyzer="word",
                                   max_features=None,
                                   stop_words=stopwords.words('spanish'),
                                   use_idf=True,
                                   lowercase=True)),
    ('nb', OneVsRestClassifier(MultinomialNB()))
])


tfidf_NB.fit(X_train, y_train)
ypred2 = tfidf_NB.predict(X_test)

print(f'tfidf_nb:')	

print(multi_label_metrics(y_test, ypred2))

tfidf_nb:


{'f1': 0.21822499557891015, 'roc_auc': 0.5614031276289032, 'accuracy': 0.13472070098576122, 'coverage_error': 39.0722891566265, 'label_ranking_average_precision_score': 0.1588181603418836}


In [11]:
from sklearn.ensemble import RandomForestClassifier

tfidf_random_forest = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3),
                                   analyzer="word",
                                   max_features=None,
                                   stop_words=stopwords.words('spanish'),
                                   use_idf=True,
                                   lowercase=True)),
    ('randomforest', RandomForestClassifier(random_state=42))
])



tfidf_random_forest.fit(X_train, y_train)
ypred3 = tfidf_random_forest.predict(X_test)


print(f'tfidf_random_forest:')	
print(multi_label_metrics(y_test, ypred3))

tfidf_random_forest:


{'f1': 0.6383469843633656, 'roc_auc': 0.7434141602805479, 'accuracy': 0.5138522002448296, 'coverage_error': 22.322337478255267, 'label_ranking_average_precision_score': 0.5416257063669199}


In [12]:
from sklearn.ensemble import AdaBoostClassifier

tfidf_adaboost = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3),
                                   analyzer="word",
                                   max_features=None,
                                   stop_words=stopwords.words('spanish'),
                                   use_idf=True,
                                   lowercase=True)),
    ('adaboost', OneVsRestClassifier(AdaBoostClassifier(random_state=42)))
])



tfidf_adaboost.fit(X_train, y_train)
ypred3 = tfidf_adaboost.predict(X_test)


print(f'tfidf_adaboost:')	
print(multi_label_metrics(y_test, ypred3))

tfidf_adaboost:


{'f1': 0.6022293427870193, 'roc_auc': 0.7491995654601569, 'accuracy': 0.45196830101153274, 'coverage_error': 22.465820501256363, 'label_ranking_average_precision_score': 0.5127305400913543}


In [None]:
from sklearn.svm import SVC

tfidf_SVC = Pipeline([
   ('vectorizer', TfidfVectorizer(ngram_range=(1,3),
                                   analyzer="word",
                                   max_features=None,
                                   stop_words=stopwords.words('spanish'),
                                   use_idf=True,
                                   lowercase=True)),
    ('linearsvc', OneVsRestClassifier(SVC(random_state=42)))
])


tfidf_SVC.fit(X_train, y_train)
ypred2 = tfidf_SVC.predict(X_test)


print(f'tfidf_SVC:')	
print(multi_label_metrics(y_test, ypred2))

In [None]:
from sklearn.svm import SVC

tfidf_SVMrbf = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3),
                                   analyzer="word",
                                   max_features=None,
                                   stop_words=stopwords.words('spanish'),
                                   use_idf=True,
                                   lowercase=True)),
    ('rbfsvc', OneVsRestClassifier(SVC(kernel="rbf", random_state=42)))
])


tfidf_SVMrbf.fit(X_train, y_train)
ypred2 = tfidf_SVMrbf.predict(X_test)


print(f'tfidf_SVMrbf:')	
print(multi_label_metrics(y_test, ypred2))