In [80]:
import os

embedding_dim = 200
corpus_base_dir = os.path.join('..', '..', '..', 'corpora', 'wired_it_20190821')

vocab_size = 10000
max_length = 1000
trunc_type='post'
padding_type='post'
oov_tok = ""
num_epochs = 30

training_dir = os.path.join(corpus_base_dir, 'training')
test_dir = os.path.join(corpus_base_dir, 'test')
classes = ['attualit_','attualit__ambiente','attualit__media','attualit__politica','attualit__tech','economia_business','economia_finanza','economia_lavoro','economia_startup','gadget_accessori','gadget_audio_e_tv','gadget_computer','gadget_elettrodomestici','gadget_foto_e_video','gadget_motori','gadget_outdoor','gadget_videogiochi','internet_regole','internet_social_network','internet_tlc','internet_web','lifestyle_design','lifestyle_food','lifestyle_mobilit_','lifestyle_salute','lifestyle_viaggi','lol','mobile_app','mobile_smartphone','mobile_tablet','play_cinema','play_cultura','play_fumetti','play_libri','play_musica','play_tv','scienza','scienza_biotech','scienza_ecologia','scienza_lab','scienza_medicina','scienza_spazio']

In [47]:
def read_file_to_text(file_path):
 with open(file_path, 'r', encoding="utf8") as file:
 return file.read().replace('\n', ' ')
def list_text_files(folder):
 return [os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith('.txt')]
flatten = lambda l: [item for sublist in l for item in sublist]

In [48]:
load_class_text_pairs = lambda folder: map(lambda classLabel: map(lambda file_name: (classLabel, read_file_to_text(file_name)), list_text_files(os.path.join(folder, classLabel))) , classes)

In [49]:
training_classes, training_texts = zip(*flatten(list(load_class_text_pairs(training_dir))))

In [50]:
test_classes, test_texts = zip(*flatten(list(load_class_text_pairs(test_dir))))

In [51]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tensorflow as tf
import numpy as np

In [52]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

In [53]:
tokenizer.fit_on_texts(training_texts)

In [54]:
training_sequences = tokenizer.texts_to_sequences(training_texts)
training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)

In [55]:
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [81]:
model = tf.keras.Sequential([
 tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
 tf.keras.layers.GlobalAveragePooling1D(),
 #tf.keras.layers.GlobalMaxPooling1D(),
 #tf.keras.layers.Dense(100, activation='relu'),
 tf.keras.layers.Dense(len(classes) + 1, activation='softmax')
])

In [82]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param # 
embedding_5 (Embedding) (None, 1000, 200) 2000000 
_________________________________________________________________
global_average_pooling1d_4 ( (None, 200) 0 
_________________________________________________________________
dense_6 (Dense) (None, 43) 8643 
Total params: 2,008,643
Trainable params: 2,008,643
Non-trainable params: 0
_________________________________________________________________


In [83]:
# We remove the '_' because class names features '_' as separators (and they need NOT to be split, thay must be a unique token)
classes_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
classes_tokenizer.fit_on_texts(classes)

training_classes_seq = np.array(classes_tokenizer.texts_to_sequences(training_classes))
test_classes_seq = np.array(classes_tokenizer.texts_to_sequences(test_classes))

In [84]:
training_classes_encoded = [class_encoded[0] for class_encoded in training_classes_seq]
classes_word_index = classes_tokenizer.word_index
print(classes_word_index)
classes_by_index = dict([(index, key) for (key, index) in classes_word_index.items()])

{'attualit_': 1, 'attualit__ambiente': 2, 'attualit__media': 3, 'attualit__politica': 4, 'attualit__tech': 5, 'economia_business': 6, 'economia_finanza': 7, 'economia_lavoro': 8, 'economia_startup': 9, 'gadget_accessori': 10, 'gadget_audio_e_tv': 11, 'gadget_computer': 12, 'gadget_elettrodomestici': 13, 'gadget_foto_e_video': 14, 'gadget_motori': 15, 'gadget_outdoor': 16, 'gadget_videogiochi': 17, 'internet_regole': 18, 'internet_social_network': 19, 'internet_tlc': 20, 'internet_web': 21, 'lifestyle_design': 22, 'lifestyle_food': 23, 'lifestyle_mobilit_': 24, 'lifestyle_salute': 25, 'lifestyle_viaggi': 26, 'lol': 27, 'mobile_app': 28, 'mobile_smartphone': 29, 'mobile_tablet': 30, 'play_cinema': 31, 'play_cultura': 32, 'play_fumetti': 33, 'play_libri': 34, 'play_musica': 35, 'play_tv': 36, 'scienza': 37, 'scienza_biotech': 38, 'scienza_ecologia': 39, 'scienza_lab': 40, 'scienza_medicina': 41, 'scienza_spazio': 42}


In [85]:
history = model.fit(training_padded, training_classes_seq, epochs=num_epochs, validation_data=(test_padded, test_classes_seq), verbose=2)

Train on 13351 samples, validate on 5709 samples
Epoch 1/30
13351/13351 - 4s - loss: 3.6755 - acc: 0.0517 - val_loss: 3.6100 - val_acc: 0.0855
Epoch 2/30
13351/13351 - 4s - loss: 3.5382 - acc: 0.1020 - val_loss: 3.4647 - val_acc: 0.1450
Epoch 3/30
13351/13351 - 4s - loss: 3.3164 - acc: 0.1935 - val_loss: 3.2081 - val_acc: 0.2091
Epoch 4/30
13351/13351 - 4s - loss: 3.0274 - acc: 0.2819 - val_loss: 2.9484 - val_acc: 0.2859
Epoch 5/30
13351/13351 - 4s - loss: 2.7583 - acc: 0.3591 - val_loss: 2.7162 - val_acc: 0.3614
Epoch 6/30
13351/13351 - 4s - loss: 2.5210 - acc: 0.4245 - val_loss: 2.5182 - val_acc: 0.4122
Epoch 7/30
13351/13351 - 4s - loss: 2.3152 - acc: 0.4742 - val_loss: 2.3549 - val_acc: 0.4449
Epoch 8/30
13351/13351 - 4s - loss: 2.1378 - acc: 0.5104 - val_loss: 2.2150 - val_acc: 0.4771
Epoch 9/30
13351/13351 - 5s - loss: 1.9860 - acc: 0.5459 - val_loss: 2.1024 - val_acc: 0.5024
Epoch 10/30
13351/13351 - 4s - loss: 1.8553 - acc: 0.5719 - val_loss: 2.0122 - val_acc: 0.4971
Epoch 11/3

In [86]:
from sklearn import metrics
classes_probabilties = model.predict(test_padded, batch_size=32, verbose=1)
predicted = np.argmax(classes_probabilties, axis=1).tolist()
expected = flatten(test_classes_seq)
precision_scores = metrics.precision_score(expected, predicted, labels=np.unique(expected), average=None)
recall_scores = metrics.recall_score(expected, predicted, labels=np.unique(expected), average=None)
f1_scores = metrics.f1_score(expected, predicted, labels=np.unique(expected), average=None)
precion_recall_f1_scores = list(zip(precision_scores, recall_scores, f1_scores))
print('|Class Label'.ljust(40, ' ') + '|Pre |Rec |F1')
print('|--- |--- |--- |---')
for index, precision_recall_f1 in enumerate(precion_recall_f1_scores):
 classLabel = classes_by_index[index + 1]
 print('|%s|%.2f|%.2f|%.2f' % (classLabel.ljust(40, ' '), precision_recall_f1[0], precision_recall_f1[1], precision_recall_f1[2]))

precision_score_micro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='micro')
precision_score_macro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='macro')

recall_score_micro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='micro')
recall_score_macro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='macro')

f1_score_micro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='micro')
f1_score_macro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='macro')

print()
print('|Average Type |Prec |Rec |F1')
print('|--- |--- |--- |---')
print('|micro|%.2f|%.2f|%.2f' % (precision_score_micro_averaged, recall_score_micro_averaged, f1_score_micro_averaged))
print('|macro|%.2f|%.2f|%.2f' % (precision_score_macro_averaged, recall_score_macro_averaged, f1_score_macro_averaged))

|Class Label |Pre |Rec |F1
|--- |--- |--- |---
|attualit_ |0.22|0.21|0.21
|attualit__ambiente |0.53|0.62|0.57
|attualit__media |0.54|0.41|0.46
|attualit__politica |0.56|0.59|0.57
|attualit__tech |0.35|0.30|0.32
|economia_business |0.39|0.43|0.41
|economia_finanza |0.75|0.66|0.70
|economia_lavoro |0.73|0.65|0.68
|economia_startup |0.73|0.65|0.69
|gadget_accessori |0.41|0.50|0.45
|gadget_audio_e_tv |0.80|0.79|0.80
|gadget_computer |0.69|0.64|0.66
|gadget_elettrodomestici |0.59|0.21|0.31
|gadget_foto_e_video |0.79|0.78|0.78
|gadget_motori |0.81|0.77|0.79
|gadget_outdoor |0.62|0.59|0.61
|gadget_videogiochi |0.84|0.85|0.85
|internet_regole |0.66|0.28|0.40
|internet_social_network |0.64|0.72|0.67
|internet_tlc |0.60|0.51|0.55
|internet_web |0.50|0.47|0.48
|lifestyle_design |0.49|0.59|0.54
|lifestyle_food |0.74|0.70|0.72
|lifestyle_mobilit_ |0.70|0.70|0.70
|lifestyle_salute |0.43|0.43|0.43
|lifestyle_viaggi |0.57|0.44|0.50
|lol |0.30|0.74|0.43
|mobile_app |0.62|0.60|0.61
|mobile_smartphone |0