{ "cells": [ { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "embedding_dim = 200\n", "corpus_base_dir = os.path.join('..', '..', '..', 'corpora', 'wired_it_20190821')\n", "\n", "vocab_size = 10000\n", "max_length = 1000\n", "trunc_type='post'\n", "padding_type='post'\n", "oov_tok = \"\"\n", "num_epochs = 30\n", "\n", "training_dir = os.path.join(corpus_base_dir, 'training')\n", "test_dir = os.path.join(corpus_base_dir, 'test')\n", "classes = ['attualit_','attualit__ambiente','attualit__media','attualit__politica','attualit__tech','economia_business','economia_finanza','economia_lavoro','economia_startup','gadget_accessori','gadget_audio_e_tv','gadget_computer','gadget_elettrodomestici','gadget_foto_e_video','gadget_motori','gadget_outdoor','gadget_videogiochi','internet_regole','internet_social_network','internet_tlc','internet_web','lifestyle_design','lifestyle_food','lifestyle_mobilit_','lifestyle_salute','lifestyle_viaggi','lol','mobile_app','mobile_smartphone','mobile_tablet','play_cinema','play_cultura','play_fumetti','play_libri','play_musica','play_tv','scienza','scienza_biotech','scienza_ecologia','scienza_lab','scienza_medicina','scienza_spazio']" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "def read_file_to_text(file_path):\n", " with open(file_path, 'r', encoding=\"utf8\") as file:\n", " return file.read().replace('\\n', ' ')\n", "def list_text_files(folder):\n", " return [os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith('.txt')]\n", "flatten = lambda l: [item for sublist in l for item in sublist]" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "load_class_text_pairs = lambda folder: map(lambda classLabel: map(lambda file_name: (classLabel, read_file_to_text(file_name)), list_text_files(os.path.join(folder, classLabel))) , classes)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "training_classes, training_texts = zip(*flatten(list(load_class_text_pairs(training_dir))))" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "test_classes, test_texts = zip(*flatten(list(load_class_text_pairs(test_dir))))" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from functools import reduce\n", "import tensorflow as tf\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "tokenizer.fit_on_texts(training_texts)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "training_sequences = tokenizer.texts_to_sequences(training_texts)\n", "training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "test_sequences = tokenizer.texts_to_sequences(test_texts)\n", "test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "model = tf.keras.Sequential([\n", " tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),\n", " tf.keras.layers.GlobalAveragePooling1D(),\n", " #tf.keras.layers.GlobalMaxPooling1D(),\n", " #tf.keras.layers.Dense(100, activation='relu'),\n", " tf.keras.layers.Dense(len(classes) + 1, activation='softmax')\n", "])" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_5\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_5 (Embedding) (None, 1000, 200) 2000000 \n", "_________________________________________________________________\n", "global_average_pooling1d_4 ( (None, 200) 0 \n", "_________________________________________________________________\n", "dense_6 (Dense) (None, 43) 8643 \n", "=================================================================\n", "Total params: 2,008,643\n", "Trainable params: 2,008,643\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "# We remove the '_' because class names features '_' as separators (and they need NOT to be split, thay must be a unique token)\n", "classes_tokenizer = Tokenizer(filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^`{|}~\\t\\n')\n", "classes_tokenizer.fit_on_texts(classes)\n", "\n", "training_classes_seq = np.array(classes_tokenizer.texts_to_sequences(training_classes))\n", "test_classes_seq = np.array(classes_tokenizer.texts_to_sequences(test_classes))" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'attualit_': 1, 'attualit__ambiente': 2, 'attualit__media': 3, 'attualit__politica': 4, 'attualit__tech': 5, 'economia_business': 6, 'economia_finanza': 7, 'economia_lavoro': 8, 'economia_startup': 9, 'gadget_accessori': 10, 'gadget_audio_e_tv': 11, 'gadget_computer': 12, 'gadget_elettrodomestici': 13, 'gadget_foto_e_video': 14, 'gadget_motori': 15, 'gadget_outdoor': 16, 'gadget_videogiochi': 17, 'internet_regole': 18, 'internet_social_network': 19, 'internet_tlc': 20, 'internet_web': 21, 'lifestyle_design': 22, 'lifestyle_food': 23, 'lifestyle_mobilit_': 24, 'lifestyle_salute': 25, 'lifestyle_viaggi': 26, 'lol': 27, 'mobile_app': 28, 'mobile_smartphone': 29, 'mobile_tablet': 30, 'play_cinema': 31, 'play_cultura': 32, 'play_fumetti': 33, 'play_libri': 34, 'play_musica': 35, 'play_tv': 36, 'scienza': 37, 'scienza_biotech': 38, 'scienza_ecologia': 39, 'scienza_lab': 40, 'scienza_medicina': 41, 'scienza_spazio': 42}\n" ] } ], "source": [ "training_classes_encoded = [class_encoded[0] for class_encoded in training_classes_seq]\n", "classes_word_index = classes_tokenizer.word_index\n", "print(classes_word_index)\n", "classes_by_index = dict([(index, key) for (key, index) in classes_word_index.items()])" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 13351 samples, validate on 5709 samples\n", "Epoch 1/30\n", "13351/13351 - 4s - loss: 3.6755 - acc: 0.0517 - val_loss: 3.6100 - val_acc: 0.0855\n", "Epoch 2/30\n", "13351/13351 - 4s - loss: 3.5382 - acc: 0.1020 - val_loss: 3.4647 - val_acc: 0.1450\n", "Epoch 3/30\n", "13351/13351 - 4s - loss: 3.3164 - acc: 0.1935 - val_loss: 3.2081 - val_acc: 0.2091\n", "Epoch 4/30\n", "13351/13351 - 4s - loss: 3.0274 - acc: 0.2819 - val_loss: 2.9484 - val_acc: 0.2859\n", "Epoch 5/30\n", "13351/13351 - 4s - loss: 2.7583 - acc: 0.3591 - val_loss: 2.7162 - val_acc: 0.3614\n", "Epoch 6/30\n", "13351/13351 - 4s - loss: 2.5210 - acc: 0.4245 - val_loss: 2.5182 - val_acc: 0.4122\n", "Epoch 7/30\n", "13351/13351 - 4s - loss: 2.3152 - acc: 0.4742 - val_loss: 2.3549 - val_acc: 0.4449\n", "Epoch 8/30\n", "13351/13351 - 4s - loss: 2.1378 - acc: 0.5104 - val_loss: 2.2150 - val_acc: 0.4771\n", "Epoch 9/30\n", "13351/13351 - 5s - loss: 1.9860 - acc: 0.5459 - val_loss: 2.1024 - val_acc: 0.5024\n", "Epoch 10/30\n", "13351/13351 - 4s - loss: 1.8553 - acc: 0.5719 - val_loss: 2.0122 - val_acc: 0.4971\n", "Epoch 11/30\n", "13351/13351 - 4s - loss: 1.7406 - acc: 0.5931 - val_loss: 1.9336 - val_acc: 0.5225\n", "Epoch 12/30\n", "13351/13351 - 4s - loss: 1.6384 - acc: 0.6180 - val_loss: 1.8646 - val_acc: 0.5279\n", "Epoch 13/30\n", "13351/13351 - 4s - loss: 1.5480 - acc: 0.6328 - val_loss: 1.8047 - val_acc: 0.5530\n", "Epoch 14/30\n", "13351/13351 - 4s - loss: 1.4645 - acc: 0.6550 - val_loss: 1.7604 - val_acc: 0.5539\n", "Epoch 15/30\n", "13351/13351 - 4s - loss: 1.3898 - acc: 0.6699 - val_loss: 1.7153 - val_acc: 0.5586\n", "Epoch 16/30\n", "13351/13351 - 4s - loss: 1.3191 - acc: 0.6874 - val_loss: 1.6837 - val_acc: 0.5605\n", "Epoch 17/30\n", "13351/13351 - 4s - loss: 1.2527 - acc: 0.7028 - val_loss: 1.6510 - val_acc: 0.5738\n", "Epoch 18/30\n", "13351/13351 - 4s - loss: 1.1915 - acc: 0.7182 - val_loss: 1.6226 - val_acc: 0.5751\n", "Epoch 19/30\n", "13351/13351 - 4s - loss: 1.1320 - acc: 0.7328 - val_loss: 1.6016 - val_acc: 0.5731\n", "Epoch 20/30\n", "13351/13351 - 4s - loss: 1.0761 - acc: 0.7475 - val_loss: 1.5790 - val_acc: 0.5835\n", "Epoch 21/30\n", "13351/13351 - 4s - loss: 1.0235 - acc: 0.7616 - val_loss: 1.5623 - val_acc: 0.5866\n", "Epoch 22/30\n", "13351/13351 - 4s - loss: 0.9718 - acc: 0.7747 - val_loss: 1.5457 - val_acc: 0.5903\n", "Epoch 23/30\n", "13351/13351 - 4s - loss: 0.9213 - acc: 0.7882 - val_loss: 1.5372 - val_acc: 0.5934\n", "Epoch 24/30\n", "13351/13351 - 4s - loss: 0.8760 - acc: 0.8046 - val_loss: 1.5226 - val_acc: 0.5954\n", "Epoch 25/30\n", "13351/13351 - 4s - loss: 0.8304 - acc: 0.8145 - val_loss: 1.5140 - val_acc: 0.5919\n", "Epoch 26/30\n", "13351/13351 - 5s - loss: 0.7861 - acc: 0.8238 - val_loss: 1.5082 - val_acc: 0.5964\n", "Epoch 27/30\n", "13351/13351 - 4s - loss: 0.7437 - acc: 0.8398 - val_loss: 1.4991 - val_acc: 0.5999\n", "Epoch 28/30\n", "13351/13351 - 4s - loss: 0.7039 - acc: 0.8494 - val_loss: 1.4925 - val_acc: 0.6008\n", "Epoch 29/30\n", "13351/13351 - 4s - loss: 0.6656 - acc: 0.8593 - val_loss: 1.4927 - val_acc: 0.6012\n", "Epoch 30/30\n", "13351/13351 - 4s - loss: 0.6287 - acc: 0.8697 - val_loss: 1.4877 - val_acc: 0.6036\n" ] } ], "source": [ "history = model.fit(training_padded, training_classes_seq, epochs=num_epochs, validation_data=(test_padded, test_classes_seq), verbose=2)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5709/5709 [==============================] - 0s 49us/sample\n", "|Class Label |Pre |Rec |F1\n", "|--- |--- |--- |---\n", "|attualit_ |0.22|0.21|0.21\n", "|attualit__ambiente |0.53|0.62|0.57\n", "|attualit__media |0.54|0.41|0.46\n", "|attualit__politica |0.56|0.59|0.57\n", "|attualit__tech |0.35|0.30|0.32\n", "|economia_business |0.39|0.43|0.41\n", "|economia_finanza |0.75|0.66|0.70\n", "|economia_lavoro |0.73|0.65|0.68\n", "|economia_startup |0.73|0.65|0.69\n", "|gadget_accessori |0.41|0.50|0.45\n", "|gadget_audio_e_tv |0.80|0.79|0.80\n", "|gadget_computer |0.69|0.64|0.66\n", "|gadget_elettrodomestici |0.59|0.21|0.31\n", "|gadget_foto_e_video |0.79|0.78|0.78\n", "|gadget_motori |0.81|0.77|0.79\n", "|gadget_outdoor |0.62|0.59|0.61\n", "|gadget_videogiochi |0.84|0.85|0.85\n", "|internet_regole |0.66|0.28|0.40\n", "|internet_social_network |0.64|0.72|0.67\n", "|internet_tlc |0.60|0.51|0.55\n", "|internet_web |0.50|0.47|0.48\n", "|lifestyle_design |0.49|0.59|0.54\n", "|lifestyle_food |0.74|0.70|0.72\n", "|lifestyle_mobilit_ |0.70|0.70|0.70\n", "|lifestyle_salute |0.43|0.43|0.43\n", "|lifestyle_viaggi |0.57|0.44|0.50\n", "|lol |0.30|0.74|0.43\n", "|mobile_app |0.62|0.60|0.61\n", "|mobile_smartphone |0.87|0.80|0.83\n", "|mobile_tablet |0.82|0.76|0.79\n", "|play_cinema |0.74|0.84|0.79\n", "|play_cultura |0.40|0.36|0.38\n", "|play_fumetti |0.90|0.87|0.88\n", "|play_libri |0.75|0.75|0.75\n", "|play_musica |0.68|0.83|0.75\n", "|play_tv |0.84|0.76|0.80\n", "|scienza |0.41|0.27|0.32\n", "|scienza_biotech |0.50|0.30|0.37\n", "|scienza_ecologia |0.52|0.59|0.55\n", "|scienza_lab |0.45|0.46|0.46\n", "|scienza_medicina |0.63|0.55|0.58\n", "|scienza_spazio |0.77|0.90|0.83\n", "\n", "|Average Type |Prec |Rec |F1\n", "|--- |--- |--- |---\n", "|micro|0.60|0.60|0.60\n", "|macro|0.62|0.59|0.59\n" ] } ], "source": [ "from sklearn import metrics\n", "classes_probabilties = model.predict(test_padded, batch_size=32, verbose=1)\n", "predicted = np.argmax(classes_probabilties, axis=1).tolist()\n", "expected = flatten(test_classes_seq)\n", "precision_scores = metrics.precision_score(expected, predicted, labels=np.unique(expected), average=None)\n", "recall_scores = metrics.recall_score(expected, predicted, labels=np.unique(expected), average=None)\n", "f1_scores = metrics.f1_score(expected, predicted, labels=np.unique(expected), average=None)\n", "precion_recall_f1_scores = list(zip(precision_scores, recall_scores, f1_scores))\n", "print('|Class Label'.ljust(40, ' ') + '|Pre |Rec |F1')\n", "print('|--- |--- |--- |---')\n", "for index, precision_recall_f1 in enumerate(precion_recall_f1_scores):\n", " classLabel = classes_by_index[index + 1]\n", " print('|%s|%.2f|%.2f|%.2f' % (classLabel.ljust(40, ' '), precision_recall_f1[0], precision_recall_f1[1], precision_recall_f1[2]))\n", "\n", "precision_score_micro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='micro')\n", "precision_score_macro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='macro')\n", "\n", "recall_score_micro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='micro')\n", "recall_score_macro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='macro')\n", "\n", "f1_score_micro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='micro')\n", "f1_score_macro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='macro')\n", "\n", "print()\n", "print('|Average Type |Prec |Rec |F1')\n", "print('|--- |--- |--- |---')\n", "print('|micro|%.2f|%.2f|%.2f' % (precision_score_micro_averaged, recall_score_micro_averaged, f1_score_micro_averaged))\n", "print('|macro|%.2f|%.2f|%.2f' % (precision_score_macro_averaged, recall_score_macro_averaged, f1_score_macro_averaged))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }