{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[16:55:32] DEBUG Fast version of gensim.models.doc2vec is being used\n", "[16:55:32] DEBUG Fast version of Fasttext is being used\n", "[16:55:32] INFO 'pattern' package not found; tag filters are not available for English\n", "[nltk_data] Downloading package punkt to /home/cris/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "# Load the \"autoreload\" extension\n", "%load_ext autoreload\n", "\n", "# always reload modules marked with \"%aimport\"\n", "%autoreload 1\n", "\n", "import os\n", "import sys\n", "from sklearn.metrics import roc_curve\n", "from sklearn.model_selection import train_test_split\n", "\n", "# add the 'src' directory as one where we can import modules\n", "src_dir = os.path.join(os.getcwd(), os.pardir, 'src')\n", "sys.path.append(src_dir)\n", "\n", "# import my method from the source code\n", "%aimport data.read_data\n", "%aimport models.train_model\n", "%aimport features.build_features\n", "%aimport visualization.visualize\n", "from data.read_data import read_data, get_stopwords\n", "from features.build_features import get_vec, to_categorical, replace_na, to_tfidf, stack_sparse, to_sparse_int, get_fasttext\n", "from models.train_model import split_train, score_function, model_ridge, model_xgb, model_ensembler\n", "from visualization.visualize import plot_roc, plot_scatter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDreview_contentreview_titlereview_starsproductTarget
00En appelant un acheteur pour demander si l'écr...La Police s'inscrit en acheteur privé sur Pric...52fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5...0
11Alors, là, on a affaire au plus grand Navet ja...Chef D'Oeuvre Absolu en vue...57b56d9d378d9e999d293f301ac43d044cd7b4786d09afb...1
22Effet garanti sur la terrase. Ils donnent immé...Effet garanti sur la terrase. Ils donnent immé...37b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c...0
33tres bon rapport qualite prix tre pratique en ...bon produit477d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88...1
44Ordinateur de bureau trés bien pour quelqu'un ...Apple Power MAC G43f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b...1
\n", "
" ], "text/plain": [ " ID review_content \\\n", "0 0 En appelant un acheteur pour demander si l'écr... \n", "1 1 Alors, là, on a affaire au plus grand Navet ja... \n", "2 2 Effet garanti sur la terrase. Ils donnent immé... \n", "3 3 tres bon rapport qualite prix tre pratique en ... \n", "4 4 Ordinateur de bureau trés bien pour quelqu'un ... \n", "\n", " review_title review_stars \\\n", "0 La Police s'inscrit en acheteur privé sur Pric... 5 \n", "1 Chef D'Oeuvre Absolu en vue... 5 \n", "2 Effet garanti sur la terrase. Ils donnent immé... 3 \n", "3 bon produit 4 \n", "4 Apple Power MAC G4 3 \n", "\n", " product Target \n", "0 2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5... 0 \n", "1 7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb... 1 \n", "2 7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c... 0 \n", "3 77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88... 1 \n", "4 f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b... 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = read_data(test=False)\n", "y = train['Target']\n", "stopwords = get_stopwords()\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Feature engineering\n", "train = replace_na(train, ['review_content', 'review_title'])\n", "X_dummies = to_categorical(train, 'review_stars')\n", "X_content = to_tfidf(train, 'review_content', stopwords)\n", "X_title = to_tfidf(train, 'review_title', stopwords)\n", "X_length = to_sparse_int(train, 'review_content')\n", "\n", "sparse_merge = stack_sparse([X_dummies, X_content, X_title, X_length])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[16:55:47] INFO loading projection weights from ../data/external/wiki.fr.bin\n", "[16:55:47] DEBUG {'kw': {}, 'mode': 'rb', 'uri': '../data/external/wiki.fr.bin'}\n", "[16:55:47] DEBUG encoding_wrapper: {'errors': 'strict', 'encoding': None, 'mode': 'rb', 'fileobj': <_io.BufferedReader name='../data/external/wiki.fr.bin'>}\n", "[16:56:20] INFO loaded (1152449, 300) matrix from ../data/external/wiki.fr.bin\n" ] } ], "source": [ "model_fasttext = get_fasttext()\n", "xtrain = get_vec(train['review_content'].values, model_fasttext, stopwords)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_train_tfv, X_test_tfv, X_train_ft, X_test_ft, y_train, y_test = train_test_split(sparse_merge, xtrain, y, test_size=0.33, random_state=7)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[16:57:12] INFO Found 2 classes\n", "[16:57:12] INFO Training Level 0 Fold # 1. Model # 0\n", "[16:58:48] INFO Predicting Level 0. Fold # 1. Model # 0\n", "[16:58:49] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.707828\n", "[16:58:49] INFO Training Level 0 Fold # 2. Model # 0\n", "[17:00:29] INFO Predicting Level 0. Fold # 2. Model # 0\n", "[17:00:30] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.701661\n", "[17:00:30] INFO Training Level 0 Fold # 3. Model # 0\n", "[17:02:09] INFO Predicting Level 0. Fold # 3. Model # 0\n", "[17:02:09] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.708444\n", "[17:02:09] INFO Level 0. Model # 0. Mean Score = 0.705978. Std Dev = 0.003062\n", "[17:02:09] INFO Training Level 0 Fold # 1. Model # 1\n", "[17:02:34] INFO Predicting Level 0. Fold # 1. Model # 1\n", "[17:02:35] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.700502\n", "[17:02:35] INFO Training Level 0 Fold # 2. Model # 1\n", "[17:03:00] INFO Predicting Level 0. Fold # 2. Model # 1\n", "[17:03:02] INFO Level 0. Fold # 2. Model # 1. Validation Score = 0.693814\n", "[17:03:02] INFO Training Level 0 Fold # 3. Model # 1\n", "[17:03:27] INFO Predicting Level 0. Fold # 3. Model # 1\n", "[17:03:29] INFO Level 0. Fold # 3. Model # 1. Validation Score = 0.701992\n", "[17:03:29] INFO Level 0. Model # 1. Mean Score = 0.698769. Std Dev = 0.003556\n", "[17:03:29] INFO Saving predictions for level # 0\n", "[17:03:29] INFO Training Level 1 Fold # 1. Model # 0\n", "[17:03:32] INFO Predicting Level 1. Fold # 1. Model # 0\n", "[17:03:32] INFO Level 1. Fold # 1. Model # 0. Validation Score = 0.691023\n", "[17:03:32] INFO Training Level 1 Fold # 2. Model # 0\n", "[17:03:35] INFO Predicting Level 1. Fold # 2. Model # 0\n", "[17:03:35] INFO Level 1. Fold # 2. Model # 0. Validation Score = 0.688373\n", "[17:03:35] INFO Training Level 1 Fold # 3. Model # 0\n", "[17:03:37] INFO Predicting Level 1. Fold # 3. Model # 0\n", "[17:03:37] INFO Level 1. Fold # 3. Model # 0. Validation Score = 0.696277\n", "[17:03:37] INFO Level 1. Model # 0. Mean Score = 0.691891. Std Dev = 0.003284\n", "[17:03:37] INFO Saving predictions for level # 1\n" ] } ], "source": [ "ens = model_ensembler(X_train_tfv, X_train_ft, y_train)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[17:03:38] INFO Training Fulldata Level 0. Model # 0\n", "[17:05:45] INFO Predicting Test Level 0. Model # 0\n", "[17:05:46] INFO Training Fulldata Level 0. Model # 1\n", "[17:06:23] INFO Predicting Test Level 0. Model # 1\n", "[17:06:27] INFO Training Fulldata Level 1. Model # 0\n", "[17:06:31] INFO Predicting Test Level 1. Model # 0\n" ] }, { "data": { "text/plain": [ "0.71845231796402476" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data_dict = {0: [X_test_tfv, X_test_tfv], 1: [X_test_ft]}\n", "preds = ens.predict(test_data_dict, lentest=X_test_ft.shape[0])\n", "preds1 = np.mean((preds[0][:,1], preds[1][:,1]),axis=0)\n", "score_function(y_test, preds1)" ] } ], "metadata": { "kernelspec": { "display_name": "cdiscount kernel", "language": "python", "name": "cdiscount" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }