{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[16:55:32] DEBUG Fast version of gensim.models.doc2vec is being used\n",
      "[16:55:32] DEBUG Fast version of Fasttext is being used\n",
      "[16:55:32] INFO 'pattern' package not found; tag filters are not available for English\n",
      "[nltk_data] Downloading package punkt to /home/cris/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "# Load the \"autoreload\" extension\n",
    "%load_ext autoreload\n",
    "\n",
    "# always reload modules marked with \"%aimport\"\n",
    "%autoreload 1\n",
    "\n",
    "import os\n",
    "import sys\n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# add the 'src' directory as one where we can import modules\n",
    "src_dir = os.path.join(os.getcwd(), os.pardir, 'src')\n",
    "sys.path.append(src_dir)\n",
    "\n",
    "# import my method from the source code\n",
    "%aimport data.read_data\n",
    "%aimport models.train_model\n",
    "%aimport features.build_features\n",
    "%aimport visualization.visualize\n",
    "from data.read_data import read_data, get_stopwords\n",
    "from features.build_features import get_vec, to_categorical, replace_na, to_tfidf, stack_sparse, to_sparse_int, get_fasttext\n",
    "from models.train_model import split_train, score_function, model_ridge, model_xgb, model_ensembler\n",
    "from visualization.visualize import plot_roc, plot_scatter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>review_content</th>\n",
       "      <th>review_title</th>\n",
       "      <th>review_stars</th>\n",
       "      <th>product</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>En appelant un acheteur pour demander si l'écr...</td>\n",
       "      <td>La Police s'inscrit en acheteur privé sur Pric...</td>\n",
       "      <td>5</td>\n",
       "      <td>2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Alors, là, on a affaire au plus grand Navet ja...</td>\n",
       "      <td>Chef D'Oeuvre Absolu en vue...</td>\n",
       "      <td>5</td>\n",
       "      <td>7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Effet garanti sur la terrase. Ils donnent immé...</td>\n",
       "      <td>Effet garanti sur la terrase. Ils donnent immé...</td>\n",
       "      <td>3</td>\n",
       "      <td>7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>tres bon rapport qualite prix tre pratique en ...</td>\n",
       "      <td>bon produit</td>\n",
       "      <td>4</td>\n",
       "      <td>77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Ordinateur de bureau trés bien pour quelqu'un ...</td>\n",
       "      <td>Apple Power MAC G4</td>\n",
       "      <td>3</td>\n",
       "      <td>f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID                                     review_content  \\\n",
       "0   0  En appelant un acheteur pour demander si l'écr...   \n",
       "1   1  Alors, là, on a affaire au plus grand Navet ja...   \n",
       "2   2  Effet garanti sur la terrase. Ils donnent immé...   \n",
       "3   3  tres bon rapport qualite prix tre pratique en ...   \n",
       "4   4  Ordinateur de bureau trés bien pour quelqu'un ...   \n",
       "\n",
       "                                        review_title  review_stars  \\\n",
       "0  La Police s'inscrit en acheteur privé sur Pric...             5   \n",
       "1                     Chef D'Oeuvre Absolu en vue...             5   \n",
       "2  Effet garanti sur la terrase. Ils donnent immé...             3   \n",
       "3                                        bon produit             4   \n",
       "4                                 Apple Power MAC G4             3   \n",
       "\n",
       "                                             product  Target  \n",
       "0  2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5...       0  \n",
       "1  7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb...       1  \n",
       "2  7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c...       0  \n",
       "3  77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88...       1  \n",
       "4  f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b...       1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = read_data(test=False)\n",
    "y = train['Target']\n",
    "stopwords = get_stopwords()\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Feature engineering\n",
    "train = replace_na(train, ['review_content', 'review_title'])\n",
    "X_dummies = to_categorical(train, 'review_stars')\n",
    "X_content = to_tfidf(train, 'review_content', stopwords)\n",
    "X_title = to_tfidf(train, 'review_title', stopwords)\n",
    "X_length = to_sparse_int(train, 'review_content')\n",
    "\n",
    "sparse_merge = stack_sparse([X_dummies, X_content, X_title, X_length])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[16:55:47] INFO loading projection weights from ../data/external/wiki.fr.bin\n",
      "[16:55:47] DEBUG {'kw': {}, 'mode': 'rb', 'uri': '../data/external/wiki.fr.bin'}\n",
      "[16:55:47] DEBUG encoding_wrapper: {'errors': 'strict', 'encoding': None, 'mode': 'rb', 'fileobj': <_io.BufferedReader name='../data/external/wiki.fr.bin'>}\n",
      "[16:56:20] INFO loaded (1152449, 300) matrix from ../data/external/wiki.fr.bin\n"
     ]
    }
   ],
   "source": [
    "model_fasttext = get_fasttext()\n",
    "xtrain = get_vec(train['review_content'].values, model_fasttext, stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_tfv, X_test_tfv, X_train_ft, X_test_ft, y_train, y_test = train_test_split(sparse_merge, xtrain, y, test_size=0.33, random_state=7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[16:57:12] INFO Found 2 classes\n",
      "[16:57:12] INFO Training Level 0 Fold # 1. Model # 0\n",
      "[16:58:48] INFO Predicting Level 0. Fold # 1. Model # 0\n",
      "[16:58:49] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.707828\n",
      "[16:58:49] INFO Training Level 0 Fold # 2. Model # 0\n",
      "[17:00:29] INFO Predicting Level 0. Fold # 2. Model # 0\n",
      "[17:00:30] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.701661\n",
      "[17:00:30] INFO Training Level 0 Fold # 3. Model # 0\n",
      "[17:02:09] INFO Predicting Level 0. Fold # 3. Model # 0\n",
      "[17:02:09] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.708444\n",
      "[17:02:09] INFO Level 0. Model # 0. Mean Score = 0.705978. Std Dev = 0.003062\n",
      "[17:02:09] INFO Training Level 0 Fold # 1. Model # 1\n",
      "[17:02:34] INFO Predicting Level 0. Fold # 1. Model # 1\n",
      "[17:02:35] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.700502\n",
      "[17:02:35] INFO Training Level 0 Fold # 2. Model # 1\n",
      "[17:03:00] INFO Predicting Level 0. Fold # 2. Model # 1\n",
      "[17:03:02] INFO Level 0. Fold # 2. Model # 1. Validation Score = 0.693814\n",
      "[17:03:02] INFO Training Level 0 Fold # 3. Model # 1\n",
      "[17:03:27] INFO Predicting Level 0. Fold # 3. Model # 1\n",
      "[17:03:29] INFO Level 0. Fold # 3. Model # 1. Validation Score = 0.701992\n",
      "[17:03:29] INFO Level 0. Model # 1. Mean Score = 0.698769. Std Dev = 0.003556\n",
      "[17:03:29] INFO Saving predictions for level # 0\n",
      "[17:03:29] INFO Training Level 1 Fold # 1. Model # 0\n",
      "[17:03:32] INFO Predicting Level 1. Fold # 1. Model # 0\n",
      "[17:03:32] INFO Level 1. Fold # 1. Model # 0. Validation Score = 0.691023\n",
      "[17:03:32] INFO Training Level 1 Fold # 2. Model # 0\n",
      "[17:03:35] INFO Predicting Level 1. Fold # 2. Model # 0\n",
      "[17:03:35] INFO Level 1. Fold # 2. Model # 0. Validation Score = 0.688373\n",
      "[17:03:35] INFO Training Level 1 Fold # 3. Model # 0\n",
      "[17:03:37] INFO Predicting Level 1. Fold # 3. Model # 0\n",
      "[17:03:37] INFO Level 1. Fold # 3. Model # 0. Validation Score = 0.696277\n",
      "[17:03:37] INFO Level 1. Model # 0. Mean Score = 0.691891. Std Dev = 0.003284\n",
      "[17:03:37] INFO Saving predictions for level # 1\n"
     ]
    }
   ],
   "source": [
    "ens = model_ensembler(X_train_tfv, X_train_ft, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[17:03:38] INFO Training Fulldata Level 0. Model # 0\n",
      "[17:05:45] INFO Predicting Test Level 0. Model # 0\n",
      "[17:05:46] INFO Training Fulldata Level 0. Model # 1\n",
      "[17:06:23] INFO Predicting Test Level 0. Model # 1\n",
      "[17:06:27] INFO Training Fulldata Level 1. Model # 0\n",
      "[17:06:31] INFO Predicting Test Level 1. Model # 0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.71845231796402476"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data_dict = {0: [X_test_tfv, X_test_tfv], 1: [X_test_ft]}\n",
    "preds = ens.predict(test_data_dict, lentest=X_test_ft.shape[0])\n",
    "preds1 = np.mean((preds[0][:,1], preds[1][:,1]),axis=0)\n",
    "score_function(y_test, preds1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cdiscount kernel",
   "language": "python",
   "name": "cdiscount"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}