{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ML Pipeline Preparation\n",
    "Follow the instructions below to help you create your ML pipeline.\n",
    "### 1. Import libraries and load data from database.\n",
    "- Import Python libraries\n",
    "- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)\n",
    "- Define feature and target variables X and Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "from sqlalchemy import create_engine\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import NLP libraries\n",
    "import re\n",
    "import nltk \n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "# nltk.download('punkt')\n",
    "# nltk.download('stopwords')\n",
    "# nltk.download('wordnet') # download for lemmatization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import sklearn\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.multioutput import MultiOutputClassifier\n",
    "from sklearn.metrics import precision_score, recall_score, f1_score\n",
    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load data from database\n",
    "engine = create_engine('sqlite:///data/DisasterResponse.db')\n",
    "df = pd.read_sql_table('DisasterResponse', engine)\n",
    "X = df['message']\n",
    "Y = df.drop(['id', 'message', 'original', 'genre'], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Write a tokenization function to process your text data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def tokenize(text):\n",
    "    # Define url pattern\n",
    "    url_re = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'\n",
    "    \n",
    "    # Detect and replace urls\n",
    "    detected_urls = re.findall(url_re, text)\n",
    "    for url in detected_urls:\n",
    "        text = text.replace(url, \"urlplaceholder\")\n",
    "    \n",
    "    # tokenize sentences\n",
    "    tokens = word_tokenize(text)\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    \n",
    "    # save cleaned tokens\n",
    "    clean_tokens = [lemmatizer.lemmatize(tok).lower().strip() for tok in tokens]\n",
    "    \n",
    "    # remove stopwords\n",
    "    STOPWORDS = list(set(stopwords.words('english')))\n",
    "    clean_tokens = [token for token in clean_tokens if token not in STOPWORDS]\n",
    "    \n",
    "    return clean_tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Build a machine learning pipeline\n",
    "- You'll find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_pipeline():\n",
    "    \n",
    "    # build NLP pipeline - count words, tf-idf, multiple output classifier\n",
    "    pipeline = Pipeline([\n",
    "        ('vec', CountVectorizer(tokenizer=tokenize)),\n",
    "        ('tfidf', TfidfTransformer()),\n",
    "        ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators = 100, n_jobs = 6)))\n",
    "    ])\n",
    "    \n",
    "    return pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Train pipeline\n",
    "- Split data into train and test sets\n",
    "- Train pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip_..._score=False, random_state=None, verbose=0,\n",
       "            warm_start=False),\n",
       "           n_jobs=None))])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, Y)\n",
    "pipeline = build_pipeline()\n",
    "pipeline.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. Test your model\n",
    "Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_report(pipeline, X_test, y_test):\n",
    "    # predict on the X_test\n",
    "    y_pred = pipeline.predict(X_test)\n",
    "    \n",
    "    # build classification report on every column\n",
    "    performances = []\n",
    "    for i in range(len(y_test.columns)):\n",
    "        performances.append([f1_score(y_test.iloc[:, i].values, y_pred[:, i], average='micro'),\n",
    "                             precision_score(y_test.iloc[:, i].values, y_pred[:, i], average='micro'),\n",
    "                             recall_score(y_test.iloc[:, i].values, y_pred[:, i], average='micro')])\n",
    "    # build dataframe\n",
    "    performances = pd.DataFrame(performances, columns=['f1 score', 'precision', 'recall'],\n",
    "                                index = y_test.columns)   \n",
    "    return performances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f1 score</th>\n",
       "      <th>precision</th>\n",
       "      <th>recall</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>related</th>\n",
       "      <td>0.801648</td>\n",
       "      <td>0.801648</td>\n",
       "      <td>0.801648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>request</th>\n",
       "      <td>0.894873</td>\n",
       "      <td>0.894873</td>\n",
       "      <td>0.894873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>offer</th>\n",
       "      <td>0.995880</td>\n",
       "      <td>0.995880</td>\n",
       "      <td>0.995880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aid_related</th>\n",
       "      <td>0.777388</td>\n",
       "      <td>0.777388</td>\n",
       "      <td>0.777388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>medical_help</th>\n",
       "      <td>0.920507</td>\n",
       "      <td>0.920507</td>\n",
       "      <td>0.920507</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>medical_products</th>\n",
       "      <td>0.956057</td>\n",
       "      <td>0.956057</td>\n",
       "      <td>0.956057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>search_and_rescue</th>\n",
       "      <td>0.971773</td>\n",
       "      <td>0.971773</td>\n",
       "      <td>0.971773</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>security</th>\n",
       "      <td>0.982301</td>\n",
       "      <td>0.982301</td>\n",
       "      <td>0.982301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>military</th>\n",
       "      <td>0.968111</td>\n",
       "      <td>0.968111</td>\n",
       "      <td>0.968111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>child_alone</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>water</th>\n",
       "      <td>0.958193</td>\n",
       "      <td>0.958193</td>\n",
       "      <td>0.958193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>food</th>\n",
       "      <td>0.940189</td>\n",
       "      <td>0.940189</td>\n",
       "      <td>0.940189</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shelter</th>\n",
       "      <td>0.935002</td>\n",
       "      <td>0.935002</td>\n",
       "      <td>0.935002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clothing</th>\n",
       "      <td>0.986726</td>\n",
       "      <td>0.986726</td>\n",
       "      <td>0.986726</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>money</th>\n",
       "      <td>0.978639</td>\n",
       "      <td>0.978639</td>\n",
       "      <td>0.978639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>missing_people</th>\n",
       "      <td>0.989930</td>\n",
       "      <td>0.989930</td>\n",
       "      <td>0.989930</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>refugees</th>\n",
       "      <td>0.968721</td>\n",
       "      <td>0.968721</td>\n",
       "      <td>0.968721</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>death</th>\n",
       "      <td>0.962161</td>\n",
       "      <td>0.962161</td>\n",
       "      <td>0.962161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_aid</th>\n",
       "      <td>0.871224</td>\n",
       "      <td>0.871224</td>\n",
       "      <td>0.871224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>infrastructure_related</th>\n",
       "      <td>0.934696</td>\n",
       "      <td>0.934696</td>\n",
       "      <td>0.934696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>transport</th>\n",
       "      <td>0.953769</td>\n",
       "      <td>0.953769</td>\n",
       "      <td>0.953769</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>buildings</th>\n",
       "      <td>0.951785</td>\n",
       "      <td>0.951785</td>\n",
       "      <td>0.951785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>electricity</th>\n",
       "      <td>0.981233</td>\n",
       "      <td>0.981233</td>\n",
       "      <td>0.981233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tools</th>\n",
       "      <td>0.994812</td>\n",
       "      <td>0.994812</td>\n",
       "      <td>0.994812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hospitals</th>\n",
       "      <td>0.990540</td>\n",
       "      <td>0.990540</td>\n",
       "      <td>0.990540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shops</th>\n",
       "      <td>0.995270</td>\n",
       "      <td>0.995270</td>\n",
       "      <td>0.995270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aid_centers</th>\n",
       "      <td>0.987946</td>\n",
       "      <td>0.987946</td>\n",
       "      <td>0.987946</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_infrastructure</th>\n",
       "      <td>0.955600</td>\n",
       "      <td>0.955600</td>\n",
       "      <td>0.955600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weather_related</th>\n",
       "      <td>0.878700</td>\n",
       "      <td>0.878700</td>\n",
       "      <td>0.878700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>floods</th>\n",
       "      <td>0.949954</td>\n",
       "      <td>0.949954</td>\n",
       "      <td>0.949954</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>storm</th>\n",
       "      <td>0.937290</td>\n",
       "      <td>0.937290</td>\n",
       "      <td>0.937290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fire</th>\n",
       "      <td>0.991456</td>\n",
       "      <td>0.991456</td>\n",
       "      <td>0.991456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>earthquake</th>\n",
       "      <td>0.971010</td>\n",
       "      <td>0.971010</td>\n",
       "      <td>0.971010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cold</th>\n",
       "      <td>0.980623</td>\n",
       "      <td>0.980623</td>\n",
       "      <td>0.980623</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_weather</th>\n",
       "      <td>0.946903</td>\n",
       "      <td>0.946903</td>\n",
       "      <td>0.946903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>direct_report</th>\n",
       "      <td>0.867562</td>\n",
       "      <td>0.867562</td>\n",
       "      <td>0.867562</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        f1 score  precision    recall\n",
       "related                 0.801648   0.801648  0.801648\n",
       "request                 0.894873   0.894873  0.894873\n",
       "offer                   0.995880   0.995880  0.995880\n",
       "aid_related             0.777388   0.777388  0.777388\n",
       "medical_help            0.920507   0.920507  0.920507\n",
       "medical_products        0.956057   0.956057  0.956057\n",
       "search_and_rescue       0.971773   0.971773  0.971773\n",
       "security                0.982301   0.982301  0.982301\n",
       "military                0.968111   0.968111  0.968111\n",
       "child_alone             1.000000   1.000000  1.000000\n",
       "water                   0.958193   0.958193  0.958193\n",
       "food                    0.940189   0.940189  0.940189\n",
       "shelter                 0.935002   0.935002  0.935002\n",
       "clothing                0.986726   0.986726  0.986726\n",
       "money                   0.978639   0.978639  0.978639\n",
       "missing_people          0.989930   0.989930  0.989930\n",
       "refugees                0.968721   0.968721  0.968721\n",
       "death                   0.962161   0.962161  0.962161\n",
       "other_aid               0.871224   0.871224  0.871224\n",
       "infrastructure_related  0.934696   0.934696  0.934696\n",
       "transport               0.953769   0.953769  0.953769\n",
       "buildings               0.951785   0.951785  0.951785\n",
       "electricity             0.981233   0.981233  0.981233\n",
       "tools                   0.994812   0.994812  0.994812\n",
       "hospitals               0.990540   0.990540  0.990540\n",
       "shops                   0.995270   0.995270  0.995270\n",
       "aid_centers             0.987946   0.987946  0.987946\n",
       "other_infrastructure    0.955600   0.955600  0.955600\n",
       "weather_related         0.878700   0.878700  0.878700\n",
       "floods                  0.949954   0.949954  0.949954\n",
       "storm                   0.937290   0.937290  0.937290\n",
       "fire                    0.991456   0.991456  0.991456\n",
       "earthquake              0.971010   0.971010  0.971010\n",
       "cold                    0.980623   0.980623  0.980623\n",
       "other_weather           0.946903   0.946903  0.946903\n",
       "direct_report           0.867562   0.867562  0.867562"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "build_report(pipeline, X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6. Improve your model\n",
    "Use grid search to find better parameters. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip_..._score=False, random_state=None, verbose=0,\n",
       "            warm_start=False),\n",
       "           n_jobs=None))]),\n",
       "       fit_params=None, iid='warn', n_jobs=6,\n",
       "       param_grid={'clf__estimator__max_features': ['sqrt', 0.5], 'clf__estimator__n_estimators': [50, 100]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameters = {'clf__estimator__max_features':['sqrt', 0.5],\n",
    "              'clf__estimator__n_estimators':[50, 100]}\n",
    "\n",
    "cv = GridSearchCV(estimator=pipeline, param_grid = parameters, cv = 5, n_jobs = 6)\n",
    "cv.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7. Test your model\n",
    "Show the accuracy, precision, and recall of the tuned model.  \n",
    "\n",
    "Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f1 score</th>\n",
       "      <th>precision</th>\n",
       "      <th>recall</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>related</th>\n",
       "      <td>0.801953</td>\n",
       "      <td>0.801953</td>\n",
       "      <td>0.801953</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>request</th>\n",
       "      <td>0.888007</td>\n",
       "      <td>0.888007</td>\n",
       "      <td>0.888007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>offer</th>\n",
       "      <td>0.995270</td>\n",
       "      <td>0.995270</td>\n",
       "      <td>0.995270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aid_related</th>\n",
       "      <td>0.765334</td>\n",
       "      <td>0.765334</td>\n",
       "      <td>0.765334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>medical_help</th>\n",
       "      <td>0.920659</td>\n",
       "      <td>0.920659</td>\n",
       "      <td>0.920659</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>medical_products</th>\n",
       "      <td>0.962313</td>\n",
       "      <td>0.962313</td>\n",
       "      <td>0.962313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>search_and_rescue</th>\n",
       "      <td>0.970552</td>\n",
       "      <td>0.970552</td>\n",
       "      <td>0.970552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>security</th>\n",
       "      <td>0.978944</td>\n",
       "      <td>0.978944</td>\n",
       "      <td>0.978944</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>military</th>\n",
       "      <td>0.966890</td>\n",
       "      <td>0.966890</td>\n",
       "      <td>0.966890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>child_alone</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>water</th>\n",
       "      <td>0.966280</td>\n",
       "      <td>0.966280</td>\n",
       "      <td>0.966280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>food</th>\n",
       "      <td>0.951480</td>\n",
       "      <td>0.951480</td>\n",
       "      <td>0.951480</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shelter</th>\n",
       "      <td>0.948581</td>\n",
       "      <td>0.948581</td>\n",
       "      <td>0.948581</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clothing</th>\n",
       "      <td>0.989014</td>\n",
       "      <td>0.989014</td>\n",
       "      <td>0.989014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>money</th>\n",
       "      <td>0.978486</td>\n",
       "      <td>0.978486</td>\n",
       "      <td>0.978486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>missing_people</th>\n",
       "      <td>0.990388</td>\n",
       "      <td>0.990388</td>\n",
       "      <td>0.990388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>refugees</th>\n",
       "      <td>0.971620</td>\n",
       "      <td>0.971620</td>\n",
       "      <td>0.971620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>death</th>\n",
       "      <td>0.973604</td>\n",
       "      <td>0.973604</td>\n",
       "      <td>0.973604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_aid</th>\n",
       "      <td>0.868630</td>\n",
       "      <td>0.868630</td>\n",
       "      <td>0.868630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>infrastructure_related</th>\n",
       "      <td>0.929661</td>\n",
       "      <td>0.929661</td>\n",
       "      <td>0.929661</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>transport</th>\n",
       "      <td>0.954379</td>\n",
       "      <td>0.954379</td>\n",
       "      <td>0.954379</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>buildings</th>\n",
       "      <td>0.956363</td>\n",
       "      <td>0.956363</td>\n",
       "      <td>0.956363</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>electricity</th>\n",
       "      <td>0.979707</td>\n",
       "      <td>0.979707</td>\n",
       "      <td>0.979707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tools</th>\n",
       "      <td>0.993592</td>\n",
       "      <td>0.993592</td>\n",
       "      <td>0.993592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hospitals</th>\n",
       "      <td>0.988404</td>\n",
       "      <td>0.988404</td>\n",
       "      <td>0.988404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shops</th>\n",
       "      <td>0.994355</td>\n",
       "      <td>0.994355</td>\n",
       "      <td>0.994355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aid_centers</th>\n",
       "      <td>0.987794</td>\n",
       "      <td>0.987794</td>\n",
       "      <td>0.987794</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_infrastructure</th>\n",
       "      <td>0.952395</td>\n",
       "      <td>0.952395</td>\n",
       "      <td>0.952395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weather_related</th>\n",
       "      <td>0.880684</td>\n",
       "      <td>0.880684</td>\n",
       "      <td>0.880684</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>floods</th>\n",
       "      <td>0.955752</td>\n",
       "      <td>0.955752</td>\n",
       "      <td>0.955752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>storm</th>\n",
       "      <td>0.945682</td>\n",
       "      <td>0.945682</td>\n",
       "      <td>0.945682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fire</th>\n",
       "      <td>0.991913</td>\n",
       "      <td>0.991913</td>\n",
       "      <td>0.991913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>earthquake</th>\n",
       "      <td>0.973146</td>\n",
       "      <td>0.973146</td>\n",
       "      <td>0.973146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cold</th>\n",
       "      <td>0.983064</td>\n",
       "      <td>0.983064</td>\n",
       "      <td>0.983064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_weather</th>\n",
       "      <td>0.941105</td>\n",
       "      <td>0.941105</td>\n",
       "      <td>0.941105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>direct_report</th>\n",
       "      <td>0.856424</td>\n",
       "      <td>0.856424</td>\n",
       "      <td>0.856424</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        f1 score  precision    recall\n",
       "related                 0.801953   0.801953  0.801953\n",
       "request                 0.888007   0.888007  0.888007\n",
       "offer                   0.995270   0.995270  0.995270\n",
       "aid_related             0.765334   0.765334  0.765334\n",
       "medical_help            0.920659   0.920659  0.920659\n",
       "medical_products        0.962313   0.962313  0.962313\n",
       "search_and_rescue       0.970552   0.970552  0.970552\n",
       "security                0.978944   0.978944  0.978944\n",
       "military                0.966890   0.966890  0.966890\n",
       "child_alone             1.000000   1.000000  1.000000\n",
       "water                   0.966280   0.966280  0.966280\n",
       "food                    0.951480   0.951480  0.951480\n",
       "shelter                 0.948581   0.948581  0.948581\n",
       "clothing                0.989014   0.989014  0.989014\n",
       "money                   0.978486   0.978486  0.978486\n",
       "missing_people          0.990388   0.990388  0.990388\n",
       "refugees                0.971620   0.971620  0.971620\n",
       "death                   0.973604   0.973604  0.973604\n",
       "other_aid               0.868630   0.868630  0.868630\n",
       "infrastructure_related  0.929661   0.929661  0.929661\n",
       "transport               0.954379   0.954379  0.954379\n",
       "buildings               0.956363   0.956363  0.956363\n",
       "electricity             0.979707   0.979707  0.979707\n",
       "tools                   0.993592   0.993592  0.993592\n",
       "hospitals               0.988404   0.988404  0.988404\n",
       "shops                   0.994355   0.994355  0.994355\n",
       "aid_centers             0.987794   0.987794  0.987794\n",
       "other_infrastructure    0.952395   0.952395  0.952395\n",
       "weather_related         0.880684   0.880684  0.880684\n",
       "floods                  0.955752   0.955752  0.955752\n",
       "storm                   0.945682   0.945682  0.945682\n",
       "fire                    0.991913   0.991913  0.991913\n",
       "earthquake              0.973146   0.973146  0.973146\n",
       "cold                    0.983064   0.983064  0.983064\n",
       "other_weather           0.941105   0.941105  0.941105\n",
       "direct_report           0.856424   0.856424  0.856424"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "build_report(cv, X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'clf__estimator__max_features': 0.5, 'clf__estimator__n_estimators': 100}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 8. Try improving your model further. Here are a few ideas:\n",
    "* try other machine learning algorithms\n",
    "* add other features besides the TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f1 score</th>\n",
       "      <th>precision</th>\n",
       "      <th>recall</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>related</th>\n",
       "      <td>0.762893</td>\n",
       "      <td>0.762893</td>\n",
       "      <td>0.762893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>request</th>\n",
       "      <td>0.892127</td>\n",
       "      <td>0.892127</td>\n",
       "      <td>0.892127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>offer</th>\n",
       "      <td>0.994049</td>\n",
       "      <td>0.994049</td>\n",
       "      <td>0.994049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aid_related</th>\n",
       "      <td>0.767318</td>\n",
       "      <td>0.767318</td>\n",
       "      <td>0.767318</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>medical_help</th>\n",
       "      <td>0.923711</td>\n",
       "      <td>0.923711</td>\n",
       "      <td>0.923711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>medical_products</th>\n",
       "      <td>0.961398</td>\n",
       "      <td>0.961398</td>\n",
       "      <td>0.961398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>search_and_rescue</th>\n",
       "      <td>0.970705</td>\n",
       "      <td>0.970705</td>\n",
       "      <td>0.970705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>security</th>\n",
       "      <td>0.977876</td>\n",
       "      <td>0.977876</td>\n",
       "      <td>0.977876</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>military</th>\n",
       "      <td>0.971468</td>\n",
       "      <td>0.971468</td>\n",
       "      <td>0.971468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>child_alone</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>water</th>\n",
       "      <td>0.963381</td>\n",
       "      <td>0.963381</td>\n",
       "      <td>0.963381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>food</th>\n",
       "      <td>0.946903</td>\n",
       "      <td>0.946903</td>\n",
       "      <td>0.946903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shelter</th>\n",
       "      <td>0.941868</td>\n",
       "      <td>0.941868</td>\n",
       "      <td>0.941868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clothing</th>\n",
       "      <td>0.987946</td>\n",
       "      <td>0.987946</td>\n",
       "      <td>0.987946</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>money</th>\n",
       "      <td>0.977418</td>\n",
       "      <td>0.977418</td>\n",
       "      <td>0.977418</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>missing_people</th>\n",
       "      <td>0.989319</td>\n",
       "      <td>0.989319</td>\n",
       "      <td>0.989319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>refugees</th>\n",
       "      <td>0.969484</td>\n",
       "      <td>0.969484</td>\n",
       "      <td>0.969484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>death</th>\n",
       "      <td>0.968721</td>\n",
       "      <td>0.968721</td>\n",
       "      <td>0.968721</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_aid</th>\n",
       "      <td>0.868782</td>\n",
       "      <td>0.868782</td>\n",
       "      <td>0.868782</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>infrastructure_related</th>\n",
       "      <td>0.928746</td>\n",
       "      <td>0.928746</td>\n",
       "      <td>0.928746</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>transport</th>\n",
       "      <td>0.955447</td>\n",
       "      <td>0.955447</td>\n",
       "      <td>0.955447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>buildings</th>\n",
       "      <td>0.954837</td>\n",
       "      <td>0.954837</td>\n",
       "      <td>0.954837</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>electricity</th>\n",
       "      <td>0.980928</td>\n",
       "      <td>0.980928</td>\n",
       "      <td>0.980928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tools</th>\n",
       "      <td>0.993592</td>\n",
       "      <td>0.993592</td>\n",
       "      <td>0.993592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hospitals</th>\n",
       "      <td>0.987489</td>\n",
       "      <td>0.987489</td>\n",
       "      <td>0.987489</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shops</th>\n",
       "      <td>0.994049</td>\n",
       "      <td>0.994049</td>\n",
       "      <td>0.994049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aid_centers</th>\n",
       "      <td>0.986726</td>\n",
       "      <td>0.986726</td>\n",
       "      <td>0.986726</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_infrastructure</th>\n",
       "      <td>0.951938</td>\n",
       "      <td>0.951938</td>\n",
       "      <td>0.951938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weather_related</th>\n",
       "      <td>0.876259</td>\n",
       "      <td>0.876259</td>\n",
       "      <td>0.876259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>floods</th>\n",
       "      <td>0.953616</td>\n",
       "      <td>0.953616</td>\n",
       "      <td>0.953616</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>storm</th>\n",
       "      <td>0.938969</td>\n",
       "      <td>0.938969</td>\n",
       "      <td>0.938969</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fire</th>\n",
       "      <td>0.991150</td>\n",
       "      <td>0.991150</td>\n",
       "      <td>0.991150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>earthquake</th>\n",
       "      <td>0.970705</td>\n",
       "      <td>0.970705</td>\n",
       "      <td>0.970705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cold</th>\n",
       "      <td>0.981843</td>\n",
       "      <td>0.981843</td>\n",
       "      <td>0.981843</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other_weather</th>\n",
       "      <td>0.942630</td>\n",
       "      <td>0.942630</td>\n",
       "      <td>0.942630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>direct_report</th>\n",
       "      <td>0.858865</td>\n",
       "      <td>0.858865</td>\n",
       "      <td>0.858865</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        f1 score  precision    recall\n",
       "related                 0.762893   0.762893  0.762893\n",
       "request                 0.892127   0.892127  0.892127\n",
       "offer                   0.994049   0.994049  0.994049\n",
       "aid_related             0.767318   0.767318  0.767318\n",
       "medical_help            0.923711   0.923711  0.923711\n",
       "medical_products        0.961398   0.961398  0.961398\n",
       "search_and_rescue       0.970705   0.970705  0.970705\n",
       "security                0.977876   0.977876  0.977876\n",
       "military                0.971468   0.971468  0.971468\n",
       "child_alone             1.000000   1.000000  1.000000\n",
       "water                   0.963381   0.963381  0.963381\n",
       "food                    0.946903   0.946903  0.946903\n",
       "shelter                 0.941868   0.941868  0.941868\n",
       "clothing                0.987946   0.987946  0.987946\n",
       "money                   0.977418   0.977418  0.977418\n",
       "missing_people          0.989319   0.989319  0.989319\n",
       "refugees                0.969484   0.969484  0.969484\n",
       "death                   0.968721   0.968721  0.968721\n",
       "other_aid               0.868782   0.868782  0.868782\n",
       "infrastructure_related  0.928746   0.928746  0.928746\n",
       "transport               0.955447   0.955447  0.955447\n",
       "buildings               0.954837   0.954837  0.954837\n",
       "electricity             0.980928   0.980928  0.980928\n",
       "tools                   0.993592   0.993592  0.993592\n",
       "hospitals               0.987489   0.987489  0.987489\n",
       "shops                   0.994049   0.994049  0.994049\n",
       "aid_centers             0.986726   0.986726  0.986726\n",
       "other_infrastructure    0.951938   0.951938  0.951938\n",
       "weather_related         0.876259   0.876259  0.876259\n",
       "floods                  0.953616   0.953616  0.953616\n",
       "storm                   0.938969   0.938969  0.938969\n",
       "fire                    0.991150   0.991150  0.991150\n",
       "earthquake              0.970705   0.970705  0.970705\n",
       "cold                    0.981843   0.981843  0.981843\n",
       "other_weather           0.942630   0.942630  0.942630\n",
       "direct_report           0.858865   0.858865  0.858865"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline_improved = Pipeline([\n",
    "                                ('vect', CountVectorizer(tokenizer=tokenize)),\n",
    "                                ('tfidf', TfidfTransformer()),\n",
    "                                ('clf', MultiOutputClassifier(AdaBoostClassifier(n_estimators = 100)))\n",
    "                            ])\n",
    "pipeline_improved.fit(X_train, y_train)\n",
    "y_pred_improved = pipeline_improved.predict(X_test)\n",
    "build_report(pipeline_improved, X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 9. Export your model as a pickle file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pickle.dump(pipeline, open('rf_model.pkl', 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pickle.dump(pipeline_improved, open('adaboost_model.pkl', 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 10. Use this notebook to complete `train.py`\n",
    "Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}