{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# things we need for NLP\n",
    "import nltk\n",
    "from nltk.stem.lancaster import LancasterStemmer\n",
    "stemmer = LancasterStemmer()\n",
    "\n",
    "# things we need for Tensorflow\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import our chat-bot intents file\n",
    "import json\n",
    "with open('intents.json') as json_data:\n",
    "    intents = json.load(json_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /Users/hgavini/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /Users/hgavini/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import ssl\n",
    "\n",
    "try:\n",
    "    _create_unverified_https_context = ssl._create_unverified_context\n",
    "except AttributeError:\n",
    "    pass\n",
    "else:\n",
    "    ssl._create_default_https_context = _create_unverified_https_context\n",
    "\n",
    "nltk.download('punkt')\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "45 documents\n",
      "9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']\n",
      "82 unique stemmed words [\"'s\", ',', 'a', 'advers', 'al', 'anyon', 'ar', 'awesom', 'be', 'behavy', 'blood', 'by', 'bye', 'can', 'caus', 'chat', 'check', 'could', 'dat', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'giv', 'good', 'goodby', 'hav', 'hello', 'help', 'hi', 'hist', 'hospit', 'how', 'i', 'id', 'is', 'lat', 'list', 'load', 'loc', 'log', 'look', 'lookup', 'man', 'me', 'mod', 'nearby', 'next', 'nic', 'of', 'off', 'op', 'paty', 'pharm', 'press', 'provid', 'react', 'rel', 'result', 'search', 'see', 'show', 'suit', 'support', 'task', 'thank', 'that', 'ther', 'til', 'tim', 'to', 'transf', 'up', 'want', 'what', 'which', 'with', 'you']\n"
     ]
    }
   ],
   "source": [
    "words = []\n",
    "classes = []\n",
    "documents = []\n",
    "ignore_words = ['?']\n",
    "# loop through each sentence in our intents patterns\n",
    "for intent in intents['intents']:\n",
    "    for pattern in intent['patterns']:\n",
    "        # tokenize each word in the sentence\n",
    "        w = nltk.word_tokenize(pattern)\n",
    "        # add to our words list\n",
    "        words.extend(w)\n",
    "        # add to documents in our corpus\n",
    "        documents.append((w, intent['tag']))\n",
    "        # add to our classes list\n",
    "        if intent['tag'] not in classes:\n",
    "            classes.append(intent['tag'])\n",
    "\n",
    "# stem and lower each word and remove duplicates\n",
    "words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n",
    "words = sorted(list(set(words)))\n",
    "\n",
    "# remove duplicates\n",
    "classes = sorted(list(set(classes)))\n",
    "\n",
    "print (len(documents), \"documents\")\n",
    "print (len(classes), \"classes\", classes)\n",
    "print (len(words), \"unique stemmed words\", words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create our training data\n",
    "training = []\n",
    "output = []\n",
    "# create an empty array for our output\n",
    "output_empty = [0] * len(classes)\n",
    "\n",
    "# training set, bag of words for each sentence\n",
    "for doc in documents:\n",
    "    # initialize our bag of words\n",
    "    bag = []\n",
    "    # list of tokenized words for the pattern\n",
    "    pattern_words = doc[0]\n",
    "    # stem each word\n",
    "    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]\n",
    "    # create our bag of words array\n",
    "    for w in words:\n",
    "        bag.append(1) if w in pattern_words else bag.append(0)\n",
    "\n",
    "    # output is a '0' for each tag and '1' for current tag\n",
    "    output_row = list(output_empty)\n",
    "    output_row[classes.index(doc[1])] = 1\n",
    "\n",
    "    training.append([bag, output_row])\n",
    "\n",
    "# shuffle our features and turn into np.array\n",
    "random.shuffle(training)\n",
    "training = np.array(training, dtype=object)\n",
    "\n",
    "# create train and test lists\n",
    "train_x = list(training[:,0])\n",
    "train_y = list(training[:,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save all of our data structures\n",
    "import pickle\n",
    "pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( \"training_data.parquet\", \"wb\" ) )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}