{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# things we need for NLP\n", "import nltk\n", "from nltk.stem.lancaster import LancasterStemmer\n", "stemmer = LancasterStemmer()\n", "\n", "# things we need for Tensorflow\n", "import numpy as np\n", "import tensorflow as tf\n", "import random" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# import our chat-bot intents file\n", "import json\n", "with open('intents.json') as json_data:\n", " intents = json.load(json_data)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /Users/hgavini/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /Users/hgavini/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import ssl\n", "\n", "try:\n", " _create_unverified_https_context = ssl._create_unverified_context\n", "except AttributeError:\n", " pass\n", "else:\n", " ssl._create_default_https_context = _create_unverified_https_context\n", "\n", "nltk.download('punkt')\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "45 documents\n", "9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']\n", "82 unique stemmed words [\"'s\", ',', 'a', 'advers', 'al', 'anyon', 'ar', 'awesom', 'be', 'behavy', 'blood', 'by', 'bye', 'can', 'caus', 'chat', 'check', 'could', 'dat', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'giv', 'good', 'goodby', 'hav', 'hello', 'help', 'hi', 'hist', 'hospit', 'how', 'i', 'id', 'is', 'lat', 'list', 'load', 'loc', 'log', 'look', 'lookup', 'man', 'me', 'mod', 'nearby', 'next', 'nic', 'of', 'off', 'op', 'paty', 'pharm', 'press', 'provid', 'react', 'rel', 'result', 'search', 'see', 'show', 'suit', 'support', 'task', 'thank', 'that', 'ther', 'til', 'tim', 'to', 'transf', 'up', 'want', 'what', 'which', 'with', 'you']\n" ] } ], "source": [ "words = []\n", "classes = []\n", "documents = []\n", "ignore_words = ['?']\n", "# loop through each sentence in our intents patterns\n", "for intent in intents['intents']:\n", " for pattern in intent['patterns']:\n", " # tokenize each word in the sentence\n", " w = nltk.word_tokenize(pattern)\n", " # add to our words list\n", " words.extend(w)\n", " # add to documents in our corpus\n", " documents.append((w, intent['tag']))\n", " # add to our classes list\n", " if intent['tag'] not in classes:\n", " classes.append(intent['tag'])\n", "\n", "# stem and lower each word and remove duplicates\n", "words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n", "words = sorted(list(set(words)))\n", "\n", "# remove duplicates\n", "classes = sorted(list(set(classes)))\n", "\n", "print (len(documents), \"documents\")\n", "print (len(classes), \"classes\", classes)\n", "print (len(words), \"unique stemmed words\", words)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# create our training data\n", "training = []\n", "output = []\n", "# create an empty array for our output\n", "output_empty = [0] * len(classes)\n", "\n", "# training set, bag of words for each sentence\n", "for doc in documents:\n", " # initialize our bag of words\n", " bag = []\n", " # list of tokenized words for the pattern\n", " pattern_words = doc[0]\n", " # stem each word\n", " pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]\n", " # create our bag of words array\n", " for w in words:\n", " bag.append(1) if w in pattern_words else bag.append(0)\n", "\n", " # output is a '0' for each tag and '1' for current tag\n", " output_row = list(output_empty)\n", " output_row[classes.index(doc[1])] = 1\n", "\n", " training.append([bag, output_row])\n", "\n", "# shuffle our features and turn into np.array\n", "random.shuffle(training)\n", "training = np.array(training, dtype=object)\n", "\n", "# create train and test lists\n", "train_x = list(training[:,0])\n", "train_y = list(training[:,1])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# save all of our data structures\n", "import pickle\n", "pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( \"training_data.parquet\", \"wb\" ) )\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 4 }