In [1]:
# things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

# things we need for Tensorflow
import numpy as np
import tensorflow as tf
import random

In [2]:
# import our chat-bot intents file
import json
with open('intents.json') as json_data:
 intents = json.load(json_data)

In [3]:
import ssl

try:
 _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
 pass
else:
 ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/hgavini/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hgavini/nltk_data...
[nltk_data] Package wordnet is already up-to-date!


True

In [4]:
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our intents patterns
for intent in intents['intents']:
 for pattern in intent['patterns']:
 # tokenize each word in the sentence
 w = nltk.word_tokenize(pattern)
 # add to our words list
 words.extend(w)
 # add to documents in our corpus
 documents.append((w, intent['tag']))
 # add to our classes list
 if intent['tag'] not in classes:
 classes.append(intent['tag'])

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

# remove duplicates
classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

45 documents
9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']
82 unique stemmed words ["'s", ',', 'a', 'advers', 'al', 'anyon', 'ar', 'awesom', 'be', 'behavy', 'blood', 'by', 'bye', 'can', 'caus', 'chat', 'check', 'could', 'dat', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'giv', 'good', 'goodby', 'hav', 'hello', 'help', 'hi', 'hist', 'hospit', 'how', 'i', 'id', 'is', 'lat', 'list', 'load', 'loc', 'log', 'look', 'lookup', 'man', 'me', 'mod', 'nearby', 'next', 'nic', 'of', 'off', 'op', 'paty', 'pharm', 'press', 'provid', 'react', 'rel', 'result', 'search', 'see', 'show', 'suit', 'support', 'task', 'thank', 'that', 'ther', 'til', 'tim', 'to', 'transf', 'up', 'want', 'what', 'which', 'with', 'you']


In [7]:
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
 # initialize our bag of words
 bag = []
 # list of tokenized words for the pattern
 pattern_words = doc[0]
 # stem each word
 pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
 # create our bag of words array
 for w in words:
 bag.append(1) if w in pattern_words else bag.append(0)

 # output is a '0' for each tag and '1' for current tag
 output_row = list(output_empty)
 output_row[classes.index(doc[1])] = 1

 training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training, dtype=object)

# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])

In [8]:
# save all of our data structures
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data.parquet", "wb" ) )
