{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# use natural language toolkit\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem.lancaster import LancasterStemmer\n", "# word stemmer\n", "stemmer = LancasterStemmer()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12 sentences of training data\n" ] } ], "source": [ "# 3 classes of training data\n", "training_data = []\n", "training_data.append({\"class\":\"greeting\", \"sentence\":\"how are you?\"})\n", "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is your day?\"})\n", "training_data.append({\"class\":\"greeting\", \"sentence\":\"good day\"})\n", "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is it going today?\"})\n", "\n", "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n", "training_data.append({\"class\":\"goodbye\", \"sentence\":\"see you later\"})\n", "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n", "training_data.append({\"class\":\"goodbye\", \"sentence\":\"talk to you soon\"})\n", "\n", "training_data.append({\"class\":\"sandwich\", \"sentence\":\"make me a sandwich\"})\n", "training_data.append({\"class\":\"sandwich\", \"sentence\":\"can you make a sandwich?\"})\n", "training_data.append({\"class\":\"sandwich\", \"sentence\":\"having a sandwich today?\"})\n", "training_data.append({\"class\":\"sandwich\", \"sentence\":\"what's for lunch?\"})\n", "print (\"%s sentences of training data\" % len(training_data))" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Corpus words and counts: {'how': 3, 'ar': 1, 'mak': 2, 'see': 1, 'is': 2, 'can': 1, 'me': 1, 'good': 1, 'hav': 3, 'talk': 1, 'lunch': 1, 'soon': 1, 'yo': 1, 'you': 4, 'day': 4, 'to': 1, 'nic': 2, 'lat': 1, 'a': 5, 'what': 1, 'for': 1, 'today': 2, 'sandwich': 3, 'it': 1, 'going': 1} \n", "\n", "Class words: {'goodbye': ['hav', 'a', 'nic', 'day', 'see', 'you', 'lat', 'hav', 'a', 'nic', 'day', 'talk', 'to', 'you', 'soon'], 'sandwich': ['mak', 'me', 'a', 'sandwich', 'can', 'you', 'mak', 'a', 'sandwich', 'hav', 'a', 'sandwich', 'today', 'what', 'for', 'lunch'], 'greeting': ['how', 'ar', 'you', 'how', 'is', 'yo', 'day', 'good', 'day', 'how', 'is', 'it', 'going', 'today']}\n" ] } ], "source": [ "# capture unique stemmed words in the training corpus\n", "corpus_words = {}\n", "class_words = {}\n", "# turn a list into a set (of unique items) and then a list again (this removes duplicates)\n", "classes = list(set([a['class'] for a in training_data]))\n", "for c in classes:\n", " # prepare a list of words within each class\n", " class_words[c] = []\n", "\n", "# loop through each sentence in our training data\n", "for data in training_data:\n", " # tokenize each sentence into words\n", " for word in nltk.word_tokenize(data['sentence']):\n", " # ignore a some things\n", " if word not in [\"?\", \"'s\"]:\n", " # stem and lowercase each word\n", " stemmed_word = stemmer.stem(word.lower())\n", " # have we not seen this word already?\n", " if stemmed_word not in corpus_words:\n", " corpus_words[stemmed_word] = 1\n", " else:\n", " corpus_words[stemmed_word] += 1\n", "\n", " # add the word to our words in class list\n", " class_words[data['class']].extend([stemmed_word])\n", "\n", "# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)\n", "print (\"Corpus words and counts: %s \\n\" % corpus_words)\n", "# also we have all words in each class\n", "print (\"Class words: %s\" % class_words)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# we can now calculate a score for a new sentence\n", "sentence = \"good day for us to have lunch?\"\n", "\n", "# calculate a score for a given class\n", "def calculate_class_score(sentence, class_name, show_details=True):\n", " score = 0\n", " # tokenize each word in our new sentence\n", " for word in nltk.word_tokenize(sentence):\n", " # check to see if the stem of the word is in any of our classes\n", " if stemmer.stem(word.lower()) in class_words[class_name]:\n", " # treat each word with same weight\n", " score += 1\n", " \n", " if show_details:\n", " print (\" match: %s\" % stemmer.stem(word.lower() ))\n", " return score" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " match: day\n", " match: to\n", " match: hav\n", "Class: goodbye Score: 3 \n", "\n", " match: for\n", " match: hav\n", " match: lunch\n", "Class: sandwich Score: 3 \n", "\n", " match: good\n", " match: day\n", "Class: greeting Score: 2 \n", "\n" ] } ], "source": [ "# now we can find the class with the highest score\n", "for c in class_words.keys():\n", " print (\"Class: %s Score: %s \\n\" % (c, calculate_class_score(sentence, c)))" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# calculate a score for a given class taking into account word commonality\n", "def calculate_class_score_commonality(sentence, class_name, show_details=True):\n", " score = 0\n", " # tokenize each word in our new sentence\n", " for word in nltk.word_tokenize(sentence):\n", " # check to see if the stem of the word is in any of our classes\n", " if stemmer.stem(word.lower()) in class_words[class_name]:\n", " # treat each word with relative weight\n", " score += (1 / corpus_words[stemmer.stem(word.lower())])\n", "\n", " if show_details:\n", " print (\" match: %s (%s)\" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))\n", " return score" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " match: day (0.25)\n", " match: to (1.0)\n", " match: hav (0.3333333333333333)\n", "Class: goodbye Score: 1.5833333333333333 \n", "\n", " match: for (1.0)\n", " match: hav (0.3333333333333333)\n", " match: lunch (1.0)\n", "Class: sandwich Score: 2.333333333333333 \n", "\n", " match: good (1.0)\n", " match: day (0.25)\n", "Class: greeting Score: 1.25 \n", "\n" ] } ], "source": [ "# now we can find the class with the highest score\n", "for c in class_words.keys():\n", " print (\"Class: %s Score: %s \\n\" % (c, calculate_class_score_commonality(sentence, c)))" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# return the class with highest score for sentence\n", "def classify(sentence):\n", " high_class = None\n", " high_score = 0\n", " # loop through our classes\n", " for c in class_words.keys():\n", " # calculate score of sentence for each class\n", " score = calculate_class_score_commonality(sentence, c, show_details=False)\n", " # keep track of highest score\n", " if score > high_score:\n", " high_class = c\n", " high_score = score\n", "\n", " return high_class, high_score" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('sandwich', 2.5)" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify(\"make me some lunch?\")" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('sandwich', 2.033333333333333)" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify(\"sudo make me a sandwich\")" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('greeting', 2.083333333333333)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify(\"how are you doing today?\")" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('goodbye', 2.25)" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify(\"talk to you tomorrow\")" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('greeting', 1.25)" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify(\"who are you?\")" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(None, 0)" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify(\"am I crazy?\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }