{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# use natural language toolkit\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.lancaster import LancasterStemmer\n",
    "# word stemmer\n",
    "stemmer = LancasterStemmer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12 sentences of training data\n"
     ]
    }
   ],
   "source": [
    "# 3 classes of training data\n",
    "training_data = []\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how are you?\"})\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is your day?\"})\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"good day\"})\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is it going today?\"})\n",
    "\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"see you later\"})\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"talk to you soon\"})\n",
    "\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"make me a sandwich\"})\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"can you make a sandwich?\"})\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"having a sandwich today?\"})\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"what's for lunch?\"})\n",
    "print (\"%s sentences of training data\" % len(training_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Corpus words and counts: {'how': 3, 'ar': 1, 'mak': 2, 'see': 1, 'is': 2, 'can': 1, 'me': 1, 'good': 1, 'hav': 3, 'talk': 1, 'lunch': 1, 'soon': 1, 'yo': 1, 'you': 4, 'day': 4, 'to': 1, 'nic': 2, 'lat': 1, 'a': 5, 'what': 1, 'for': 1, 'today': 2, 'sandwich': 3, 'it': 1, 'going': 1} \n",
      "\n",
      "Class words: {'goodbye': ['hav', 'a', 'nic', 'day', 'see', 'you', 'lat', 'hav', 'a', 'nic', 'day', 'talk', 'to', 'you', 'soon'], 'sandwich': ['mak', 'me', 'a', 'sandwich', 'can', 'you', 'mak', 'a', 'sandwich', 'hav', 'a', 'sandwich', 'today', 'what', 'for', 'lunch'], 'greeting': ['how', 'ar', 'you', 'how', 'is', 'yo', 'day', 'good', 'day', 'how', 'is', 'it', 'going', 'today']}\n"
     ]
    }
   ],
   "source": [
    "# capture unique stemmed words in the training corpus\n",
    "corpus_words = {}\n",
    "class_words = {}\n",
    "# turn a list into a set (of unique items) and then a list again (this removes duplicates)\n",
    "classes = list(set([a['class'] for a in training_data]))\n",
    "for c in classes:\n",
    "    # prepare a list of words within each class\n",
    "    class_words[c] = []\n",
    "\n",
    "# loop through each sentence in our training data\n",
    "for data in training_data:\n",
    "    # tokenize each sentence into words\n",
    "    for word in nltk.word_tokenize(data['sentence']):\n",
    "        # ignore a some things\n",
    "        if word not in [\"?\", \"'s\"]:\n",
    "            # stem and lowercase each word\n",
    "            stemmed_word = stemmer.stem(word.lower())\n",
    "            # have we not seen this word already?\n",
    "            if stemmed_word not in corpus_words:\n",
    "                corpus_words[stemmed_word] = 1\n",
    "            else:\n",
    "                corpus_words[stemmed_word] += 1\n",
    "\n",
    "            # add the word to our words in class list\n",
    "            class_words[data['class']].extend([stemmed_word])\n",
    "\n",
    "# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)\n",
    "print (\"Corpus words and counts: %s \\n\" % corpus_words)\n",
    "# also we have all words in each class\n",
    "print (\"Class words: %s\" % class_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# we can now calculate a score for a new sentence\n",
    "sentence = \"good day for us to have lunch?\"\n",
    "\n",
    "# calculate a score for a given class\n",
    "def calculate_class_score(sentence, class_name, show_details=True):\n",
    "    score = 0\n",
    "    # tokenize each word in our new sentence\n",
    "    for word in nltk.word_tokenize(sentence):\n",
    "        # check to see if the stem of the word is in any of our classes\n",
    "        if stemmer.stem(word.lower()) in class_words[class_name]:\n",
    "            # treat each word with same weight\n",
    "            score += 1\n",
    "            \n",
    "            if show_details:\n",
    "                print (\"   match: %s\" % stemmer.stem(word.lower() ))\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   match: day\n",
      "   match: to\n",
      "   match: hav\n",
      "Class: goodbye  Score: 3 \n",
      "\n",
      "   match: for\n",
      "   match: hav\n",
      "   match: lunch\n",
      "Class: sandwich  Score: 3 \n",
      "\n",
      "   match: good\n",
      "   match: day\n",
      "Class: greeting  Score: 2 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# now we can find the class with the highest score\n",
    "for c in class_words.keys():\n",
    "    print (\"Class: %s  Score: %s \\n\" % (c, calculate_class_score(sentence, c)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# calculate a score for a given class taking into account word commonality\n",
    "def calculate_class_score_commonality(sentence, class_name, show_details=True):\n",
    "    score = 0\n",
    "    # tokenize each word in our new sentence\n",
    "    for word in nltk.word_tokenize(sentence):\n",
    "        # check to see if the stem of the word is in any of our classes\n",
    "        if stemmer.stem(word.lower()) in class_words[class_name]:\n",
    "            # treat each word with relative weight\n",
    "            score += (1 / corpus_words[stemmer.stem(word.lower())])\n",
    "\n",
    "            if show_details:\n",
    "                print (\"   match: %s (%s)\" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   match: day (0.25)\n",
      "   match: to (1.0)\n",
      "   match: hav (0.3333333333333333)\n",
      "Class: goodbye  Score: 1.5833333333333333 \n",
      "\n",
      "   match: for (1.0)\n",
      "   match: hav (0.3333333333333333)\n",
      "   match: lunch (1.0)\n",
      "Class: sandwich  Score: 2.333333333333333 \n",
      "\n",
      "   match: good (1.0)\n",
      "   match: day (0.25)\n",
      "Class: greeting  Score: 1.25 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# now we can find the class with the highest score\n",
    "for c in class_words.keys():\n",
    "    print (\"Class: %s  Score: %s \\n\" % (c, calculate_class_score_commonality(sentence, c)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# return the class with highest score for sentence\n",
    "def classify(sentence):\n",
    "    high_class = None\n",
    "    high_score = 0\n",
    "    # loop through our classes\n",
    "    for c in class_words.keys():\n",
    "        # calculate score of sentence for each class\n",
    "        score = calculate_class_score_commonality(sentence, c, show_details=False)\n",
    "        # keep track of highest score\n",
    "        if score > high_score:\n",
    "            high_class = c\n",
    "            high_score = score\n",
    "\n",
    "    return high_class, high_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('sandwich', 2.5)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(\"make me some lunch?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('sandwich', 2.033333333333333)"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(\"sudo make me a sandwich\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('greeting', 2.083333333333333)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(\"how are you doing today?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('goodbye', 2.25)"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(\"talk to you tomorrow\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('greeting', 1.25)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(\"who are you?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(None, 0)"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(\"am I crazy?\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}