{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "03d3a48a",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.corpus import names\n",
    "from pylab import *\n",
    "import random as pyrandom"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38897036",
   "metadata": {},
   "source": [
    "# Parts of Speech Tagging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "7101d69a",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import brown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "893549d7",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tagged_words = brown.tagged_words(categories='news')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "2d8fdc90",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def features(s,i,y):\n",
    "    f = dict(ltag=y[i-1] if i>0 else \"^\", # previous tag\n",
    "             lword=s[i-1] if i>0 else \"^\", # previous word\n",
    "             s1 = s[i][-1:], # current word features\n",
    "             s2 = s[i][-2:],\n",
    "             s3 = s[i][-3:])\n",
    "    return f"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbe6ff18",
   "metadata": {},
   "source": [
    "# Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7ccac27e",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = []\n",
    "for sy in brown.tagged_sents(categories='news'):\n",
    "    s,y = zip(*sy)\n",
    "    for i in range(len(s)):\n",
    "        data.append((features(s,i,y),y[i]))\n",
    "n = len(data)\n",
    "training_set = data[n//10:]\n",
    "test_set = data[:n//10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "66ba816e",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "classifier = nltk.NaiveBayesClassifier.train(training_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "fd3afe50",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8176031824962705"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.classify.accuracy(classifier,test_set)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24401659",
   "metadata": {},
   "source": [
    "# Greedy Decoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "596ce943",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class MyTagger:\n",
    "    def __init__(self,classifier):\n",
    "        self.classifier = classifier\n",
    "    def tag(self,s):\n",
    "        y = []\n",
    "        for i in range(len(s)):\n",
    "            f = features(s,i,y)\n",
    "            y.append(classifier.classify(features(s,i,y)))\n",
    "        return zip(s,y)            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "bb6adcd5",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tagger = MyTagger(classifier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "cab4c5cb",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('The', 'AT'),\n",
       " ('quick', 'NN'),\n",
       " ('brown', 'NN'),\n",
       " ('fox', 'NPS-TL'),\n",
       " ('jumped', 'VBD'),\n",
       " ('over', 'RP'),\n",
       " ('the', 'AT'),\n",
       " ('lazy', 'JJ'),\n",
       " ('dogs.', 'NP')]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tagger.tag(\"The quick brown fox jumped over the lazy dogs.\".split())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd64592b",
   "metadata": {},
   "source": [
    "# More Advanced Models"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "662450d0",
   "metadata": {},
   "source": [
    "- Viterbi Decoding\n",
    "- MEMM\n",
    "- Conditional Random Fields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05f70ce3",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
}