{ "cells": [ { "cell_type": "code", "execution_count": 23, "id": "03d3a48a", "metadata": { "collapsed": false }, "outputs": [], "source": [ "import nltk\n", "from nltk.corpus import names\n", "from pylab import *\n", "import random as pyrandom" ] }, { "cell_type": "markdown", "id": "38897036", "metadata": {}, "source": [ "# Parts of Speech Tagging" ] }, { "cell_type": "code", "execution_count": 24, "id": "7101d69a", "metadata": { "collapsed": false }, "outputs": [], "source": [ "from nltk.corpus import brown" ] }, { "cell_type": "code", "execution_count": 25, "id": "893549d7", "metadata": { "collapsed": false }, "outputs": [], "source": [ "tagged_words = brown.tagged_words(categories='news')" ] }, { "cell_type": "code", "execution_count": 44, "id": "2d8fdc90", "metadata": { "collapsed": false }, "outputs": [], "source": [ "def features(s,i,y):\n", " f = dict(ltag=y[i-1] if i>0 else \"^\", # previous tag\n", " lword=s[i-1] if i>0 else \"^\", # previous word\n", " s1 = s[i][-1:], # current word features\n", " s2 = s[i][-2:],\n", " s3 = s[i][-3:])\n", " return f" ] }, { "cell_type": "markdown", "id": "fbe6ff18", "metadata": {}, "source": [ "# Training" ] }, { "cell_type": "code", "execution_count": 35, "id": "7ccac27e", "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = []\n", "for sy in brown.tagged_sents(categories='news'):\n", " s,y = zip(*sy)\n", " for i in range(len(s)):\n", " data.append((features(s,i,y),y[i]))\n", "n = len(data)\n", "training_set = data[n//10:]\n", "test_set = data[:n//10]" ] }, { "cell_type": "code", "execution_count": 36, "id": "66ba816e", "metadata": { "collapsed": false }, "outputs": [], "source": [ "classifier = nltk.NaiveBayesClassifier.train(training_set)" ] }, { "cell_type": "code", "execution_count": 37, "id": "fd3afe50", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.8176031824962705" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.classify.accuracy(classifier,test_set)" ] }, { "cell_type": "markdown", "id": "24401659", "metadata": {}, "source": [ "# Greedy Decoding" ] }, { "cell_type": "code", "execution_count": 46, "id": "596ce943", "metadata": { "collapsed": false }, "outputs": [], "source": [ "class MyTagger:\n", " def __init__(self,classifier):\n", " self.classifier = classifier\n", " def tag(self,s):\n", " y = []\n", " for i in range(len(s)):\n", " f = features(s,i,y)\n", " y.append(classifier.classify(features(s,i,y)))\n", " return zip(s,y) " ] }, { "cell_type": "code", "execution_count": 40, "id": "bb6adcd5", "metadata": { "collapsed": false }, "outputs": [], "source": [ "tagger = MyTagger(classifier)" ] }, { "cell_type": "code", "execution_count": 45, "id": "cab4c5cb", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('The', 'AT'),\n", " ('quick', 'NN'),\n", " ('brown', 'NN'),\n", " ('fox', 'NPS-TL'),\n", " ('jumped', 'VBD'),\n", " ('over', 'RP'),\n", " ('the', 'AT'),\n", " ('lazy', 'JJ'),\n", " ('dogs.', 'NP')]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tagger.tag(\"The quick brown fox jumped over the lazy dogs.\".split())" ] }, { "cell_type": "markdown", "id": "dd64592b", "metadata": {}, "source": [ "# More Advanced Models" ] }, { "cell_type": "markdown", "id": "662450d0", "metadata": {}, "source": [ "- Viterbi Decoding\n", "- MEMM\n", "- Conditional Random Fields" ] }, { "cell_type": "code", "execution_count": null, "id": "05f70ce3", "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }