{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "38c97229",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.corpus import names\n",
    "from pylab import *\n",
    "import random as pyrandom"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01d28868",
   "metadata": {},
   "source": [
    "# Sentence Segmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "4b05b3e9",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sents = nltk.corpus.treebank_raw.sents()\n",
    "sents = [s for s in sents if len(s)>3]\n",
    "sents = [s for s in sents if \"START\" not in s]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "98e6db7e",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tokens = []\n",
    "boundaries = []\n",
    "for s in sents:\n",
    "    tokens += s\n",
    "    boundaries.append(len(tokens)-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "6990a8a0",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', 'A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.', 'The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even', 'brief', 'exposures', 'to', 'it', 'causing', 'symptoms', 'that', 'show', 'up', 'decades', 'later', ',', 'researchers', 'said', '.', 'Lorillard', 'Inc', '.,', 'the', 'unit', 'of', 'New', 'York', '-', 'based', 'Loews', 'Corp', '.', 'that', 'makes', 'Kent', 'cigarettes', ',', 'stopped', 'using', 'crocidolite', 'in', 'its', 'Micronite', 'cigarette', 'filters', 'in', '1956', '.', 'Although', 'preliminary', 'findings', 'were', 'reported', 'more', 'than', 'a', 'year', 'ago', ',', 'the', 'latest', 'results', 'appear', 'in', 'today', \"'\", 's', 'New', 'England', 'Journal', 'of', 'Medicine', ',', 'a', 'forum', 'likely', 'to', 'bring', 'new', 'attention', 'to', 'the', 'problem', '.', 'A', 'Lorillard', 'spokewoman', 'said', ',', '\"']\n"
     ]
    }
   ],
   "source": [
    "print tokens[:200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "9c84af47",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def features(s,i):\n",
    "    return dict(current=tokens[i],\n",
    "                prev=tokens[i-1],\n",
    "                next=tokens[i+1],\n",
    "                upper=tokens[i+1][0].isupper(),\n",
    "                plen=len(tokens[i-1]),\n",
    "                nlen=len(tokens[i+1]))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "fa07a737",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5951\n"
     ]
    }
   ],
   "source": [
    "data = []\n",
    "for i in range(1,len(tokens)-1):\n",
    "    if tokens[i] not in [\".\",\"?\",\"!\"]: continue\n",
    "    c = (i in boundaries)\n",
    "    f = features(tokens,i)\n",
    "    data.append((f,c))\n",
    "pyrandom.shuffle(data)\n",
    "n = len(data)\n",
    "print n\n",
    "training_set = data[n//10:]\n",
    "test_set = data[:n//10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "7262c00e",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9798319327731092"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier = nltk.NaiveBayesClassifier.train(training_set)\n",
    "nltk.classify.accuracy(classifier,test_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "06641c58",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.classify(features(\"The quick . brown\".split(),2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "bf296af7",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def segment_sentences(words):\n",
    "    sentences = [[words[0]]]\n",
    "    for i in range(1,len(words)):\n",
    "        sentences[-1].append(words[i])\n",
    "        c = words[i] in [\".\",\"?\",\"!\"] and classifier.classify(features(words,i))\n",
    "        if c: sentences.append([])\n",
    "    if sentences[-1]==[]: sentences = sentences[:-1]\n",
    "    return sentences\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "40939b59",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Smith', 'ran', '.'], ['J', '.', 'Smith', 'really', 'ran', '.']]"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "segment_sentences(\"\"\"Smith ran . J . Smith really ran . \"\"\".split())"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
}