{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "38c97229", "metadata": { "collapsed": false }, "outputs": [], "source": [ "import nltk\n", "from nltk.corpus import names\n", "from pylab import *\n", "import random as pyrandom" ] }, { "cell_type": "markdown", "id": "01d28868", "metadata": {}, "source": [ "# Sentence Segmentation" ] }, { "cell_type": "code", "execution_count": 34, "id": "4b05b3e9", "metadata": { "collapsed": false }, "outputs": [], "source": [ "sents = nltk.corpus.treebank_raw.sents()\n", "sents = [s for s in sents if len(s)>3]\n", "sents = [s for s in sents if \"START\" not in s]" ] }, { "cell_type": "code", "execution_count": 35, "id": "98e6db7e", "metadata": { "collapsed": false }, "outputs": [], "source": [ "tokens = []\n", "boundaries = []\n", "for s in sents:\n", " tokens += s\n", " boundaries.append(len(tokens)-1)" ] }, { "cell_type": "code", "execution_count": 36, "id": "6990a8a0", "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', 'A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.', 'The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even', 'brief', 'exposures', 'to', 'it', 'causing', 'symptoms', 'that', 'show', 'up', 'decades', 'later', ',', 'researchers', 'said', '.', 'Lorillard', 'Inc', '.,', 'the', 'unit', 'of', 'New', 'York', '-', 'based', 'Loews', 'Corp', '.', 'that', 'makes', 'Kent', 'cigarettes', ',', 'stopped', 'using', 'crocidolite', 'in', 'its', 'Micronite', 'cigarette', 'filters', 'in', '1956', '.', 'Although', 'preliminary', 'findings', 'were', 'reported', 'more', 'than', 'a', 'year', 'ago', ',', 'the', 'latest', 'results', 'appear', 'in', 'today', \"'\", 's', 'New', 'England', 'Journal', 'of', 'Medicine', ',', 'a', 'forum', 'likely', 'to', 'bring', 'new', 'attention', 'to', 'the', 'problem', '.', 'A', 'Lorillard', 'spokewoman', 'said', ',', '\"']\n" ] } ], "source": [ "print tokens[:200]" ] }, { "cell_type": "code", "execution_count": 42, "id": "9c84af47", "metadata": { "collapsed": false }, "outputs": [], "source": [ "def features(s,i):\n", " return dict(current=tokens[i],\n", " prev=tokens[i-1],\n", " next=tokens[i+1],\n", " upper=tokens[i+1][0].isupper(),\n", " plen=len(tokens[i-1]),\n", " nlen=len(tokens[i+1]))\n", " " ] }, { "cell_type": "code", "execution_count": 46, "id": "fa07a737", "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5951\n" ] } ], "source": [ "data = []\n", "for i in range(1,len(tokens)-1):\n", " if tokens[i] not in [\".\",\"?\",\"!\"]: continue\n", " c = (i in boundaries)\n", " f = features(tokens,i)\n", " data.append((f,c))\n", "pyrandom.shuffle(data)\n", "n = len(data)\n", "print n\n", "training_set = data[n//10:]\n", "test_set = data[:n//10]" ] }, { "cell_type": "code", "execution_count": 47, "id": "7262c00e", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.9798319327731092" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier = nltk.NaiveBayesClassifier.train(training_set)\n", "nltk.classify.accuracy(classifier,test_set)" ] }, { "cell_type": "code", "execution_count": 48, "id": "06641c58", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier.classify(features(\"The quick . brown\".split(),2))" ] }, { "cell_type": "code", "execution_count": 59, "id": "bf296af7", "metadata": { "collapsed": false }, "outputs": [], "source": [ "def segment_sentences(words):\n", " sentences = [[words[0]]]\n", " for i in range(1,len(words)):\n", " sentences[-1].append(words[i])\n", " c = words[i] in [\".\",\"?\",\"!\"] and classifier.classify(features(words,i))\n", " if c: sentences.append([])\n", " if sentences[-1]==[]: sentences = sentences[:-1]\n", " return sentences\n", " " ] }, { "cell_type": "code", "execution_count": 61, "id": "40939b59", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['Smith', 'ran', '.'], ['J', '.', 'Smith', 'really', 'ran', '.']]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "segment_sentences(\"\"\"Smith ran . J . Smith really ran . \"\"\".split())" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }