Coverage for nltk.tag.tnt: 8%

100

101

102

103

# Natural Language Toolkit: TnT Tagger

# Author: Sam Huston <sjh900@gmail.com>

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

'''

Implementation of 'TnT - A Statisical Part of Speech Tagger'

by Thorsten Brants

http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf

'''

from __future__ import print_function

from nltk.probability import FreqDist, ConditionalFreqDist

from nltk.tag.api import TaggerI

class TnT(TaggerI):

'''

TnT - Statistical POS tagger

IMPORTANT NOTES:

* DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS

- It is possible to provide an untrained POS tagger to

create tags for unknown words, see __init__ function

* SHOULD BE USED WITH SENTENCE-DELIMITED INPUT

- Due to the nature of this tagger, it works best when

trained over sentence delimited input.

- However it still produces good results if the training

data and testing data are separated on all punctuation eg: [,.?!]

- Input for training is expected to be a list of sentences

where each sentence is a list of (word, tag) tuples

- Input for tag function is a single sentence

Input for tagdata function is a list of sentences

Output is of a similar form

* Function provided to process text that is unsegmented

- Please see basic_sent_chop()

TnT uses a second order Markov model to produce tags for

a sequence of input, specifically:

argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

IE: the maximum projection of a set of probabilities

The set of possible tags for a given word is derived

from the training data. It is the set of all tags

that exact word has been assigned.

The probability of a tag for a given word is the linear

interpolation of 3 markov models; a zero-order, first-order,

and a second order model.

P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +

l3*P(t_i| t_i-1, t_i-2)

A beam search is used to limit the memory usage of the algorithm.

The degree of the beam can be changed using N in the initialization.

N represents the maximum number of possible solutions to maintain

while tagging.

It is possible to differentiate the tags which are assigned to

capitalized words. However this does not result in a significant

gain in the accuracy of the results.

'''

def __init__(self, unk=None, Trained=False, N=1000, C=False):

'''

Construct a TnT statistical tagger. Tagger must be trained

before being used to tag input.

:param unk: instance of a POS tagger, conforms to TaggerI

:type unk:(TaggerI)

:param Trained: Indication that the POS tagger is trained or not

:type Trained: boolean

:param N: Beam search degree (see above)

:type N:(int)

:param C: Capitalization flag

:type C: boolean

Initializer, creates frequency distributions to be used

for tagging

_lx values represent the portion of the tri/bi/uni taggers

to be used to calculate the probability

N value is the number of possible solutions to maintain

while tagging. A good value for this is 1000

C is a boolean value which specifies to use or

not use the Capitalization of the word as additional

information for tagging.

NOTE: using capitalization may not increase the accuracy

of the tagger

'''

self._uni = FreqDist()

self._bi = ConditionalFreqDist()

self._tri = ConditionalFreqDist()

self._wd = ConditionalFreqDist()

self._eos = ConditionalFreqDist()

self._l1 = 0.0

self._l2 = 0.0

self._l3 = 0.0

self._N = N

self._C = C

self._T = Trained

self._unk = unk

# statistical tools (ignore or delete me)

self.unknown = 0

self.known = 0

def train(self, data):

'''

Uses a set of tagged data to train the tagger.

If an unknown word tagger is specified,

it is trained on the same data.

:param data: List of lists of (word, tag) tuples

:type data: tuple(str)

'''

# Ensure that local C flag is initialized before use

C = False

if self._unk is not None and self._T == False:

self._unk.train(data)

for sent in data:

history = ['BOS', 'BOS']

for w, t in sent:

# if capitalization is requested,

# and the word begins with a capital

# set local flag C to True

if self._C and w[0].isupper(): C=True

self._wd[w].inc(t)

self._uni.inc((t,C))

self._bi[history[1]].inc((t,C))

self._tri[tuple(history)].inc((t,C))

history.append((t,C))

history.pop(0)

# set local flag C to false for the next word

C = False

self._eos[t].inc('EOS')

# compute lambda values from the trained frequency distributions

self._compute_lambda()

#(debugging -- ignore or delete me)

#print "lambdas"

#print i, self._l1, i, self._l2, i, self._l3

def _compute_lambda(self):

'''

creates lambda values based upon training data

NOTE: no need to explicitly reference C,

it is contained within the tag variable :: tag == (tag,C)

for each tag trigram (t1, t2, t3)

depending on the maximum value of

- f(t1,t2,t3)-1 / f(t1,t2)-1

- f(t2,t3)-1 / f(t2)-1

- f(t3)-1 / N-1

increment l3,l2, or l1 by f(t1,t2,t3)

ISSUES -- Resolutions:

if 2 values are equal, increment both lambda values

by (f(t1,t2,t3) / 2)

'''

# temporary lambda variables

tl1 = 0.0

tl2 = 0.0

tl3 = 0.0

# for each t1,t2 in system

for history in self._tri.conditions():

(h1, h2) = history

# for each t3 given t1,t2 in system

# (NOTE: tag actually represents (tag,C))

# However no effect within this function

for tag in self._tri[history].samples():

# if there has only been 1 occurrence of this tag in the data

# then ignore this trigram.

if self._uni[tag] == 1:

continue

# safe_div provides a safe floating point division

# it returns -1 if the denominator is 0

c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1))

c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1))

c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1))

# if c1 is the maximum value:

if (c1 > c3) and (c1 > c2):

tl1 += self._tri[history][tag]

# if c2 is the maximum value

elif (c2 > c3) and (c2 > c1):

tl2 += self._tri[history][tag]

# if c3 is the maximum value

elif (c3 > c2) and (c3 > c1):

tl3 += self._tri[history][tag]

# if c3, and c2 are equal and larger than c1

elif (c3 == c2) and (c3 > c1):

tl2 += float(self._tri[history][tag]) /2.0

tl3 += float(self._tri[history][tag]) /2.0

# if c1, and c2 are equal and larger than c3

# this might be a dumb thing to do....(not sure yet)

elif (c2 == c1) and (c1 > c3):

tl1 += float(self._tri[history][tag]) /2.0

tl2 += float(self._tri[history][tag]) /2.0

# otherwise there might be a problem

# eg: all values = 0

else:

#print "Problem", c1, c2 ,c3

pass

# Lambda normalisation:

# ensures that l1+l2+l3 = 1

self._l1 = tl1 / (tl1+tl2+tl3)

self._l2 = tl2 / (tl1+tl2+tl3)

self._l3 = tl3 / (tl1+tl2+tl3)

def _safe_div(self, v1, v2):

'''

Safe floating point division function, does not allow division by 0

returns -1 if the denominator is 0

'''

if v2 == 0:

return -1

else:

return float(v1) / float(v2)

def tagdata(self, data):

'''

Tags each sentence in a list of sentences

:param data:list of list of words

:type data: [[string,],]

:return: list of list of (word, tag) tuples

Invokes tag(sent) function for each sentence

compiles the results into a list of tagged sentences

each tagged sentence is a list of (word, tag) tuples

'''

res = []

for sent in data:

res1 = self.tag(sent)

res.append(res1)

return res

def tag(self, data):

'''

Tags a single sentence

:param data: list of words

:type data: [string,]

:return: [(word, tag),]

Calls recursive function '_tagword'

to produce a list of tags

Associates the sequence of returned tags

with the correct words in the input sequence

returns a list of (word, tag) tuples

'''

current_state = [(['BOS', 'BOS'], 1.0)]

sent = list(data)

tags = self._tagword(sent, current_state)

res = []

for i in range(len(sent)):

# unpack and discard the C flags

(t,C) = tags[i+2]

res.append((sent[i], t))

return res

def _tagword(self, sent, current_states):

'''

:param sent : List of words remaining in the sentence

:type sent : [word,]

:param current_states : List of possible tag combinations for

the sentence so far, and the probability

associated with each tag combination

:type current_states : [([tag, ],prob), ]

Tags the first word in the sentence and

recursively tags the reminder of sentence

Uses formula specified above to calculate the probability

of a particular tag

'''

# if this word marks the end of the sentance,

# return the most probable tag

if sent == []:

(h,p) = current_states[0]

return h

# otherwise there are more words to be tagged

word = sent[0]

sent = sent[1:]

new_states = []

# if the Capitalisation is requested,

# initalise the flag for this word

C = False

if self._C and word[0].isupper(): C=True

# if word is known

# compute the set of possible tags

# and their associated probabilities

if word in self._wd.conditions():

self.known += 1

for (history, curr_sent_prob) in current_states:

probs = []

for t in self._wd[word].samples():

p_uni = self._uni.freq((t,C))

p_bi = self._bi[history[-1]].freq((t,C))

p_tri = self._tri[tuple(history[-2:])].freq((t,C))

p_wd = float(self._wd[word][t])/float(self._uni[(t,C)])

p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri

p2 = p * p_wd

probs.append(((t,C), p2))

# compute the result of appending each tag to this history

for (tag, prob) in probs:

new_states.append((history + [tag], curr_sent_prob*prob))

# otherwise a new word, set of possible tags is unknown

else:

self.unknown += 1

# since a set of possible tags,

# and the probability of each specific tag

# can not be returned from most classifiers:

# specify that any unknown words are tagged with certainty

p = 1

# if no unknown word tagger has been specified

# then use the tag 'Unk'

if self._unk is None:

tag = ('Unk',C)

# otherwise apply the unknown word tagger

else :

[(_w, t)] = list(self._unk.tag([word]))

tag = (t,C)

for (history, prob) in current_states:

history.append(tag)

new_states = current_states

# now have computed a set of possible new_states

# sort states by prob

# _cmp_tup is a comparison function,

# set is now ordered greatest to least probability

new_states.sort(self._cmp_tup)

# del everything after N (threshold)

# this is the beam search cut

if len(new_states) > self._N:

new_states = new_states[:self._N]

# compute the tags for the rest of the sentence

# return the best list of tags for the sentence

return self._tagword(sent, new_states)

def _cmp_tup(self, tuple1, tuple2):

(_hq, p1) = tuple1

(_h2, p2) = tuple2

if (p2-p1) > 0:

return 1

else:

return -1

########################################

# helper function -- basic sentence tokenizer

########################################

def basic_sent_chop(data, raw=True):

'''

Basic method for tokenizing input into sentences

for this tagger:

:param data: list of tokens (words or (word, tag) tuples)

:type data: str or tuple(str, str)

:param raw: boolean flag marking the input data

as a list of words or a list of tagged words

:type raw: bool

:return: list of sentences

sentences are a list of tokens

tokens are the same as the input

Function takes a list of tokens and separates the tokens into lists

where each list represents a sentence fragment

This function can separate both tagged and raw sequences into

basic sentences.

Sentence markers are the set of [,.!?]

This is a simple method which enhances the performance of the TnT

tagger. Better sentence tokenization will further enhance the results.

'''

new_data = []

curr_sent = []

sent_mark = [',','.','?','!']

if raw:

for word in data:

if word in sent_mark:

curr_sent.append(word)

new_data.append(curr_sent)

curr_sent = []

else:

curr_sent.append(word)

else:

for (word,tag) in data:

if word in sent_mark:

curr_sent.append((word,tag))

new_data.append(curr_sent)

curr_sent = []

else:

curr_sent.append((word,tag))

return new_data

def demo():

from nltk.tag import tnt

from nltk.corpus import brown

sents = list(brown.tagged_sents())

test = list(brown.sents())

# create and train the tagger

tagger = tnt.TnT()

tagger.train(sents[200:1000])

# tag some data

tagged_data = tagger.tagdata(test[100:120])

# print results

for j in range(len(tagged_data)):

s = tagged_data[j]

t = sents[j+100]

for i in range(len(s)):

print(s[i],'--', t[i])

print()

def demo2():

from nltk import tag

from nltk.tag import tnt

from nltk.corpus import treebank

d = list(treebank.tagged_sents())

t = tnt.TnT(N=1000, C=False)

s = tnt.TnT(N=1000, C=True)

t.train(d[(11)*100:])

s.train(d[(11)*100:])

for i in range(10):

tacc = tag.accuracy(t, d[i*100:((i+1)*100)])

tp_un = float(t.unknown) / float(t.known +t.unknown)

tp_kn = float(t.known) / float(t.known + t.unknown)

t.unknown = 0

t.known = 0

print('Capitalization off:')

print('Accuracy:', tacc)

print('Percentage known:', tp_kn)

print('Percentage unknown:', tp_un)

print('Accuracy over known words:', (tacc / tp_kn))

sacc = tag.accuracy(s, d[i*100:((i+1)*100)])

sp_un = float(s.unknown) / float(s.known +s.unknown)

sp_kn = float(s.known) / float(s.known + s.unknown)

s.unknown = 0

s.known = 0

print('Capitalization on:')

print('Accuracy:', sacc)

print('Percentage known:', sp_kn)

print('Percentage unknown:', sp_un)

print('Accuracy over known words:', (sacc / sp_kn))

def demo3():

from nltk import tag

from nltk.corpus import treebank, brown

from nltk.tag import tnt

d = list(treebank.tagged_sents())

e = list(brown.tagged_sents())

d = d[:1000]

e = e[:1000]

d10 = int(len(d)*0.1)

e10 = int(len(e)*0.1)

tknacc = 0

sknacc = 0

tallacc = 0

sallacc = 0

tknown = 0

sknown = 0

for i in range(10):

t = tnt.TnT(N=1000, C=False)

s = tnt.TnT(N=1000, C=False)

dtest = d[(i*d10):((i+1)*d10)]

etest = e[(i*e10):((i+1)*e10)]

dtrain = d[:(i*d10)] + d[((i+1)*d10):]

etrain = e[:(i*e10)] + e[((i+1)*e10):]

t.train(dtrain)

s.train(etrain)

tacc = tag.accuracy(t, dtest)

tp_un = float(t.unknown) / float(t.known +t.unknown)

tp_kn = float(t.known) / float(t.known + t.unknown)

tknown += tp_kn

t.unknown = 0

t.known = 0

sacc = tag.accuracy(s, etest)

sp_un = float(s.unknown) / float(s.known + s.unknown)

sp_kn = float(s.known) / float(s.known + s.unknown)

sknown += sp_kn

s.unknown = 0

s.known = 0

tknacc += (tacc / tp_kn)

sknacc += (sacc / tp_kn)

tallacc += tacc

sallacc += sacc

#print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

print("brown: acc over words known:", 10 * tknacc)

print(" : overall accuracy:", 10 * tallacc)

print(" : words known:", 10 * tknown)

print("treebank: acc over words known:", 10 * sknacc)

print(" : overall accuracy:", 10 * sallacc)

print(" : words known:", 10 * sknown)

if __name__ == "__main__":

import doctest

doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)

Coverage for nltk.tag.tnt : 8%

238 statements 18 run 220 missing 0 excluded