Coverage for nltk.model.ngram : 75%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Language Models # # Copyright (C) 2001-2012 NLTK Project # Authors: Steven Bird <sb@csse.unimelb.edu.au> # Daniel Blanchard <dan.blanchard@gmail.com> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
SimpleGoodTuringProbDist)
""" Default estimator function using a SimpleGoodTuringProbDist. """ # can't be an instance method of NgramModel as they # can't be pickled either. return SimpleGoodTuringProbDist(fdist)
""" A processing interface for assigning a probability to the next word. """
# add cutoff """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training.
>>> from nltk.corpus import brown >>> from nltk.probability import LidstoneProbDist >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) >>> lm = NgramModel(3, brown.words(categories='news'), estimator) >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', ... 'primary', 'election', 'produced', '``', 'no', 'evidence', ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) ... # doctest: +ELLIPSIS 1.682...
:param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list of string :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kw_args: Extra keyword arguments for estimator. :type estimator_kw_args: (any) """
estimator = _estimator
else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)
# recursively construct the lower-order models
""" Evaluate the probability of this word in this context using Katz Backoff.
:param word: the word to get the probability of :type word: str :param context: the context the word is in :type context: list(str) """
else:
else:
""" Evaluate the (negative) log probability of this word in this context.
:param word: the word to get the probability of :type word: str :param context: the context the word is in :type context: list(str) """
''' Randomly select a word that is likely to appear in this context.
:param context: the context the word is in :type context: list(str) '''
return self.generate(1, context)[-1]
# NB, this will always start with same word since model # is trained on a single text ''' Generate random text based on the language model.
:param num_words: number of words to generate :type num_words: int :param context: initial words in generated string :type context: list(str) '''
text = list(context) for i in range(num_words): text.append(self._generate_one(text)) return text
context = (self._prefix + tuple(context))[-self._n+1:] # print "Context (%d): <%s>" % (self._n, ','.join(context)) if context in self: return self[context].generate() elif self._n > 1: return self._backoff._generate_one(context[1:]) else: return '.'
""" Calculate the approximate cross-entropy of the n-gram model for a given evaluation text. This is the average log probability of each word in the text.
:param text: words to use for evaluation :type text: list(str) """
# Add prefix to front to correctly handle first n-1 words
""" Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text.
:param text: words to calculate perplexity of :type text: list(str) """
return pow(2.0, self.entropy(text))
return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |