Coverage for nltk.align : 91%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Aligned Sentences # # Copyright (C) 2001-2012 NLTK Project # Author: Will Zhang <wilzzha@gmail.com> # Guan Gui <ggui@student.unimelb.edu.au> # Steven Bird <stevenbird1@gmail.com> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
""" Return an aligned sentence object, which encapsulates two sentences along with an ``Alignment`` between them.
>>> from nltk.align import AlignedSent >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'], ... ['the', 'house', 'is', 'small'], '0-2 1-3 2-1 3-0') >>> algnsent.words ['klein', 'ist', 'das', 'Haus'] >>> algnsent.mots ['the', 'house', 'is', 'small'] >>> algnsent.alignment Alignment([(0, 2), (1, 3), (2, 1), (3, 0)]) >>> algnsent.precision('0-2 1-3 2-1 3-3') 0.75 >>> from nltk.corpus import comtrans >>> print(comtrans.aligned_sents()[54]) <AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'> >>> print(comtrans.aligned_sents()[54].alignment) 0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
:param words: source language words :type words: list(str) :param mots: target language words :type mots: list(str) :param alignment: the word-level alignments between the source and target language :type alignment: Alignment """
encoding = 'latin-1'):
def words(self):
def mots(self):
""" Check whether the alignments are legal.
:param a: alignment to be checked :raise IndexError: if alignment is out of sentence boundary :rtype: boolean """ raise IndexError("Alignment is outside boundary of words")
""" Return a string representation for this ``AlignedSent``.
:rtype: str """
""" Return a human-readable string representation for this ``AlignedSent``.
:rtype: str """
""" Return the aligned sentence pair, reversing the directionality
:rtype: AlignedSent """ self._alignment.invert())
""" Return the precision of an aligned sentence with respect to a "gold standard" reference ``AlignedSent``.
:type reference: AlignedSent or Alignment :param reference: A "gold standard" reference aligned sentence. :rtype: float or None """ # Get alignments in set of 2-tuples form # The "possible" precision is used since it doesn't penalize for finding # an alignment that was marked as "possible" (NAACL corpus)
else:
""" Return the recall of an aligned sentence with respect to a "gold standard" reference ``AlignedSent``.
:type reference: AlignedSent or Alignment :param reference: A "gold standard" reference aligned sentence. :rtype: float or None """ # Get alignments in set of 2-tuples form # The "sure" recall is used so we don't penalize for missing an # alignment that was only marked as "possible".
else:
# Call NLTKs existing functions for recall
""" Return the Alignment Error Rate (AER) of an aligned sentence with respect to a "gold standard" reference ``AlignedSent``.
Return an error rate between 0.0 (perfect alignment) and 1.0 (no alignment).
>>> from nltk.align import AlignedSent >>> s = AlignedSent(["the", "cat"], ["le", "chat"], [(0, 0), (1, 1)]) >>> s.alignment_error_rate(s) 0.0
:type reference: AlignedSent or Alignment :param reference: A "gold standard" reference aligned sentence. :type possible: AlignedSent or Alignment or None :param possible: A "gold standard" reference of possible alignments (defaults to *reference* if None) :rtype: float or None """ # Get alignments in set of 2-tuples form else:
# Set possible alignment possible = possible.alignment else: else: # Possible alignment is just sure alignment
# Sanity check
# Return the Alignment Error Rate float(len(align) + len(sure)))
""" A storage class for representing alignment between two sequences, s1, s2. In general, an alignment is a set of tuples of the form (i, j, ...) representing an alignment between the i-th element of s1 and the j-th element of s2. Tuples are extensible (they might contain additional data, such as a boolean to indicate sure vs possible alignments).
>>> from nltk.align import Alignment >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)]) >>> a.invert() Alignment([(0, 0), (1, 0), (2, 1), (2, 2)]) >>> print(a.invert()) 0-0 1-0 2-1 2-2 >>> a[0] [(0, 1), (0, 0)] >>> a.invert()[2] [(2, 1), (2, 2)] >>> b = Alignment([(0, 0), (0, 1)]) >>> b.issubset(a) True >>> c = Alignment('0-0 0-1') >>> b == c True """
else:
""" Look up the alignments that map from a given index or slice. """
""" Return an Alignment object, being the inverted mapping. """
""" Work out the range of the mapping from the given positions. If no positions are specified, compute the range of the entire mapping. """ image = set() if not self._index: self._build_index() if not positions: positions = list(range(len(self._index))) for p in positions: image.update(f for _,f in self._index[p]) return sorted(image)
""" Produce a Giza-formatted string representing the alignment. """
""" Produce a Giza-formatted string representing the alignment. """
""" Build a list self._index such that self._index[i] is a list of the alignments originating from word i. """
""" This class implements the Expectation Maximization algorithm for IBM Model 1. The algorithm runs upon a sentence-aligned parallel corpus and generates word alignments in aligned sentence pairs. The process is divided into 2 stages:
- Stage 1: Calculates word-to-word translation probabilities by collecting evidence of a English word being the translation of a foreign word from the parallel corpus. - Stage 2: Generates updated word alignments for the sentence pairs, based on the translation probabilities from Stage 1.
>>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']), ... AlignedSent(['the', 'book'], ['das', 'Buch']), ... AlignedSent(['a', 'book'], ['ein', 'Buch'])] >>> ibm1 = IBMModel1(corpus) >>> print("%.1f" % ibm1.probabilities['book', 'Buch']) 1.0 >>> print("%.1f" % ibm1.probabilities['book', 'das']) 0.0 >>> print("%.1f" % ibm1.probabilities['book', None]) 0.5
:param aligned_sents: The parallel text ``corpus.Iterable`` containing AlignedSent instances of aligned sentence pairs from the corpus. :type aligned_sents: list(AlignedSent) :param convergent_threshold: The threshold value of convergence. An entry is considered converged if the delta from ``old_t`` to ``new_t`` is less than this value. The algorithm terminates when all entries are converged. This parameter is optional, default is 0.01 :type convergent_threshold: float """
# Dictionary of translation probabilities t(e,f).
""" Perform Expectation Maximization training to learn word-to-word translation probabilities. """
# Collect up sets of all English and foreign words # add the NULL token to the foreign word set.
# Initialise t(e|f) uniformly
# count(e|f) # total(f)
# Compute normalization
# Collect counts
# Estimate probabilities
# Have we converged (num_converged, num_probs, 100.0*num_converged/num_probs))
""" Return a list of AlignedSents with Alignments calculated using IBM-Model 1. """
raise ValueError("No probabilities calculated")
# Alignment Learning from t(e|f)
# for every English word # find the French word that gives maximized t(e|f) # NULL token is the initial candidate
# only output alignment with non-NULL mapping
# substitute the alignment of AlignedSent with the yielded one aligned_sent.mots, alignment))
i, j, p = pair_string.split("-") return int(i), int(j)
import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |