Coverage for nltk.tree : 74%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# -*- coding: utf-8 -*- # Natural Language Toolkit: Text Trees # # Copyright (C) 2001-2012 NLTK Project # Author: Edward Loper <edloper@gradient.cis.upenn.edu> # Steven Bird <sb@csse.unimelb.edu.au> # Peter Ljunglöf <peter.ljunglof@gu.se> # Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms) # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
Class for representing hierarchical language structures, such as syntax trees and morphological trees. """
# TODO: add LabelledTree (can be used for dependency trees)
###################################################################### ## Trees ######################################################################
""" A Tree represents a hierarchical grouping of leaves and subtrees. For example, each constituent in a syntax tree is represented by a single Tree.
A tree's children are encoded as a list of leaves and subtrees, where a leaf is a basic (non-tree) value; and a subtree is a nested Tree.
>>> from nltk.tree import Tree >>> print(Tree(1, [2, Tree(3, [4]), 5])) (1 2 (3 4) 5) >>> vp = Tree('VP', [Tree('V', ['saw']), ... Tree('NP', ['him'])]) >>> s = Tree('S', [Tree('NP', ['I']), vp]) >>> print(s) (S (NP I) (VP (V saw) (NP him))) >>> print(s[1]) (VP (V saw) (NP him)) >>> print(s[1,1]) (NP him) >>> t = Tree("(S (NP I) (VP (V saw) (NP him)))") >>> s == t True >>> t[1][1].node = "X" >>> print(t) (S (NP I) (VP (V saw) (X him))) >>> t[0], t[1,1] = t[1,1], t[0] >>> print(t) (S (X him) (VP (V saw) (NP I)))
The length of a tree is the number of children it has.
>>> len(t) 2
Any other properties that a Tree defines are known as node properties, and are used to add information about individual hierarchical groupings. For example, syntax trees use a NODE property to label syntactic constituents with phrase tags, such as "NP" and "VP".
Several Tree methods use "tree positions" to specify children or descendants of a tree. Tree positions are defined as follows:
- The tree position *i* specifies a Tree's *i*\ th child. - The tree position ``()`` specifies the Tree itself. - If *p* is the tree position of descendant *d*, then *p+i* specifies the *i*\ th child of *d*.
I.e., every tree position is either a single index *i*, specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*, specifying ``tree[i1][i2]...[iN]``.
Construct a new tree. This constructor can be called in one of two ways:
- ``Tree(node, children)`` constructs a new tree with the specified node value and list of children.
- ``Tree(s)`` constructs a new tree by parsing the string ``s``. It is equivalent to calling the class method ``Tree.parse(s)``. """ raise TypeError("%s: Expected a node value and child list " "or a single string" % type(self).__name__) raise TypeError("%s() argument 2 should be a list, not a " "string" % type(self).__name__) else:
#//////////////////////////////////////////////////////////// # Comparison operators #////////////////////////////////////////////////////////////
if not isinstance(other, Tree): return False return self.node <= other.node or list.__le__(self, other) if not isinstance(other, Tree): return False return self.node >= other.node or list.__ge__(self, other)
#//////////////////////////////////////////////////////////// # Disabled list operations #////////////////////////////////////////////////////////////
raise TypeError('Tree does not support multiplication') raise TypeError('Tree does not support multiplication') raise TypeError('Tree does not support addition') raise TypeError('Tree does not support addition')
#//////////////////////////////////////////////////////////// # Indexing (with support for tree positions) #////////////////////////////////////////////////////////////
else: else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__))
raise IndexError('The tree position () may not be ' 'assigned to.') else: else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__))
elif isinstance(index, (list, tuple)): if len(index) == 0: raise IndexError('The tree position () may not be deleted.') elif len(index) == 1: del self[index[0]] else: del self[index[0]][index[1:]] else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__))
#//////////////////////////////////////////////////////////// # Basic tree operations #////////////////////////////////////////////////////////////
""" Return the leaves of the tree.
>>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.leaves() ['the', 'dog', 'chased', 'the', 'cat']
:return: a list containing this tree's leaves. The order reflects the order of the leaves in the tree's hierarchical structure. :rtype: list """ else:
""" Return a flat version of the tree, with all non-root non-terminals removed.
>>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> print(t.flatten()) (S the dog chased the cat)
:return: a tree consisting of this tree's root connected directly to its leaves, omitting all intervening non-terminal nodes. :rtype: Tree """
""" Return the height of the tree.
>>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.height() 5 >>> print(t[0,0]) (D the) >>> t[0,0].height() 2
:return: The height of this tree. The height of a tree containing no children is 1; the height of a tree containing only leaves is 2; and the height of any other tree is one plus the maximum of its children's heights. :rtype: int """ else:
""" >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.treepositions() # doctest: +ELLIPSIS [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...] >>> for pos in t.treepositions('leaves'): ... t[pos] = t[pos][::-1].upper() >>> print(t) (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC))))
:param order: One of: ``preorder``, ``postorder``, ``bothorder``, ``leaves``. """ else:
""" Generate all the subtrees of this tree, optionally restricted to trees matching the filter function.
>>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> for s in t.subtrees(lambda t: t.height() == 2): ... print(s) (D the) (N dog) (V chased) (D the) (N cat)
:type filter: function :param filter: the function to filter all local trees """
""" Generate the productions that correspond to the non-terminal nodes of the tree. For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the form P -> C1 C2 ... Cn.
>>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.productions() [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased', NP -> D N, D -> 'the', N -> 'cat']
:rtype: list(Production) """
raise TypeError('Productions can only be generated from trees having node labels that are strings')
""" Return a sequence of pos-tagged words extracted from the tree.
>>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.pos() [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]
:return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). The order reflects the order of the leaves in the tree's hierarchical structure. :rtype: list(tuple) """ else:
""" :return: The tree position of the ``index``-th leaf in this tree. I.e., if ``tp=self.leaf_treeposition(i)``, then ``self[tp]==self.leaves()[i]``.
:raise IndexError: If this tree contains fewer than ``index+1`` leaves, or if ``index<0``. """ if index < 0: raise IndexError('index must be non-negative')
stack = [(self, ())] while stack: value, treepos = stack.pop() if not isinstance(value, Tree): if index == 0: return treepos else: index -= 1 else: for i in range(len(value)-1, -1, -1): stack.append( (value[i], treepos+(i,)) )
raise IndexError('index must be less than or equal to len(self)')
""" :return: The tree position of the lowest descendant of this tree that dominates ``self.leaves()[start:end]``. :raise ValueError: if ``end <= start`` """ if end <= start: raise ValueError('end must be greater than start') # Find the tree positions of the start & end leaves, and # take the longest common subsequence. start_treepos = self.leaf_treeposition(start) end_treepos = self.leaf_treeposition(end-1) # Find the first index where they mismatch: for i in range(len(start_treepos)): if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: return start_treepos[:i] return start_treepos
#//////////////////////////////////////////////////////////// # Transforms #////////////////////////////////////////////////////////////
""" This method can modify a tree in three ways:
1. Convert a tree into its Chomsky Normal Form (CNF) equivalent -- Every subtree has either two non-terminals or one terminal as its children. This process requires the creation of more"artificial" non-terminal nodes. 2. Markov (vertical) smoothing of children in new artificial nodes 3. Horizontal (parent) annotation of nodes
:param factor: Right or left factoring method (default = "right") :type factor: str = [left|right] :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) :type horzMarkov: int | None :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) :type vertMarkov: int | None :param childChar: A string used in construction of the artificial nodes, separating the head of the original subtree from the child nodes that have yet to be expanded (default = "|") :type childChar: str :param parentChar: A string used to separate the node representation from its vertical annotation :type parentChar: str """ from .treetransforms import chomsky_normal_form chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
""" This method modifies the tree in three ways:
1. Transforms a tree in Chomsky Normal Form back to its original structure (branching greater than two) 2. Removes any parent annotation (if it exists) 3. (optional) expands unary subtrees (if previously collapsed with collapseUnary(...) )
:param expandUnary: Flag to expand unary or not (default = True) :type expandUnary: bool :param childChar: A string separating the head node from its children in an artificial node (default = "|") :type childChar: str :param parentChar: A sting separating the node label from its parent annotation (default = "^") :type parentChar: str :param unaryChar: A string joining two non-terminals in a unary production (default = "+") :type unaryChar: str """ from .treetransforms import un_chomsky_normal_form un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
""" Collapse subtrees with a single child (ie. unary productions) into a new non-terminal (Tree node) joined by 'joinChar'. This is useful when working with algorithms that do not allow unary productions, and completely removing the unary productions would require loss of useful information. The Tree is modified directly (since it is passed by reference) and no value is returned.
:param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. Part-of-Speech tags) since they are always unary productions :type collapsePOS: bool :param collapseRoot: 'False' (default) will not modify the root production if it is unary. For the Penn WSJ treebank corpus, this corresponds to the TOP -> productions. :type collapseRoot: bool :param joinChar: A string used to connect collapsed node values (default = "+") :type joinChar: str """ from .treetransforms import collapse_unary collapse_unary(self, collapsePOS, collapseRoot, joinChar)
#//////////////////////////////////////////////////////////// # Convert, copy #////////////////////////////////////////////////////////////
def convert(cls, tree): """ Convert a tree between different subtypes of Tree. ``cls`` determines which class will be used to encode the new tree.
:type tree: Tree :param tree: The tree that should be converted. :return: The new Tree. """ else:
else: newcopy = self.copy(deep=True) for pos in newcopy.treepositions('leaves'): newcopy[pos] = leaf_freezer(newcopy[pos]) newcopy = frozen_class.convert(newcopy)
#//////////////////////////////////////////////////////////// # Parsing #////////////////////////////////////////////////////////////
node_pattern=None, leaf_pattern=None, remove_empty_top_bracketing=False): """ Parse a bracketed tree string and return the resulting tree. Trees are represented as nested brackettings, such as::
(S (NP (NNP John)) (VP (V runs)))
:type s: str :param s: The string to parse
:type brackets: str (length=2) :param brackets: The bracket characters used to mark the beginning and end of trees and subtrees.
:type parse_node: function :type parse_leaf: function :param parse_node, parse_leaf: If specified, these functions are applied to the substrings of ``s`` corresponding to nodes and leaves (respectively) to obtain the values for those nodes and leaves. They should have the following signature:
parse_node(str) -> value
For example, these functions could be used to parse nodes and leaves whose values should be some type other than string (such as ``FeatStruct``). Note that by default, node strings and leaf strings are delimited by whitespace and brackets; to override this default, use the ``node_pattern`` and ``leaf_pattern`` arguments.
:type node_pattern: str :type leaf_pattern: str :param node_pattern, leaf_pattern: Regular expression patterns used to find node and leaf substrings in ``s``. By default, both nodes patterns are defined to match any sequence of non-whitespace non-bracket characters.
:type remove_empty_top_bracketing: bool :param remove_empty_top_bracketing: If the resulting tree has an empty node label, and is length one, then return its single child instead. This is useful for treebank trees, which sometimes contain an extra level of bracketing.
:return: A tree corresponding to the string representation ``s``. If this class method is called using a subclass of Tree, then it will return a tree of that type. :rtype: Tree """ # Construct a regexp that will tokenize the string. open_pattern, node_pattern, close_pattern, leaf_pattern)) # Walk through each token, updating a stack of trees. # Beginning of a tree/subtree # End of a tree/subtree else: # Leaf node else:
# check that we got exactly one complete tree. else:
# If the tree has an extra level with node='', then get rid of # it. E.g.: "((S (NP ...) (VP ...)))" # return the tree.
def _parse_error(cls, s, match, expecting): """ Display a friendly error message when parsing a tree string fails. :param s: The string we're parsing. :param match: regexp match of the problem token. :param expecting: what we expected to see instead. """ # Construct a basic error message else: cls.__name__, expecting, token, ' '*12, pos) # Add a display showing the error token itsels:
#//////////////////////////////////////////////////////////// # Visualization & String Representation #////////////////////////////////////////////////////////////
""" Open a new window containing a graphical diagram of this tree. """ from nltk.draw.tree import draw_trees draw_trees(self)
""" :return: A pretty-printed string representation of this tree. :rtype: str :param margin: The right margin at which to do line-wrapping. :type margin: int :param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. :type indent: int :param nodesep: A string that is used to separate the node from the children. E.g., the default value ``':'`` gives trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``. """
# Try writing it on one line.
# If it doesn't fit on one line, then write it on multi-lines. else: nodesep, parens, quotes) elif isinstance(child, compat.string_types) and not quotes: s += '\n'+' '*(indent+2)+ '%s' % child else: s += '\n'+' '*(indent+2)+ '%r' % child
r""" Returns a representation of the tree compatible with the LaTeX qtree package. This consists of the string ``\Tree`` followed by the parse tree represented in bracketed notation.
For example, the following result was generated from a parse tree of the sentence ``The announcement astounded us``::
\Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
See http://www.ling.upenn.edu/advice/latex.html for the LaTeX style file for the qtree package.
:return: A latex qtree representation of this tree. :rtype: str """
else: " ".join(childstrs), parens[1]) else: " ".join(childstrs), parens[1])
# Precompute our hash value. This ensures that we're really # immutable. It also means we only have to calculate it once. "must be immutable" % type(self).__name__)
raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__) raise ValueError('%s may not be modified' % type(self).__name__)
"""Get the node value""" """ Set the node value. This will only succeed the first time the node value is set, which should occur in ImmutableTree.__init__(). """
###################################################################### ## Parented trees ######################################################################
""" An abstract base class for a ``Tree`` that automatically maintains pointers to parent nodes. These parent pointers are updated whenever any change is made to a tree's structure. Two subclasses are currently defined:
- ``ParentedTree`` is used for tree structures where each subtree has at most one parent. This class should be used in cases where there is no"sharing" of subtrees.
- ``MultiParentedTree`` is used for tree structures where a subtree may have zero or more parents. This class should be used in cases where subtrees may be shared.
Subclassing =========== The ``AbstractParentedTree`` class redefines all operations that modify a tree's structure to call two methods, which are used by subclasses to update parent information:
- ``_setparent()`` is called whenever a new child is added. - ``_delparent()`` is called whenever a child is removed. """
#//////////////////////////////////////////////////////////// # Parent management #////////////////////////////////////////////////////////////
""" Update the parent pointer of ``child`` to point to ``self``. This method is only called if the type of ``child`` is ``Tree``; i.e., it is not called when adding a leaf to a tree. This method is always called before the child is actually added to the child list of ``self``.
:type child: Tree :type index: int :param index: The index of ``child`` in ``self``. :raise TypeError: If ``child`` is a tree with an impropriate type. Typically, if ``child`` is a tree, then its type needs to match the type of ``self``. This prevents mixing of different tree types (single-parented, multi-parented, and non-parented). :param dry_run: If true, the don't actually set the child's parent pointer; just check for any error conditions, and raise an exception if one is found. """ raise NotImplementedError()
""" Update the parent pointer of ``child`` to not point to self. This method is only called if the type of ``child`` is ``Tree``; i.e., it is not called when removing a leaf from a tree. This method is always called before the child is actually removed from the child list of ``self``.
:type child: Tree :type index: int :param index: The index of ``child`` in ``self``. """ raise NotImplementedError()
#//////////////////////////////////////////////////////////// # Methods that add/remove children #//////////////////////////////////////////////////////////// # Every method that adds or removes a child must make # appropriate calls to _setparent() and _delparent().
# del ptree[start:stop] # Clear all the children pointers. # Delete the children from our child list.
# del ptree[i] # Clear the child's parent pointer. # Remove the child from our child list.
# del ptree[()] # del ptree[(i,)] # del ptree[i1, i2, i3] else:
else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__))
# ptree[start:stop] = value # make a copy of value, in case it's an iterator # Check for any error conditions, so we can avoid ending # up in an inconsistent state if an error does occur. # clear the child pointers of all parents we're removing # set the child pointers of the new children. We do this # after clearing *all* child pointers, in case we're e.g. # reversing the elements in a tree. # finally, update the content of the child list itself.
# ptree[i] = value # if the value is not changing, do nothing. return # Set the new child's parent pointer. # Remove the old child's parent pointer # Update our child list.
# ptree[()] = value # ptree[(i,)] = value # ptree[i1, i2, i3] = value else:
else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__))
# Handle negative indexes. Note that if index < -len(self), # we do *not* raise an IndexError, unlike __getitem__. This # is done for consistency with list.__getitem__ and list.index. # Set the child's parent, and update our child list.
# n.b.: like `list`, this is done by equality, not identity! # To remove a specific child, use del ptree[i].
# We need to implement __getslice__ and friends, even though # they're deprecated, because otherwise list.__getslice__ will get # called (since we're subclassing from list). Just delegate to # __getitem__ etc., but use max(0, start) and max(0, stop) because # because negative indices are already handled *before* # __getslice__ is called; and we don't want to double-count them. return self.__getitem__(slice(max(0, start), max(0, stop)))
""" A ``Tree`` that automatically maintains parent pointers for single-parented trees. The following are methods for querying the structure of a parented tree: ``parent``, ``parent_index``, ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``.
Each ``ParentedTree`` may have at most one parent. In particular, subtrees may not be shared. Any attempt to reuse a single ``ParentedTree`` as a child of more than one parent (or as multiple children of the same parent) will cause a ``ValueError`` exception to be raised.
``ParentedTrees`` should never be used in the same tree as ``Trees`` or ``MultiParentedTrees``. Mixing tree implementations may result in incorrect parent pointers and in ``TypeError`` exceptions. """ """The parent of this Tree, or None if it has no parent."""
#///////////////////////////////////////////////////////////////// # Methods #/////////////////////////////////////////////////////////////////
"""The parent of this tree, or None if it has no parent."""
""" The index of this tree in its parent. I.e., ``ptree.parent()[ptree.parent_index()] is ptree``. Note that ``ptree.parent_index()`` is not necessarily equal to ``ptree.parent.index(ptree)``, since the ``index()`` method returns the first child that is equal to its argument. """
"""The left sibling of this tree, or None if it has none.""" return self._parent[parent_index-1]
"""The right sibling of this tree, or None if it has none.""" return self._parent[parent_index+1]
""" The root of this tree. I.e., the unique ancestor of this tree whose parent is None. If ``ptree.parent()`` is None, then ``ptree`` is its own root. """
""" The tree position of this tree, relative to the root of the tree. I.e., ``ptree.root[ptree.treeposition] is ptree``. """
#///////////////////////////////////////////////////////////////// # Parent Management #/////////////////////////////////////////////////////////////////
# Sanity checks
# Delete child's parent pointer.
# If the child's type is incorrect, then complain. raise TypeError('Can not insert a non-ParentedTree '+ 'into a ParentedTree')
# If child already has a parent, then complain. raise ValueError('Can not insert a subtree that already ' 'has a parent.')
# Set child's parent pointer & index.
""" A ``Tree`` that automatically maintains parent pointers for multi-parented trees. The following are methods for querying the structure of a multi-parented tree: ``parents()``, ``parent_indices()``, ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``.
Each ``MultiParentedTree`` may have zero or more parents. In particular, subtrees may be shared. If a single ``MultiParentedTree`` is used as multiple children of the same parent, then that parent will appear multiple times in its ``parents()`` method.
``MultiParentedTrees`` should never be used in the same tree as ``Trees`` or ``ParentedTrees``. Mixing tree implementations may result in incorrect parent pointers and in ``TypeError`` exceptions. """ """A list of this tree's parents. This list should not contain duplicates, even if a parent contains this tree multiple times."""
#///////////////////////////////////////////////////////////////// # Methods #/////////////////////////////////////////////////////////////////
""" The set of parents of this tree. If this tree has no parents, then ``parents`` is the empty set. To check if a tree is used as multiple children of the same parent, use the ``parent_indices()`` method.
:type: list(MultiParentedTree) """
""" A list of all left siblings of this tree, in any of its parent trees. A tree may be its own left sibling if it is used as multiple contiguous children of the same parent. A tree may appear multiple times in this list if it is the left sibling of this tree with respect to multiple parents.
:type: list(MultiParentedTree) """ for (parent, index) in self._get_parent_indices() if index > 0]
""" A list of all right siblings of this tree, in any of its parent trees. A tree may be its own right sibling if it is used as multiple contiguous children of the same parent. A tree may appear multiple times in this list if it is the right sibling of this tree with respect to multiple parents.
:type: list(MultiParentedTree) """ for (parent, index) in self._get_parent_indices() if index < (len(parent)-1)]
for parent in self._parents for index, child in enumerate(parent) if child is self]
""" The set of all roots of this tree. This set is formed by tracing all possible parent paths until trees with no parents are found.
:type: list(MultiParentedTree) """
for parent in self._parents: parent._get_roots_helper(result) else:
""" Return a list of the indices where this tree occurs as a child of ``parent``. If this child does not occur as a child of ``parent``, then the empty list is returned. The following is always true::
for parent_index in ptree.parent_indices(parent): parent[parent_index] is ptree """ if parent not in self._parents: return [] else: return [index for (index, child) in enumerate(parent) if child is self]
""" Return a list of all tree positions that can be used to reach this multi-parented tree starting from ``root``. I.e., the following is always true::
for treepos in ptree.treepositions(root): root[treepos] is ptree """ else: return [treepos+(index,) for parent in self._parents for treepos in parent.treepositions(root) for (index, child) in enumerate(parent) if child is self]
#///////////////////////////////////////////////////////////////// # Parent Management #/////////////////////////////////////////////////////////////////
# Sanity checks
# If the only copy of child in self is at index, then delete # self from child's parent list. else:
# If the child's type is incorrect, then complain. raise TypeError('Can not insert a non-MultiParentedTree '+ 'into a MultiParentedTree')
# Add self as a parent pointer if it's not already listed. if parent is self: break else:
###################################################################### ## Probabilistic trees ######################################################################
# We have to patch up these methods to make them work right: return '%s (p=%s)' % (Tree.__repr__(self), self.prob()) return Tree.__cmp__(self, other) or cmp(self.prob(), other.prob()) return not (self == other) def convert(cls, val): else: return cls(val.node, children, prob=1.0) else:
# We have to patch up these methods to make them work right: return '%s [%s]' % (Tree.__repr__(self), self.prob()) return '%s [%s]' % (self.pprint(margin=60), self.prob()) c = Tree.__cmp__(self, other) if c != 0: return c return cmp(self.prob(), other.prob()) if not isinstance(other, Tree): return False return Tree.__eq__(self, other) and self.prob()==other.prob() return not (self == other) if not deep: return type(self)(self.node, self, prob=self.prob()) else: return type(self).convert(self) def convert(cls, val): else: return cls(val.node, children, prob=1.0) else:
else:
###################################################################### ## Parsing ######################################################################
""" Use Tree.parse(s, remove_empty_top_bracketing=True) instead. """ raise NameError("Use Tree.parse(s, remove_empty_top_bracketing=True) instead.")
""" Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, as shown in the following example (X represents a Chinese character): S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
:return: A tree corresponding to the string representation. :rtype: Tree :param s: The string to be converted :type s: str """ tokens = re.split(r'([()| ])', s) for i in range(len(tokens)): if tokens[i] == '(': tokens[i-1], tokens[i] = tokens[i], tokens[i-1] # pull nonterminal inside parens elif ':' in tokens[i]: fields = tokens[i].split(':') if len(fields) == 2: # non-terminal tokens[i] = fields[1] else: tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" elif tokens[i] == '|': tokens[i] = ''
treebank_string = " ".join(tokens) return Tree.parse(treebank_string, remove_empty_top_bracketing=True)
# s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier # s = re.sub(r'\w+:', '', s) # remove role tags
# return s
###################################################################### ## Demonstration ######################################################################
""" A demonstration showing how Trees and Trees can be used. This demonstration creates a Tree, and loads a Tree from the Treebank corpus, and shows the results of calling several of their methods. """
from nltk import tree
# Demonstrate tree parsing. s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))' t = Tree(s) print("Convert bracketed string into tree:") print(t) print(t.__repr__())
print("Display tree properties:") print(t.node) # tree's constituent type print(t[0]) # tree's first child print(t[1]) # tree's second child print(t.height()) print(t.leaves()) print(t[1]) print(t[1,1]) print(t[1,1,0])
# Demonstrate tree modification. the_cat = t[0] the_cat.insert(1, tree.Tree.parse('(JJ big)')) print("Tree modification:") print(t) t[1,1,1] = tree.Tree.parse('(NN cake)') print(t) print()
# Tree transforms print("Collapse unary:") t.collapse_unary() print(t) print("Chomsky normal form:") t.chomsky_normal_form() print(t) print()
# Demonstrate probabilistic trees. pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5) print("Probabilistic Tree:") print(pt) print()
# Demonstrate parsing of treebank output format. t = tree.Tree.parse(t.pprint()) print("Convert tree to bracketed string and back again:") print(t) print()
# Demonstrate LaTeX output print("LaTeX output:") print(t.pprint_latex_qtree()) print()
# Demonstrate Productions print("Production output:") print(t.productions()) print()
# Demonstrate tree nodes containing objects other than strings t.node = ('test', 3) print(t)
'ProbabilisticTree', 'Tree', 'bracket_parse', 'sinica_parse', 'ParentedTree', 'MultiParentedTree', 'ImmutableParentedTree', 'ImmutableMultiParentedTree']
import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
|